X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/04d21c8a97da9463e6bedda620ecf1282722ea3e..refs/pull/5685/head:/app/Entities/Tools/PageIncludeParser.php diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php index 63d3ea8d6..e0b89f158 100644 --- a/app/Entities/Tools/PageIncludeParser.php +++ b/app/Entities/Tools/PageIncludeParser.php @@ -4,54 +4,217 @@ namespace BookStack\Entities\Tools; use BookStack\Util\HtmlDocument; use Closure; +use DOMDocument; +use DOMElement; +use DOMNode; +use DOMText; class PageIncludeParser { protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/"; + /** + * Elements to clean up and remove if left empty after a parsing operation. + * @var DOMElement[] + */ + protected array $toCleanup = []; + + /** + * @param Closure(PageIncludeTag $tag): PageContent $pageContentForId + */ public function __construct( - protected string $pageHtml, + protected HtmlDocument $doc, protected Closure $pageContentForId, ) { } - public function parse(): string + /** + * Parse out the include tags. + * Returns the count of new content DOM nodes added to the document. + */ + public function parse(): int + { + $nodesAdded = 0; + $tags = $this->locateAndIsolateIncludeTags(); + + foreach ($tags as $tag) { + /** @var PageIncludeContent $content */ + $content = $this->pageContentForId->call($this, $tag); + + if (!$content->isInline()) { + $parentP = $this->getParentParagraph($tag->domNode); + $isWithinParentP = $parentP === $tag->domNode->parentNode; + if ($parentP && $isWithinParentP) { + $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode); + } else if ($parentP) { + $this->moveTagNodeToBesideParent($tag, $parentP); + } + } + + $replacementNodes = $content->toDomNodes(); + $nodesAdded += count($replacementNodes); + $this->replaceNodeWithNodes($tag->domNode, $replacementNodes); + } + + $this->cleanup(); + + return $nodesAdded; + } + + /** + * Locate include tags within the given document, isolating them to their + * own nodes in the DOM for future targeted manipulation. + * @return PageIncludeTag[] + */ + protected function locateAndIsolateIncludeTags(): array + { + $includeHosts = $this->doc->queryXPath("//*[text()[contains(., '{{@')]]"); + $includeTags = []; + + /** @var DOMNode $node */ + foreach ($includeHosts as $node) { + /** @var DOMNode $childNode */ + foreach ($node->childNodes as $childNode) { + if ($childNode->nodeName === '#text') { + array_push($includeTags, ...$this->splitTextNodesAtTags($childNode)); + } + } + } + + return $includeTags; + } + + /** + * Takes a text DOMNode and splits its text content at include tags + * into multiple text nodes within the original parent. + * Returns found PageIncludeTag references. + * @return PageIncludeTag[] + */ + protected function splitTextNodesAtTags(DOMNode $textNode): array + { + $includeTags = []; + $text = $textNode->textContent; + preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE); + + $currentOffset = 0; + foreach ($matches[0] as $index => $fullTagMatch) { + $tagOuterContent = $fullTagMatch[0]; + $tagInnerContent = $matches[1][$index][0]; + $tagStartOffset = $fullTagMatch[1]; + + if ($currentOffset < $tagStartOffset) { + $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset); + $textNode->parentNode->insertBefore($this->doc->createTextNode($previousText), $textNode); + } + + $node = $textNode->parentNode->insertBefore($this->doc->createTextNode($tagOuterContent), $textNode); + $includeTags[] = new PageIncludeTag($tagInnerContent, $node); + $currentOffset = $tagStartOffset + strlen($tagOuterContent); + } + + if ($currentOffset > 0) { + $textNode->textContent = substr($text, $currentOffset); + } + + return $includeTags; + } + + /** + * Replace the given node with all those in $replacements + * @param DOMNode[] $replacements + */ + protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void + { + /** @var DOMDocument $targetDoc */ + $targetDoc = $toReplace->ownerDocument; + + foreach ($replacements as $replacement) { + if ($replacement->ownerDocument !== $targetDoc) { + $replacement = $targetDoc->importNode($replacement, true); + } + + $toReplace->parentNode->insertBefore($replacement, $toReplace); + } + + $toReplace->parentNode->removeChild($toReplace); + } + + /** + * Move a tag node to become a sibling of the given parent. + * Will attempt to guess a position based upon the tag content within the parent. + */ + protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void { - $html = new HtmlDocument($this->pageHtml); + $parentText = $parent->textContent; + $tagPos = strpos($parentText, $tag->tagContent); + $before = $tagPos < (strlen($parentText) / 2); + $this->toCleanup[] = $tag->domNode->parentNode; - $includeHosts = $html->queryXPath("//body//*[contains(text(), '{{@')]"); - $node = $includeHosts->item(0); + if ($before) { + $parent->parentNode->insertBefore($tag->domNode, $parent); + } else { + $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling); + } + } - // One of the direct child textnodes of the "$includeHosts" should be - // the one with the include tag within. - $textNode = $node->childNodes->item(0); + /** + * Splits the given $parentNode at the location of the $domNode within it. + * Attempts replicate the original $parentNode, moving some of their parent + * children in where needed, before adding the $domNode between. + */ + protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void + { + $children = [...$parentNode->childNodes]; + $splitPos = array_search($domNode, $children, true); + if ($splitPos === false) { + $splitPos = count($children) - 1; + } - // TODO: - // Hunt down the specific text nodes with matches - // Split out tag text node from rest of content - // Fetch tag content-> - // If range or top-block: delete tag text node, [Promote to top-block], delete old top-block if empty - // If inline: Replace current text node with new text or elem - // !! "Range" or "inline" status should come from tag parser and content fetcher, not guessed direct from content - // since we could have a range of inline elements + $parentClone = $parentNode->cloneNode(); + $parentNode->parentNode->insertBefore($parentClone, $parentNode); + $parentClone->removeAttribute('id'); - // [Promote to top-block] - // Tricky operation. - // Can throw in before or after current top-block depending on relative position - // Could [Split] top-block but complex past a single level depth. - // Maybe [Split] if one level depth, otherwise default to before/after block - // Should work for the vast majority of cases, and not for those which would - // technically be invalid in-editor anyway. + for ($i = 0; $i < $splitPos; $i++) { + /** @var DOMNode $child */ + $child = $children[$i]; + $parentClone->appendChild($child); + } - // [Split] - // Copy original top-block node type and attrs (apart from ID) - // Move nodes after promoted tag-node into copy - // Insert copy after original (after promoted top-block eventually) + $parentNode->parentNode->insertBefore($domNode, $parentNode); - // Notes: May want to eventually parse through backwards, which should avoid issues - // in changes affecting the next tag, where tags may be in the same/adjacent nodes. + $this->toCleanup[] = $parentNode; + $this->toCleanup[] = $parentClone; + } + /** + * Get the parent paragraph of the given node, if existing. + */ + protected function getParentParagraph(DOMNode $parent): ?DOMNode + { + do { + if (strtolower($parent->nodeName) === 'p') { + return $parent; + } - return $html->getBodyInnerHtml(); + $parent = $parent->parentNode; + } while ($parent !== null); + + return null; + } + + /** + * Cleanup after a parse operation. + * Removes stranded elements we may have left during the parse. + */ + protected function cleanup(): void + { + foreach ($this->toCleanup as $element) { + $element->normalize(); + while ($element->parentNode && !$element->hasChildNodes()) { + $parent = $element->parentNode; + $parent->removeChild($element); + $element = $parent; + } + } } }