X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/04d21c8a97da9463e6bedda620ecf1282722ea3e..refs/pull/5685/head:/app/Entities/Tools/PageIncludeParser.php

diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php
index 63d3ea8d6..e0b89f158 100644
--- a/app/Entities/Tools/PageIncludeParser.php
+++ b/app/Entities/Tools/PageIncludeParser.php
@@ -4,54 +4,217 @@ namespace BookStack\Entities\Tools;
 
 use BookStack\Util\HtmlDocument;
 use Closure;
+use DOMDocument;
+use DOMElement;
+use DOMNode;
+use DOMText;
 
 class PageIncludeParser
 {
     protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";
 
+    /**
+     * Elements to clean up and remove if left empty after a parsing operation.
+     * @var DOMElement[]
+     */
+    protected array $toCleanup = [];
+
+    /**
+     * @param Closure(PageIncludeTag $tag): PageContent $pageContentForId
+     */
     public function __construct(
-        protected string $pageHtml,
+        protected HtmlDocument $doc,
         protected Closure $pageContentForId,
     ) {
     }
 
-    public function parse(): string
+    /**
+     * Parse out the include tags.
+     * Returns the count of new content DOM nodes added to the document.
+     */
+    public function parse(): int
+    {
+        $nodesAdded = 0;
+        $tags = $this->locateAndIsolateIncludeTags();
+
+        foreach ($tags as $tag) {
+            /** @var PageIncludeContent $content */
+            $content = $this->pageContentForId->call($this, $tag);
+
+            if (!$content->isInline()) {
+                $parentP = $this->getParentParagraph($tag->domNode);
+                $isWithinParentP = $parentP === $tag->domNode->parentNode;
+                if ($parentP && $isWithinParentP) {
+                    $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode);
+                } else if ($parentP) {
+                    $this->moveTagNodeToBesideParent($tag, $parentP);
+                }
+            }
+
+            $replacementNodes = $content->toDomNodes();
+            $nodesAdded += count($replacementNodes);
+            $this->replaceNodeWithNodes($tag->domNode, $replacementNodes);
+        }
+
+        $this->cleanup();
+
+        return $nodesAdded;
+    }
+
+    /**
+     * Locate include tags within the given document, isolating them to their
+     * own nodes in the DOM for future targeted manipulation.
+     * @return PageIncludeTag[]
+     */
+    protected function locateAndIsolateIncludeTags(): array
+    {
+        $includeHosts = $this->doc->queryXPath("//*[text()[contains(., '{{@')]]");
+        $includeTags = [];
+
+        /** @var DOMNode $node */
+        foreach ($includeHosts as $node) {
+            /** @var DOMNode $childNode */
+            foreach ($node->childNodes as $childNode) {
+                if ($childNode->nodeName === '#text') {
+                    array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
+                }
+            }
+        }
+
+        return $includeTags;
+    }
+
+    /**
+     * Takes a text DOMNode and splits its text content at include tags
+     * into multiple text nodes within the original parent.
+     * Returns found PageIncludeTag references.
+     * @return PageIncludeTag[]
+     */
+    protected function splitTextNodesAtTags(DOMNode $textNode): array
+    {
+        $includeTags = [];
+        $text = $textNode->textContent;
+        preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);
+
+        $currentOffset = 0;
+        foreach ($matches[0] as $index => $fullTagMatch) {
+            $tagOuterContent = $fullTagMatch[0];
+            $tagInnerContent = $matches[1][$index][0];
+            $tagStartOffset = $fullTagMatch[1];
+
+            if ($currentOffset < $tagStartOffset) {
+                $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
+                $textNode->parentNode->insertBefore($this->doc->createTextNode($previousText), $textNode);
+            }
+
+            $node = $textNode->parentNode->insertBefore($this->doc->createTextNode($tagOuterContent), $textNode);
+            $includeTags[] = new PageIncludeTag($tagInnerContent, $node);
+            $currentOffset = $tagStartOffset + strlen($tagOuterContent);
+        }
+
+        if ($currentOffset > 0) {
+            $textNode->textContent = substr($text, $currentOffset);
+        }
+
+        return $includeTags;
+    }
+
+    /**
+     * Replace the given node with all those in $replacements
+     * @param DOMNode[] $replacements
+     */
+    protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void
+    {
+        /** @var DOMDocument $targetDoc */
+        $targetDoc = $toReplace->ownerDocument;
+
+        foreach ($replacements as $replacement) {
+            if ($replacement->ownerDocument !== $targetDoc) {
+                $replacement = $targetDoc->importNode($replacement, true);
+            }
+
+            $toReplace->parentNode->insertBefore($replacement, $toReplace);
+        }
+
+        $toReplace->parentNode->removeChild($toReplace);
+    }
+
+    /**
+     * Move a tag node to become a sibling of the given parent.
+     * Will attempt to guess a position based upon the tag content within the parent.
+     */
+    protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void
     {
-        $html = new HtmlDocument($this->pageHtml);
+        $parentText = $parent->textContent;
+        $tagPos = strpos($parentText, $tag->tagContent);
+        $before = $tagPos < (strlen($parentText) / 2);
+        $this->toCleanup[] = $tag->domNode->parentNode;
 
-        $includeHosts = $html->queryXPath("//body//*[contains(text(), '{{@')]");
-        $node = $includeHosts->item(0);
+        if ($before) {
+            $parent->parentNode->insertBefore($tag->domNode, $parent);
+        } else {
+            $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling);
+        }
+    }
 
-        // One of the direct child textnodes of the "$includeHosts" should be
-        // the one with the include tag within.
-        $textNode = $node->childNodes->item(0);
+    /**
+     * Splits the given $parentNode at the location of the $domNode within it.
+     * Attempts replicate the original $parentNode, moving some of their parent
+     * children in where needed, before adding the $domNode between.
+     */
+    protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void
+    {
+        $children = [...$parentNode->childNodes];
+        $splitPos = array_search($domNode, $children, true);
+        if ($splitPos === false) {
+            $splitPos = count($children) - 1;
+        }
 
-        // TODO:
-        // Hunt down the specific text nodes with matches
-        // Split out tag text node from rest of content
-        // Fetch tag content->
-          // If range or top-block: delete tag text node, [Promote to top-block], delete old top-block if empty
-          // If inline: Replace current text node with new text or elem
-        // !! "Range" or "inline" status should come from tag parser and content fetcher, not guessed direct from content
-        //     since we could have a range of inline elements
+        $parentClone = $parentNode->cloneNode();
+        $parentNode->parentNode->insertBefore($parentClone, $parentNode);
+        $parentClone->removeAttribute('id');
 
-        // [Promote to top-block]
-        // Tricky operation.
-        // Can throw in before or after current top-block depending on relative position
-        // Could [Split] top-block but complex past a single level depth.
-        // Maybe [Split] if one level depth, otherwise default to before/after block
-        // Should work for the vast majority of cases, and not for those which would
-        // technically be invalid in-editor anyway.
+        for ($i = 0; $i < $splitPos; $i++) {
+            /** @var DOMNode $child */
+            $child = $children[$i];
+            $parentClone->appendChild($child);
+        }
 
-        // [Split]
-        // Copy original top-block node type and attrs (apart from ID)
-        // Move nodes after promoted tag-node into copy
-        // Insert copy after original (after promoted top-block eventually)
+        $parentNode->parentNode->insertBefore($domNode, $parentNode);
 
-        // Notes: May want to eventually parse through backwards, which should avoid issues
-        // in changes affecting the next tag, where tags may be in the same/adjacent nodes.
+        $this->toCleanup[] = $parentNode;
+        $this->toCleanup[] = $parentClone;
+    }
 
+    /**
+     * Get the parent paragraph of the given node, if existing.
+     */
+    protected function getParentParagraph(DOMNode $parent): ?DOMNode
+    {
+        do {
+            if (strtolower($parent->nodeName) === 'p') {
+                return $parent;
+            }
 
-        return $html->getBodyInnerHtml();
+            $parent = $parent->parentNode;
+        } while ($parent !== null);
+
+        return null;
+    }
+
+    /**
+     * Cleanup after a parse operation.
+     * Removes stranded elements we may have left during the parse.
+     */
+    protected function cleanup(): void
+    {
+        foreach ($this->toCleanup as $element) {
+            $element->normalize();
+            while ($element->parentNode && !$element->hasChildNodes()) {
+                $parent = $element->parentNode;
+                $parent->removeChild($element);
+                $element = $parent;
+            }
+        }
     }
 }