]> BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageIncludeParser.php
Includes: Updated logic regarding parent block els, added tests
[bookstack] / app / Entities / Tools / PageIncludeParser.php
1 <?php
2
3 namespace BookStack\Entities\Tools;
4
5 use BookStack\Util\HtmlDocument;
6 use Closure;
7 use DOMDocument;
8 use DOMElement;
9 use DOMNode;
10 use DOMText;
11
12 class PageIncludeParser
13 {
14     protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";
15
16     /**
17      * Elements to clean up and remove if left empty after a parsing operation.
18      * @var DOMElement[]
19      */
20     protected array $toCleanup = [];
21
22     public function __construct(
23         protected string $pageHtml,
24         protected Closure $pageContentForId,
25     ) {
26     }
27
28     /**
29      * Parse out the include tags.
30      */
31     public function parse(): string
32     {
33         $doc = new HtmlDocument($this->pageHtml);
34
35         $tags = $this->locateAndIsolateIncludeTags($doc);
36
37         foreach ($tags as $tag) {
38             $htmlContent = $this->pageContentForId->call($this, $tag->getPageId());
39             $content = new PageIncludeContent($htmlContent, $tag);
40
41             if (!$content->isInline()) {
42                 $parentP = $this->getParentParagraph($tag->domNode);
43                 $isWithinParentP = $parentP === $tag->domNode->parentNode;
44                 if ($parentP && $isWithinParentP) {
45                     $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode);
46                 } else if ($parentP) {
47                     $this->moveTagNodeToBesideParent($tag, $parentP);
48                 }
49             }
50
51             $this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes());
52         }
53
54         $this->cleanup();
55
56         return $doc->getBodyInnerHtml();
57     }
58
59     /**
60      * Locate include tags within the given document, isolating them to their
61      * own nodes in the DOM for future targeted manipulation.
62      * @return PageIncludeTag[]
63      */
64     protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array
65     {
66         $includeHosts = $doc->queryXPath("//body//*[text()[contains(., '{{@')]]");
67         $includeTags = [];
68
69         /** @var DOMNode $node */
70         /** @var DOMNode $childNode */
71         foreach ($includeHosts as $node) {
72             foreach ($node->childNodes as $childNode) {
73                 if ($childNode->nodeName === '#text') {
74                     array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
75                 }
76             }
77         }
78
79         return $includeTags;
80     }
81
82     /**
83      * Takes a text DOMNode and splits its text content at include tags
84      * into multiple text nodes within the original parent.
85      * Returns found PageIncludeTag references.
86      * @return PageIncludeTag[]
87      */
88     protected function splitTextNodesAtTags(DOMNode $textNode): array
89     {
90         $includeTags = [];
91         $text = $textNode->textContent;
92         preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);
93
94         $currentOffset = 0;
95         foreach ($matches[0] as $index => $fullTagMatch) {
96             $tagOuterContent = $fullTagMatch[0];
97             $tagInnerContent = $matches[1][$index][0];
98             $tagStartOffset = $fullTagMatch[1];
99
100             if ($currentOffset < $tagStartOffset) {
101                 $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
102                 $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);
103             }
104
105             $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);
106             $includeTags[] = new PageIncludeTag($tagInnerContent, $node);
107             $currentOffset = $tagStartOffset + strlen($tagOuterContent);
108         }
109
110         if ($currentOffset > 0) {
111             $textNode->textContent = substr($text, $currentOffset);
112         }
113
114         return $includeTags;
115     }
116
117     /**
118      * Replace the given node with all those in $replacements
119      * @param DOMNode[] $replacements
120      */
121     protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void
122     {
123         /** @var DOMDocument $targetDoc */
124         $targetDoc = $toReplace->ownerDocument;
125
126         foreach ($replacements as $replacement) {
127             if ($replacement->ownerDocument !== $targetDoc) {
128                 $replacement = $targetDoc->adoptNode($replacement);
129             }
130
131             $toReplace->parentNode->insertBefore($replacement, $toReplace);
132         }
133
134         $toReplace->parentNode->removeChild($toReplace);
135     }
136
137     /**
138      * Move a tag node to become a sibling of the given parent.
139      * Will attempt to guess a position based upon the tag content within the parent.
140      */
141     protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void
142     {
143         $parentText = $parent->textContent;
144         $tagPos = strpos($parentText, $tag->tagContent);
145         $before = $tagPos < (strlen($parentText) / 2);
146
147         if ($before) {
148             $parent->parentNode->insertBefore($tag->domNode, $parent);
149         } else {
150             $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling);
151         }
152     }
153
154     /**
155      * Splits the given $parentNode at the location of the $domNode within it.
156      * Attempts replicate the original $parentNode, moving some of their parent
157      * children in where needed, before adding the $domNode between.
158      */
159     protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void
160     {
161         $children = [...$parentNode->childNodes];
162         $splitPos = array_search($domNode, $children, true);
163         if ($splitPos === false) {
164             $splitPos = count($children) - 1;
165         }
166
167         $parentClone = $parentNode->cloneNode();
168         $parentNode->parentNode->insertBefore($parentClone, $parentNode);
169         $parentClone->removeAttribute('id');
170
171         /** @var DOMNode $child */
172         for ($i = 0; $i < $splitPos; $i++) {
173             $child = $children[$i];
174             $parentClone->appendChild($child);
175         }
176
177         $parentNode->parentNode->insertBefore($domNode, $parentNode);
178
179         $this->toCleanup[] = $parentNode;
180         $this->toCleanup[] = $parentClone;
181     }
182
183     /**
184      * Get the parent paragraph of the given node, if existing.
185      */
186     protected function getParentParagraph(DOMNode $parent): ?DOMNode
187     {
188         do {
189             if (strtolower($parent->nodeName) === 'p') {
190                 return $parent;
191             }
192
193             $parent = $parent->parentElement;
194         } while ($parent !== null);
195
196         return null;
197     }
198
199     /**
200      * Cleanup after a parse operation.
201      * Removes stranded elements we may have left during the parse.
202      */
203     protected function cleanup(): void
204     {
205         foreach ($this->toCleanup as $element) {
206             $element->normalize();
207             if ($element->parentNode && !$element->hasChildNodes()) {
208                 $element->parentNode->removeChild($element);
209             }
210         }
211     }
212 }