X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/c88eb729a499197e8b3ab9d5019b4426a65d3d41..refs/pull/5721/head:/app/Entities/Tools/PageIncludeParser.php diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php index 5ce847d6c..e0b89f158 100644 --- a/app/Entities/Tools/PageIncludeParser.php +++ b/app/Entities/Tools/PageIncludeParser.php @@ -13,39 +13,52 @@ class PageIncludeParser { protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/"; + /** + * Elements to clean up and remove if left empty after a parsing operation. + * @var DOMElement[] + */ + protected array $toCleanup = []; + + /** + * @param Closure(PageIncludeTag $tag): PageContent $pageContentForId + */ public function __construct( - protected string $pageHtml, + protected HtmlDocument $doc, protected Closure $pageContentForId, ) { } - public function parse(): string + /** + * Parse out the include tags. + * Returns the count of new content DOM nodes added to the document. + */ + public function parse(): int { - $doc = new HtmlDocument($this->pageHtml); - - $tags = $this->locateAndIsolateIncludeTags($doc); - $topLevel = [...$doc->getBodyChildren()]; + $nodesAdded = 0; + $tags = $this->locateAndIsolateIncludeTags(); foreach ($tags as $tag) { - $htmlContent = $this->pageContentForId->call($this, $tag->getPageId()); - $content = new PageIncludeContent($htmlContent, $tag); + /** @var PageIncludeContent $content */ + $content = $this->pageContentForId->call($this, $tag); if (!$content->isInline()) { - $isParentTopLevel = in_array($tag->domNode->parentNode, $topLevel, true); - if ($isParentTopLevel) { + $parentP = $this->getParentParagraph($tag->domNode); + $isWithinParentP = $parentP === $tag->domNode->parentNode; + if ($parentP && $isWithinParentP) { $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode); - } else { - $this->promoteTagNodeToBody($tag, $doc->getBody()); + } else if ($parentP) { + $this->moveTagNodeToBesideParent($tag, $parentP); } } - $this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes()); + $replacementNodes = $content->toDomNodes(); + $nodesAdded += count($replacementNodes); + $this->replaceNodeWithNodes($tag->domNode, $replacementNodes); } - // TODO Notes: May want to eventually parse through backwards, which should avoid issues - // in changes affecting the next tag, where tags may be in the same/adjacent nodes. + $this->cleanup(); - return $doc->getBodyInnerHtml(); + return $nodesAdded; } /** @@ -53,14 +66,14 @@ class PageIncludeParser * own nodes in the DOM for future targeted manipulation. * @return PageIncludeTag[] */ - protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array + protected function locateAndIsolateIncludeTags(): array { - $includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]"); + $includeHosts = $this->doc->queryXPath("//*[text()[contains(., '{{@')]]"); $includeTags = []; /** @var DOMNode $node */ - /** @var DOMNode $childNode */ foreach ($includeHosts as $node) { + /** @var DOMNode $childNode */ foreach ($node->childNodes as $childNode) { if ($childNode->nodeName === '#text') { array_push($includeTags, ...$this->splitTextNodesAtTags($childNode)); @@ -91,10 +104,10 @@ class PageIncludeParser if ($currentOffset < $tagStartOffset) { $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset); - $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode); + $textNode->parentNode->insertBefore($this->doc->createTextNode($previousText), $textNode); } - $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode); + $node = $textNode->parentNode->insertBefore($this->doc->createTextNode($tagOuterContent), $textNode); $includeTags[] = new PageIncludeTag($tagInnerContent, $node); $currentOffset = $tagStartOffset + strlen($tagOuterContent); } @@ -107,6 +120,7 @@ class PageIncludeParser } /** + * Replace the given node with all those in $replacements * @param DOMNode[] $replacements */ protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void @@ -116,7 +130,7 @@ class PageIncludeParser foreach ($replacements as $replacement) { if ($replacement->ownerDocument !== $targetDoc) { - $replacement = $targetDoc->adoptNode($replacement); + $replacement = $targetDoc->importNode($replacement, true); } $toReplace->parentNode->insertBefore($replacement, $toReplace); @@ -125,51 +139,82 @@ class PageIncludeParser $toReplace->parentNode->removeChild($toReplace); } - protected function promoteTagNodeToBody(PageIncludeTag $tag, DOMNode $body): void + /** + * Move a tag node to become a sibling of the given parent. + * Will attempt to guess a position based upon the tag content within the parent. + */ + protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void { - /** @var DOMNode $topParent */ - $topParent = $tag->domNode->parentNode; - while ($topParent->parentNode !== $body) { - $topParent = $topParent->parentNode; - } - - $parentText = $topParent->textContent; + $parentText = $parent->textContent; $tagPos = strpos($parentText, $tag->tagContent); $before = $tagPos < (strlen($parentText) / 2); + $this->toCleanup[] = $tag->domNode->parentNode; if ($before) { - $body->insertBefore($tag->domNode, $topParent); + $parent->parentNode->insertBefore($tag->domNode, $parent); } else { - $body->insertBefore($tag->domNode, $topParent->nextSibling); + $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling); } } + /** + * Splits the given $parentNode at the location of the $domNode within it. + * Attempts replicate the original $parentNode, moving some of their parent + * children in where needed, before adding the $domNode between. + */ protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void { $children = [...$parentNode->childNodes]; - $splitPos = array_search($domNode, $children, true) ?: count($children); + $splitPos = array_search($domNode, $children, true); + if ($splitPos === false) { + $splitPos = count($children) - 1; + } + $parentClone = $parentNode->cloneNode(); + $parentNode->parentNode->insertBefore($parentClone, $parentNode); $parentClone->removeAttribute('id'); - /** @var DOMNode $child */ for ($i = 0; $i < $splitPos; $i++) { - $child = $children[0]; + /** @var DOMNode $child */ + $child = $children[$i]; $parentClone->appendChild($child); } - if ($parentClone->hasChildNodes()) { - $parentNode->parentNode->insertBefore($parentClone, $parentNode); - } - $parentNode->parentNode->insertBefore($domNode, $parentNode); - $parentClone->normalize(); - $parentNode->normalize(); - if (!$parentNode->hasChildNodes()) { - $parentNode->remove(); - } - if (!$parentClone->hasChildNodes()) { - $parentClone->remove(); + $this->toCleanup[] = $parentNode; + $this->toCleanup[] = $parentClone; + } + + /** + * Get the parent paragraph of the given node, if existing. + */ + protected function getParentParagraph(DOMNode $parent): ?DOMNode + { + do { + if (strtolower($parent->nodeName) === 'p') { + return $parent; + } + + $parent = $parent->parentNode; + } while ($parent !== null); + + return null; + } + + /** + * Cleanup after a parse operation. + * Removes stranded elements we may have left during the parse. + */ + protected function cleanup(): void + { + foreach ($this->toCleanup as $element) { + $element->normalize(); + while ($element->parentNode && !$element->hasChildNodes()) { + $parent = $element->parentNode; + $parent->removeChild($element); + $element = $parent; + } } } }