BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageIncludeParser.php

   1 <?php
   2
   3 namespace BookStack\Entities\Tools;
   4
   5 use BookStack\Util\HtmlDocument;
   6 use Closure;
   7 use DOMDocument;
   8 use DOMElement;
   9 use DOMNode;
  10 use DOMText;
  11
  12 class PageIncludeParser
  13 {
  14     protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";
  15
  16     /**
  17      * Elements to clean up and remove if left empty after a parsing operation.
  18      * @var DOMElement[]
  19      */
  20     protected array $toCleanup = [];
  21
  22     public function __construct(
  23         protected string $pageHtml,
  24         protected Closure $pageContentForId,
  25     ) {
  26     }
  27
  28     /**
  29      * Parse out the include tags.
  30      */
  31     public function parse(): string
  32     {
  33         $doc = new HtmlDocument($this->pageHtml);
  34
  35         $tags = $this->locateAndIsolateIncludeTags($doc);
  36
  37         foreach ($tags as $tag) {
  38             $htmlContent = $this->pageContentForId->call($this, $tag->getPageId());
  39             $content = new PageIncludeContent($htmlContent, $tag);
  40
  41             if (!$content->isInline()) {
  42                 $parentP = $this->getParentParagraph($tag->domNode);
  43                 $isWithinParentP = $parentP === $tag->domNode->parentNode;
  44                 if ($parentP && $isWithinParentP) {
  45                     $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode);
  46                 } else if ($parentP) {
  47                     $this->moveTagNodeToBesideParent($tag, $parentP);
  48                 }
  49             }
  50
  51             $this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes());
  52         }
  53
  54         $this->cleanup();
  55
  56         return $doc->getBodyInnerHtml();
  57     }
  58
  59     /**
  60      * Locate include tags within the given document, isolating them to their
  61      * own nodes in the DOM for future targeted manipulation.
  62      * @return PageIncludeTag[]
  63      */
  64     protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array
  65     {
  66         $includeHosts = $doc->queryXPath("//body//*[text()[contains(., '{{@')]]");
  67         $includeTags = [];
  68
  69         /** @var DOMNode $node */
  70         /** @var DOMNode $childNode */
  71         foreach ($includeHosts as $node) {
  72             foreach ($node->childNodes as $childNode) {
  73                 if ($childNode->nodeName === '#text') {
  74                     array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
  75                 }
  76             }
  77         }
  78
  79         return $includeTags;
  80     }
  81
  82     /**
  83      * Takes a text DOMNode and splits its text content at include tags
  84      * into multiple text nodes within the original parent.
  85      * Returns found PageIncludeTag references.
  86      * @return PageIncludeTag[]
  87      */
  88     protected function splitTextNodesAtTags(DOMNode $textNode): array
  89     {
  90         $includeTags = [];
  91         $text = $textNode->textContent;
  92         preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);
  93
  94         $currentOffset = 0;
  95         foreach ($matches[0] as $index => $fullTagMatch) {
  96             $tagOuterContent = $fullTagMatch[0];
  97             $tagInnerContent = $matches[1][$index][0];
  98             $tagStartOffset = $fullTagMatch[1];
  99
 100             if ($currentOffset < $tagStartOffset) {
 101                 $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
 102                 $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);
 103             }
 104
 105             $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);
 106             $includeTags[] = new PageIncludeTag($tagInnerContent, $node);
 107             $currentOffset = $tagStartOffset + strlen($tagOuterContent);
 108         }
 109
 110         if ($currentOffset > 0) {
 111             $textNode->textContent = substr($text, $currentOffset);
 112         }
 113
 114         return $includeTags;
 115     }
 116
 117     /**
 118      * Replace the given node with all those in $replacements
 119      * @param DOMNode[] $replacements
 120      */
 121     protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void
 122     {
 123         /** @var DOMDocument $targetDoc */
 124         $targetDoc = $toReplace->ownerDocument;
 125
 126         foreach ($replacements as $replacement) {
 127             if ($replacement->ownerDocument !== $targetDoc) {
 128                 $replacement = $targetDoc->adoptNode($replacement);
 129             }
 130
 131             $toReplace->parentNode->insertBefore($replacement, $toReplace);
 132         }
 133
 134         $toReplace->parentNode->removeChild($toReplace);
 135     }
 136
 137     /**
 138      * Move a tag node to become a sibling of the given parent.
 139      * Will attempt to guess a position based upon the tag content within the parent.
 140      */
 141     protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void
 142     {
 143         $parentText = $parent->textContent;
 144         $tagPos = strpos($parentText, $tag->tagContent);
 145         $before = $tagPos < (strlen($parentText) / 2);
 146
 147         if ($before) {
 148             $parent->parentNode->insertBefore($tag->domNode, $parent);
 149         } else {
 150             $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling);
 151         }
 152     }
 153
 154     /**
 155      * Splits the given $parentNode at the location of the $domNode within it.
 156      * Attempts replicate the original $parentNode, moving some of their parent
 157      * children in where needed, before adding the $domNode between.
 158      */
 159     protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void
 160     {
 161         $children = [...$parentNode->childNodes];
 162         $splitPos = array_search($domNode, $children, true);
 163         if ($splitPos === false) {
 164             $splitPos = count($children) - 1;
 165         }
 166
 167         $parentClone = $parentNode->cloneNode();
 168         $parentNode->parentNode->insertBefore($parentClone, $parentNode);
 169         $parentClone->removeAttribute('id');
 170
 171         /** @var DOMNode $child */
 172         for ($i = 0; $i < $splitPos; $i++) {
 173             $child = $children[$i];
 174             $parentClone->appendChild($child);
 175         }
 176
 177         $parentNode->parentNode->insertBefore($domNode, $parentNode);
 178
 179         $this->toCleanup[] = $parentNode;
 180         $this->toCleanup[] = $parentClone;
 181     }
 182
 183     /**
 184      * Get the parent paragraph of the given node, if existing.
 185      */
 186     protected function getParentParagraph(DOMNode $parent): ?DOMNode
 187     {
 188         do {
 189             if (strtolower($parent->nodeName) === 'p') {
 190                 return $parent;
 191             }
 192
 193             $parent = $parent->parentElement;
 194         } while ($parent !== null);
 195
 196         return null;
 197     }
 198
 199     /**
 200      * Cleanup after a parse operation.
 201      * Removes stranded elements we may have left during the parse.
 202      */
 203     protected function cleanup(): void
 204     {
 205         foreach ($this->toCleanup as $element) {
 206             $element->normalize();
 207             if ($element->parentNode && !$element->hasChildNodes()) {
 208                 $element->parentNode->removeChild($element);
 209             }
 210         }
 211     }
 212 }