use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
use BookStack\Uploads\ImageService;
use BookStack\Util\CspService;
-use DOMDocument;
+use BookStack\Util\HtmlDocument;
use DOMElement;
-use DOMXPath;
use Exception;
use Throwable;
protected function htmlToPdf(string $html): string
{
$html = $this->containHtml($html);
- $html = $this->replaceIframesWithLinks($html);
- $html = $this->openDetailElements($html);
+ $doc = new HtmlDocument();
+ $doc->loadCompleteHtml($html);
- return $this->pdfGenerator->fromHtml($html);
+ $this->replaceIframesWithLinks($doc);
+ $this->openDetailElements($doc);
+ $cleanedHtml = $doc->getHtml();
+
+ return $this->pdfGenerator->fromHtml($cleanedHtml);
}
/**
* Within the given HTML content, Open any detail blocks.
*/
- protected function openDetailElements(string $html): string
+ protected function openDetailElements(HtmlDocument $doc): void
{
- libxml_use_internal_errors(true);
-
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
- $xPath = new DOMXPath($doc);
-
- $details = $xPath->query('//details');
+ $details = $doc->queryXPath('//details');
/** @var DOMElement $detail */
foreach ($details as $detail) {
$detail->setAttribute('open', 'open');
}
-
- return $doc->saveHTML();
}
/**
- * Within the given HTML content, replace any iframe elements
+ * Within the given HTML document, replace any iframe elements
* with anchor links within paragraph blocks.
*/
- protected function replaceIframesWithLinks(string $html): string
+ protected function replaceIframesWithLinks(HtmlDocument $doc): void
{
- libxml_use_internal_errors(true);
-
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
- $xPath = new DOMXPath($doc);
+ $iframes = $doc->queryXPath('//iframe');
- $iframes = $xPath->query('//iframe');
/** @var DOMElement $iframe */
foreach ($iframes as $iframe) {
$link = $iframe->getAttribute('src');
$paragraph->appendChild($anchor);
$iframe->parentNode->replaceChild($paragraph, $iframe);
}
-
- return $doc->saveHTML();
}
/**
use BookStack\Uploads\ImageRepo;
use BookStack\Uploads\ImageService;
use BookStack\Util\HtmlContentFilter;
-use DOMDocument;
+use BookStack\Util\HtmlDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
-use DOMXPath;
use Illuminate\Support\Str;
class PageContent
return $htmlText;
}
- $doc = $this->loadDocumentFromHtml($htmlText);
- $container = $doc->documentElement;
- $body = $container->childNodes->item(0);
- $childNodes = $body->childNodes;
- $xPath = new DOMXPath($doc);
+ $doc = new HtmlDocument($htmlText);
// Get all img elements with image data blobs
- $imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]');
+ $imageNodes = $doc->queryXPath('//img[contains(@src, \'data:image\')]');
foreach ($imageNodes as $imageNode) {
$imageSrc = $imageNode->getAttribute('src');
$newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);
$imageNode->setAttribute('src', $newUrl);
}
- // Generate inner html as a string
- $html = '';
- foreach ($childNodes as $childNode) {
- $html .= $doc->saveHTML($childNode);
- }
-
- return $html;
+ return $doc->getBodyInnerHtml();
}
/**
return $htmlText;
}
- $doc = $this->loadDocumentFromHtml($htmlText);
- $container = $doc->documentElement;
- $body = $container->childNodes->item(0);
- $childNodes = $body->childNodes;
- $xPath = new DOMXPath($doc);
+ $doc = new HtmlDocument($htmlText);
// Map to hold used ID references
$idMap = [];
// Map to hold changing ID references
$changeMap = [];
- $this->updateIdsRecursively($body, 0, $idMap, $changeMap);
- $this->updateLinks($xPath, $changeMap);
+ $this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
+ $this->updateLinks($doc, $changeMap);
- // Generate inner html as a string
- $html = '';
- foreach ($childNodes as $childNode) {
- $html .= $doc->saveHTML($childNode);
- }
-
- // Perform required string-level tweaks
+ // Generate inner html as a string & perform required string-level tweaks
+ $html = $doc->getBodyInnerHtml();
$html = str_replace(' ', ' ', $html);
return $html;
* Update the all links in the given xpath to apply requires changes within the
* given $changeMap array.
*/
- protected function updateLinks(DOMXPath $xpath, array $changeMap): void
+ protected function updateLinks(HtmlDocument $doc, array $changeMap): void
{
if (empty($changeMap)) {
return;
}
- $links = $xpath->query('//body//*//*[@href]');
+ $links = $doc->queryXPath('//body//*//*[@href]');
/** @var DOMElement $domElem */
foreach ($links as $domElem) {
$href = ltrim($domElem->getAttribute('href'), '#');
return [];
}
- $doc = $this->loadDocumentFromHtml($htmlContent);
- $xPath = new DOMXPath($doc);
- $headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6');
+ $doc = new HtmlDocument($htmlContent);
+ $headers = $doc->queryXPath('//h1|//h2|//h3|//h4|//h5|//h6');
- return $headers ? $this->headerNodesToLevelList($headers) : [];
+ return $headers->count() === 0 ? [] : $this->headerNodesToLevelList($headers);
}
/**
protected function fetchSectionOfPage(Page $page, string $sectionId): string
{
$topLevelTags = ['table', 'ul', 'ol', 'pre'];
- $doc = $this->loadDocumentFromHtml($page->html);
+ $doc = new HtmlDocument($page->html);
// Search included content for the id given and blank out if not exists.
$matchingElem = $doc->getElementById($sectionId);
// Otherwise replace the content with the found content
// Checks if the top-level wrapper should be included by matching on tag types
- $innerContent = '';
$isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);
if ($isTopLevel) {
- $innerContent .= $doc->saveHTML($matchingElem);
- } else {
- foreach ($matchingElem->childNodes as $childNode) {
- $innerContent .= $doc->saveHTML($childNode);
- }
+ return $doc->getNodeOuterHtml($matchingElem);
}
- libxml_clear_errors();
-
- return $innerContent;
- }
-
- /**
- * Create and load a DOMDocument from the given html content.
- */
- protected function loadDocumentFromHtml(string $html): DOMDocument
- {
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
- $doc->loadHTML($html);
- return $doc;
+ return $doc->getNodeInnerHtml($matchingElem);
}
}
use BookStack\References\ModelResolvers\CrossLinkModelResolver;
use BookStack\References\ModelResolvers\PageLinkModelResolver;
use BookStack\References\ModelResolvers\PagePermalinkModelResolver;
-use DOMDocument;
-use DOMXPath;
+use BookStack\Util\HtmlDocument;
class CrossLinkParser
{
{
$links = [];
- $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML($html);
-
- $xPath = new DOMXPath($doc);
- $anchors = $xPath->query('//a[@href]');
+ $doc = new HtmlDocument($html);
+ $anchors = $doc->queryXPath('//a[@href]');
/** @var \DOMElement $anchor */
foreach ($anchors as $anchor) {
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Entities\Repos\RevisionRepo;
-use DOMDocument;
-use DOMXPath;
+use BookStack\Util\HtmlDocument;
class ReferenceUpdater
{
- protected ReferenceFetcher $referenceFetcher;
- protected RevisionRepo $revisionRepo;
-
- public function __construct(ReferenceFetcher $referenceFetcher, RevisionRepo $revisionRepo)
- {
- $this->referenceFetcher = $referenceFetcher;
- $this->revisionRepo = $revisionRepo;
+ public function __construct(
+ protected ReferenceFetcher $referenceFetcher,
+ protected RevisionRepo $revisionRepo
+ ) {
}
public function updateEntityPageReferences(Entity $entity, string $oldLink)
return $html;
}
- $html = '<body>' . $html . '</body>';
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
-
- $xPath = new DOMXPath($doc);
- $anchors = $xPath->query('//a[@href]');
+ $doc = new HtmlDocument($html);
+ $anchors = $doc->queryXPath('//a[@href]');
/** @var \DOMElement $anchor */
foreach ($anchors as $anchor) {
$anchor->setAttribute('href', $updated);
}
- $html = '';
- $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
- foreach ($topElems as $child) {
- $html .= $doc->saveHTML($child);
- }
-
- return $html;
+ return $doc->getBodyInnerHtml();
}
}
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
-use DOMDocument;
+use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Support\Collection;
'h6' => 1.5,
];
- $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
+ $doc = new HtmlDocument($html);
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML($html);
-
- $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */
- foreach ($topElems as $child) {
+ foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) {
*/
protected function generateTermScoreMapFromTags(array $tags): array
{
- $scoreMap = [];
$names = [];
$values = [];
namespace BookStack\Util;
use DOMAttr;
-use DOMDocument;
use DOMElement;
use DOMNodeList;
-use DOMXPath;
class HtmlContentFilter
{
return $html;
}
- $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML($html);
- $xPath = new DOMXPath($doc);
+ $doc = new HtmlDocument($html);
// Remove standard script tags
- $scriptElems = $xPath->query('//script');
+ $scriptElems = $doc->queryXPath('//script');
static::removeNodes($scriptElems);
// Remove clickable links to JavaScript URI
- $badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
+ $badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
static::removeNodes($badLinks);
// Remove forms with calls to JavaScript URI
- $badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
+ $badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
static::removeNodes($badForms);
// Remove meta tag to prevent external redirects
- $metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
+ $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
// Remove data or JavaScript iFrames
- $badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
+ $badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
static::removeNodes($badIframes);
// Remove attributes, within svg children, hiding JavaScript or data uris.
// A bunch of svg element and attribute combinations expose xss possibilities.
// For example, SVG animate tag can exploit javascript in values.
- $badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
+ $badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
static::removeAttributes($badValuesAttrs);
// Remove elements with a xlink:href attribute
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
- $xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
+ $xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
static::removeAttributes($xlinkHrefAttributes);
// Remove 'on*' attributes
- $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
+ $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
static::removeAttributes($onAttributes);
- $html = '';
- $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
- foreach ($topElems as $child) {
- $html .= $doc->saveHTML($child);
- }
-
- return $html;
+ return $doc->getBodyInnerHtml();
}
/**
--- /dev/null
+<?php
+
+namespace BookStack\Util;
+
+use DOMDocument;
+use DOMElement;
+use DOMNode;
+use DOMNodeList;
+use DOMXPath;
+
+/**
+ * HtmlDocument is a thin wrapper around DOMDocument built
+ * specifically for loading, querying and generating HTML content.
+ */
+class HtmlDocument
+{
+ protected DOMDocument $document;
+ protected ?DOMXPath $xpath = null;
+ protected int $loadOptions;
+
+ public function __construct(string $partialHtml = '', int $loadOptions = 0)
+ {
+ libxml_use_internal_errors(true);
+ $this->document = new DOMDocument();
+ $this->loadOptions = $loadOptions;
+
+ if ($partialHtml) {
+ $this->loadPartialHtml($partialHtml);
+ }
+ }
+
+ /**
+ * Load some HTML content that's part of a document (e.g. body content)
+ * into the current document.
+ */
+ public function loadPartialHtml(string $html): void
+ {
+ $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
+ $this->document->loadHTML($html, $this->loadOptions);
+ $this->xpath = null;
+ }
+
+ /**
+ * Load a complete page of HTML content into the document.
+ */
+ public function loadCompleteHtml(string $html): void
+ {
+ $html = '<?xml encoding="utf-8" ?>' . $html;
+ $this->document->loadHTML($html, $this->loadOptions);
+ $this->xpath = null;
+ }
+
+ /**
+ * Start an XPath query on the current document.
+ */
+ public function queryXPath(string $expression): DOMNodeList
+ {
+ if (is_null($this->xpath)) {
+ $this->xpath = new DOMXPath($this->document);
+ }
+
+ $result = $this->xpath->query($expression);
+ if ($result === false) {
+ throw new \InvalidArgumentException("XPath query for expression [$expression] failed to execute");
+ }
+
+ return $result;
+ }
+
+ /**
+ * Create a new DOMElement instance within the document.
+ */
+ public function createElement(string $localName, string $value = ''): DOMElement
+ {
+ $element = $this->document->createElement($localName, $value);
+
+ if ($element === false) {
+ throw new \InvalidArgumentException("Failed to create element of name [$localName] and value [$value]");
+ }
+
+ return $element;
+ }
+
+ /**
+ * Get an element within the document of the given ID.
+ */
+ public function getElementById(string $elementId): ?DOMElement
+ {
+ return $this->document->getElementById($elementId);
+ }
+
+ /**
+ * Get the DOMNode that represents the HTML body.
+ */
+ public function getBody(): DOMNode
+ {
+ return $this->document->getElementsByTagName('body')[0];
+ }
+
+ /**
+ * Get the nodes that are a direct child of the body.
+ * This is usually all the content nodes if loaded partially.
+ */
+ public function getBodyChildren(): DOMNodeList
+ {
+ return $this->getBody()->childNodes;
+ }
+
+ /**
+ * Get the inner HTML content of the body.
+ * This is usually all the content if loaded partially.
+ */
+ public function getBodyInnerHtml(): string
+ {
+ $html = '';
+ foreach ($this->getBodyChildren() as $child) {
+ $html .= $this->document->saveHTML($child);
+ }
+
+ return $html;
+ }
+
+ /**
+ * Get the HTML content of the whole document.
+ */
+ public function getHtml(): string
+ {
+ return $this->document->saveHTML();
+ }
+
+ /**
+ * Get the inner HTML for the given node.
+ */
+ public function getNodeInnerHtml(DOMNode $node): string
+ {
+ $html = '';
+
+ foreach ($node->childNodes as $childNode) {
+ $html .= $this->document->saveHTML($childNode);
+ }
+
+ return $html;
+ }
+
+ /**
+ * Get the outer HTML for the given node.
+ */
+ public function getNodeOuterHtml(DOMNode $node): string
+ {
+ return $this->document->saveHTML($node);
+ }
+}
namespace BookStack\Util;
-use DOMDocument;
use DOMElement;
use DOMNodeList;
-use DOMXPath;
class HtmlNonceApplicator
{
- protected static $placeholder = '[CSP_NONCE_VALUE]';
+ protected static string $placeholder = '[CSP_NONCE_VALUE]';
/**
* Prepare the given HTML content with nonce attributes including a placeholder
return $html;
}
- $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML($html, LIBXML_SCHEMA_CREATE);
- $xPath = new DOMXPath($doc);
+ // LIBXML_SCHEMA_CREATE was found to be required here otherwise
+ // the PHP DOMDocument handling will attempt to format/close
+ // HTML tags within scripts and therefore change JS content.
+ $doc = new HtmlDocument($html, LIBXML_SCHEMA_CREATE);
// Apply to scripts
- $scriptElems = $xPath->query('//script');
+ $scriptElems = $doc->queryXPath('//script');
static::addNonceAttributes($scriptElems, static::$placeholder);
// Apply to styles
- $styleElems = $xPath->query('//style');
+ $styleElems = $doc->queryXPath('//style');
static::addNonceAttributes($styleElems, static::$placeholder);
- $returnHtml = '';
- $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
- foreach ($topElems as $child) {
- $content = $doc->saveHTML($child);
- $returnHtml .= $content;
- }
-
- return $returnHtml;
+ return $doc->getBodyInnerHtml();
}
/**