BookStack Code Mirror - bookstack/blobdiff - app/Entities/Tools/PageContent.php

index 45bfe8fa1740bf0c9c33cbd2432967a74cd73547..b1c750adbdd6a3a3645c75e91b0f44c9e940bc6c 100644 (file)

--- a/app/Entities/Tools/PageContent.php

+++ b/app/Entities/Tools/PageContent.php

@@ -12,6 +12,8 @@ use BookStack\Uploads\ImageRepo;

use BookStack\Uploads\ImageService;

use BookStack\Util\HtmlContentFilter;

use DOMDocument;

+use DOMElement;

+use DOMNode;

use DOMNodeList;

use DOMXPath;

use Illuminate\Support\Str;

@@ -107,15 +109,35 @@ class PageContent

/**

* Convert all inline base64 content to uploaded image files.

+ * Regex is used to locate the start of data-uri definitions then

+ * manual looping over content is done to parse the whole data uri.

+ * Attempting to capture the whole data uri using regex can cause PHP

+ * PCRE limits to be hit with larger, multi-MB, files.

protected function extractBase64ImagesFromMarkdown(string $markdown)

{

$matches = [];

- preg_match_all('/!\[.*?]\(.*?(data:image\/.*?)[)"\s]/', $markdown, $matches);

+ $contentLength = strlen($markdown);

+ $replacements = [];

+ preg_match_all('/!\[.*?]\(.*?(data:image\/.{1,6};base64,)/', $markdown, $matches, PREG_OFFSET_CAPTURE);

+ foreach ($matches[1] as $base64MatchPair) {

+ [$dataUri, $index] = $base64MatchPair;

+ for ($i = strlen($dataUri) + $index; $i < $contentLength; $i++) {

+ $char = $markdown[$i];

+ if ($char === ')' || $char === ' ' || $char === "\n" || $char === '"') {

+ break;

+ }

+ $dataUri .= $char;

+ }

+ $newUrl = $this->base64ImageUriToUploadedImageUrl($dataUri);

+ $replacements[] = [$dataUri, $newUrl];

+ }

- foreach ($matches[1] as $base64Match) {

- $newUrl = $this->base64ImageUriToUploadedImageUrl($base64Match);

- $markdown = str_replace($base64Match, $newUrl, $markdown);

+ foreach ($replacements as [$dataUri, $newUrl]) {

+ $markdown = str_replace($dataUri, $newUrl, $markdown);

}

return $markdown;

@@ -156,7 +178,7 @@ class PageContent

/**

* Parse a base64 image URI into the data and extension.

- * @return array{extension: array, data: string}

+ * @return array{extension: string, data: string}

protected function parseBase64ImageUri(string $uri): array

{

@@ -193,6 +215,15 @@ class PageContent

}

+ // Set ids on nested header nodes

+ $nestedHeaders = $xPath->query('//body//*//h1|//body//*//h2|//body//*//h3|//body//*//h4|//body//*//h5|//body//*//h6');

+ foreach ($nestedHeaders as $nestedHeader) {

+ [$oldId, $newId] = $this->setUniqueId($nestedHeader, $idMap);

+ if ($newId && $newId !== $oldId) {

+ $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

+ }

// Ensure no duplicate ids within child items

$idElems = $xPath->query('//body//*//*[@id]');

foreach ($idElems as $domElem) {

@@ -208,6 +239,9 @@ class PageContent

$html .= $doc->saveHTML($childNode);

}

+ // Perform required string-level tweaks

+ $html = str_replace(' ', ' ', $html);

return $html;

}

@@ -228,9 +262,9 @@ class PageContent

* A map for existing ID's should be passed in to check for current existence.

* Returns a pair of strings in the format [old_id, new_id].

- protected function setUniqueId(\DOMNode $element, array &$idMap): array

+ protected function setUniqueId(DOMNode $element, array &$idMap): array

{

- if (get_class($element) !== 'DOMElement') {

+ if (!$element instanceof DOMElement) {

return ['', ''];

}

@@ -242,7 +276,7 @@ class PageContent

return [$existingId, $existingId];

}

- // Create an unique id for the element

+ // Create a unique id for the element

// Uses the content as a basis to ensure output is the same every time

// the same content is passed through.

$contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

@@ -312,7 +346,7 @@ class PageContent

protected function headerNodesToLevelList(DOMNodeList $nodeList): array

{

- $tree = collect($nodeList)->map(function ($header) {

+ $tree = collect($nodeList)->map(function (DOMElement $header) {

$text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

$text = mb_substr($text, 0, 100);