Prevented PCRE limit issues in markdown base64 extraction

author Dan Brown <redacted>

Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)

committer Dan Brown <redacted>

Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)
author Dan Brown <redacted>
Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)
committer Dan Brown <redacted>
Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)
diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php

index b95131fce2e25561c22131d3d81e3dc4d30a8aac..dbb62021cae18c18e5233bb9a04e27da0874c4e3 100644 (file)
--- a/app/Entities/Tools/PageContent.php
+++ b/app/Entities/Tools/PageContent.php
@@ -109,15 +109,35 @@ class PageContent
  
      /**
       * Convert all inline base64 content to uploaded image files.
+     * Regex is used to locate the start of data-uri definitions then
+     * manual looping over content is done to parse the whole data uri.
+     * Attempting to capture the whole data uri using regex can cause PHP
+     * PCRE limits to be hit with larger, multi-MB, files.
       */
      protected function extractBase64ImagesFromMarkdown(string $markdown)
      {
          $matches = [];
-        preg_match_all('/!\[.*?]\(.*?(data:image\/.*?)[)"\s]/', $markdown, $matches);
+        $contentLength = strlen($markdown);
+        $replacements = [];
+        preg_match_all('/!\[.*?]\(.*?(data:image\/.{1,6};base64,)/', $markdown, $matches, PREG_OFFSET_CAPTURE);
+
+        foreach ($matches[1] as $base64MatchPair) {
+            [$dataUri, $index] = $base64MatchPair;
+
+            for ($i = strlen($dataUri) + $index; $i < $contentLength; $i++) {
+                $char = $markdown[$i];
+                if ($char === ')' || $char === ' ' || $char === "\n" || $char === '"') {
+                    break;
+                }
+                $dataUri .= $char;
+            }
+
+            $newUrl = $this->base64ImageUriToUploadedImageUrl($dataUri);
+            $replacements[] = [$dataUri, $newUrl];
+        }
  
-        foreach ($matches[1] as $base64Match) {
-            $newUrl = $this->base64ImageUriToUploadedImageUrl($base64Match);
-            $markdown = str_replace($base64Match, $newUrl, $markdown);
+        foreach ($replacements as [$dataUri, $newUrl]) {
+            $markdown = str_replace($dataUri, $newUrl, $markdown);
          }
  
          return $markdown;
diff --git a/tests/Entity/PageContentTest.php b/tests/Entity/PageContentTest.php

index 9524186c8f1cdc64f0b9cd43f215515948cf98b1..cf1ecd84d44782d48ca5e7958edc210598ef4002 100644 (file)
--- a/tests/Entity/PageContentTest.php
+++ b/tests/Entity/PageContentTest.php
@@ -657,6 +657,39 @@ class PageContentTest extends TestCase
          $this->deleteImage($imagePath);
      }
  
+    public function test_markdown_base64_extract_not_limited_by_pcre_limits()
+    {
+        $pcreBacktrackLimit = ini_get("pcre.backtrack_limit");
+        $pcreRecursionLimit = ini_get("pcre.recursion_limit");
+
+        $this->asEditor();
+        $page = Page::query()->first();
+
+        ini_set("pcre.backtrack_limit", "500");
+        ini_set("pcre.recursion_limit", "500");
+
+        $content = str_repeat('a', 5000);
+        $base64Content = base64_encode($content);
+
+        $this->put($page->getUrl(), [
+            'name'     => $page->name, 'summary' => '',
+            'markdown' => 'test ![test](data:image/jpeg;base64,' . $base64Content . ') ![test](data:image/jpeg;base64,' . $base64Content . ')',
+        ]);
+
+        $page->refresh();
+        $this->assertStringMatchesFormat('<p%A>test <img src="http://localhost/uploads/images/gallery/%A.jpeg" alt="test"> <img src="http://localhost/uploads/images/gallery/%A.jpeg" alt="test">%A</p>%A', $page->html);
+
+        $matches = [];
+        preg_match('/src="https:\/\/p.rizon.top:443\/http\/localhost(.*?)"/', $page->html, $matches);
+        $imagePath = $matches[1];
+        $imageFile = public_path($imagePath);
+        $this->assertEquals($content, file_get_contents($imageFile));
+
+        $this->deleteImage($imagePath);
+        ini_set("pcre.backtrack_limit", $pcreBacktrackLimit);
+        ini_set("pcre.recursion_limit", $pcreRecursionLimit);
+    }
+
      public function test_base64_images_within_markdown_blanked_if_not_supported_extension_for_extract()
      {
          $this->asEditor();
author	Dan Brown <redacted>
	Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)
committer	Dan Brown <redacted>
	Sun, 6 Feb 2022 07:51:38 +0000 (07:51 +0000)
app/Entities/Tools/PageContent.php		patch \| blob \| history
tests/Entity/PageContentTest.php		patch \| blob \| history