]> BookStack Code Mirror - bookstack/commitdiff
Refactored the code for ExportService to use DomDocument. fix/video-export 908/head
authorabijeet <redacted>
Sun, 27 Jan 2019 10:23:51 +0000 (15:53 +0530)
committerabijeet <redacted>
Sun, 27 Jan 2019 10:23:51 +0000 (15:53 +0530)
Fixes #883

Also handling more scenarios.

app/Entities/ExportService.php
app/Exceptions/ExportException.php [new file with mode: 0644]
app/Http/Controllers/PageController.php
resources/lang/en/entities.php
resources/lang/en/errors.php
tests/Entity/ExportTest.php

index d07c093f1e32f5a1dbd9e41c8b97036be5556fb4..1ad360b86c49269cbdeaf0add842b0e5a307666f 100644 (file)
@@ -2,15 +2,14 @@
 
 use BookStack\Entities\Repos\EntityRepo;
 use BookStack\Uploads\ImageService;
+use BookStack\Exceptions\ExportException;
 
 class ExportService
 {
-
-    const VIDEO_REGEX = "/\<video.*?\>\<source.*?\ src\=(\")(.*?)(\").*?><\/video>/";
-    const YOUTUBE_REGEX = "/\<iframe.*src\=(\'|\")(\/\/www\.youtube\.com.*?)(\'|\").*?><\/iframe>/";
-    const VIMEO_REGEX = "/\<iframe.*src\=(\'|\")(\/\/player\.vimeo\.com.*?)(\'|\").*?><\/iframe>/";
-    const GOOGLE_MAP_REGEX = "/\<iframe.*src\=(\'|\")(\/\/maps\.google\.com.*?)(\'|\").*?><\/iframe>/";
-    const DAILYMOTION_REGEX = "/\<iframe.*src\=(\'|\")(\/\/www\.dailymotion\.com.*?)(\'|\").*?><\/iframe>/";
+    protected $contentMatching = [
+        'video' => ["www.youtube.com", "player.vimeo.com", "www.dailymotion.com"],
+        'map' => ['maps.google.com']
+    ];
 
     protected $entityRepo;
     protected $imageService;
@@ -80,16 +79,17 @@ class ExportService
     /**
      * Convert a page to a PDF file.
      * @param Page $page
+     * @param bool $isTesting
      * @return mixed|string
      * @throws \Throwable
      */
-    public function pageToPdf(Page $page)
+    public function pageToPdf(Page $page, bool $isTesting = false)
     {
         $this->entityRepo->renderPage($page);
         $html = view('pages/pdf', [
             'page' => $page
         ])->render();
-        return $this->htmlToPdf($html);
+        return $this->htmlToPdf($html, $isTesting);
     }
 
     /**
@@ -130,12 +130,16 @@ class ExportService
     /**
      * Convert normal webpage HTML to a PDF.
      * @param $html
+     * @param $isTesting
      * @return string
      * @throws \Exception
      */
-    protected function htmlToPdf($html)
+    protected function htmlToPdf($html, $isTesting = false)
     {
         $containedHtml = $this->containHtml($html, true);
+        if ($isTesting) {
+            return $containedHtml;
+        }
         $useWKHTML = config('snappy.pdf.binary') !== false;
         if ($useWKHTML) {
             $pdf = \SnappyPDF::loadHTML($containedHtml);
@@ -151,56 +155,62 @@ class ExportService
      * @param $htmlContent
      * @param bool $isPDF
      * @return mixed|string
-     * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
+     * @throws \BookStack\Exceptions\ExportException
      */
-    protected function containHtml($htmlContent, $isPDF = false)
+    protected function containHtml(string $htmlContent, bool $isPDF = false) : string
     {
-        $imageTagsOutput = [];
-        preg_match_all("/\<img.*src\=(\'|\")(.*?)(\'|\").*?\>/i", $htmlContent, $imageTagsOutput);
-
-        // Replace image src with base64 encoded image strings
-        if (isset($imageTagsOutput[0]) && count($imageTagsOutput[0]) > 0) {
-            foreach ($imageTagsOutput[0] as $index => $imgMatch) {
-                $oldImgTagString = $imgMatch;
-                $srcString = $imageTagsOutput[2][$index];
-                $imageEncoded = $this->imageService->imageUriToBase64($srcString);
-                if ($imageEncoded === null) {
-                    $imageEncoded = $srcString;
-                }
-                $newImgTagString = str_replace($srcString, $imageEncoded, $oldImgTagString);
-                $htmlContent = str_replace($oldImgTagString, $newImgTagString, $htmlContent);
+        $dom = $this->getDOM($htmlContent);
+        if ($dom === false) {
+            throw new ExportException(trans('errors.dom_parse_error'));
+        }
+
+        // replace image src with base64 encoded image strings
+        $images = $dom->getElementsByTagName('img');
+        foreach ($images as $img) {
+            $base64String = $this->imageService->imageUriToBase64($img->getAttribute('src'));
+            if ($base64String !== null) {
+                $img->setAttribute('src', $base64String);
+                $dom->saveHTML($img);
             }
         }
 
-        $linksOutput = [];
-        preg_match_all("/\<a.*href\=(\'|\")(.*?)(\'|\").*?\>/i", $htmlContent, $linksOutput);
-
-        // Replace image src with base64 encoded image strings
-        if (isset($linksOutput[0]) && count($linksOutput[0]) > 0) {
-            foreach ($linksOutput[0] as $index => $linkMatch) {
-                $oldLinkString = $linkMatch;
-                $srcString = $linksOutput[2][$index];
-                if (strpos(trim($srcString), 'http') !== 0) {
-                    $newSrcString = url($srcString);
-                    $newLinkString = str_replace($srcString, $newSrcString, $oldLinkString);
-                    $htmlContent = str_replace($oldLinkString, $newLinkString, $htmlContent);
-                }
+        // replace all relative hrefs.
+        $links = $dom->getElementsByTagName('a');
+        foreach ($links as $link) {
+            $href = $link->getAttribute('href');
+            if (strpos(trim($href), 'http') !== 0) {
+                $newHref = url($href);
+                $link->setAttribute('href', $newHref);
+                $dom->saveHTML($link);
             }
         }
 
-        // Replace problems caused by TinyMCE removing the protocol for YouTube, Google Maps, DailyMotion and Vimeo
-        if ($isPDF) {
-            $callback = [$this, 'replaceContentPDF'];
-            $htmlContent = $this->replaceLinkedTags(self::VIDEO_REGEX, $htmlContent, $callback, 'Video');
-        } else {
-            $callback = [$this, 'replaceContentHtml'];
+        // replace all src in video, audio and iframe tags
+        $xmlDoc = new \DOMXPath($dom);
+        $srcElements = $xmlDoc->query('//video | //audio | //iframe');
+        foreach ($srcElements as $element) {
+            $element = $this->fixRelativeSrc($element);
+            $dom->saveHTML($element);
+
+            if ($isPDF) {
+                $src = $element->getAttribute('src');
+                $label = $this->getContentLabel($src);
+
+                $div = $dom->createElement('div');
+                $textNode = $dom->createTextNode($label);
+
+                $anchor = $dom->createElement('a');
+                $anchor->setAttribute('href', $src);
+                $anchor->textContent = $src;
+
+                $div->appendChild($textNode);
+                $div->appendChild($anchor);
+
+                $element->parentNode->replaceChild($div, $element);
+            }
         }
-        $htmlContent = $this->replaceLinkedTags(self::YOUTUBE_REGEX, $htmlContent, $callback, 'Video');
-        $htmlContent = $this->replaceLinkedTags(self::GOOGLE_MAP_REGEX, $htmlContent, $callback, 'Map');
-        $htmlContent = $this->replaceLinkedTags(self::DAILYMOTION_REGEX, $htmlContent, $callback, 'Video');
-        $htmlContent = $this->replaceLinkedTags(self::VIMEO_REGEX, $htmlContent, $callback, 'Video');
 
-        return $htmlContent;
+        return $dom->saveHTML();
     }
 
     /**
@@ -208,21 +218,43 @@ class ExportService
      * This method filters any bad looking content to provide a nice final output.
      * @param Page $page
      * @return mixed
+     * @throws \BookStack\Exceptions\ExportException
      */
     public function pageToPlainText(Page $page)
     {
         $html = $this->entityRepo->renderPage($page);
+        $dom = $this->getDom($html);
+
+        if ($dom === false) {
+            throw new ExportException(trans('errors.dom_parse_error'));
+        }
 
-        $callback = [$this, 'replaceContentText'];
-        // Replace video tag in PDF
-        $html = $this->replaceLinkedTags(self::VIDEO_REGEX, $html, $callback, 'Video');
-        // Replace problems caused by TinyMCE removing the protocol for YouTube, Google Maps, DailyMotion and Vimeo
-        $html = $this->replaceLinkedTags(self::YOUTUBE_REGEX, $html, $callback, 'Video');
-        $html = $this->replaceLinkedTags(self::GOOGLE_MAP_REGEX, $html, $callback, 'Map');
-        $html = $this->replaceLinkedTags(self::DAILYMOTION_REGEX, $html, $callback, 'Video');
-        $html = $this->replaceLinkedTags(self::VIMEO_REGEX, $html, $callback, 'Video');
+        // handle anchor tags.
+        $links = $dom->getElementsByTagName('a');
+        foreach ($links as $link) {
+            $href = $link->getAttribute('href');
+            if (strpos(trim($href), 'http') !== 0) {
+                $newHref = url($href);
+                $link->setAttribute('href', $newHref);
+            }
 
-        $text = strip_tags($html);
+            $link->textContent = trim($link->textContent . " ($href)");
+            $dom->saveHTML();
+        }
+
+        $xmlDoc = new \DOMXPath($dom);
+        $srcElements = $xmlDoc->query('//video | //audio | //iframe | //img');
+        foreach ($srcElements as $element) {
+            $element = $this->fixRelativeSrc($element);
+            $fixedSrc = $element->getAttribute('src');
+            $label = $this->getContentLabel($fixedSrc);
+            $finalLabel = "\n\n$label $fixedSrc\n\n";
+
+            $textNode = $dom->createTextNode($finalLabel);
+            $element->parentNode->replaceChild($textNode, $element);
+        }
+
+        $text = strip_tags($dom->saveHTML());
         // Replace multiple spaces with single spaces
         $text = preg_replace('/\ {2,}/', ' ', $text);
         // Reduce multiple horrid whitespace characters.
@@ -267,56 +299,36 @@ class ExportService
         return $text;
     }
 
-    /**
-     * Can be used to replace certain tags that cause problems such as the TinyMCE video tag
-     * modification that have to be undone.
-     * See - https://github.com/tinymce/tinymce/blob/0f7a0f12667bde6eae9377b50b797f4479aa1ac7/src/plugins/media/main/ts/core/UrlPatterns.ts#L22
-     * @param String $regex
-     * @param String $htmlContent
-     * @param array $callback
-     * @param String $contentLabel
-     * @return String $htmlContent - Modified html content
-     */
-    protected function replaceLinkedTags($regex, $htmlContent, $callback, $contentLabel = '') {
-        $iframeOutput = [];
-        preg_match_all($regex, $htmlContent, $iframeOutput);
-        if (isset($iframeOutput[0]) && count($iframeOutput[0]) > 0) {
-            foreach ($iframeOutput[0] as $index => $iframeMatch) {
-                $htmlContent = call_user_func($callback, $htmlContent, $iframeOutput, $index, $contentLabel);
-            }
-        }
-        return $htmlContent;
+    protected function getDom(string $htmlContent) : \DOMDocument
+    {
+        // See - https://stackoverflow.com/a/17559716/903324
+        $dom = new \DOMDocument();
+        libxml_use_internal_errors(true);
+        $dom->loadHTML($htmlContent);
+        libxml_clear_errors();
+        return $dom;
     }
 
-    protected function replaceContentHtml($htmlContent, $iframeOutput, $index, $contentLabel) {
-        $srcString = $iframeOutput[2][$index];
-        $newSrcString = $srcString;
-        if (strpos($srcString, 'http') !== 0) {
-            $newSrcString = 'https:' . $srcString;
+    protected function fixRelativeSrc(\DOMElement $element): \DOMElement
+    {
+        $src = $element->getAttribute('src');
+        if (strpos(trim($src), 'http') !== 0) {
+            $newSrc = 'https:' . $src;
+            $element->setAttribute('src', $newSrc);
         }
-        $htmlContent = str_replace($srcString, $newSrcString, $htmlContent);
-        return $htmlContent;
+        return $element;
     }
 
-    protected function replaceContentPDF($htmlContent, $iframeOutput, $index, $contentLabel) {
-        $srcString = $iframeOutput[2][$index];
-        $newSrcString = $srcString;
-        if (strpos($srcString, 'http') !== 0) {
-            $newSrcString = 'https:' . $srcString;
-        }
-        $finalHtmlString = "$contentLabel: <a href='$newSrcString'>$newSrcString</a>";
-        $htmlContent = str_replace($iframeOutput[0][$index], $finalHtmlString, $htmlContent);
-        return $htmlContent;
-    }
 
-    protected function replaceContentText($htmlContent, $iframeOutput, $index, $contentLabel) {
-        $srcString = $iframeOutput[2][$index];
-        $newSrcString = $srcString;
-        if (strpos($srcString, 'http') !== 0) {
-            $newSrcString = 'https:' . $srcString;
+    protected function getContentLabel(string $src) : string
+    {
+        foreach ($this->contentMatching as $key => $possibleValues) {
+            foreach ($possibleValues as $value) {
+                if (strpos($src, $value)) {
+                    return trans("entities.$key");
+                }
+            }
         }
-        $finalHtmlString = "$contentLabel: $newSrcString";
-        $htmlContent = str_replace($iframeOutput[0][$index], $finalHtmlString, $htmlContent);
-        return $htmlContent;
+        return trans('entities.embedded_content');
     }
 }
diff --git a/app/Exceptions/ExportException.php b/app/Exceptions/ExportException.php
new file mode 100644 (file)
index 0000000..74090d8
--- /dev/null
@@ -0,0 +1,6 @@
+<?php namespace BookStack\Exceptions;
+
+class ExportException extends PrettyException
+{
+
+}
index 74595443b130c5137ec5608e89591749e794e2a6..3b3de5ffa5a829785eb62e90ba0da539b365889d 100644 (file)
@@ -495,13 +495,15 @@ class PageController extends Controller
      * https://github.com/barryvdh/laravel-dompdf
      * @param string $bookSlug
      * @param string $pageSlug
+     * @param Request $request
      * @return \Illuminate\Http\Response
      */
-    public function exportPdf($bookSlug, $pageSlug)
+    public function exportPdf($bookSlug, $pageSlug, Request $request)
     {
+        $isTesting = $request->query('isTesting');
         $page = $this->pageRepo->getPageBySlug($pageSlug, $bookSlug);
         $page->html = $this->pageRepo->renderPage($page);
-        $pdfContent = $this->exportService->pageToPdf($page);
+        $pdfContent = $this->exportService->pageToPdf($page, !empty($isTesting));
         return $this->downloadResponse($pdfContent, $pageSlug . '.pdf');
     }
 
index 2a64f57a3700c113814faa01ba8e6d6cbf9b95dd..6ca8863f09bc102f595b3690bf3bd235a7b5a31a 100644 (file)
@@ -289,5 +289,11 @@ return [
     // Revision
     'revision_delete_confirm' => 'Are you sure you want to delete this revision?',
     'revision_delete_success' => 'Revision deleted',
-    'revision_cannot_delete_latest' => 'Cannot delete the latest revision.'
+    'revision_cannot_delete_latest' => 'Cannot delete the latest revision.',
+
+    // PDF / Text Embeds
+    'video' => 'Video: ',
+    'map' => 'Map: ',
+    'embedded_content' => 'Embedded Content: '
+
 ];
\ No newline at end of file
index b91a0c3e11c96cfdc6f85d4f9940c472476610b2..7393360ef731a10bde278ff72b0dec0748ab6eef 100644 (file)
@@ -81,4 +81,7 @@ return [
     'app_down' => ':appName is down right now',
     'back_soon' => 'It will be back up soon.',
 
+    // Export errors
+    'dom_parse_error' => 'There was an error while exporting the page. This maybe caused due to the HTML structure of the page.'
+
 ];
index 9228c3bad0655e2549ebaf8596642421ea1b4552..9a3034defdf0ec4f89e3c49a0a4776a54c9c57f5 100644 (file)
@@ -157,4 +157,29 @@ class ExportTest extends TestCase
 
     }
 
+    public function test_pdf_export_no_video_iframe() {
+        $page = Page::first();
+        $page->html = '<p id="bkmrk-%C2%A0-0">&nbsp;</p>' .
+            '<p id="bkmrk-%C2%A0-1"><iframe src="//www.youtube.com/embed/LkFt_fp7FmE" width="560" height="314" allowfullscreen="allowfullscreen"></iframe></p>' .
+            '<p id="bkmrk-"><video src="//player.vimeo.com/video/276396369?title=0&amp;amp;byline=0" width="425" height="350" allowfullscreen="allowfullscreen"></video></p>' .
+            '<p id="bkmrk--0"><iframe style="border: 0;" src="//maps.google.com/embed?testquery=true" width="600" height="450" frameborder="0" allowfullscreen="allowfullscreen"></iframe></p>' .
+            '<p id="bkmrk--1"><iframe src="//www.dailymotion.com/embed/video/x2rqgfm" width="480" height="432" frameborder="0" allowfullscreen="allowfullscreen"></iframe></p>' .
+            '<p id="bkmrk-%C2%A0-2">&nbsp;</p>';
+
+        $page->save();
+
+        $this->asEditor();
+        $resp = $this->get($page->getUrl('/export/pdf?isTesting=true'));
+        $resp->assertStatus(200);
+
+        $checks = [
+            '</video>',
+            '</iframe>'
+        ];
+
+        foreach ($checks as $check) {
+            $resp->assertDontSee($check);
+        }
+    }
+
 }
\ No newline at end of file