use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
use BookStack\Uploads\ImageService;
use BookStack\Util\CspService;
-use DOMDocument;
+use BookStack\Util\HtmlDocument;
use DOMElement;
-use DOMXPath;
use Exception;
use Throwable;
protected function htmlToPdf(string $html): string
{
$html = $this->containHtml($html);
- $html = $this->replaceIframesWithLinks($html);
- $html = $this->openDetailElements($html);
+ $doc = new HtmlDocument();
+ $doc->loadCompleteHtml($html);
- return $this->pdfGenerator->fromHtml($html);
+ $this->replaceIframesWithLinks($doc);
+ $this->openDetailElements($doc);
+ $cleanedHtml = $doc->getHtml();
+
+ return $this->pdfGenerator->fromHtml($cleanedHtml);
}
/**
* Within the given HTML content, Open any detail blocks.
*/
- protected function openDetailElements(string $html): string
+ protected function openDetailElements(HtmlDocument $doc): void
{
- libxml_use_internal_errors(true);
-
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
- $xPath = new DOMXPath($doc);
-
- $details = $xPath->query('//details');
+ $details = $doc->queryXPath('//details');
/** @var DOMElement $detail */
foreach ($details as $detail) {
$detail->setAttribute('open', 'open');
}
-
- return $doc->saveHTML();
}
/**
- * Within the given HTML content, replace any iframe elements
+ * Within the given HTML document, replace any iframe elements
* with anchor links within paragraph blocks.
*/
- protected function replaceIframesWithLinks(string $html): string
+ protected function replaceIframesWithLinks(HtmlDocument $doc): void
{
- libxml_use_internal_errors(true);
-
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
- $xPath = new DOMXPath($doc);
+ $iframes = $doc->queryXPath('//iframe');
- $iframes = $xPath->query('//iframe');
/** @var DOMElement $iframe */
foreach ($iframes as $iframe) {
$link = $iframe->getAttribute('src');
$paragraph->appendChild($anchor);
$iframe->parentNode->replaceChild($paragraph, $iframe);
}
-
- return $doc->saveHTML();
}
/**
foreach ($imageTagsOutput[0] as $index => $imgMatch) {
$oldImgTagString = $imgMatch;
$srcString = $imageTagsOutput[2][$index];
- $imageEncoded = $this->imageService->imageUriToBase64($srcString);
+ $imageEncoded = $this->imageService->imageUrlToBase64($srcString);
if ($imageEncoded === null) {
$imageEncoded = $srcString;
}
* Converts the page contents into simple plain text.
* This method filters any bad looking content to provide a nice final output.
*/
- public function pageToPlainText(Page $page): string
+ public function pageToPlainText(Page $page, bool $pageRendered = false, bool $fromParent = false): string
{
- $html = (new PageContent($page))->render();
- $text = strip_tags($html);
+ $html = $pageRendered ? $page->html : (new PageContent($page))->render();
+ // Add proceeding spaces before tags so spaces remain between
+ // text within elements after stripping tags.
+ $html = str_replace('<', " <", $html);
+ $text = trim(strip_tags($html));
// Replace multiple spaces with single spaces
- $text = preg_replace('/\ {2,}/', ' ', $text);
+ $text = preg_replace('/ {2,}/', ' ', $text);
// Reduce multiple horrid whitespace characters.
$text = preg_replace('/(\x0A|\xA0|\x0A|\r|\n){2,}/su', "\n\n", $text);
$text = html_entity_decode($text);
// Add title
- $text = $page->name . "\n\n" . $text;
+ $text = $page->name . ($fromParent ? "\n" : "\n\n") . $text;
return $text;
}
*/
public function chapterToPlainText(Chapter $chapter): string
{
- $text = $chapter->name . "\n\n";
- $text .= $chapter->description . "\n\n";
+ $text = $chapter->name . "\n" . $chapter->description;
+ $text = trim($text) . "\n\n";
+
+ $parts = [];
foreach ($chapter->getVisiblePages() as $page) {
- $text .= $this->pageToPlainText($page);
+ $parts[] = $this->pageToPlainText($page, false, true);
}
- return $text;
+ return $text . implode("\n\n", $parts);
}
/**
*/
public function bookToPlainText(Book $book): string
{
- $bookTree = (new BookContents($book))->getTree(false, false);
- $text = $book->name . "\n\n";
+ $bookTree = (new BookContents($book))->getTree(false, true);
+ $text = $book->name . "\n" . $book->description;
+ $text = rtrim($text) . "\n\n";
+
+ $parts = [];
foreach ($bookTree as $bookChild) {
if ($bookChild->isA('chapter')) {
- $text .= $this->chapterToPlainText($bookChild);
+ $parts[] = $this->chapterToPlainText($bookChild);
} else {
- $text .= $this->pageToPlainText($bookChild);
+ $parts[] = $this->pageToPlainText($bookChild, true, true);
}
}
- return $text;
+ return $text . implode("\n\n", $parts);
}
/**
public function chapterToMarkdown(Chapter $chapter): string
{
$text = '# ' . $chapter->name . "\n\n";
- $text .= $chapter->description . "\n\n";
+
+ $description = (new HtmlToMarkdown($chapter->descriptionHtml()))->convert();
+ if ($description) {
+ $text .= $description . "\n\n";
+ }
+
foreach ($chapter->pages as $page) {
$text .= $this->pageToMarkdown($page) . "\n\n";
}
{
$bookTree = (new BookContents($book))->getTree(false, true);
$text = '# ' . $book->name . "\n\n";
+
+ $description = (new HtmlToMarkdown($book->descriptionHtml()))->convert();
+ if ($description) {
+ $text .= $description . "\n\n";
+ }
+
foreach ($bookTree as $bookChild) {
if ($bookChild instanceof Chapter) {
$text .= $this->chapterToMarkdown($bookChild) . "\n\n";