BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageContent.php

1 <?php

3 namespace BookStack\Entities\Tools;

5 use BookStack\Entities\Models\Page;

6 use BookStack\Entities\Tools\Markdown\CustomListItemRenderer;

7 use BookStack\Entities\Tools\Markdown\CustomStrikeThroughExtension;

8 use BookStack\Exceptions\ImageUploadException;

9 use BookStack\Facades\Theme;

10 use BookStack\Theming\ThemeEvents;

11 use BookStack\Uploads\ImageRepo;

12 use BookStack\Util\HtmlContentFilter;

13 use DOMDocument;

14 use DOMNodeList;

15 use DOMXPath;

16 use Illuminate\Support\Str;

17 use League\CommonMark\Block\Element\ListItem;

18 use League\CommonMark\CommonMarkConverter;

19 use League\CommonMark\Environment;

20 use League\CommonMark\Extension\Table\TableExtension;

21 use League\CommonMark\Extension\TaskList\TaskListExtension;

23 class PageContent

24 {

25 protected $page;

27 /**

28 * PageContent constructor.

29 */

30 public function __construct(Page $page)

31 {

32 $this->page = $page;

33 }

35 /**

36 * Update the content of the page with new provided HTML.

37 */

38 public function setNewHTML(string $html)

39 {

40 $html = $this->extractBase64ImagesFromHtml($html);

41 $this->page->html = $this->formatHtml($html);

42 $this->page->text = $this->toPlainText();

43 $this->page->markdown = '';

44 }

46 /**

47 * Update the content of the page with new provided Markdown content.

48 */

49 public function setNewMarkdown(string $markdown)

50 {

51 $markdown = $this->extractBase64ImagesFromMarkdown($markdown);

52 $this->page->markdown = $markdown;

53 $html = $this->markdownToHtml($markdown);

54 $this->page->html = $this->formatHtml($html);

55 $this->page->text = $this->toPlainText();

56 }

58 /**

59 * Convert the given Markdown content to a HTML string.

60 */

61 protected function markdownToHtml(string $markdown): string

62 {

63 $environment = Environment::createCommonMarkEnvironment();

64 $environment->addExtension(new TableExtension());

65 $environment->addExtension(new TaskListExtension());

66 $environment->addExtension(new CustomStrikeThroughExtension());

67 $environment = Theme::dispatch(ThemeEvents::COMMONMARK_ENVIRONMENT_CONFIGURE, $environment) ?? $environment;

68 $converter = new CommonMarkConverter([], $environment);

70 $environment->addBlockRenderer(ListItem::class, new CustomListItemRenderer(), 10);

72 return $converter->convertToHtml($markdown);

73 }

75 /**

76 * Convert all base64 image data to saved images.

77 */

78 protected function extractBase64ImagesFromHtml(string $htmlText): string

79 {

80 if (empty($htmlText) || strpos($htmlText, 'data:image') === false) {

81 return $htmlText;

82 }

84 $doc = $this->loadDocumentFromHtml($htmlText);

85 $container = $doc->documentElement;

86 $body = $container->childNodes->item(0);

87 $childNodes = $body->childNodes;

88 $xPath = new DOMXPath($doc);

90 // Get all img elements with image data blobs

91 $imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]');

92 foreach ($imageNodes as $imageNode) {

93 $imageSrc = $imageNode->getAttribute('src');

94 $newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);

95 $imageNode->setAttribute('src', $newUrl);

96 }

98 // Generate inner html as a string

99 $html = '';

100 foreach ($childNodes as $childNode) {

101 $html .= $doc->saveHTML($childNode);

102 }

103

104 return $html;

105 }

106

107 /**

108 * Convert all inline base64 content to uploaded image files.

109 */

110 protected function extractBase64ImagesFromMarkdown(string $markdown)

111 {

112 $matches = [];

113 preg_match_all('/!\[.*?]\(.*?(data:image\/.*?)[)"\s]/', $markdown, $matches);

114

115 foreach ($matches[1] as $base64Match) {

116 $newUrl = $this->base64ImageUriToUploadedImageUrl($base64Match);

117 $markdown = str_replace($base64Match, $newUrl, $markdown);

118 }

119

120 return $markdown;

121 }

122

123 /**

124 * Parse the given base64 image URI and return the URL to the created image instance.

125 * Returns an empty string if the parsed URI is invalid or causes an error upon upload.

126 */

127 protected function base64ImageUriToUploadedImageUrl(string $uri): string

128 {

129 $imageRepo = app()->make(ImageRepo::class);

130 $imageInfo = $this->parseBase64ImageUri($uri);

131

132 // Validate extension and content

133 if (empty($imageInfo['data']) || !$imageRepo->imageExtensionSupported($imageInfo['extension'])) {

134 return '';

135 }

136

137 // Save image from data with a random name

138 $imageName = 'embedded-image-' . Str::random(8) . '.' . $imageInfo['extension'];

139

140 try {

141 $image = $imageRepo->saveNewFromData($imageName, $imageInfo['data'], 'gallery', $this->page->id);

142 } catch (ImageUploadException $exception) {

143 return '';

144 }

145

146 return $image->url;

147 }

148

149 /**

150 * Parse a base64 image URI into the data and extension.

151 * @return array{extension: array, data: string}

152 */

153 protected function parseBase64ImageUri(string $uri): array

154 {

155 [$dataDefinition, $base64ImageData] = explode(',', $uri, 2);

156 $extension = strtolower(preg_split('/[\/;]/', $dataDefinition)[1] ?? '');

157 return [

158 'extension' => $extension,

159 'data' => base64_decode($base64ImageData) ?: '',

160 ];

161 }

162

163 /**

164 * Formats a page's html to be tagged correctly within the system.

165 */

166 protected function formatHtml(string $htmlText): string

167 {

168 if (empty($htmlText)) {

169 return $htmlText;

170 }

171

172 $doc = $this->loadDocumentFromHtml($htmlText);

173 $container = $doc->documentElement;

174 $body = $container->childNodes->item(0);

175 $childNodes = $body->childNodes;

176 $xPath = new DOMXPath($doc);

177

178 // Set ids on top-level nodes

179 $idMap = [];

180 foreach ($childNodes as $index => $childNode) {

181 [$oldId, $newId] = $this->setUniqueId($childNode, $idMap);

182 if ($newId && $newId !== $oldId) {

183 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

184 }

185 }

186

187 // Ensure no duplicate ids within child items

188 $idElems = $xPath->query('//body//*//*[@id]');

189 foreach ($idElems as $domElem) {

190 [$oldId, $newId] = $this->setUniqueId($domElem, $idMap);

191 if ($newId && $newId !== $oldId) {

192 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

193 }

194 }

195

196 // Generate inner html as a string

197 $html = '';

198 foreach ($childNodes as $childNode) {

199 $html .= $doc->saveHTML($childNode);

200 }

201

202 return $html;

203 }

204

205 /**

206 * Update the all links to the $old location to instead point to $new.

207 */

208 protected function updateLinks(DOMXPath $xpath, string $old, string $new)

209 {

210 $old = str_replace('"', '', $old);

211 $matchingLinks = $xpath->query('//body//*//*[@href="' . $old . '"]');

212 foreach ($matchingLinks as $domElem) {

213 $domElem->setAttribute('href', $new);

214 }

215 }

216

217 /**

218 * Set a unique id on the given DOMElement.

219 * A map for existing ID's should be passed in to check for current existence.

220 * Returns a pair of strings in the format [old_id, new_id].

221 */

222 protected function setUniqueId(\DOMNode $element, array &$idMap): array

223 {

224 if (get_class($element) !== 'DOMElement') {

225 return ['', ''];

226 }

227

228 // Stop if there's an existing valid id that has not already been used.

229 $existingId = $element->getAttribute('id');

230 if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {

231 $idMap[$existingId] = true;

232

233 return [$existingId, $existingId];

234 }

235

236 // Create an unique id for the element

237 // Uses the content as a basis to ensure output is the same every time

238 // the same content is passed through.

239 $contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

240 $newId = urlencode($contentId);

241 $loopIndex = 0;

242

243 while (isset($idMap[$newId])) {

244 $newId = urlencode($contentId . '-' . $loopIndex);

245 $loopIndex++;

246 }

247

248 $element->setAttribute('id', $newId);

249 $idMap[$newId] = true;

250

251 return [$existingId, $newId];

252 }

253

254 /**

255 * Get a plain-text visualisation of this page.

256 */

257 protected function toPlainText(): string

258 {

259 $html = $this->render(true);

260

261 return html_entity_decode(strip_tags($html));

262 }

263

264 /**

265 * Render the page for viewing.

266 */

267 public function render(bool $blankIncludes = false): string

268 {

269 $content = $this->page->html ?? '';

270

271 if (!config('app.allow_content_scripts')) {

272 $content = HtmlContentFilter::removeScripts($content);

273 }

274

275 if ($blankIncludes) {

276 $content = $this->blankPageIncludes($content);

277 } else {

278 $content = $this->parsePageIncludes($content);

279 }

280

281 return $content;

282 }

283

284 /**

285 * Parse the headers on the page to get a navigation menu.

286 */

287 public function getNavigation(string $htmlContent): array

288 {

289 if (empty($htmlContent)) {

290 return [];

291 }

292

293 $doc = $this->loadDocumentFromHtml($htmlContent);

294 $xPath = new DOMXPath($doc);

295 $headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6');

296

297 return $headers ? $this->headerNodesToLevelList($headers) : [];

298 }

299

300 /**

301 * Convert a DOMNodeList into an array of readable header attributes

302 * with levels normalised to the lower header level.

303 */

304 protected function headerNodesToLevelList(DOMNodeList $nodeList): array

305 {

306 $tree = collect($nodeList)->map(function ($header) {

307 $text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

308 $text = mb_substr($text, 0, 100);

309

310 return [

311 'nodeName' => strtolower($header->nodeName),

312 'level' => intval(str_replace('h', '', $header->nodeName)),

313 'link' => '#' . $header->getAttribute('id'),

314 'text' => $text,

315 ];

316 })->filter(function ($header) {

317 return mb_strlen($header['text']) > 0;

318 });

319

320 // Shift headers if only smaller headers have been used

321 $levelChange = ($tree->pluck('level')->min() - 1);

322 $tree = $tree->map(function ($header) use ($levelChange) {

323 $header['level'] -= ($levelChange);

324

325 return $header;

326 });

327

328 return $tree->toArray();

329 }

330

331 /**

332 * Remove any page include tags within the given HTML.

333 */

334 protected function blankPageIncludes(string $html): string

335 {

336 return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html);

337 }

338

339 /**

340 * Parse any include tags "{{@<page_id>#section}}" to be part of the page.

341 */

342 protected function parsePageIncludes(string $html): string

343 {

344 $matches = [];

345 preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches);

346

347 foreach ($matches[1] as $index => $includeId) {

348 $fullMatch = $matches[0][$index];

349 $splitInclude = explode('#', $includeId, 2);

350

351 // Get page id from reference

352 $pageId = intval($splitInclude[0]);

353 if (is_nan($pageId)) {

354 continue;

355 }

356

357 // Find page and skip this if page not found

358 /** @var ?Page $matchedPage */

359 $matchedPage = Page::visible()->find($pageId);

360 if ($matchedPage === null) {

361 $html = str_replace($fullMatch, '', $html);

362 continue;

363 }

364

365 // If we only have page id, just insert all page html and continue.

366 if (count($splitInclude) === 1) {

367 $html = str_replace($fullMatch, $matchedPage->html, $html);

368 continue;

369 }

370

371 // Create and load HTML into a document

372 $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]);

373 $html = str_replace($fullMatch, trim($innerContent), $html);

374 }

375

376 return $html;

377 }

378

379 /**

380 * Fetch the content from a specific section of the given page.

381 */

382 protected function fetchSectionOfPage(Page $page, string $sectionId): string

383 {

384 $topLevelTags = ['table', 'ul', 'ol'];

385 $doc = $this->loadDocumentFromHtml($page->html);

386

387 // Search included content for the id given and blank out if not exists.

388 $matchingElem = $doc->getElementById($sectionId);

389 if ($matchingElem === null) {

390 return '';

391 }

392

393 // Otherwise replace the content with the found content

394 // Checks if the top-level wrapper should be included by matching on tag types

395 $innerContent = '';

396 $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);

397 if ($isTopLevel) {

398 $innerContent .= $doc->saveHTML($matchingElem);

399 } else {

400 foreach ($matchingElem->childNodes as $childNode) {

401 $innerContent .= $doc->saveHTML($childNode);

402 }

403 }

404 libxml_clear_errors();

405

406 return $innerContent;

407 }

408

409 /**

410 * Create and load a DOMDocument from the given html content.

411 */

412 protected function loadDocumentFromHtml(string $html): DOMDocument

413 {

414 libxml_use_internal_errors(true);

415 $doc = new DOMDocument();

416 $html = '<body>' . $html . '</body>';

417 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));

418

419 return $doc;

420 }

421 }