BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageContent.php

1 <?php namespace BookStack\Entities\Tools;

3 use BookStack\Auth\Permissions\PermissionService;

4 use BookStack\Entities\Models\Page;

5 use BookStack\Entities\Tools\Markdown\CustomStrikeThroughExtension;

6 use BookStack\Facades\Theme;

7 use BookStack\Theming\ThemeEvents;

8 use BookStack\Uploads\Image;

9 use BookStack\Uploads\ImageRepo;

10 use BookStack\Uploads\ImageService;

11 use DOMDocument;

12 use DOMNodeList;

13 use DOMXPath;

14 use League\CommonMark\CommonMarkConverter;

15 use League\CommonMark\Environment;

16 use League\CommonMark\Extension\Table\TableExtension;

17 use League\CommonMark\Extension\TaskList\TaskListExtension;

19 class PageContent

20 {

22 protected $page;

24 /**

25 * PageContent constructor.

26 */

27 public function __construct(Page $page)

28 {

29 $this->page = $page;

30 }

32 /**

33 * Update the content of the page with new provided HTML.

34 */

35 public function setNewHTML(string $html)

36 {

37 $html = $this->saveBase64Images($this->page, $html);

38 $this->page->html = $this->formatHtml($html);

39 $this->page->text = $this->toPlainText();

40 $this->page->markdown = '';

41 }

43 /**

44 * Update the content of the page with new provided Markdown content.

45 */

46 public function setNewMarkdown(string $markdown)

47 {

48 $this->page->markdown = $markdown;

49 $html = $this->markdownToHtml($markdown);

50 $this->page->html = $this->formatHtml($html);

51 $this->page->text = $this->toPlainText();

52 }

54 /**

55 * Convert the given Markdown content to a HTML string.

56 */

57 protected function markdownToHtml(string $markdown): string

58 {

59 $environment = Environment::createCommonMarkEnvironment();

60 $environment->addExtension(new TableExtension());

61 $environment->addExtension(new TaskListExtension());

62 $environment->addExtension(new CustomStrikeThroughExtension());

63 $environment = Theme::dispatch(ThemeEvents::COMMONMARK_ENVIRONMENT_CONFIGURE, $environment) ?? $environment;

64 $converter = new CommonMarkConverter([], $environment);

65 return $converter->convertToHtml($markdown);

66 }

68 /**

69 * Convert all base64 image data to saved images

70 */

71 public function saveBase64Images(Page $page, string $htmlText): string

72 {

73 if ($htmlText == '') {

74 return $htmlText;

75 }

77 libxml_use_internal_errors(true);

78 $doc = new DOMDocument();

79 $doc->loadHTML(mb_convert_encoding($htmlText, 'HTML-ENTITIES', 'UTF-8'));

80 $container = $doc->documentElement;

81 $body = $container->childNodes->item(0);

82 $childNodes = $body->childNodes;

83 $xPath = new DOMXPath($doc);

85 // Get all img elements with image data blobs

86 $imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]');

87 foreach($imageNodes as $imageNode) {

88 $imageSrc = $imageNode->getAttribute('src');

90 # Parse base64 data

91 $result = preg_match('"data:image/[a-zA-Z]*(;base64,[a-zA-Z0-9+/\\= ]*)"', $imageSrc, $matches);

93 if($result === 1) {

94 $base64ImageData = $matches[1];

96 $image = new Image();

97 $imageService = app()->make(ImageService::class);

98 $permissionService = app(PermissionService::class);

99 $imageRepo = new ImageRepo(new Image(), $imageService, $permissionService, $page);

100

101 # Use existing saveDrawing method used for Drawio diagrams

102 $image = $imageRepo->saveDrawing($base64ImageData, $page->id);

103

104 // Create a new img element with the saved image URI

105 $newNode = $doc->createElement('img');

106 $newNode->setAttribute('src', $image->path);

107

108 // Replace the old img element

109 $imageNode->parentNode->replaceChild($newNode, $imageNode);

110 }

111 }

112

113 // Generate inner html as a string

114 $html = '';

115 foreach ($childNodes as $childNode) {

116 $html .= $doc->saveHTML($childNode);

117 }

118

119 return $html;

120 }

121

122 /**

123 * Formats a page's html to be tagged correctly within the system.

124 */

125 protected function formatHtml(string $htmlText): string

126 {

127 if ($htmlText == '') {

128 return $htmlText;

129 }

130

131 libxml_use_internal_errors(true);

132 $doc = new DOMDocument();

133 $doc->loadHTML(mb_convert_encoding($htmlText, 'HTML-ENTITIES', 'UTF-8'));

134

135 $container = $doc->documentElement;

136 $body = $container->childNodes->item(0);

137 $childNodes = $body->childNodes;

138 $xPath = new DOMXPath($doc);

139

140 // Set ids on top-level nodes

141 $idMap = [];

142 foreach ($childNodes as $index => $childNode) {

143 [$oldId, $newId] = $this->setUniqueId($childNode, $idMap);

144 if ($newId && $newId !== $oldId) {

145 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

146 }

147 }

148

149 // Ensure no duplicate ids within child items

150 $idElems = $xPath->query('//body//*//*[@id]');

151 foreach ($idElems as $domElem) {

152 [$oldId, $newId] = $this->setUniqueId($domElem, $idMap);

153 if ($newId && $newId !== $oldId) {

154 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

155 }

156 }

157

158 // Generate inner html as a string

159 $html = '';

160 foreach ($childNodes as $childNode) {

161 $html .= $doc->saveHTML($childNode);

162 }

163

164 return $html;

165 }

166

167 /**

168 * Update the all links to the $old location to instead point to $new.

169 */

170 protected function updateLinks(DOMXPath $xpath, string $old, string $new)

171 {

172 $old = str_replace('"', '', $old);

173 $matchingLinks = $xpath->query('//body//*//*[@href="'.$old.'"]');

174 foreach ($matchingLinks as $domElem) {

175 $domElem->setAttribute('href', $new);

176 }

177 }

178

179 /**

180 * Set a unique id on the given DOMElement.

181 * A map for existing ID's should be passed in to check for current existence.

182 * Returns a pair of strings in the format [old_id, new_id]

183 */

184 protected function setUniqueId(\DOMNode $element, array &$idMap): array

185 {

186 if (get_class($element) !== 'DOMElement') {

187 return ['', ''];

188 }

189

190 // Stop if there's an existing valid id that has not already been used.

191 $existingId = $element->getAttribute('id');

192 if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {

193 $idMap[$existingId] = true;

194 return [$existingId, $existingId];

195 }

196

197 // Create an unique id for the element

198 // Uses the content as a basis to ensure output is the same every time

199 // the same content is passed through.

200 $contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

201 $newId = urlencode($contentId);

202 $loopIndex = 0;

203

204 while (isset($idMap[$newId])) {

205 $newId = urlencode($contentId . '-' . $loopIndex);

206 $loopIndex++;

207 }

208

209 $element->setAttribute('id', $newId);

210 $idMap[$newId] = true;

211 return [$existingId, $newId];

212 }

213

214 /**

215 * Get a plain-text visualisation of this page.

216 */

217 protected function toPlainText(): string

218 {

219 $html = $this->render(true);

220 return html_entity_decode(strip_tags($html));

221 }

222

223 /**

224 * Render the page for viewing

225 */

226 public function render(bool $blankIncludes = false) : string

227 {

228 $content = $this->page->html;

229

230 if (!config('app.allow_content_scripts')) {

231 $content = $this->escapeScripts($content);

232 }

233

234 if ($blankIncludes) {

235 $content = $this->blankPageIncludes($content);

236 } else {

237 $content = $this->parsePageIncludes($content);

238 }

239

240 return $content;

241 }

242

243 /**

244 * Parse the headers on the page to get a navigation menu

245 */

246 public function getNavigation(string $htmlContent): array

247 {

248 if (empty($htmlContent)) {

249 return [];

250 }

251

252 libxml_use_internal_errors(true);

253 $doc = new DOMDocument();

254 $doc->loadHTML(mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'));

255 $xPath = new DOMXPath($doc);

256 $headers = $xPath->query("//h1|//h2|//h3|//h4|//h5|//h6");

257

258 return $headers ? $this->headerNodesToLevelList($headers) : [];

259 }

260

261 /**

262 * Convert a DOMNodeList into an array of readable header attributes

263 * with levels normalised to the lower header level.

264 */

265 protected function headerNodesToLevelList(DOMNodeList $nodeList): array

266 {

267 $tree = collect($nodeList)->map(function ($header) {

268 $text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

269 $text = mb_substr($text, 0, 100);

270

271 return [

272 'nodeName' => strtolower($header->nodeName),

273 'level' => intval(str_replace('h', '', $header->nodeName)),

274 'link' => '#' . $header->getAttribute('id'),

275 'text' => $text,

276 ];

277 })->filter(function ($header) {

278 return mb_strlen($header['text']) > 0;

279 });

280

281 // Shift headers if only smaller headers have been used

282 $levelChange = ($tree->pluck('level')->min() - 1);

283 $tree = $tree->map(function ($header) use ($levelChange) {

284 $header['level'] -= ($levelChange);

285 return $header;

286 });

287

288 return $tree->toArray();

289 }

290

291 /**

292 * Remove any page include tags within the given HTML.

293 */

294 protected function blankPageIncludes(string $html) : string

295 {

296 return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html);

297 }

298

299 /**

300 * Parse any include tags "{{@<page_id>#section}}" to be part of the page.

301 */

302 protected function parsePageIncludes(string $html) : string

303 {

304 $matches = [];

305 preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches);

306

307 foreach ($matches[1] as $index => $includeId) {

308 $fullMatch = $matches[0][$index];

309 $splitInclude = explode('#', $includeId, 2);

310

311 // Get page id from reference

312 $pageId = intval($splitInclude[0]);

313 if (is_nan($pageId)) {

314 continue;

315 }

316

317 // Find page and skip this if page not found

318 $matchedPage = Page::visible()->find($pageId);

319 if ($matchedPage === null) {

320 $html = str_replace($fullMatch, '', $html);

321 continue;

322 }

323

324 // If we only have page id, just insert all page html and continue.

325 if (count($splitInclude) === 1) {

326 $html = str_replace($fullMatch, $matchedPage->html, $html);

327 continue;

328 }

329

330 // Create and load HTML into a document

331 $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]);

332 $html = str_replace($fullMatch, trim($innerContent), $html);

333 }

334

335 return $html;

336 }

337

338

339 /**

340 * Fetch the content from a specific section of the given page.

341 */

342 protected function fetchSectionOfPage(Page $page, string $sectionId): string

343 {

344 $topLevelTags = ['table', 'ul', 'ol'];

345 $doc = new DOMDocument();

346 libxml_use_internal_errors(true);

347 $doc->loadHTML(mb_convert_encoding('<body>'.$page->html.'</body>', 'HTML-ENTITIES', 'UTF-8'));

348

349 // Search included content for the id given and blank out if not exists.

350 $matchingElem = $doc->getElementById($sectionId);

351 if ($matchingElem === null) {

352 return '';

353 }

354

355 // Otherwise replace the content with the found content

356 // Checks if the top-level wrapper should be included by matching on tag types

357 $innerContent = '';

358 $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);

359 if ($isTopLevel) {

360 $innerContent .= $doc->saveHTML($matchingElem);

361 } else {

362 foreach ($matchingElem->childNodes as $childNode) {

363 $innerContent .= $doc->saveHTML($childNode);

364 }

365 }

366 libxml_clear_errors();

367

368 return $innerContent;

369 }

370

371 /**

372 * Escape script tags within HTML content.

373 */

374 protected function escapeScripts(string $html) : string

375 {

376 if (empty($html)) {

377 return $html;

378 }

379

380 libxml_use_internal_errors(true);

381 $doc = new DOMDocument();

382 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));

383 $xPath = new DOMXPath($doc);

384

385 // Remove standard script tags

386 $scriptElems = $xPath->query('//script');

387 foreach ($scriptElems as $scriptElem) {

388 $scriptElem->parentNode->removeChild($scriptElem);

389 }

390

391 // Remove clickable links to JavaScript URI

392 $badLinks = $xPath->query('//*[contains(@href, \'javascript:\')]');

393 foreach ($badLinks as $badLink) {

394 $badLink->parentNode->removeChild($badLink);

395 }

396

397 // Remove forms with calls to JavaScript URI

398 $badForms = $xPath->query('//*[contains(@action, \'javascript:\')] | //*[contains(@formaction, \'javascript:\')]');

399 foreach ($badForms as $badForm) {

400 $badForm->parentNode->removeChild($badForm);

401 }

402

403 // Remove meta tag to prevent external redirects

404 $metaTags = $xPath->query('//meta[contains(@content, \'url\')]');

405 foreach ($metaTags as $metaTag) {

406 $metaTag->parentNode->removeChild($metaTag);

407 }

408

409 // Remove data or JavaScript iFrames

410 $badIframes = $xPath->query('//*[contains(@src, \'data:\')] | //*[contains(@src, \'javascript:\')] | //*[@srcdoc]');

411 foreach ($badIframes as $badIframe) {

412 $badIframe->parentNode->removeChild($badIframe);

413 }

414

415 // Remove 'on*' attributes

416 $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');

417 foreach ($onAttributes as $attr) {

418 /** @var \DOMAttr $attr*/

419 $attrName = $attr->nodeName;

420 $attr->parentNode->removeAttribute($attrName);

421 }

422

423 $html = '';

424 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;

425 foreach ($topElems as $child) {

426 $html .= $doc->saveHTML($child);

427 }

428

429 return $html;

430 }

431 }