BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageContent.php

1 <?php namespace BookStack\Entities\Tools;

3 use BookStack\Entities\Models\Page;

4 use BookStack\Entities\Tools\Markdown\CustomStrikeThroughExtension;

5 use BookStack\Facades\Theme;

6 use BookStack\Theming\ThemeEvents;

7 use DOMDocument;

8 use DOMNodeList;

9 use DOMXPath;

10 use League\CommonMark\CommonMarkConverter;

11 use League\CommonMark\Environment;

12 use League\CommonMark\Extension\Table\TableExtension;

13 use League\CommonMark\Extension\TaskList\TaskListExtension;

15 class PageContent

16 {

18 protected $page;

20 /**

21 * PageContent constructor.

22 */

23 public function __construct(Page $page)

24 {

25 $this->page = $page;

26 }

28 /**

29 * Update the content of the page with new provided HTML.

30 */

31 public function setNewHTML(string $html)

32 {

33 $this->page->html = $this->formatHtml($html);

34 $this->page->text = $this->toPlainText();

35 $this->page->markdown = '';

36 }

38 /**

39 * Update the content of the page with new provided Markdown content.

40 */

41 public function setNewMarkdown(string $markdown)

42 {

43 $this->page->markdown = $markdown;

44 $html = $this->markdownToHtml($markdown);

45 $this->page->html = $this->formatHtml($html);

46 $this->page->text = $this->toPlainText();

47 }

49 /**

50 * Convert the given Markdown content to a HTML string.

51 */

52 protected function markdownToHtml(string $markdown): string

53 {

54 $environment = Environment::createCommonMarkEnvironment();

55 $environment->addExtension(new TableExtension());

56 $environment->addExtension(new TaskListExtension());

57 $environment->addExtension(new CustomStrikeThroughExtension());

58 $environment = Theme::dispatch(ThemeEvents::COMMONMARK_ENVIRONMENT_CONFIGURE, $environment) ?? $environment;

59 $converter = new CommonMarkConverter([], $environment);

60 return $converter->convertToHtml($markdown);

61 }

63 /**

64 * Formats a page's html to be tagged correctly within the system.

65 */

66 protected function formatHtml(string $htmlText): string

67 {

68 if ($htmlText == '') {

69 return $htmlText;

70 }

72 libxml_use_internal_errors(true);

73 $doc = new DOMDocument();

74 $doc->loadHTML(mb_convert_encoding($htmlText, 'HTML-ENTITIES', 'UTF-8'));

76 $container = $doc->documentElement;

77 $body = $container->childNodes->item(0);

78 $childNodes = $body->childNodes;

79 $xPath = new DOMXPath($doc);

81 // Set ids on top-level nodes

82 $idMap = [];

83 foreach ($childNodes as $index => $childNode) {

84 [$oldId, $newId] = $this->setUniqueId($childNode, $idMap);

85 if ($newId && $newId !== $oldId) {

86 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

87 }

88 }

90 // Ensure no duplicate ids within child items

91 $idElems = $xPath->query('//body//*//*[@id]');

92 foreach ($idElems as $domElem) {

93 [$oldId, $newId] = $this->setUniqueId($domElem, $idMap);

94 if ($newId && $newId !== $oldId) {

95 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

96 }

97 }

99 // Generate inner html as a string

100 $html = '';

101 foreach ($childNodes as $childNode) {

102 $html .= $doc->saveHTML($childNode);

103 }

104

105 return $html;

106 }

107

108 /**

109 * Update the all links to the $old location to instead point to $new.

110 */

111 protected function updateLinks(DOMXPath $xpath, string $old, string $new)

112 {

113 $old = str_replace('"', '', $old);

114 $matchingLinks = $xpath->query('//body//*//*[@href="'.$old.'"]');

115 foreach ($matchingLinks as $domElem) {

116 $domElem->setAttribute('href', $new);

117 }

118 }

119

120 /**

121 * Set a unique id on the given DOMElement.

122 * A map for existing ID's should be passed in to check for current existence.

123 * Returns a pair of strings in the format [old_id, new_id]

124 */

125 protected function setUniqueId(\DOMNode $element, array &$idMap): array

126 {

127 if (get_class($element) !== 'DOMElement') {

128 return ['', ''];

129 }

130

131 // Stop if there's an existing valid id that has not already been used.

132 $existingId = $element->getAttribute('id');

133 if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {

134 $idMap[$existingId] = true;

135 return [$existingId, $existingId];

136 }

137

138 // Create an unique id for the element

139 // Uses the content as a basis to ensure output is the same every time

140 // the same content is passed through.

141 $contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

142 $newId = urlencode($contentId);

143 $loopIndex = 0;

144

145 while (isset($idMap[$newId])) {

146 $newId = urlencode($contentId . '-' . $loopIndex);

147 $loopIndex++;

148 }

149

150 $element->setAttribute('id', $newId);

151 $idMap[$newId] = true;

152 return [$existingId, $newId];

153 }

154

155 /**

156 * Get a plain-text visualisation of this page.

157 */

158 protected function toPlainText(): string

159 {

160 $html = $this->render(true);

161 return html_entity_decode(strip_tags($html));

162 }

163

164 /**

165 * Render the page for viewing

166 */

167 public function render(bool $blankIncludes = false) : string

168 {

169 $content = $this->page->html;

170

171 if (!config('app.allow_content_scripts')) {

172 $content = $this->escapeScripts($content);

173 }

174

175 if ($blankIncludes) {

176 $content = $this->blankPageIncludes($content);

177 } else {

178 $content = $this->parsePageIncludes($content);

179 }

180

181 return $content;

182 }

183

184 /**

185 * Parse the headers on the page to get a navigation menu

186 */

187 public function getNavigation(string $htmlContent): array

188 {

189 if (empty($htmlContent)) {

190 return [];

191 }

192

193 libxml_use_internal_errors(true);

194 $doc = new DOMDocument();

195 $doc->loadHTML(mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'));

196 $xPath = new DOMXPath($doc);

197 $headers = $xPath->query("//h1|//h2|//h3|//h4|//h5|//h6");

198

199 return $headers ? $this->headerNodesToLevelList($headers) : [];

200 }

201

202 /**

203 * Convert a DOMNodeList into an array of readable header attributes

204 * with levels normalised to the lower header level.

205 */

206 protected function headerNodesToLevelList(DOMNodeList $nodeList): array

207 {

208 $tree = collect($nodeList)->map(function ($header) {

209 $text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

210 $text = mb_substr($text, 0, 100);

211

212 return [

213 'nodeName' => strtolower($header->nodeName),

214 'level' => intval(str_replace('h', '', $header->nodeName)),

215 'link' => '#' . $header->getAttribute('id'),

216 'text' => $text,

217 ];

218 })->filter(function ($header) {

219 return mb_strlen($header['text']) > 0;

220 });

221

222 // Shift headers if only smaller headers have been used

223 $levelChange = ($tree->pluck('level')->min() - 1);

224 $tree = $tree->map(function ($header) use ($levelChange) {

225 $header['level'] -= ($levelChange);

226 return $header;

227 });

228

229 return $tree->toArray();

230 }

231

232 /**

233 * Remove any page include tags within the given HTML.

234 */

235 protected function blankPageIncludes(string $html) : string

236 {

237 return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html);

238 }

239

240 /**

241 * Parse any include tags "{{@<page_id>#section}}" to be part of the page.

242 */

243 protected function parsePageIncludes(string $html) : string

244 {

245 $matches = [];

246 preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches);

247

248 foreach ($matches[1] as $index => $includeId) {

249 $fullMatch = $matches[0][$index];

250 $splitInclude = explode('#', $includeId, 2);

251

252 // Get page id from reference

253 $pageId = intval($splitInclude[0]);

254 if (is_nan($pageId)) {

255 continue;

256 }

257

258 // Find page and skip this if page not found

259 $matchedPage = Page::visible()->find($pageId);

260 if ($matchedPage === null) {

261 $html = str_replace($fullMatch, '', $html);

262 continue;

263 }

264

265 // If we only have page id, just insert all page html and continue.

266 if (count($splitInclude) === 1) {

267 $html = str_replace($fullMatch, $matchedPage->html, $html);

268 continue;

269 }

270

271 // Create and load HTML into a document

272 $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]);

273 $html = str_replace($fullMatch, trim($innerContent), $html);

274 }

275

276 return $html;

277 }

278

279

280 /**

281 * Fetch the content from a specific section of the given page.

282 */

283 protected function fetchSectionOfPage(Page $page, string $sectionId): string

284 {

285 $topLevelTags = ['table', 'ul', 'ol'];

286 $doc = new DOMDocument();

287 libxml_use_internal_errors(true);

288 $doc->loadHTML(mb_convert_encoding('<body>'.$page->html.'</body>', 'HTML-ENTITIES', 'UTF-8'));

289

290 // Search included content for the id given and blank out if not exists.

291 $matchingElem = $doc->getElementById($sectionId);

292 if ($matchingElem === null) {

293 return '';

294 }

295

296 // Otherwise replace the content with the found content

297 // Checks if the top-level wrapper should be included by matching on tag types

298 $innerContent = '';

299 $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);

300 if ($isTopLevel) {

301 $innerContent .= $doc->saveHTML($matchingElem);

302 } else {

303 foreach ($matchingElem->childNodes as $childNode) {

304 $innerContent .= $doc->saveHTML($childNode);

305 }

306 }

307 libxml_clear_errors();

308

309 return $innerContent;

310 }

311

312 /**

313 * Escape script tags within HTML content.

314 */

315 protected function escapeScripts(string $html) : string

316 {

317 if (empty($html)) {

318 return $html;

319 }

320

321 libxml_use_internal_errors(true);

322 $doc = new DOMDocument();

323 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));

324 $xPath = new DOMXPath($doc);

325

326 // Remove standard script tags

327 $scriptElems = $xPath->query('//script');

328 foreach ($scriptElems as $scriptElem) {

329 $scriptElem->parentNode->removeChild($scriptElem);

330 }

331

332 // Remove clickable links to JavaScript URI

333 $badLinks = $xPath->query('//*[contains(@href, \'javascript:\')]');

334 foreach ($badLinks as $badLink) {

335 $badLink->parentNode->removeChild($badLink);

336 }

337

338 // Remove forms with calls to JavaScript URI

339 $badForms = $xPath->query('//*[contains(@action, \'javascript:\')] | //*[contains(@formaction, \'javascript:\')]');

340 foreach ($badForms as $badForm) {

341 $badForm->parentNode->removeChild($badForm);

342 }

343

344 // Remove meta tag to prevent external redirects

345 $metaTags = $xPath->query('//meta[contains(@content, \'url\')]');

346 foreach ($metaTags as $metaTag) {

347 $metaTag->parentNode->removeChild($metaTag);

348 }

349

350 // Remove data or JavaScript iFrames

351 $badIframes = $xPath->query('//*[contains(@src, \'data:\')] | //*[contains(@src, \'javascript:\')] | //*[@srcdoc]');

352 foreach ($badIframes as $badIframe) {

353 $badIframe->parentNode->removeChild($badIframe);

354 }

355

356 // Remove 'on*' attributes

357 $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');

358 foreach ($onAttributes as $attr) {

359 /** @var \DOMAttr $attr*/

360 $attrName = $attr->nodeName;

361 $attr->parentNode->removeAttribute($attrName);

362 }

363

364 $html = '';

365 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;

366 foreach ($topElems as $child) {

367 $html .= $doc->saveHTML($child);

368 }

369

370 return $html;

371 }

372 }