BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageContent.php

1 <?php namespace BookStack\Entities\Tools;

3 use BookStack\Entities\Models\Page;

4 use DOMDocument;

5 use DOMNodeList;

6 use DOMXPath;

7 use League\CommonMark\CommonMarkConverter;

9 class PageContent

10 {

12 protected $page;

14 /**

15 * PageContent constructor.

16 */

17 public function __construct(Page $page)

18 {

19 $this->page = $page;

20 }

22 /**

23 * Update the content of the page with new provided HTML.

24 */

25 public function setNewHTML(string $html)

26 {

27 $this->page->html = $this->formatHtml($html);

28 $this->page->text = $this->toPlainText();

29 $this->page->markdown = '';

30 }

32 /**

33 * Update the content of the page with new provided Markdown content.

34 */

35 public function setNewMarkdown(string $markdown)

36 {

37 $this->page->markdown = $markdown;

38 $html = $this->markdownToHtml($markdown);

39 $this->page->html = $this->formatHtml($html);

40 $this->page->text = $this->toPlainText();

41 }

43 /**

44 * Convert the given Markdown content to a HTML string.

45 */

46 protected function markdownToHtml(string $markdown): string

47 {

48 $converter = new CommonMarkConverter();

49 return $converter->convertToHtml($markdown);

50 }

52 /**

53 * Formats a page's html to be tagged correctly within the system.

54 */

55 protected function formatHtml(string $htmlText): string

56 {

57 if ($htmlText == '') {

58 return $htmlText;

59 }

61 libxml_use_internal_errors(true);

62 $doc = new DOMDocument();

63 $doc->loadHTML(mb_convert_encoding($htmlText, 'HTML-ENTITIES', 'UTF-8'));

65 $container = $doc->documentElement;

66 $body = $container->childNodes->item(0);

67 $childNodes = $body->childNodes;

68 $xPath = new DOMXPath($doc);

70 // Set ids on top-level nodes

71 $idMap = [];

72 foreach ($childNodes as $index => $childNode) {

73 [$oldId, $newId] = $this->setUniqueId($childNode, $idMap);

74 if ($newId && $newId !== $oldId) {

75 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

76 }

77 }

79 // Ensure no duplicate ids within child items

80 $idElems = $xPath->query('//body//*//*[@id]');

81 foreach ($idElems as $domElem) {

82 [$oldId, $newId] = $this->setUniqueId($domElem, $idMap);

83 if ($newId && $newId !== $oldId) {

84 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

85 }

86 }

88 // Generate inner html as a string

89 $html = '';

90 foreach ($childNodes as $childNode) {

91 $html .= $doc->saveHTML($childNode);

92 }

94 return $html;

95 }

97 /**

98 * Update the all links to the $old location to instead point to $new.

99 */

100 protected function updateLinks(DOMXPath $xpath, string $old, string $new)

101 {

102 $old = str_replace('"', '', $old);

103 $matchingLinks = $xpath->query('//body//*//*[@href="'.$old.'"]');

104 foreach ($matchingLinks as $domElem) {

105 $domElem->setAttribute('href', $new);

106 }

107 }

108

109 /**

110 * Set a unique id on the given DOMElement.

111 * A map for existing ID's should be passed in to check for current existence.

112 * Returns a pair of strings in the format [old_id, new_id]

113 */

114 protected function setUniqueId(\DOMNode $element, array &$idMap): array

115 {

116 if (get_class($element) !== 'DOMElement') {

117 return ['', ''];

118 }

119

120 // Stop if there's an existing valid id that has not already been used.

121 $existingId = $element->getAttribute('id');

122 if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {

123 $idMap[$existingId] = true;

124 return [$existingId, $existingId];

125 }

126

127 // Create an unique id for the element

128 // Uses the content as a basis to ensure output is the same every time

129 // the same content is passed through.

130 $contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

131 $newId = urlencode($contentId);

132 $loopIndex = 0;

133

134 while (isset($idMap[$newId])) {

135 $newId = urlencode($contentId . '-' . $loopIndex);

136 $loopIndex++;

137 }

138

139 $element->setAttribute('id', $newId);

140 $idMap[$newId] = true;

141 return [$existingId, $newId];

142 }

143

144 /**

145 * Get a plain-text visualisation of this page.

146 */

147 protected function toPlainText(): string

148 {

149 $html = $this->render(true);

150 return html_entity_decode(strip_tags($html));

151 }

152

153 /**

154 * Render the page for viewing

155 */

156 public function render(bool $blankIncludes = false) : string

157 {

158 $content = $this->page->html;

159

160 if (!config('app.allow_content_scripts')) {

161 $content = $this->escapeScripts($content);

162 }

163

164 if ($blankIncludes) {

165 $content = $this->blankPageIncludes($content);

166 } else {

167 $content = $this->parsePageIncludes($content);

168 }

169

170 return $content;

171 }

172

173 /**

174 * Parse the headers on the page to get a navigation menu

175 */

176 public function getNavigation(string $htmlContent): array

177 {

178 if (empty($htmlContent)) {

179 return [];

180 }

181

182 libxml_use_internal_errors(true);

183 $doc = new DOMDocument();

184 $doc->loadHTML(mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'));

185 $xPath = new DOMXPath($doc);

186 $headers = $xPath->query("//h1|//h2|//h3|//h4|//h5|//h6");

187

188 return $headers ? $this->headerNodesToLevelList($headers) : [];

189 }

190

191 /**

192 * Convert a DOMNodeList into an array of readable header attributes

193 * with levels normalised to the lower header level.

194 */

195 protected function headerNodesToLevelList(DOMNodeList $nodeList): array

196 {

197 $tree = collect($nodeList)->map(function ($header) {

198 $text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

199 $text = mb_substr($text, 0, 100);

200

201 return [

202 'nodeName' => strtolower($header->nodeName),

203 'level' => intval(str_replace('h', '', $header->nodeName)),

204 'link' => '#' . $header->getAttribute('id'),

205 'text' => $text,

206 ];

207 })->filter(function ($header) {

208 return mb_strlen($header['text']) > 0;

209 });

210

211 // Shift headers if only smaller headers have been used

212 $levelChange = ($tree->pluck('level')->min() - 1);

213 $tree = $tree->map(function ($header) use ($levelChange) {

214 $header['level'] -= ($levelChange);

215 return $header;

216 });

217

218 return $tree->toArray();

219 }

220

221 /**

222 * Remove any page include tags within the given HTML.

223 */

224 protected function blankPageIncludes(string $html) : string

225 {

226 return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html);

227 }

228

229 /**

230 * Parse any include tags "{{@<page_id>#section}}" to be part of the page.

231 */

232 protected function parsePageIncludes(string $html) : string

233 {

234 $matches = [];

235 preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches);

236

237 foreach ($matches[1] as $index => $includeId) {

238 $fullMatch = $matches[0][$index];

239 $splitInclude = explode('#', $includeId, 2);

240

241 // Get page id from reference

242 $pageId = intval($splitInclude[0]);

243 if (is_nan($pageId)) {

244 continue;

245 }

246

247 // Find page and skip this if page not found

248 $matchedPage = Page::visible()->find($pageId);

249 if ($matchedPage === null) {

250 $html = str_replace($fullMatch, '', $html);

251 continue;

252 }

253

254 // If we only have page id, just insert all page html and continue.

255 if (count($splitInclude) === 1) {

256 $html = str_replace($fullMatch, $matchedPage->html, $html);

257 continue;

258 }

259

260 // Create and load HTML into a document

261 $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]);

262 $html = str_replace($fullMatch, trim($innerContent), $html);

263 }

264

265 return $html;

266 }

267

268

269 /**

270 * Fetch the content from a specific section of the given page.

271 */

272 protected function fetchSectionOfPage(Page $page, string $sectionId): string

273 {

274 $topLevelTags = ['table', 'ul', 'ol'];

275 $doc = new DOMDocument();

276 libxml_use_internal_errors(true);

277 $doc->loadHTML(mb_convert_encoding('<body>'.$page->html.'</body>', 'HTML-ENTITIES', 'UTF-8'));

278

279 // Search included content for the id given and blank out if not exists.

280 $matchingElem = $doc->getElementById($sectionId);

281 if ($matchingElem === null) {

282 return '';

283 }

284

285 // Otherwise replace the content with the found content

286 // Checks if the top-level wrapper should be included by matching on tag types

287 $innerContent = '';

288 $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);

289 if ($isTopLevel) {

290 $innerContent .= $doc->saveHTML($matchingElem);

291 } else {

292 foreach ($matchingElem->childNodes as $childNode) {

293 $innerContent .= $doc->saveHTML($childNode);

294 }

295 }

296 libxml_clear_errors();

297

298 return $innerContent;

299 }

300

301 /**

302 * Escape script tags within HTML content.

303 */

304 protected function escapeScripts(string $html) : string

305 {

306 if (empty($html)) {

307 return $html;

308 }

309

310 libxml_use_internal_errors(true);

311 $doc = new DOMDocument();

312 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));

313 $xPath = new DOMXPath($doc);

314

315 // Remove standard script tags

316 $scriptElems = $xPath->query('//script');

317 foreach ($scriptElems as $scriptElem) {

318 $scriptElem->parentNode->removeChild($scriptElem);

319 }

320

321 // Remove clickable links to JavaScript URI

322 $badLinks = $xPath->query('//*[contains(@href, \'javascript:\')]');

323 foreach ($badLinks as $badLink) {

324 $badLink->parentNode->removeChild($badLink);

325 }

326

327 // Remove forms with calls to JavaScript URI

328 $badForms = $xPath->query('//*[contains(@action, \'javascript:\')] | //*[contains(@formaction, \'javascript:\')]');

329 foreach ($badForms as $badForm) {

330 $badForm->parentNode->removeChild($badForm);

331 }

332

333 // Remove meta tag to prevent external redirects

334 $metaTags = $xPath->query('//meta[contains(@content, \'url\')]');

335 foreach ($metaTags as $metaTag) {

336 $metaTag->parentNode->removeChild($metaTag);

337 }

338

339 // Remove data or JavaScript iFrames

340 $badIframes = $xPath->query('//*[contains(@src, \'data:\')] | //*[contains(@src, \'javascript:\')] | //*[@srcdoc]');

341 foreach ($badIframes as $badIframe) {

342 $badIframe->parentNode->removeChild($badIframe);

343 }

344

345 // Remove 'on*' attributes

346 $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');

347 foreach ($onAttributes as $attr) {

348 /** @var \DOMAttr $attr*/

349 $attrName = $attr->nodeName;

350 $attr->parentNode->removeAttribute($attrName);

351 }

352

353 $html = '';

354 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;

355 foreach ($topElems as $child) {

356 $html .= $doc->saveHTML($child);

357 }

358

359 return $html;

360 }

361

362 /**

363 * Retrieve first image in page content and return the source URL.

364 */

365 public function fetchFirstImage()

366 {

367 $htmlContent = $this->page->html;

368

369 $dom = new \DomDocument();

370 $dom->loadHTML($htmlContent);

371 $images = $dom->getElementsByTagName('img');

372

373 return $images->length > 0 ? $images[0]->getAttribute('src') : null;

374 }

375 }