BookStack Code Mirror - bookstack/blob - app/Entities/Tools/PageContent.php

1 <?php namespace BookStack\Entities\Tools;

3 use BookStack\Entities\Models\Page;

4 use BookStack\Entities\Tools\Markdown\CustomStrikeThroughExtension;

5 use BookStack\Facades\Theme;

6 use BookStack\Theming\ThemeEvents;

7 use BookStack\Util\HtmlContentFilter;

8 use DOMDocument;

9 use DOMNodeList;

10 use DOMXPath;

11 use League\CommonMark\CommonMarkConverter;

12 use League\CommonMark\Environment;

13 use League\CommonMark\Extension\Table\TableExtension;

14 use League\CommonMark\Extension\TaskList\TaskListExtension;

16 class PageContent

17 {

19 protected $page;

21 /**

22 * PageContent constructor.

23 */

24 public function __construct(Page $page)

25 {

26 $this->page = $page;

27 }

29 /**

30 * Update the content of the page with new provided HTML.

31 */

32 public function setNewHTML(string $html)

33 {

34 $this->page->html = $this->formatHtml($html);

35 $this->page->text = $this->toPlainText();

36 $this->page->markdown = '';

37 }

39 /**

40 * Update the content of the page with new provided Markdown content.

41 */

42 public function setNewMarkdown(string $markdown)

43 {

44 $this->page->markdown = $markdown;

45 $html = $this->markdownToHtml($markdown);

46 $this->page->html = $this->formatHtml($html);

47 $this->page->text = $this->toPlainText();

48 }

50 /**

51 * Convert the given Markdown content to a HTML string.

52 */

53 protected function markdownToHtml(string $markdown): string

54 {

55 $environment = Environment::createCommonMarkEnvironment();

56 $environment->addExtension(new TableExtension());

57 $environment->addExtension(new TaskListExtension());

58 $environment->addExtension(new CustomStrikeThroughExtension());

59 $environment = Theme::dispatch(ThemeEvents::COMMONMARK_ENVIRONMENT_CONFIGURE, $environment) ?? $environment;

60 $converter = new CommonMarkConverter([], $environment);

61 return $converter->convertToHtml($markdown);

62 }

64 /**

65 * Formats a page's html to be tagged correctly within the system.

66 */

67 protected function formatHtml(string $htmlText): string

68 {

69 if ($htmlText == '') {

70 return $htmlText;

71 }

73 libxml_use_internal_errors(true);

74 $doc = new DOMDocument();

75 $doc->loadHTML(mb_convert_encoding($htmlText, 'HTML-ENTITIES', 'UTF-8'));

77 $container = $doc->documentElement;

78 $body = $container->childNodes->item(0);

79 $childNodes = $body->childNodes;

80 $xPath = new DOMXPath($doc);

82 // Set ids on top-level nodes

83 $idMap = [];

84 foreach ($childNodes as $index => $childNode) {

85 [$oldId, $newId] = $this->setUniqueId($childNode, $idMap);

86 if ($newId && $newId !== $oldId) {

87 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

88 }

89 }

91 // Ensure no duplicate ids within child items

92 $idElems = $xPath->query('//body//*//*[@id]');

93 foreach ($idElems as $domElem) {

94 [$oldId, $newId] = $this->setUniqueId($domElem, $idMap);

95 if ($newId && $newId !== $oldId) {

96 $this->updateLinks($xPath, '#' . $oldId, '#' . $newId);

97 }

98 }

100 // Generate inner html as a string

101 $html = '';

102 foreach ($childNodes as $childNode) {

103 $html .= $doc->saveHTML($childNode);

104 }

105

106 return $html;

107 }

108

109 /**

110 * Update the all links to the $old location to instead point to $new.

111 */

112 protected function updateLinks(DOMXPath $xpath, string $old, string $new)

113 {

114 $old = str_replace('"', '', $old);

115 $matchingLinks = $xpath->query('//body//*//*[@href="'.$old.'"]');

116 foreach ($matchingLinks as $domElem) {

117 $domElem->setAttribute('href', $new);

118 }

119 }

120

121 /**

122 * Set a unique id on the given DOMElement.

123 * A map for existing ID's should be passed in to check for current existence.

124 * Returns a pair of strings in the format [old_id, new_id]

125 */

126 protected function setUniqueId(\DOMNode $element, array &$idMap): array

127 {

128 if (get_class($element) !== 'DOMElement') {

129 return ['', ''];

130 }

131

132 // Stop if there's an existing valid id that has not already been used.

133 $existingId = $element->getAttribute('id');

134 if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {

135 $idMap[$existingId] = true;

136 return [$existingId, $existingId];

137 }

138

139 // Create an unique id for the element

140 // Uses the content as a basis to ensure output is the same every time

141 // the same content is passed through.

142 $contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);

143 $newId = urlencode($contentId);

144 $loopIndex = 0;

145

146 while (isset($idMap[$newId])) {

147 $newId = urlencode($contentId . '-' . $loopIndex);

148 $loopIndex++;

149 }

150

151 $element->setAttribute('id', $newId);

152 $idMap[$newId] = true;

153 return [$existingId, $newId];

154 }

155

156 /**

157 * Get a plain-text visualisation of this page.

158 */

159 protected function toPlainText(): string

160 {

161 $html = $this->render(true);

162 return html_entity_decode(strip_tags($html));

163 }

164

165 /**

166 * Render the page for viewing

167 */

168 public function render(bool $blankIncludes = false) : string

169 {

170 $content = $this->page->html;

171

172 if (!config('app.allow_content_scripts')) {

173 $content = HtmlContentFilter::removeScripts($content);

174 }

175

176 if ($blankIncludes) {

177 $content = $this->blankPageIncludes($content);

178 } else {

179 $content = $this->parsePageIncludes($content);

180 }

181

182 return $content;

183 }

184

185 /**

186 * Parse the headers on the page to get a navigation menu

187 */

188 public function getNavigation(string $htmlContent): array

189 {

190 if (empty($htmlContent)) {

191 return [];

192 }

193

194 libxml_use_internal_errors(true);

195 $doc = new DOMDocument();

196 $doc->loadHTML(mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'));

197 $xPath = new DOMXPath($doc);

198 $headers = $xPath->query("//h1|//h2|//h3|//h4|//h5|//h6");

199

200 return $headers ? $this->headerNodesToLevelList($headers) : [];

201 }

202

203 /**

204 * Convert a DOMNodeList into an array of readable header attributes

205 * with levels normalised to the lower header level.

206 */

207 protected function headerNodesToLevelList(DOMNodeList $nodeList): array

208 {

209 $tree = collect($nodeList)->map(function ($header) {

210 $text = trim(str_replace("\xc2\xa0", '', $header->nodeValue));

211 $text = mb_substr($text, 0, 100);

212

213 return [

214 'nodeName' => strtolower($header->nodeName),

215 'level' => intval(str_replace('h', '', $header->nodeName)),

216 'link' => '#' . $header->getAttribute('id'),

217 'text' => $text,

218 ];

219 })->filter(function ($header) {

220 return mb_strlen($header['text']) > 0;

221 });

222

223 // Shift headers if only smaller headers have been used

224 $levelChange = ($tree->pluck('level')->min() - 1);

225 $tree = $tree->map(function ($header) use ($levelChange) {

226 $header['level'] -= ($levelChange);

227 return $header;

228 });

229

230 return $tree->toArray();

231 }

232

233 /**

234 * Remove any page include tags within the given HTML.

235 */

236 protected function blankPageIncludes(string $html) : string

237 {

238 return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html);

239 }

240

241 /**

242 * Parse any include tags "{{@<page_id>#section}}" to be part of the page.

243 */

244 protected function parsePageIncludes(string $html) : string

245 {

246 $matches = [];

247 preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches);

248

249 foreach ($matches[1] as $index => $includeId) {

250 $fullMatch = $matches[0][$index];

251 $splitInclude = explode('#', $includeId, 2);

252

253 // Get page id from reference

254 $pageId = intval($splitInclude[0]);

255 if (is_nan($pageId)) {

256 continue;

257 }

258

259 // Find page and skip this if page not found

260 $matchedPage = Page::visible()->find($pageId);

261 if ($matchedPage === null) {

262 $html = str_replace($fullMatch, '', $html);

263 continue;

264 }

265

266 // If we only have page id, just insert all page html and continue.

267 if (count($splitInclude) === 1) {

268 $html = str_replace($fullMatch, $matchedPage->html, $html);

269 continue;

270 }

271

272 // Create and load HTML into a document

273 $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]);

274 $html = str_replace($fullMatch, trim($innerContent), $html);

275 }

276

277 return $html;

278 }

279

280

281 /**

282 * Fetch the content from a specific section of the given page.

283 */

284 protected function fetchSectionOfPage(Page $page, string $sectionId): string

285 {

286 $topLevelTags = ['table', 'ul', 'ol'];

287 $doc = new DOMDocument();

288 libxml_use_internal_errors(true);

289 $doc->loadHTML(mb_convert_encoding('<body>'.$page->html.'</body>', 'HTML-ENTITIES', 'UTF-8'));

290

291 // Search included content for the id given and blank out if not exists.

292 $matchingElem = $doc->getElementById($sectionId);

293 if ($matchingElem === null) {

294 return '';

295 }

296

297 // Otherwise replace the content with the found content

298 // Checks if the top-level wrapper should be included by matching on tag types

299 $innerContent = '';

300 $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);

301 if ($isTopLevel) {

302 $innerContent .= $doc->saveHTML($matchingElem);

303 } else {

304 foreach ($matchingElem->childNodes as $childNode) {

305 $innerContent .= $doc->saveHTML($childNode);

306 }

307 }

308 libxml_clear_errors();

309

310 return $innerContent;

311 }

312 }