3 namespace BookStack\Entities\Tools;
5 use BookStack\Actions\Tag;
6 use BookStack\Entities\EntityProvider;
7 use BookStack\Entities\Models\Entity;
8 use BookStack\Entities\Models\Page;
9 use BookStack\Entities\Models\SearchTerm;
12 use Illuminate\Support\Collection;
17 * A list of delimiter characters used to break-up parsed content into terms for indexing.
21 public static $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
26 protected $entityProvider;
28 public function __construct(EntityProvider $entityProvider)
30 $this->entityProvider = $entityProvider;
34 * Index the given entity.
36 public function indexEntity(Entity $entity)
38 $this->deleteEntityTerms($entity);
39 $terms = $this->entityToTermDataArray($entity);
40 SearchTerm::query()->insert($terms);
44 * Index multiple Entities at once.
46 * @param Entity[] $entities
48 public function indexEntities(array $entities)
51 foreach ($entities as $entity) {
52 $entityTerms = $this->entityToTermDataArray($entity);
53 array_push($terms, ...$entityTerms);
56 $chunkedTerms = array_chunk($terms, 500);
57 foreach ($chunkedTerms as $termChunk) {
58 SearchTerm::query()->insert($termChunk);
63 * Delete and re-index the terms for all entities in the system.
64 * Can take a callback which is used for reporting progress.
65 * Callback receives three arguments:
66 * - An instance of the model being processed
67 * - The number that have been processed so far.
68 * - The total number of that model to be processed.
70 * @param callable(Entity, int, int):void|null $progressCallback
72 public function indexAllEntities(?callable $progressCallback = null)
74 SearchTerm::query()->truncate();
76 foreach ($this->entityProvider->all() as $entityModel) {
77 $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
78 $selectFields = ['id', 'name', $indexContentField];
79 $total = $entityModel->newQuery()->withTrashed()->count();
83 $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
84 $this->indexEntities($entities->all());
85 $processed = min($processed + $chunkSize, $total);
87 if (is_callable($progressCallback)) {
88 $progressCallback($entityModel, $processed, $total);
92 $entityModel->newQuery()
93 ->select($selectFields)
94 ->with(['tags:id,name,value,entity_id,entity_type'])
95 ->chunk($chunkSize, $chunkCallback);
100 * Delete related Entity search terms.
102 public function deleteEntityTerms(Entity $entity)
104 $entity->searchTerms()->delete();
108 * Create a scored term array from the given text, where the keys are the terms
109 * and the values are their scores.
111 * @returns array<string, int>
113 protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
115 $termMap = $this->textToTermCountMap($text);
117 foreach ($termMap as $term => $count) {
118 $termMap[$term] = $count * $scoreAdjustment;
125 * Create a scored term array from the given HTML, where the keys are the terms
126 * and the values are their scores.
128 * @returns array<string, int>
130 protected function generateTermScoreMapFromHtml(string $html): array
137 $elementScoreAdjustmentMap = [
146 $html = '<body>' . $html . '</body>';
147 libxml_use_internal_errors(true);
148 $doc = new DOMDocument();
149 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
151 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
152 /** @var DOMNode $child */
153 foreach ($topElems as $child) {
154 $nodeName = $child->nodeName;
155 $termCounts = $this->textToTermCountMap(trim($child->textContent));
156 foreach ($termCounts as $term => $count) {
157 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
158 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
162 return $scoresByTerm;
166 * Create a scored term map from the given set of entity tags.
170 * @returns array<string, int>
172 protected function generateTermScoreMapFromTags(array $tags): array
178 foreach ($tags as $tag) {
179 $names[] = $tag->name;
180 $values[] = $tag->value;
183 $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
184 $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
186 return $this->mergeTermScoreMaps($nameMap, $valueMap);
190 * For the given text, return an array where the keys are the unique term words
191 * and the values are the frequency of that term.
193 * @returns array<string, int>
195 protected function textToTermCountMap(string $text): array
197 $tokenMap = []; // {TextToken => OccurrenceCount}
198 $splitChars = static::$delimiters;
199 $token = strtok($text, $splitChars);
201 while ($token !== false) {
202 if (!isset($tokenMap[$token])) {
203 $tokenMap[$token] = 0;
206 $token = strtok($splitChars);
213 * For the given entity, Generate an array of term data details.
214 * Is the raw term data, not instances of SearchTerm models.
216 * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
218 protected function entityToTermDataArray(Entity $entity): array
220 $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
221 $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
223 if ($entity instanceof Page) {
224 $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
226 $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description ?? '', $entity->searchFactor);
229 $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
232 $entityId = $entity->id;
233 $entityType = $entity->getMorphClass();
234 foreach ($mergedScoreMap as $term => $score) {
238 'entity_type' => $entityType,
239 'entity_id' => $entityId,
247 * For the given term data arrays, Merge their contents by term
248 * while combining any scores.
250 * @param array<string, int>[] ...$scoreMaps
252 * @returns array<string, int>
254 protected function mergeTermScoreMaps(...$scoreMaps): array
258 foreach ($scoreMaps as $scoreMap) {
259 foreach ($scoreMap as $term => $score) {
260 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;