3 namespace BookStack\Search;
5 use BookStack\Activity\Models\Tag;
6 use BookStack\Entities\EntityProvider;
7 use BookStack\Entities\Models\Entity;
8 use BookStack\Entities\Models\Page;
9 use BookStack\Util\HtmlDocument;
11 use Illuminate\Database\Eloquent\Builder;
12 use Illuminate\Support\Collection;
17 * A list of delimiter characters used to break-up parsed content into terms for indexing.
19 public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
22 * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
23 * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
25 public static string $softDelimiters = ".-";
27 public function __construct(
28 protected EntityProvider $entityProvider
33 * Index the given entity.
35 public function indexEntity(Entity $entity): void
37 $this->deleteEntityTerms($entity);
38 $terms = $this->entityToTermDataArray($entity);
39 $this->insertTerms($terms);
43 * Index multiple Entities at once.
45 * @param Entity[] $entities
47 public function indexEntities(array $entities): void
50 foreach ($entities as $entity) {
51 $entityTerms = $this->entityToTermDataArray($entity);
52 array_push($terms, ...$entityTerms);
55 $this->insertTerms($terms);
59 * Delete and re-index the terms for all entities in the system.
60 * Can take a callback which is used for reporting progress.
61 * Callback receives three arguments:
62 * - An instance of the model being processed
63 * - The number that have been processed so far.
64 * - The total number of that model to be processed.
66 * @param callable(Entity, int, int):void|null $progressCallback
68 public function indexAllEntities(?callable $progressCallback = null): void
70 SearchTerm::query()->truncate();
72 foreach ($this->entityProvider->all() as $entityModel) {
73 $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
74 $selectFields = ['id', 'name', $indexContentField];
75 /** @var Builder<Entity> $query */
76 $query = $entityModel->newQuery();
77 $total = $query->withTrashed()->count();
81 $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
82 $this->indexEntities($entities->all());
83 $processed = min($processed + $chunkSize, $total);
85 if (is_callable($progressCallback)) {
86 $progressCallback($entityModel, $processed, $total);
90 $entityModel->newQuery()
91 ->select($selectFields)
92 ->with(['tags:id,name,value,entity_id,entity_type'])
93 ->chunk($chunkSize, $chunkCallback);
98 * Delete related Entity search terms.
100 public function deleteEntityTerms(Entity $entity): void
102 $entity->searchTerms()->delete();
106 * Insert the given terms into the database.
107 * Chunks through the given terms to remain within database limits.
108 * @param array[] $terms
110 protected function insertTerms(array $terms): void
112 $chunkedTerms = array_chunk($terms, 500);
113 foreach ($chunkedTerms as $termChunk) {
114 SearchTerm::query()->insert($termChunk);
119 * Create a scored term array from the given text, where the keys are the terms
120 * and the values are their scores.
122 * @returns array<string, int>
124 protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
126 $termMap = $this->textToTermCountMap($text);
128 foreach ($termMap as $term => $count) {
129 $termMap[$term] = floor($count * $scoreAdjustment);
136 * Create a scored term array from the given HTML, where the keys are the terms
137 * and the values are their scores.
139 * @returns array<string, int>
141 protected function generateTermScoreMapFromHtml(string $html): array
148 $elementScoreAdjustmentMap = [
157 $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
158 $doc = new HtmlDocument($html);
160 /** @var DOMNode $child */
161 foreach ($doc->getBodyChildren() as $child) {
162 $nodeName = $child->nodeName;
163 $text = trim($child->textContent);
164 $text = str_replace("\u{00A0}", ' ', $text);
165 $termCounts = $this->textToTermCountMap($text);
166 foreach ($termCounts as $term => $count) {
167 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
168 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
172 return $scoresByTerm;
176 * Create a scored term map from the given set of entity tags.
180 * @returns array<string, int>
182 protected function generateTermScoreMapFromTags(array $tags): array
187 foreach ($tags as $tag) {
188 $names[] = $tag->name;
189 $values[] = $tag->value;
192 $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
193 $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
195 return $this->mergeTermScoreMaps($nameMap, $valueMap);
199 * For the given text, return an array where the keys are the unique term words
200 * and the values are the frequency of that term.
202 * @returns array<string, int>
204 protected function textToTermCountMap(string $text): array
206 $tokenMap = []; // {TextToken => OccurrenceCount}
207 $softDelims = static::$softDelimiters;
208 $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
212 $token = $tokenizer->next();
214 while ($token !== false) {
215 $delim = $tokenizer->previousDelimiter();
217 if ($delim && str_contains($softDelims, $delim) && $token !== '') {
218 $extendedToken .= $delim . $token;
221 if ($extendedLen > 1) {
222 $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
224 $extendedToken = $token;
229 $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
232 $token = $tokenizer->next();
235 if ($extendedLen > 1) {
236 $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
243 * For the given entity, Generate an array of term data details.
244 * Is the raw term data, not instances of SearchTerm models.
246 * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
248 protected function entityToTermDataArray(Entity $entity): array
250 $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
251 $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
253 if ($entity instanceof Page) {
254 $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
256 $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
259 $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
262 $entityId = $entity->id;
263 $entityType = $entity->getMorphClass();
264 foreach ($mergedScoreMap as $term => $score) {
268 'entity_type' => $entityType,
269 'entity_id' => $entityId,
277 * For the given term data arrays, Merge their contents by term
278 * while combining any scores.
280 * @param array<string, int>[] ...$scoreMaps
282 * @returns array<string, int>
284 protected function mergeTermScoreMaps(...$scoreMaps): array
288 foreach ($scoreMaps as $scoreMap) {
289 foreach ($scoreMap as $term => $score) {
290 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;