3 namespace BookStack\Search;
5 use BookStack\Activity\Models\Tag;
6 use BookStack\Entities\EntityProvider;
7 use BookStack\Entities\Models\Entity;
8 use BookStack\Entities\Models\Page;
9 use BookStack\Search\Vectors\StoreEntityVectorsJob;
10 use BookStack\Search\Vectors\VectorQueryServiceProvider;
11 use BookStack\Util\HtmlDocument;
13 use Illuminate\Database\Eloquent\Builder;
14 use Illuminate\Support\Collection;
19 * A list of delimiter characters used to break-up parsed content into terms for indexing.
21 public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
24 * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
25 * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
27 public static string $softDelimiters = ".-";
29 public function __construct(
30 protected EntityProvider $entityProvider,
35 * Index the given entity.
37 public function indexEntity(Entity $entity): void
39 $this->deleteEntityTerms($entity);
40 $terms = $this->entityToTermDataArray($entity);
41 $this->insertTerms($terms);
43 if (VectorQueryServiceProvider::isEnabled()) {
44 dispatch(new StoreEntityVectorsJob($entity));
49 * Index multiple Entities at once.
51 * @param Entity[] $entities
53 public function indexEntities(array $entities): void
56 $vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();
58 foreach ($entities as $entity) {
59 $entityTerms = $this->entityToTermDataArray($entity);
60 array_push($terms, ...$entityTerms);
62 if ($vectorQueryEnabled) {
63 dispatch(new StoreEntityVectorsJob($entity));
67 $this->insertTerms($terms);
71 * Delete and re-index the terms for all entities in the system.
72 * Can take a callback which is used for reporting progress.
73 * Callback receives three arguments:
74 * - An instance of the model being processed
75 * - The number that have been processed so far.
76 * - The total number of that model to be processed.
78 * @param callable(Entity, int, int):void|null $progressCallback
80 public function indexAllEntities(?callable $progressCallback = null): void
82 SearchTerm::query()->truncate();
84 foreach ($this->entityProvider->all() as $entityModel) {
85 $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
86 $selectFields = ['id', 'name', $indexContentField];
87 /** @var Builder<Entity> $query */
88 $query = $entityModel->newQuery();
89 $total = $query->withTrashed()->count();
93 $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
94 $this->indexEntities($entities->all());
95 $processed = min($processed + $chunkSize, $total);
97 if (is_callable($progressCallback)) {
98 $progressCallback($entityModel, $processed, $total);
102 $entityModel->newQuery()
103 ->select($selectFields)
104 ->with(['tags:id,name,value,entity_id,entity_type'])
105 ->chunk($chunkSize, $chunkCallback);
110 * Delete related Entity search terms.
112 public function deleteEntityTerms(Entity $entity): void
114 $entity->searchTerms()->delete();
118 * Insert the given terms into the database.
119 * Chunks through the given terms to remain within database limits.
120 * @param array[] $terms
122 protected function insertTerms(array $terms): void
124 $chunkedTerms = array_chunk($terms, 500);
125 foreach ($chunkedTerms as $termChunk) {
126 SearchTerm::query()->insert($termChunk);
131 * Create a scored term array from the given text, where the keys are the terms
132 * and the values are their scores.
134 * @returns array<string, int>
136 protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
138 $termMap = $this->textToTermCountMap($text);
140 foreach ($termMap as $term => $count) {
141 $termMap[$term] = floor($count * $scoreAdjustment);
148 * Create a scored term array from the given HTML, where the keys are the terms
149 * and the values are their scores.
151 * @returns array<string, int>
153 protected function generateTermScoreMapFromHtml(string $html): array
160 $elementScoreAdjustmentMap = [
169 $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
170 $doc = new HtmlDocument($html);
172 /** @var DOMNode $child */
173 foreach ($doc->getBodyChildren() as $child) {
174 $nodeName = $child->nodeName;
175 $termCounts = $this->textToTermCountMap(trim($child->textContent));
176 foreach ($termCounts as $term => $count) {
177 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
178 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
182 return $scoresByTerm;
186 * Create a scored term map from the given set of entity tags.
190 * @returns array<string, int>
192 protected function generateTermScoreMapFromTags(array $tags): array
197 foreach ($tags as $tag) {
198 $names[] = $tag->name;
199 $values[] = $tag->value;
202 $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
203 $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
205 return $this->mergeTermScoreMaps($nameMap, $valueMap);
209 * For the given text, return an array where the keys are the unique term words
210 * and the values are the frequency of that term.
212 * @returns array<string, int>
214 protected function textToTermCountMap(string $text): array
216 $tokenMap = []; // {TextToken => OccurrenceCount}
217 $softDelims = static::$softDelimiters;
218 $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
222 $token = $tokenizer->next();
224 while ($token !== false) {
225 $delim = $tokenizer->previousDelimiter();
227 if ($delim && str_contains($softDelims, $delim) && $token !== '') {
228 $extendedToken .= $delim . $token;
231 if ($extendedLen > 1) {
232 $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
234 $extendedToken = $token;
239 $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
242 $token = $tokenizer->next();
245 if ($extendedLen > 1) {
246 $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
253 * For the given entity, Generate an array of term data details.
254 * Is the raw term data, not instances of SearchTerm models.
256 * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
258 protected function entityToTermDataArray(Entity $entity): array
260 $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
261 $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
263 if ($entity instanceof Page) {
264 $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
266 $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
269 $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
272 $entityId = $entity->id;
273 $entityType = $entity->getMorphClass();
274 foreach ($mergedScoreMap as $term => $score) {
278 'entity_type' => $entityType,
279 'entity_id' => $entityId,
287 * For the given term data arrays, Merge their contents by term
288 * while combining any scores.
290 * @param array<string, int>[] ...$scoreMaps
292 * @returns array<string, int>
294 protected function mergeTermScoreMaps(...$scoreMaps): array
298 foreach ($scoreMaps as $scoreMap) {
299 foreach ($scoreMap as $term => $score) {
300 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;