3 namespace BookStack\Search;
5 use BookStack\Activity\Models\Tag;
6 use BookStack\Entities\EntityProvider;
7 use BookStack\Entities\Models\Entity;
8 use BookStack\Entities\Models\Page;
9 use BookStack\Util\HtmlDocument;
11 use Illuminate\Database\Eloquent\Builder;
12 use Illuminate\Support\Collection;
17 * A list of delimiter characters used to break-up parsed content into terms for indexing.
19 public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"«»";
21 public function __construct(
22 protected EntityProvider $entityProvider
27 * Index the given entity.
29 public function indexEntity(Entity $entity): void
31 $this->deleteEntityTerms($entity);
32 $terms = $this->entityToTermDataArray($entity);
33 $this->insertTerms($terms);
37 * Index multiple Entities at once.
39 * @param Entity[] $entities
41 public function indexEntities(array $entities): void
44 foreach ($entities as $entity) {
45 $entityTerms = $this->entityToTermDataArray($entity);
46 array_push($terms, ...$entityTerms);
49 $this->insertTerms($terms);
53 * Delete and re-index the terms for all entities in the system.
54 * Can take a callback which is used for reporting progress.
55 * Callback receives three arguments:
56 * - An instance of the model being processed
57 * - The number that have been processed so far.
58 * - The total number of that model to be processed.
60 * @param callable(Entity, int, int):void|null $progressCallback
62 public function indexAllEntities(?callable $progressCallback = null): void
64 SearchTerm::query()->truncate();
66 foreach ($this->entityProvider->all() as $entityModel) {
67 $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
68 $selectFields = ['id', 'name', $indexContentField];
69 /** @var Builder<Entity> $query */
70 $query = $entityModel->newQuery();
71 $total = $query->withTrashed()->count();
75 $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
76 $this->indexEntities($entities->all());
77 $processed = min($processed + $chunkSize, $total);
79 if (is_callable($progressCallback)) {
80 $progressCallback($entityModel, $processed, $total);
84 $entityModel->newQuery()
85 ->select($selectFields)
86 ->with(['tags:id,name,value,entity_id,entity_type'])
87 ->chunk($chunkSize, $chunkCallback);
92 * Delete related Entity search terms.
94 public function deleteEntityTerms(Entity $entity): void
96 $entity->searchTerms()->delete();
100 * Insert the given terms into the database.
101 * Chunks through the given terms to remain within database limits.
102 * @param array[] $terms
104 protected function insertTerms(array $terms): void
106 $chunkedTerms = array_chunk($terms, 500);
107 foreach ($chunkedTerms as $termChunk) {
108 SearchTerm::query()->insert($termChunk);
113 * Create a scored term array from the given text, where the keys are the terms
114 * and the values are their scores.
116 * @returns array<string, int>
118 protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
120 $termMap = $this->textToTermCountMap($text);
122 foreach ($termMap as $term => $count) {
123 $termMap[$term] = floor($count * $scoreAdjustment);
130 * Create a scored term array from the given HTML, where the keys are the terms
131 * and the values are their scores.
133 * @returns array<string, int>
135 protected function generateTermScoreMapFromHtml(string $html): array
142 $elementScoreAdjustmentMap = [
151 $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
152 $doc = new HtmlDocument($html);
154 /** @var DOMNode $child */
155 foreach ($doc->getBodyChildren() as $child) {
156 $nodeName = $child->nodeName;
157 $termCounts = $this->textToTermCountMap(trim($child->textContent));
158 foreach ($termCounts as $term => $count) {
159 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
160 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
164 return $scoresByTerm;
168 * Create a scored term map from the given set of entity tags.
172 * @returns array<string, int>
174 protected function generateTermScoreMapFromTags(array $tags): array
179 foreach ($tags as $tag) {
180 $names[] = $tag->name;
181 $values[] = $tag->value;
184 $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
185 $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
187 return $this->mergeTermScoreMaps($nameMap, $valueMap);
191 * For the given text, return an array where the keys are the unique term words
192 * and the values are the frequency of that term.
194 * @returns array<string, int>
196 protected function textToTermCountMap(string $text): array
198 $tokenMap = []; // {TextToken => OccurrenceCount}
199 $splitChars = static::$delimiters;
200 $token = strtok($text, $splitChars);
202 while ($token !== false) {
203 if (!isset($tokenMap[$token])) {
204 $tokenMap[$token] = 0;
207 $token = strtok($splitChars);
214 * For the given entity, Generate an array of term data details.
215 * Is the raw term data, not instances of SearchTerm models.
217 * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
219 protected function entityToTermDataArray(Entity $entity): array
221 $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
222 $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
224 if ($entity instanceof Page) {
225 $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
227 $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
230 $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
233 $entityId = $entity->id;
234 $entityType = $entity->getMorphClass();
235 foreach ($mergedScoreMap as $term => $score) {
239 'entity_type' => $entityType,
240 'entity_id' => $entityId,
248 * For the given term data arrays, Merge their contents by term
249 * while combining any scores.
251 * @param array<string, int>[] ...$scoreMaps
253 * @returns array<string, int>
255 protected function mergeTermScoreMaps(...$scoreMaps): array
259 foreach ($scoreMaps as $scoreMap) {
260 foreach ($scoreMap as $term => $score) {
261 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;