]> BookStack Code Mirror - bookstack/blob - app/Entities/Tools/SearchIndex.php
Added page content parsing to up-rank header text in search
[bookstack] / app / Entities / Tools / SearchIndex.php
1 <?php
2
3 namespace BookStack\Entities\Tools;
4
5 use BookStack\Entities\EntityProvider;
6 use BookStack\Entities\Models\Entity;
7 use BookStack\Entities\Models\Page;
8 use BookStack\Entities\Models\SearchTerm;
9 use DOMDocument;
10 use DOMNode;
11 use Illuminate\Support\Collection;
12
13 class SearchIndex
14 {
15
16     /**
17      * @var EntityProvider
18      */
19     protected $entityProvider;
20
21     public function __construct(EntityProvider $entityProvider)
22     {
23         $this->entityProvider = $entityProvider;
24     }
25
26     /**
27      * Index the given entity.
28      */
29     public function indexEntity(Entity $entity)
30     {
31         $this->deleteEntityTerms($entity);
32         $terms = $this->entityToTermDataArray($entity);
33         SearchTerm::query()->insert($terms);
34     }
35
36     /**
37      * Index multiple Entities at once.
38      *
39      * @param Entity[] $entities
40      */
41     public function indexEntities(array $entities)
42     {
43         $terms = [];
44         foreach ($entities as $entity) {
45             $entityTerms = $this->entityToTermDataArray($entity);
46             array_push($terms, ...$entityTerms);
47         }
48
49         $chunkedTerms = array_chunk($terms, 500);
50         foreach ($chunkedTerms as $termChunk) {
51             SearchTerm::query()->insert($termChunk);
52         }
53     }
54
55     /**
56      * Delete and re-index the terms for all entities in the system.
57      * Can take a callback which is used for reporting progress.
58      * Callback receives three arguments:
59      * - An instance of the model being processed
60      * - The number that have been processed so far.
61      * - The total number of that model to be processed.
62      *
63      * @param callable(Entity, int, int)|null $progressCallback
64      */
65     public function indexAllEntities(?callable $progressCallback = null)
66     {
67         SearchTerm::query()->truncate();
68
69         foreach ($this->entityProvider->all() as $entityModel) {
70             $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
71             $selectFields = ['id', 'name', $indexContentField];
72             $total = $entityModel->newQuery()->withTrashed()->count();
73             $chunkSize = 250;
74             $processed = 0;
75
76             $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
77                 $this->indexEntities($entities->all());
78                 $processed = min($processed + $chunkSize, $total);
79
80                 if (is_callable($progressCallback)) {
81                     $progressCallback($entityModel, $processed, $total);
82                 }
83             };
84
85             $entityModel->newQuery()
86                 ->select($selectFields)
87                 ->chunk($chunkSize, $chunkCallback);
88         }
89     }
90
91     /**
92      * Delete related Entity search terms.
93      */
94     public function deleteEntityTerms(Entity $entity)
95     {
96         $entity->searchTerms()->delete();
97     }
98
99     /**
100      * Create a scored term array from the given text, where the keys are the terms
101      * and the values are their scores.
102      *
103      * @returns array<string, int>
104      */
105     protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
106     {
107         $termMap = $this->textToTermCountMap($text);
108
109         foreach ($termMap as $term => $count) {
110             $termMap[$term] = $count * $scoreAdjustment;
111         }
112
113         return $termMap;
114     }
115
116     /**
117      * Create a scored term array from the given HTML, where the keys are the terms
118      * and the values are their scores.
119      *
120      * @returns array<string, int>
121      */
122     protected function generateTermScoreMapFromHtml(string $html): array
123     {
124         if (empty($html)) {
125             return [];
126         }
127
128         $scoresByTerm = [];
129         $elementScoreAdjustmentMap = [
130             'h1' => 10,
131             'h2' => 5,
132             'h3' => 4,
133             'h4' => 3,
134             'h5' => 2,
135             'h6' => 1.5,
136         ];
137
138         $html = '<body>' . $html . '</body>';
139         libxml_use_internal_errors(true);
140         $doc = new DOMDocument();
141         $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
142
143         $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
144         /** @var DOMNode $child */
145         foreach ($topElems as $child) {
146             $nodeName = $child->nodeName;
147             $termCounts = $this->textToTermCountMap(trim($child->textContent));
148             foreach ($termCounts as $term => $count) {
149                 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
150                 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
151             }
152         }
153
154         return $scoresByTerm;
155     }
156
157     /**
158      * For the given text, return an array where the keys are the unique term words
159      * and the values are the frequency of that term.
160      *
161      * @returns array<string, int>
162      */
163     protected function textToTermCountMap(string $text): array
164     {
165         $tokenMap = []; // {TextToken => OccurrenceCount}
166         $splitChars = " \n\t.,!?:;()[]{}<>`'\"";
167         $token = strtok($text, $splitChars);
168
169         while ($token !== false) {
170             if (!isset($tokenMap[$token])) {
171                 $tokenMap[$token] = 0;
172             }
173             $tokenMap[$token]++;
174             $token = strtok($splitChars);
175         }
176
177         return $tokenMap;
178     }
179
180     /**
181      * For the given entity, Generate an array of term data details.
182      * Is the raw term data, not instances of SearchTerm models.
183      *
184      * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
185      */
186     protected function entityToTermDataArray(Entity $entity): array
187     {
188         $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
189
190         if ($entity instanceof Page) {
191             $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
192         } else {
193             $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
194         }
195
196         $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
197
198         $dataArray = [];
199         $entityId = $entity->id;
200         $entityType = $entity->getMorphClass();
201         foreach ($mergedScoreMap as $term => $score) {
202             $dataArray[] = [
203                 'term' => $term,
204                 'score' => $score,
205                 'entity_type' => $entityType,
206                 'entity_id' => $entityId,
207             ];
208         }
209
210         return $dataArray;
211     }
212
213
214     /**
215      * For the given term data arrays, Merge their contents by term
216      * while combining any scores.
217      *
218      * @param array<string, int>[] ...$scoreMaps
219      *
220      * @returns array<string, int>
221      */
222     protected function mergeTermScoreMaps(...$scoreMaps): array
223     {
224         $mergedMap = [];
225
226         foreach ($scoreMaps as $scoreMap) {
227             foreach ($scoreMap as $term => $score) {
228                 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
229             }
230         }
231
232         return $mergedMap;
233     }
234 }