3 namespace BookStack\Entities\Tools;
5 use BookStack\Entities\EntityProvider;
6 use BookStack\Entities\Models\Entity;
7 use BookStack\Entities\Models\Page;
8 use BookStack\Entities\Models\SearchTerm;
11 use Illuminate\Support\Collection;
19 protected $entityProvider;
21 public function __construct(EntityProvider $entityProvider)
23 $this->entityProvider = $entityProvider;
27 * Index the given entity.
29 public function indexEntity(Entity $entity)
31 $this->deleteEntityTerms($entity);
32 $terms = $this->entityToTermDataArray($entity);
33 SearchTerm::query()->insert($terms);
37 * Index multiple Entities at once.
39 * @param Entity[] $entities
41 public function indexEntities(array $entities)
44 foreach ($entities as $entity) {
45 $entityTerms = $this->entityToTermDataArray($entity);
46 array_push($terms, ...$entityTerms);
49 $chunkedTerms = array_chunk($terms, 500);
50 foreach ($chunkedTerms as $termChunk) {
51 SearchTerm::query()->insert($termChunk);
56 * Delete and re-index the terms for all entities in the system.
57 * Can take a callback which is used for reporting progress.
58 * Callback receives three arguments:
59 * - An instance of the model being processed
60 * - The number that have been processed so far.
61 * - The total number of that model to be processed.
63 * @param callable(Entity, int, int)|null $progressCallback
65 public function indexAllEntities(?callable $progressCallback = null)
67 SearchTerm::query()->truncate();
69 foreach ($this->entityProvider->all() as $entityModel) {
70 $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
71 $selectFields = ['id', 'name', $indexContentField];
72 $total = $entityModel->newQuery()->withTrashed()->count();
76 $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
77 $this->indexEntities($entities->all());
78 $processed = min($processed + $chunkSize, $total);
80 if (is_callable($progressCallback)) {
81 $progressCallback($entityModel, $processed, $total);
85 $entityModel->newQuery()
86 ->select($selectFields)
87 ->chunk($chunkSize, $chunkCallback);
92 * Delete related Entity search terms.
94 public function deleteEntityTerms(Entity $entity)
96 $entity->searchTerms()->delete();
100 * Create a scored term array from the given text, where the keys are the terms
101 * and the values are their scores.
103 * @returns array<string, int>
105 protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
107 $termMap = $this->textToTermCountMap($text);
109 foreach ($termMap as $term => $count) {
110 $termMap[$term] = $count * $scoreAdjustment;
117 * Create a scored term array from the given HTML, where the keys are the terms
118 * and the values are their scores.
120 * @returns array<string, int>
122 protected function generateTermScoreMapFromHtml(string $html): array
129 $elementScoreAdjustmentMap = [
138 $html = '<body>' . $html . '</body>';
139 libxml_use_internal_errors(true);
140 $doc = new DOMDocument();
141 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
143 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
144 /** @var DOMNode $child */
145 foreach ($topElems as $child) {
146 $nodeName = $child->nodeName;
147 $termCounts = $this->textToTermCountMap(trim($child->textContent));
148 foreach ($termCounts as $term => $count) {
149 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
150 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
154 return $scoresByTerm;
158 * For the given text, return an array where the keys are the unique term words
159 * and the values are the frequency of that term.
161 * @returns array<string, int>
163 protected function textToTermCountMap(string $text): array
165 $tokenMap = []; // {TextToken => OccurrenceCount}
166 $splitChars = " \n\t.,!?:;()[]{}<>`'\"";
167 $token = strtok($text, $splitChars);
169 while ($token !== false) {
170 if (!isset($tokenMap[$token])) {
171 $tokenMap[$token] = 0;
174 $token = strtok($splitChars);
181 * For the given entity, Generate an array of term data details.
182 * Is the raw term data, not instances of SearchTerm models.
184 * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
186 protected function entityToTermDataArray(Entity $entity): array
188 $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
190 if ($entity instanceof Page) {
191 $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
193 $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
196 $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
199 $entityId = $entity->id;
200 $entityType = $entity->getMorphClass();
201 foreach ($mergedScoreMap as $term => $score) {
205 'entity_type' => $entityType,
206 'entity_id' => $entityId,
215 * For the given term data arrays, Merge their contents by term
216 * while combining any scores.
218 * @param array<string, int>[] ...$scoreMaps
220 * @returns array<string, int>
222 protected function mergeTermScoreMaps(...$scoreMaps): array
226 foreach ($scoreMaps as $scoreMap) {
227 foreach ($scoreMap as $term => $score) {
228 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;