X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/85db812feaae5f36ea6214931cec4adb67a9cb39..refs/pull/3043/head:/app/Entities/Tools/SearchIndex.php diff --git a/app/Entities/Tools/SearchIndex.php b/app/Entities/Tools/SearchIndex.php index 81a5022ce..d748c1695 100644 --- a/app/Entities/Tools/SearchIndex.php +++ b/app/Entities/Tools/SearchIndex.php @@ -1,84 +1,98 @@ -`'\""; /** * @var EntityProvider */ protected $entityProvider; - - public function __construct(SearchTerm $searchTerm, EntityProvider $entityProvider) + public function __construct(EntityProvider $entityProvider) { - $this->searchTerm = $searchTerm; $this->entityProvider = $entityProvider; } - /** * Index the given entity. */ public function indexEntity(Entity $entity) { $this->deleteEntityTerms($entity); - $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor); - $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor); - $terms = array_merge($nameTerms, $bodyTerms); - foreach ($terms as $index => $term) { - $terms[$index]['entity_type'] = $entity->getMorphClass(); - $terms[$index]['entity_id'] = $entity->id; - } - $this->searchTerm->newQuery()->insert($terms); + $terms = $this->entityToTermDataArray($entity); + SearchTerm::query()->insert($terms); } /** - * Index multiple Entities at once + * Index multiple Entities at once. + * * @param Entity[] $entities */ - protected function indexEntities(array $entities) + public function indexEntities(array $entities) { $terms = []; foreach ($entities as $entity) { - $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor); - $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor); - foreach (array_merge($nameTerms, $bodyTerms) as $term) { - $term['entity_id'] = $entity->id; - $term['entity_type'] = $entity->getMorphClass(); - $terms[] = $term; - } + $entityTerms = $this->entityToTermDataArray($entity); + array_push($terms, ...$entityTerms); } $chunkedTerms = array_chunk($terms, 500); foreach ($chunkedTerms as $termChunk) { - $this->searchTerm->newQuery()->insert($termChunk); + SearchTerm::query()->insert($termChunk); } } /** * Delete and re-index the terms for all entities in the system. + * Can take a callback which is used for reporting progress. + * Callback receives three arguments: + * - An instance of the model being processed + * - The number that have been processed so far. + * - The total number of that model to be processed. + * + * @param callable(Entity, int, int)|null $progressCallback */ - public function indexAllEntities() + public function indexAllEntities(?callable $progressCallback = null) { - $this->searchTerm->newQuery()->truncate(); + SearchTerm::query()->truncate(); foreach ($this->entityProvider->all() as $entityModel) { - $selectFields = ['id', 'name', $entityModel->textField]; + $indexContentField = $entityModel instanceof Page ? 'html' : 'description'; + $selectFields = ['id', 'name', $indexContentField]; + $total = $entityModel->newQuery()->withTrashed()->count(); + $chunkSize = 250; + $processed = 0; + + $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) { + $this->indexEntities($entities->all()); + $processed = min($processed + $chunkSize, $total); + + if (is_callable($progressCallback)) { + $progressCallback($entityModel, $processed, $total); + } + }; + $entityModel->newQuery() - ->withTrashed() ->select($selectFields) - ->chunk(1000, function (Collection $entities) { - $this->indexEntities($entities->all()); - }); + ->with(['tags:id,name,value,entity_id,entity_type']) + ->chunk($chunkSize, $chunkCallback); } } @@ -91,12 +105,97 @@ class SearchIndex } /** - * Create a scored term array from the given text. + * Create a scored term array from the given text, where the keys are the terms + * and the values are their scores. + * + * @returns array + */ + protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array + { + $termMap = $this->textToTermCountMap($text); + + foreach ($termMap as $term => $count) { + $termMap[$term] = $count * $scoreAdjustment; + } + + return $termMap; + } + + /** + * Create a scored term array from the given HTML, where the keys are the terms + * and the values are their scores. + * + * @returns array + */ + protected function generateTermScoreMapFromHtml(string $html): array + { + if (empty($html)) { + return []; + } + + $scoresByTerm = []; + $elementScoreAdjustmentMap = [ + 'h1' => 10, + 'h2' => 5, + 'h3' => 4, + 'h4' => 3, + 'h5' => 2, + 'h6' => 1.5, + ]; + + $html = '' . $html . ''; + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); + + $topElems = $doc->documentElement->childNodes->item(0)->childNodes; + /** @var DOMNode $child */ + foreach ($topElems as $child) { + $nodeName = $child->nodeName; + $termCounts = $this->textToTermCountMap(trim($child->textContent)); + foreach ($termCounts as $term => $count) { + $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1); + $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange; + } + } + + return $scoresByTerm; + } + + /** + * Create a scored term map from the given set of entity tags. + * + * @param Tag[] $tags + * + * @returns array */ - protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array + protected function generateTermScoreMapFromTags(array $tags): array + { + $scoreMap = []; + $names = []; + $values = []; + + foreach ($tags as $tag) { + $names[] = $tag->name; + $values[] = $tag->value; + } + + $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3); + $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5); + + return $this->mergeTermScoreMaps($nameMap, $valueMap); + } + + /** + * For the given text, return an array where the keys are the unique term words + * and the values are the frequency of that term. + * + * @returns array + */ + protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = " \n\t.,!?:;()[]{}<>`'\""; + $splitChars = static::$delimiters; $token = strtok($text, $splitChars); while ($token !== false) { @@ -107,14 +206,61 @@ class SearchIndex $token = strtok($splitChars); } - $terms = []; - foreach ($tokenMap as $token => $count) { - $terms[] = [ - 'term' => $token, - 'score' => $count * $scoreAdjustment + return $tokenMap; + } + + /** + * For the given entity, Generate an array of term data details. + * Is the raw term data, not instances of SearchTerm models. + * + * @returns array{term: string, score: float, entity_id: int, entity_type: string}[] + */ + protected function entityToTermDataArray(Entity $entity): array + { + $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor); + $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all()); + + if ($entity instanceof Page) { + $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html); + } else { + $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description ?? '', $entity->searchFactor); + } + + $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap); + + $dataArray = []; + $entityId = $entity->id; + $entityType = $entity->getMorphClass(); + foreach ($mergedScoreMap as $term => $score) { + $dataArray[] = [ + 'term' => $term, + 'score' => $score, + 'entity_type' => $entityType, + 'entity_id' => $entityId, ]; } - return $terms; + return $dataArray; + } + + /** + * For the given term data arrays, Merge their contents by term + * while combining any scores. + * + * @param array[] ...$scoreMaps + * + * @returns array + */ + protected function mergeTermScoreMaps(...$scoreMaps): array + { + $mergedMap = []; + + foreach ($scoreMaps as $scoreMap) { + foreach ($scoreMap as $term => $score) { + $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score; + } + } + + return $mergedMap; } }