X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/820be162f5bfb31f69f0122a61755fdd8623275f..refs/pull/3391/head:/app/Entities/Tools/SearchIndex.php diff --git a/app/Entities/Tools/SearchIndex.php b/app/Entities/Tools/SearchIndex.php index 50e471bc9..d43d98207 100644 --- a/app/Entities/Tools/SearchIndex.php +++ b/app/Entities/Tools/SearchIndex.php @@ -2,13 +2,24 @@ namespace BookStack\Entities\Tools; +use BookStack\Actions\Tag; use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; +use BookStack\Entities\Models\Page; use BookStack\Entities\Models\SearchTerm; +use DOMDocument; +use DOMNode; +use Illuminate\Database\Eloquent\Builder; use Illuminate\Support\Collection; class SearchIndex { + /** + * A list of delimiter characters used to break-up parsed content into terms for indexing. + * + * @var string + */ + public static $delimiters = " \n\t.,!?:;()[]{}<>`'\""; /** * @var EntityProvider @@ -57,15 +68,18 @@ class SearchIndex * - The number that have been processed so far. * - The total number of that model to be processed. * - * @param callable(Entity, int, int)|null $progressCallback + * @param callable(Entity, int, int):void|null $progressCallback */ public function indexAllEntities(?callable $progressCallback = null) { SearchTerm::query()->truncate(); foreach ($this->entityProvider->all() as $entityModel) { - $selectFields = ['id', 'name', $entityModel->textField]; - $total = $entityModel->newQuery()->withTrashed()->count(); + $indexContentField = $entityModel instanceof Page ? 'html' : 'description'; + $selectFields = ['id', 'name', $indexContentField]; + /** @var Builder $query */ + $query = $entityModel->newQuery(); + $total = $query->withTrashed()->count(); $chunkSize = 250; $processed = 0; @@ -80,6 +94,7 @@ class SearchIndex $entityModel->newQuery() ->select($selectFields) + ->with(['tags:id,name,value,entity_id,entity_type']) ->chunk($chunkSize, $chunkCallback); } } @@ -93,14 +108,97 @@ class SearchIndex } /** - * Create a scored term array from the given text. + * Create a scored term array from the given text, where the keys are the terms + * and the values are their scores. + * + * @returns array + */ + protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array + { + $termMap = $this->textToTermCountMap($text); + + foreach ($termMap as $term => $count) { + $termMap[$term] = $count * $scoreAdjustment; + } + + return $termMap; + } + + /** + * Create a scored term array from the given HTML, where the keys are the terms + * and the values are their scores. + * + * @returns array + */ + protected function generateTermScoreMapFromHtml(string $html): array + { + if (empty($html)) { + return []; + } + + $scoresByTerm = []; + $elementScoreAdjustmentMap = [ + 'h1' => 10, + 'h2' => 5, + 'h3' => 4, + 'h4' => 3, + 'h5' => 2, + 'h6' => 1.5, + ]; + + $html = '' . $html . ''; + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); + + $topElems = $doc->documentElement->childNodes->item(0)->childNodes; + /** @var DOMNode $child */ + foreach ($topElems as $child) { + $nodeName = $child->nodeName; + $termCounts = $this->textToTermCountMap(trim($child->textContent)); + foreach ($termCounts as $term => $count) { + $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1); + $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange; + } + } + + return $scoresByTerm; + } + + /** + * Create a scored term map from the given set of entity tags. * - * @returns array{term: string, score: float} + * @param Tag[] $tags + * + * @returns array */ - protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array + protected function generateTermScoreMapFromTags(array $tags): array + { + $scoreMap = []; + $names = []; + $values = []; + + foreach ($tags as $tag) { + $names[] = $tag->name; + $values[] = $tag->value; + } + + $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3); + $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5); + + return $this->mergeTermScoreMaps($nameMap, $valueMap); + } + + /** + * For the given text, return an array where the keys are the unique term words + * and the values are the frequency of that term. + * + * @returns array + */ + protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = " \n\t.,!?:;()[]{}<>`'\""; + $splitChars = static::$delimiters; $token = strtok($text, $splitChars); while ($token !== false) { @@ -111,34 +209,61 @@ class SearchIndex $token = strtok($splitChars); } - $terms = []; - foreach ($tokenMap as $token => $count) { - $terms[] = [ - 'term' => $token, - 'score' => $count * $scoreAdjustment, - ]; - } - - return $terms; + return $tokenMap; } /** * For the given entity, Generate an array of term data details. * Is the raw term data, not instances of SearchTerm models. * - * @returns array{term: string, score: float}[] + * @returns array{term: string, score: float, entity_id: int, entity_type: string}[] */ protected function entityToTermDataArray(Entity $entity): array { - $nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor); - $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor); - $termData = array_merge($nameTerms, $bodyTerms); + $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor); + $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all()); + + if ($entity instanceof Page) { + $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html); + } else { + $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor); + } - foreach ($termData as $index => $term) { - $termData[$index]['entity_type'] = $entity->getMorphClass(); - $termData[$index]['entity_id'] = $entity->id; + $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap); + + $dataArray = []; + $entityId = $entity->id; + $entityType = $entity->getMorphClass(); + foreach ($mergedScoreMap as $term => $score) { + $dataArray[] = [ + 'term' => $term, + 'score' => $score, + 'entity_type' => $entityType, + 'entity_id' => $entityId, + ]; + } + + return $dataArray; + } + + /** + * For the given term data arrays, Merge their contents by term + * while combining any scores. + * + * @param array[] ...$scoreMaps + * + * @returns array + */ + protected function mergeTermScoreMaps(...$scoreMaps): array + { + $mergedMap = []; + + foreach ($scoreMaps as $scoreMap) { + foreach ($scoreMap as $term => $score) { + $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score; + } } - return $termData; + return $mergedMap; } }