X-Git-Url: http://source.bookstackapp.com/bookstack/blobdiff_plain/9135a85de4eef32a91c7a3ee0aa405ed454e5a4c..refs/pull/5591/head:/app/Search/SearchIndex.php diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index 54ed95ebb..36f71f6cc 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -2,11 +2,11 @@ namespace BookStack\Search; -use BookStack\Actions\Tag; +use BookStack\Activity\Models\Tag; use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Page; -use DOMDocument; +use BookStack\Util\HtmlDocument; use DOMNode; use Illuminate\Database\Eloquent\Builder; use Illuminate\Support\Collection; @@ -15,29 +15,28 @@ class SearchIndex { /** * A list of delimiter characters used to break-up parsed content into terms for indexing. - * - * @var string */ - public static $delimiters = " \n\t.,!?:;()[]{}<>`'\""; + public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»"; /** - * @var EntityProvider + * A list of delimiter which could be commonly used within a single term and also indicate a break between terms. + * The indexer will index the full term with these delimiters, plus the terms split via these delimiters. */ - protected $entityProvider; + public static string $softDelimiters = ".-"; - public function __construct(EntityProvider $entityProvider) - { - $this->entityProvider = $entityProvider; + public function __construct( + protected EntityProvider $entityProvider + ) { } /** * Index the given entity. */ - public function indexEntity(Entity $entity) + public function indexEntity(Entity $entity): void { $this->deleteEntityTerms($entity); $terms = $this->entityToTermDataArray($entity); - SearchTerm::query()->insert($terms); + $this->insertTerms($terms); } /** @@ -45,7 +44,7 @@ class SearchIndex * * @param Entity[] $entities */ - public function indexEntities(array $entities) + public function indexEntities(array $entities): void { $terms = []; foreach ($entities as $entity) { @@ -53,10 +52,7 @@ class SearchIndex array_push($terms, ...$entityTerms); } - $chunkedTerms = array_chunk($terms, 500); - foreach ($chunkedTerms as $termChunk) { - SearchTerm::query()->insert($termChunk); - } + $this->insertTerms($terms); } /** @@ -69,7 +65,7 @@ class SearchIndex * * @param callable(Entity, int, int):void|null $progressCallback */ - public function indexAllEntities(?callable $progressCallback = null) + public function indexAllEntities(?callable $progressCallback = null): void { SearchTerm::query()->truncate(); @@ -101,11 +97,24 @@ class SearchIndex /** * Delete related Entity search terms. */ - public function deleteEntityTerms(Entity $entity) + public function deleteEntityTerms(Entity $entity): void { $entity->searchTerms()->delete(); } + /** + * Insert the given terms into the database. + * Chunks through the given terms to remain within database limits. + * @param array[] $terms + */ + protected function insertTerms(array $terms): void + { + $chunkedTerms = array_chunk($terms, 500); + foreach ($chunkedTerms as $termChunk) { + SearchTerm::query()->insert($termChunk); + } + } + /** * Create a scored term array from the given text, where the keys are the terms * and the values are their scores. @@ -145,16 +154,11 @@ class SearchIndex 'h6' => 1.5, ]; - $html = '' . $html . ''; $html = str_ireplace(['
', '
', '
'], "\n", $html); + $doc = new HtmlDocument($html); - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); - - $topElems = $doc->documentElement->childNodes->item(0)->childNodes; /** @var DOMNode $child */ - foreach ($topElems as $child) { + foreach ($doc->getBodyChildren() as $child) { $nodeName = $child->nodeName; $termCounts = $this->textToTermCountMap(trim($child->textContent)); foreach ($termCounts as $term => $count) { @@ -175,7 +179,6 @@ class SearchIndex */ protected function generateTermScoreMapFromTags(array $tags): array { - $scoreMap = []; $names = []; $values = []; @@ -199,15 +202,36 @@ class SearchIndex protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = static::$delimiters; - $token = strtok($text, $splitChars); + $softDelims = static::$softDelimiters; + $tokenizer = new SearchTextTokenizer($text, static::$delimiters); + $extendedToken = ''; + $extendedLen = 0; + + $token = $tokenizer->next(); while ($token !== false) { - if (!isset($tokenMap[$token])) { - $tokenMap[$token] = 0; + $delim = $tokenizer->previousDelimiter(); + + if ($delim && str_contains($softDelims, $delim) && $token !== '') { + $extendedToken .= $delim . $token; + $extendedLen++; + } else { + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; + } + $extendedToken = $token; + $extendedLen = 1; } - $tokenMap[$token]++; - $token = strtok($splitChars); + + if ($token) { + $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1; + } + + $token = $tokenizer->next(); + } + + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; } return $tokenMap;