From: Dan Brown Date: Fri, 14 Feb 2025 19:25:59 +0000 (+0000) Subject: Merge branch 'inv-hareesh/development' into search_index_updates X-Git-Tag: v25.02~1^2~14^2~1 X-Git-Url: http://source.bookstackapp.com/bookstack/commitdiff_plain/c291d27c19b69641c067da3aefc0623016f34471?ds=inline;hp=-c Merge branch 'inv-hareesh/development' into search_index_updates --- c291d27c19b69641c067da3aefc0623016f34471 diff --combined app/Search/SearchIndex.php index a8bd2c4b2,e10219e2d..36f71f6cc --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@@ -16,13 -16,7 +16,13 @@@ class SearchInde /** * A list of delimiter characters used to break-up parsed content into terms for indexing. */ - public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\""; - public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"«»"; ++ public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»"; + + /** + * A list of delimiter which could be commonly used within a single term and also indicate a break between terms. + * The indexer will index the full term with these delimiters, plus the terms split via these delimiters. + */ + public static string $softDelimiters = ".-"; public function __construct( protected EntityProvider $entityProvider @@@ -202,36 -196,15 +202,36 @@@ protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = static::$delimiters; - $token = strtok($text, $splitChars); + $softDelims = static::$softDelimiters; + $tokenizer = new SearchTextTokenizer($text, static::$delimiters); + $extendedToken = ''; + $extendedLen = 0; + + $token = $tokenizer->next(); while ($token !== false) { - if (!isset($tokenMap[$token])) { - $tokenMap[$token] = 0; + $delim = $tokenizer->previousDelimiter(); + + if ($delim && str_contains($softDelims, $delim) && $token !== '') { + $extendedToken .= $delim . $token; + $extendedLen++; + } else { + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; + } + $extendedToken = $token; + $extendedLen = 1; } - $tokenMap[$token]++; - $token = strtok($splitChars); + + if ($token) { + $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1; + } + + $token = $tokenizer->next(); + } + + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; } return $tokenMap;