]> BookStack Code Mirror - bookstack/commitdiff
Merge branch 'inv-hareesh/development' into search_index_updates
authorDan Brown <redacted>
Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
committerDan Brown <redacted>
Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
1  2 
app/Search/SearchIndex.php

index a8bd2c4b28575643b152f8cf0c0fd4e436d2599f,e10219e2d2f4a63521a86cf52a6b0672a93beb0e..36f71f6ccc7759cd09d3de29e8464a187c1507a6
@@@ -16,13 -16,7 +16,13 @@@ class SearchInde
      /**
       * A list of delimiter characters used to break-up parsed content into terms for indexing.
       */
-     public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
 -    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"«»";
++    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
 +
 +    /**
 +     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
 +     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
 +     */
 +    public static string $softDelimiters = ".-";
  
      public function __construct(
          protected EntityProvider $entityProvider
      protected function textToTermCountMap(string $text): array
      {
          $tokenMap = []; // {TextToken => OccurrenceCount}
 -        $splitChars = static::$delimiters;
 -        $token = strtok($text, $splitChars);
 +        $softDelims = static::$softDelimiters;
 +        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
 +        $extendedToken = '';
 +        $extendedLen = 0;
 +
 +        $token = $tokenizer->next();
  
          while ($token !== false) {
 -            if (!isset($tokenMap[$token])) {
 -                $tokenMap[$token] = 0;
 +            $delim = $tokenizer->previousDelimiter();
 +
 +            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
 +                $extendedToken .= $delim . $token;
 +                $extendedLen++;
 +            } else {
 +                if ($extendedLen > 1) {
 +                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
 +                }
 +                $extendedToken = $token;
 +                $extendedLen = 1;
              }
 -            $tokenMap[$token]++;
 -            $token = strtok($splitChars);
 +
 +            if ($token) {
 +                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
 +            }
 +
 +            $token = $tokenizer->next();
 +        }
 +
 +        if ($extendedLen > 1) {
 +            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
          }
  
          return $tokenMap;