Merge branch 'inv-hareesh/development' into search_index_updates

author Dan Brown <redacted>

Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)

committer Dan Brown <redacted>

Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
author Dan Brown <redacted>
Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
committer Dan Brown <redacted>
Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
diff --combined app/Search/SearchIndex.php

index a8bd2c4b28575643b152f8cf0c0fd4e436d2599f,e10219e2d2f4a63521a86cf52a6b0672a93beb0e..36f71f6ccc7759cd09d3de29e8464a187c1507a6
--- 1/app/Search/SearchIndex.php
--- 2/app/Search/SearchIndex.php
+++ b/app/Search/SearchIndex.php
@@@ -16,13 -16,7 +16,13 @@@ class SearchInde
       /**
        * A list of delimiter characters used to break-up parsed content into terms for indexing.
        */
-     public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
- -    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"«»";
++    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
+ +
+ +    /**
+ +     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+ +     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+ +     */
+ +    public static string $softDelimiters = ".-";
   
       public function __construct(
           protected EntityProvider $entityProvider
@@@ -202,36 -196,15 +202,36 @@@
       protected function textToTermCountMap(string $text): array
       {
           $tokenMap = []; // {TextToken => OccurrenceCount}
- -        $splitChars = static::$delimiters;
- -        $token = strtok($text, $splitChars);
+ +        $softDelims = static::$softDelimiters;
+ +        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+ +        $extendedToken = '';
+ +        $extendedLen = 0;
+ +
+ +        $token = $tokenizer->next();
   
           while ($token !== false) {
- -            if (!isset($tokenMap[$token])) {
- -                $tokenMap[$token] = 0;
+ +            $delim = $tokenizer->previousDelimiter();
+ +
+ +            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+ +                $extendedToken .= $delim . $token;
+ +                $extendedLen++;
+ +            } else {
+ +                if ($extendedLen > 1) {
+ +                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+ +                }
+ +                $extendedToken = $token;
+ +                $extendedLen = 1;
               }
- -            $tokenMap[$token]++;
- -            $token = strtok($splitChars);
+ +
+ +            if ($token) {
+ +                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+ +            }
+ +
+ +            $token = $tokenizer->next();
+ +        }
+ +
+ +        if ($extendedLen > 1) {
+ +            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
           }
   
           return $tokenMap;
author	Dan Brown <redacted>
	Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)
committer	Dan Brown <redacted>
	Fri, 14 Feb 2025 19:25:59 +0000 (19:25 +0000)