]> BookStack Code Mirror - bookstack/blobdiff - app/Search/SearchIndex.php
added routes for zip export
[bookstack] / app / Search / SearchIndex.php
index d9fc4e7aadce9aabc3b8b7aa70e17f6edf8d5af3..36f71f6ccc7759cd09d3de29e8464a187c1507a6 100644 (file)
@@ -16,7 +16,13 @@ class SearchIndex
     /**
      * A list of delimiter characters used to break-up parsed content into terms for indexing.
      */
-    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
+
+    /**
+     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+     */
+    public static string $softDelimiters = ".-";
 
     public function __construct(
         protected EntityProvider $entityProvider
@@ -30,7 +36,7 @@ class SearchIndex
     {
         $this->deleteEntityTerms($entity);
         $terms = $this->entityToTermDataArray($entity);
-        SearchTerm::query()->insert($terms);
+        $this->insertTerms($terms);
     }
 
     /**
@@ -46,10 +52,7 @@ class SearchIndex
             array_push($terms, ...$entityTerms);
         }
 
-        $chunkedTerms = array_chunk($terms, 500);
-        foreach ($chunkedTerms as $termChunk) {
-            SearchTerm::query()->insert($termChunk);
-        }
+        $this->insertTerms($terms);
     }
 
     /**
@@ -99,6 +102,19 @@ class SearchIndex
         $entity->searchTerms()->delete();
     }
 
+    /**
+     * Insert the given terms into the database.
+     * Chunks through the given terms to remain within database limits.
+     * @param array[] $terms
+     */
+    protected function insertTerms(array $terms): void
+    {
+        $chunkedTerms = array_chunk($terms, 500);
+        foreach ($chunkedTerms as $termChunk) {
+            SearchTerm::query()->insert($termChunk);
+        }
+    }
+
     /**
      * Create a scored term array from the given text, where the keys are the terms
      * and the values are their scores.
@@ -186,15 +202,36 @@ class SearchIndex
     protected function textToTermCountMap(string $text): array
     {
         $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = static::$delimiters;
-        $token = strtok($text, $splitChars);
+        $softDelims = static::$softDelimiters;
+        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+        $extendedToken = '';
+        $extendedLen = 0;
+
+        $token = $tokenizer->next();
 
         while ($token !== false) {
-            if (!isset($tokenMap[$token])) {
-                $tokenMap[$token] = 0;
+            $delim = $tokenizer->previousDelimiter();
+
+            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+                $extendedToken .= $delim . $token;
+                $extendedLen++;
+            } else {
+                if ($extendedLen > 1) {
+                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+                }
+                $extendedToken = $token;
+                $extendedLen = 1;
             }
-            $tokenMap[$token]++;
-            $token = strtok($splitChars);
+
+            if ($token) {
+                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+            }
+
+            $token = $tokenizer->next();
+        }
+
+        if ($extendedLen > 1) {
+            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
         }
 
         return $tokenMap;