]> BookStack Code Mirror - bookstack/commitdiff
Searching: Added custom tokenizer that considers soft delimiters.
authorDan Brown <redacted>
Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
committerDan Brown <redacted>
Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
This changes indexing so that a.b now indexes as "a", "b" AND "a.b"
instead of just the first two, for periods and hypens, so terms
containing those characters can be searched within.

Adds hypens as a delimiter - #2095

app/Search/SearchIndex.php
app/Search/SearchOptions.php
app/Search/SearchTextTokenizer.php [new file with mode: 0644]
tests/Search/SearchIndexingTest.php

index c7d9d6502e272ed4edae6b2cf6d402b7ea43464b..a8bd2c4b28575643b152f8cf0c0fd4e436d2599f 100644 (file)
@@ -16,7 +16,13 @@ class SearchIndex
     /**
      * A list of delimiter characters used to break-up parsed content into terms for indexing.
      */
-    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
+
+    /**
+     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+     */
+    public static string $softDelimiters = ".-";
 
     public function __construct(
         protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ class SearchIndex
     protected function textToTermCountMap(string $text): array
     {
         $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = static::$delimiters;
-        $token = strtok($text, $splitChars);
+        $softDelims = static::$softDelimiters;
+        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+        $extendedToken = '';
+        $extendedLen = 0;
+
+        $token = $tokenizer->next();
 
         while ($token !== false) {
-            if (!isset($tokenMap[$token])) {
-                $tokenMap[$token] = 0;
+            $delim = $tokenizer->previousDelimiter();
+
+            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+                $extendedToken .= $delim . $token;
+                $extendedLen++;
+            } else {
+                if ($extendedLen > 1) {
+                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+                }
+                $extendedToken = $token;
+                $extendedLen = 1;
             }
-            $tokenMap[$token]++;
-            $token = strtok($splitChars);
+
+            if ($token) {
+                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+            }
+
+            $token = $tokenizer->next();
+        }
+
+        if ($extendedLen > 1) {
+            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
         }
 
         return $tokenMap;
index a6f82029920ee7dd0a0ce1de8416a3498764f11b..bf527d9c3058c1a87f988212da0d086c63fe66e6 100644 (file)
@@ -181,7 +181,7 @@ class SearchOptions
     protected static function parseStandardTermString(string $termString): array
     {
         $terms = explode(' ', $termString);
-        $indexDelimiters = SearchIndex::$delimiters;
+        $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
         $parsed = [
             'terms'  => [],
             'exacts' => [],
diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php
new file mode 100644 (file)
index 0000000..f43fd56
--- /dev/null
@@ -0,0 +1,70 @@
+<?php
+
+namespace BookStack\Search;
+
+/**
+ * A custom text tokenizer which records & provides insight needed for our search indexing.
+ * We used to use basic strtok() but this class does the following which that lacked:
+ * - Tracks and provides the current/previous delimiter that we've stopped at.
+ * - Returns empty tokens upon parsing a delimiter.
+ */
+class SearchTextTokenizer
+{
+    protected int $currentIndex = 0;
+    protected int $length;
+    protected string $currentDelimiter = '';
+    protected string $previousDelimiter = '';
+
+    public function __construct(
+        protected string $text,
+        protected string $delimiters = ' '
+    ) {
+        $this->length = strlen($this->text);
+    }
+
+    /**
+     * Get the current delimiter to be found.
+     */
+    public function currentDelimiter(): string
+    {
+        return $this->currentDelimiter;
+    }
+
+    /**
+     * Get the previous delimiter found.
+     */
+    public function previousDelimiter(): string
+    {
+        return $this->previousDelimiter;
+    }
+
+    /**
+     * Get the next token between delimiters.
+     * Returns false if there's no further tokens.
+     */
+    public function next(): string|false
+    {
+        $token = '';
+
+        for ($i = $this->currentIndex; $i < $this->length; $i++) {
+            $char = $this->text[$i];
+            if (str_contains($this->delimiters, $char)) {
+                $this->previousDelimiter = $this->currentDelimiter;
+                $this->currentDelimiter = $char;
+                $this->currentIndex = $i + 1;
+                return $token;
+            }
+
+            $token .= $char;
+        }
+
+        if ($token) {
+            $this->currentIndex = $this->length;
+            $this->previousDelimiter = $this->currentDelimiter;
+            $this->currentDelimiter = '';
+            return $token;
+        }
+
+        return false;
+    }
+}
index 43219a4ed9876f1de2732b65833744b036eff5f1..6933813b608a0f6c8559e3dc6d49bc1d087ad5a0 100644 (file)
@@ -74,4 +74,20 @@ class SearchIndexingTest extends TestCase
         $this->assertEquals(3, $scoreByTerm->get('Animal'));
         $this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
     }
+
+    public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
+    {
+        $page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
+
+        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
+        $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
+        foreach ($expected as $term) {
+            $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
+        }
+
+        $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
+        foreach ($nonExpected as $term) {
+            $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
+        }
+    }
 }