Searching: Added custom tokenizer that considers soft delimiters.

author Dan Brown <redacted>

Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)

committer Dan Brown <redacted>

Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
author Dan Brown <redacted>
Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
committer Dan Brown <redacted>
Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php

index c7d9d6502e272ed4edae6b2cf6d402b7ea43464b..a8bd2c4b28575643b152f8cf0c0fd4e436d2599f 100644 (file)
--- a/app/Search/SearchIndex.php
+++ b/app/Search/SearchIndex.php
@@ -16,7 +16,13 @@ class SearchIndex
      /**
       * A list of delimiter characters used to break-up parsed content into terms for indexing.
       */
-    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
+
+    /**
+     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+     */
+    public static string $softDelimiters = ".-";
  
      public function __construct(
          protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ class SearchIndex
      protected function textToTermCountMap(string $text): array
      {
          $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = static::$delimiters;
-        $token = strtok($text, $splitChars);
+        $softDelims = static::$softDelimiters;
+        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+        $extendedToken = '';
+        $extendedLen = 0;
+
+        $token = $tokenizer->next();
  
          while ($token !== false) {
-            if (!isset($tokenMap[$token])) {
-                $tokenMap[$token] = 0;
+            $delim = $tokenizer->previousDelimiter();
+
+            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+                $extendedToken .= $delim . $token;
+                $extendedLen++;
+            } else {
+                if ($extendedLen > 1) {
+                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+                }
+                $extendedToken = $token;
+                $extendedLen = 1;
              }
-            $tokenMap[$token]++;
-            $token = strtok($splitChars);
+
+            if ($token) {
+                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+            }
+
+            $token = $tokenizer->next();
+        }
+
+        if ($extendedLen > 1) {
+            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
          }
  
          return $tokenMap;
diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php

index a6f82029920ee7dd0a0ce1de8416a3498764f11b..bf527d9c3058c1a87f988212da0d086c63fe66e6 100644 (file)
--- a/app/Search/SearchOptions.php
+++ b/app/Search/SearchOptions.php
@@ -181,7 +181,7 @@ class SearchOptions
      protected static function parseStandardTermString(string $termString): array
      {
          $terms = explode(' ', $termString);
-        $indexDelimiters = SearchIndex::$delimiters;
+        $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
          $parsed = [
              'terms'  => [],
              'exacts' => [],
diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php

new file mode 100644 (file)

index 0000000..f43fd56
--- /dev/null
+++ b/app/Search/SearchTextTokenizer.php
@@ -0,0 +1,70 @@
+<?php
+
+namespace BookStack\Search;
+
+/**
+ * A custom text tokenizer which records & provides insight needed for our search indexing.
+ * We used to use basic strtok() but this class does the following which that lacked:
+ * - Tracks and provides the current/previous delimiter that we've stopped at.
+ * - Returns empty tokens upon parsing a delimiter.
+ */
+class SearchTextTokenizer
+{
+    protected int $currentIndex = 0;
+    protected int $length;
+    protected string $currentDelimiter = '';
+    protected string $previousDelimiter = '';
+
+    public function __construct(
+        protected string $text,
+        protected string $delimiters = ' '
+    ) {
+        $this->length = strlen($this->text);
+    }
+
+    /**
+     * Get the current delimiter to be found.
+     */
+    public function currentDelimiter(): string
+    {
+        return $this->currentDelimiter;
+    }
+
+    /**
+     * Get the previous delimiter found.
+     */
+    public function previousDelimiter(): string
+    {
+        return $this->previousDelimiter;
+    }
+
+    /**
+     * Get the next token between delimiters.
+     * Returns false if there's no further tokens.
+     */
+    public function next(): string|false
+    {
+        $token = '';
+
+        for ($i = $this->currentIndex; $i < $this->length; $i++) {
+            $char = $this->text[$i];
+            if (str_contains($this->delimiters, $char)) {
+                $this->previousDelimiter = $this->currentDelimiter;
+                $this->currentDelimiter = $char;
+                $this->currentIndex = $i + 1;
+                return $token;
+            }
+
+            $token .= $char;
+        }
+
+        if ($token) {
+            $this->currentIndex = $this->length;
+            $this->previousDelimiter = $this->currentDelimiter;
+            $this->currentDelimiter = '';
+            return $token;
+        }
+
+        return false;
+    }
+}
diff --git a/tests/Search/SearchIndexingTest.php b/tests/Search/SearchIndexingTest.php

index 43219a4ed9876f1de2732b65833744b036eff5f1..6933813b608a0f6c8559e3dc6d49bc1d087ad5a0 100644 (file)
--- a/tests/Search/SearchIndexingTest.php
+++ b/tests/Search/SearchIndexingTest.php
@@ -74,4 +74,20 @@ class SearchIndexingTest extends TestCase
          $this->assertEquals(3, $scoreByTerm->get('Animal'));
          $this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
      }
+
+    public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
+    {
+        $page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
+
+        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
+        $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
+        foreach ($expected as $term) {
+            $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
+        }
+
+        $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
+        foreach ($nonExpected as $term) {
+            $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
+        }
+    }
  }
author	Dan Brown <redacted>
	Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
committer	Dan Brown <redacted>
	Fri, 14 Feb 2025 19:01:51 +0000 (19:01 +0000)
app/Search/SearchIndex.php		patch \| blob \| history
app/Search/SearchOptions.php		patch \| blob \| history
app/Search/SearchTextTokenizer.php	[new file with mode: 0644]	patch \| blob
tests/Search/SearchIndexingTest.php		patch \| blob \| history