/**
* A list of delimiter characters used to break-up parsed content into terms for indexing.
*/
- public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+ public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
+
+ /**
+ * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+ * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+ */
+ public static string $softDelimiters = ".-";
public function __construct(
protected EntityProvider $entityProvider
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
- $splitChars = static::$delimiters;
- $token = strtok($text, $splitChars);
+ $softDelims = static::$softDelimiters;
+ $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+ $extendedToken = '';
+ $extendedLen = 0;
+
+ $token = $tokenizer->next();
while ($token !== false) {
- if (!isset($tokenMap[$token])) {
- $tokenMap[$token] = 0;
+ $delim = $tokenizer->previousDelimiter();
+
+ if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+ $extendedToken .= $delim . $token;
+ $extendedLen++;
+ } else {
+ if ($extendedLen > 1) {
+ $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+ }
+ $extendedToken = $token;
+ $extendedLen = 1;
}
- $tokenMap[$token]++;
- $token = strtok($splitChars);
+
+ if ($token) {
+ $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+ }
+
+ $token = $tokenizer->next();
+ }
+
+ if ($extendedLen > 1) {
+ $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
}
return $tokenMap;
protected static function parseStandardTermString(string $termString): array
{
$terms = explode(' ', $termString);
- $indexDelimiters = SearchIndex::$delimiters;
+ $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
$parsed = [
'terms' => [],
'exacts' => [],
--- /dev/null
+<?php
+
+namespace BookStack\Search;
+
+/**
+ * A custom text tokenizer which records & provides insight needed for our search indexing.
+ * We used to use basic strtok() but this class does the following which that lacked:
+ * - Tracks and provides the current/previous delimiter that we've stopped at.
+ * - Returns empty tokens upon parsing a delimiter.
+ */
+class SearchTextTokenizer
+{
+ protected int $currentIndex = 0;
+ protected int $length;
+ protected string $currentDelimiter = '';
+ protected string $previousDelimiter = '';
+
+ public function __construct(
+ protected string $text,
+ protected string $delimiters = ' '
+ ) {
+ $this->length = strlen($this->text);
+ }
+
+ /**
+ * Get the current delimiter to be found.
+ */
+ public function currentDelimiter(): string
+ {
+ return $this->currentDelimiter;
+ }
+
+ /**
+ * Get the previous delimiter found.
+ */
+ public function previousDelimiter(): string
+ {
+ return $this->previousDelimiter;
+ }
+
+ /**
+ * Get the next token between delimiters.
+ * Returns false if there's no further tokens.
+ */
+ public function next(): string|false
+ {
+ $token = '';
+
+ for ($i = $this->currentIndex; $i < $this->length; $i++) {
+ $char = $this->text[$i];
+ if (str_contains($this->delimiters, $char)) {
+ $this->previousDelimiter = $this->currentDelimiter;
+ $this->currentDelimiter = $char;
+ $this->currentIndex = $i + 1;
+ return $token;
+ }
+
+ $token .= $char;
+ }
+
+ if ($token) {
+ $this->currentIndex = $this->length;
+ $this->previousDelimiter = $this->currentDelimiter;
+ $this->currentDelimiter = '';
+ return $token;
+ }
+
+ return false;
+ }
+}
$this->assertEquals(3, $scoreByTerm->get('Animal'));
$this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
}
+
+ public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
+ {
+ $page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
+
+ $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
+ $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
+ foreach ($expected as $term) {
+ $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
+ }
+
+ $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
+ foreach ($nonExpected as $term) {
+ $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
+ }
+ }
}