]> BookStack Code Mirror - bookstack/commitdiff
Improved vector text chunking vectors 5552/head
authorDan Brown <redacted>
Tue, 19 Aug 2025 10:04:14 +0000 (11:04 +0100)
committerDan Brown <redacted>
Tue, 19 Aug 2025 10:04:14 +0000 (11:04 +0100)
app/Search/SearchController.php
app/Search/Vectors/EntityVectorGenerator.php
app/Search/Vectors/TextChunker.php [new file with mode: 0644]
tests/Search/TextChunkerTest.php [new file with mode: 0644]

index a688385e7c37d43f90e1084e8415d37bffea80e6..b5b2b76b679fc51d95fe56766dda9571741fa007 100644 (file)
@@ -141,8 +141,12 @@ class SearchController extends Controller
         return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
     }
 
+    /**
+     * Perform a vector/LLM-based query search.
+     */
     public function searchQuery(Request $request, VectorSearchRunner $runner)
     {
+        // TODO - Validate if query system is active
         $query = $request->get('query', '');
 
         if ($query) {
index 9563694a321838539a0a9bf7c7daff5c97a172bc..5f2a7c17817689a0e389caf832191d9c51f18124 100644 (file)
@@ -2,6 +2,7 @@
 
 namespace BookStack\Search\Vectors;
 
+use BookStack\Activity\Models\Tag;
 use BookStack\Entities\Models\Entity;
 use BookStack\Search\Vectors\Services\VectorQueryService;
 use Illuminate\Support\Facades\DB;
@@ -47,8 +48,10 @@ class EntityVectorGenerator
             ];
         }
 
-        // TODO - Chunk inserts
-        SearchVector::query()->insert($toInsert);
+        $chunks = array_chunk($toInsert, 500);
+        foreach ($chunks as $chunk) {
+            SearchVector::query()->insert($chunk);
+        }
     }
 
     /**
@@ -69,16 +72,16 @@ class EntityVectorGenerator
      */
     protected function chunkText(string $text): array
     {
-        // TODO - Join adjacent smaller chunks up
-        return array_filter(array_map(function (string $section): string {
-            return trim($section);
-        }, explode("\n", $text)));
+        return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
     }
 
     protected function entityToPlainText(Entity $entity): string
     {
-        $text = $entity->name . "\n\n" . $entity->{$entity->textField};
-        // TODO - Add tags
-        return $text;
+        $tags = $entity->tags()->get();
+        $tagText = $tags->map(function (Tag $tag) {
+            return $tag->name . ': ' . $tag->value;
+        })->join('\n');
+
+        return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
     }
 }
diff --git a/app/Search/Vectors/TextChunker.php b/app/Search/Vectors/TextChunker.php
new file mode 100644 (file)
index 0000000..3ddf1ad
--- /dev/null
@@ -0,0 +1,77 @@
+<?php
+
+namespace BookStack\Search\Vectors;
+
+use InvalidArgumentException;
+
+/**
+ * Splits a given string into smaller chunks based on specified delimiters
+ * and a predefined maximum chunk size. This will work through the given delimiters
+ * to break down text further and further to fit into the chunk size.
+ *
+ * The last delimiter is always an empty string to ensure text can always be broken down.
+ */
+class TextChunker
+{
+    public function __construct(
+        protected int $chunkSize,
+        protected array $delimiterOrder,
+    ) {
+        if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
+            $this->delimiterOrder[] = '';
+        }
+
+        if ($this->chunkSize < 1) {
+            throw new InvalidArgumentException('Chunk size must be greater than 0');
+        }
+    }
+
+    public function chunk(string $text): array
+    {
+        $delimiter = $this->delimiterOrder[0];
+        $delimiterLength = strlen($delimiter);
+        $lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
+
+        $cChunk = ''; // Current chunk
+        $cLength = 0; // Current chunk length
+        $chunks = []; // Chunks to return
+        $lDelim = ''; // Last delimiter
+
+        foreach ($lines as $index => $line) {
+            $lineLength = strlen($line);
+            if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
+                $cChunk .= $line . $delimiter;
+                $cLength += $lineLength + $delimiterLength;
+                $lDelim = $delimiter;
+            } else if ($lineLength <= $this->chunkSize) {
+                $chunks[] = trim($cChunk, $delimiter);
+                $cChunk = $line . $delimiter;
+                $cLength = $lineLength + $delimiterLength;
+                $lDelim = $delimiter;
+            } else {
+                $subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
+                $subDelimiter = $this->delimiterOrder[1] ?? '';
+                $subDelimiterLength = strlen($subDelimiter);
+                foreach ($subChunks->chunk($line) as $subChunk) {
+                    $chunkLength = strlen($subChunk);
+                    if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
+                        $cChunk .= $subChunk . $subDelimiter;
+                        $cLength += $chunkLength + $subDelimiterLength;
+                        $lDelim = $subDelimiter;
+                    } else {
+                        $chunks[] = trim($cChunk, $lDelim);
+                        $cChunk = $subChunk . $subDelimiter;
+                        $cLength = $chunkLength + $subDelimiterLength;
+                        $lDelim = $subDelimiter;
+                    }
+                }
+            }
+        }
+
+        if ($cChunk !== '') {
+            $chunks[] = trim($cChunk, $lDelim);
+        }
+
+        return $chunks;
+    }
+}
diff --git a/tests/Search/TextChunkerTest.php b/tests/Search/TextChunkerTest.php
new file mode 100644 (file)
index 0000000..f78bf11
--- /dev/null
@@ -0,0 +1,47 @@
+<?php
+
+namespace Search;
+
+use BookStack\Search\Vectors\TextChunker;
+use Tests\TestCase;
+
+class TextChunkerTest extends TestCase
+{
+    public function test_it_chunks_text()
+    {
+        $chunker = new TextChunker(3, []);
+        $chunks = $chunker->chunk('123456789');
+
+        $this->assertEquals(['123', '456', '789'], $chunks);
+    }
+
+    public function test_chunk_size_must_be_greater_than_zero()
+    {
+        $this->expectException(\InvalidArgumentException::class);
+        $chunker = new TextChunker(-5, []);
+    }
+
+    public function test_it_works_through_given_delimiters()
+    {
+        $chunker = new TextChunker(5, ['-', '.', '']);
+        $chunks = $chunker->chunk('12-3456.789abcdefg');
+
+        $this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
+    }
+
+    public function test_it_attempts_to_pack_chunks()
+    {
+        $chunker = new TextChunker(8, [' ', '']);
+        $chunks = $chunker->chunk('123 456 789 abc def');
+
+        $this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
+    }
+
+    public function test_it_attempts_to_pack_using_subchunks()
+    {
+        $chunker = new TextChunker(8, [' ', '-', '']);
+        $chunks = $chunker->chunk('123 456-789abc');
+
+        $this->assertEquals(['123 456', '789abc'], $chunks);
+    }
+}