From: Dan Brown Date: Tue, 19 Aug 2025 10:04:14 +0000 (+0100) Subject: Improved vector text chunking X-Git-Url: http://source.bookstackapp.com/bookstack/commitdiff_plain/refs/pull/5552/head?ds=sidebyside Improved vector text chunking --- diff --git a/app/Search/SearchController.php b/app/Search/SearchController.php index a688385e7..b5b2b76b6 100644 --- a/app/Search/SearchController.php +++ b/app/Search/SearchController.php @@ -141,8 +141,12 @@ class SearchController extends Controller return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']); } + /** + * Perform a vector/LLM-based query search. + */ public function searchQuery(Request $request, VectorSearchRunner $runner) { + // TODO - Validate if query system is active $query = $request->get('query', ''); if ($query) { diff --git a/app/Search/Vectors/EntityVectorGenerator.php b/app/Search/Vectors/EntityVectorGenerator.php index 9563694a3..5f2a7c178 100644 --- a/app/Search/Vectors/EntityVectorGenerator.php +++ b/app/Search/Vectors/EntityVectorGenerator.php @@ -2,6 +2,7 @@ namespace BookStack\Search\Vectors; +use BookStack\Activity\Models\Tag; use BookStack\Entities\Models\Entity; use BookStack\Search\Vectors\Services\VectorQueryService; use Illuminate\Support\Facades\DB; @@ -47,8 +48,10 @@ class EntityVectorGenerator ]; } - // TODO - Chunk inserts - SearchVector::query()->insert($toInsert); + $chunks = array_chunk($toInsert, 500); + foreach ($chunks as $chunk) { + SearchVector::query()->insert($chunk); + } } /** @@ -69,16 +72,16 @@ class EntityVectorGenerator */ protected function chunkText(string $text): array { - // TODO - Join adjacent smaller chunks up - return array_filter(array_map(function (string $section): string { - return trim($section); - }, explode("\n", $text))); + return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text); } protected function entityToPlainText(Entity $entity): string { - $text = $entity->name . "\n\n" . $entity->{$entity->textField}; - // TODO - Add tags - return $text; + $tags = $entity->tags()->get(); + $tagText = $tags->map(function (Tag $tag) { + return $tag->name . ': ' . $tag->value; + })->join('\n'); + + return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField}; } } diff --git a/app/Search/Vectors/TextChunker.php b/app/Search/Vectors/TextChunker.php new file mode 100644 index 000000000..3ddf1ad4a --- /dev/null +++ b/app/Search/Vectors/TextChunker.php @@ -0,0 +1,77 @@ +delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') { + $this->delimiterOrder[] = ''; + } + + if ($this->chunkSize < 1) { + throw new InvalidArgumentException('Chunk size must be greater than 0'); + } + } + + public function chunk(string $text): array + { + $delimiter = $this->delimiterOrder[0]; + $delimiterLength = strlen($delimiter); + $lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text); + + $cChunk = ''; // Current chunk + $cLength = 0; // Current chunk length + $chunks = []; // Chunks to return + $lDelim = ''; // Last delimiter + + foreach ($lines as $index => $line) { + $lineLength = strlen($line); + if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) { + $cChunk .= $line . $delimiter; + $cLength += $lineLength + $delimiterLength; + $lDelim = $delimiter; + } else if ($lineLength <= $this->chunkSize) { + $chunks[] = trim($cChunk, $delimiter); + $cChunk = $line . $delimiter; + $cLength = $lineLength + $delimiterLength; + $lDelim = $delimiter; + } else { + $subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1)); + $subDelimiter = $this->delimiterOrder[1] ?? ''; + $subDelimiterLength = strlen($subDelimiter); + foreach ($subChunks->chunk($line) as $subChunk) { + $chunkLength = strlen($subChunk); + if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) { + $cChunk .= $subChunk . $subDelimiter; + $cLength += $chunkLength + $subDelimiterLength; + $lDelim = $subDelimiter; + } else { + $chunks[] = trim($cChunk, $lDelim); + $cChunk = $subChunk . $subDelimiter; + $cLength = $chunkLength + $subDelimiterLength; + $lDelim = $subDelimiter; + } + } + } + } + + if ($cChunk !== '') { + $chunks[] = trim($cChunk, $lDelim); + } + + return $chunks; + } +} diff --git a/tests/Search/TextChunkerTest.php b/tests/Search/TextChunkerTest.php new file mode 100644 index 000000000..f78bf11a4 --- /dev/null +++ b/tests/Search/TextChunkerTest.php @@ -0,0 +1,47 @@ +chunk('123456789'); + + $this->assertEquals(['123', '456', '789'], $chunks); + } + + public function test_chunk_size_must_be_greater_than_zero() + { + $this->expectException(\InvalidArgumentException::class); + $chunker = new TextChunker(-5, []); + } + + public function test_it_works_through_given_delimiters() + { + $chunker = new TextChunker(5, ['-', '.', '']); + $chunks = $chunker->chunk('12-3456.789abcdefg'); + + $this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks); + } + + public function test_it_attempts_to_pack_chunks() + { + $chunker = new TextChunker(8, [' ', '']); + $chunks = $chunker->chunk('123 456 789 abc def'); + + $this->assertEquals(['123 456', '789 abc', 'def'], $chunks); + } + + public function test_it_attempts_to_pack_using_subchunks() + { + $chunker = new TextChunker(8, [' ', '-', '']); + $chunks = $chunker->chunk('123 456-789abc'); + + $this->assertEquals(['123 456', '789abc'], $chunks); + } +}