return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
}
+ /**
+ * Perform a vector/LLM-based query search.
+ */
public function searchQuery(Request $request, VectorSearchRunner $runner)
{
+ // TODO - Validate if query system is active
$query = $request->get('query', '');
if ($query) {
namespace BookStack\Search\Vectors;
+use BookStack\Activity\Models\Tag;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\Services\VectorQueryService;
use Illuminate\Support\Facades\DB;
];
}
- // TODO - Chunk inserts
- SearchVector::query()->insert($toInsert);
+ $chunks = array_chunk($toInsert, 500);
+ foreach ($chunks as $chunk) {
+ SearchVector::query()->insert($chunk);
+ }
}
/**
*/
protected function chunkText(string $text): array
{
- // TODO - Join adjacent smaller chunks up
- return array_filter(array_map(function (string $section): string {
- return trim($section);
- }, explode("\n", $text)));
+ return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
}
protected function entityToPlainText(Entity $entity): string
{
- $text = $entity->name . "\n\n" . $entity->{$entity->textField};
- // TODO - Add tags
- return $text;
+ $tags = $entity->tags()->get();
+ $tagText = $tags->map(function (Tag $tag) {
+ return $tag->name . ': ' . $tag->value;
+ })->join('\n');
+
+ return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
}
}
--- /dev/null
+<?php
+
+namespace BookStack\Search\Vectors;
+
+use InvalidArgumentException;
+
+/**
+ * Splits a given string into smaller chunks based on specified delimiters
+ * and a predefined maximum chunk size. This will work through the given delimiters
+ * to break down text further and further to fit into the chunk size.
+ *
+ * The last delimiter is always an empty string to ensure text can always be broken down.
+ */
+class TextChunker
+{
+ public function __construct(
+ protected int $chunkSize,
+ protected array $delimiterOrder,
+ ) {
+ if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
+ $this->delimiterOrder[] = '';
+ }
+
+ if ($this->chunkSize < 1) {
+ throw new InvalidArgumentException('Chunk size must be greater than 0');
+ }
+ }
+
+ public function chunk(string $text): array
+ {
+ $delimiter = $this->delimiterOrder[0];
+ $delimiterLength = strlen($delimiter);
+ $lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
+
+ $cChunk = ''; // Current chunk
+ $cLength = 0; // Current chunk length
+ $chunks = []; // Chunks to return
+ $lDelim = ''; // Last delimiter
+
+ foreach ($lines as $index => $line) {
+ $lineLength = strlen($line);
+ if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
+ $cChunk .= $line . $delimiter;
+ $cLength += $lineLength + $delimiterLength;
+ $lDelim = $delimiter;
+ } else if ($lineLength <= $this->chunkSize) {
+ $chunks[] = trim($cChunk, $delimiter);
+ $cChunk = $line . $delimiter;
+ $cLength = $lineLength + $delimiterLength;
+ $lDelim = $delimiter;
+ } else {
+ $subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
+ $subDelimiter = $this->delimiterOrder[1] ?? '';
+ $subDelimiterLength = strlen($subDelimiter);
+ foreach ($subChunks->chunk($line) as $subChunk) {
+ $chunkLength = strlen($subChunk);
+ if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
+ $cChunk .= $subChunk . $subDelimiter;
+ $cLength += $chunkLength + $subDelimiterLength;
+ $lDelim = $subDelimiter;
+ } else {
+ $chunks[] = trim($cChunk, $lDelim);
+ $cChunk = $subChunk . $subDelimiter;
+ $cLength = $chunkLength + $subDelimiterLength;
+ $lDelim = $subDelimiter;
+ }
+ }
+ }
+ }
+
+ if ($cChunk !== '') {
+ $chunks[] = trim($cChunk, $lDelim);
+ }
+
+ return $chunks;
+ }
+}
--- /dev/null
+<?php
+
+namespace Search;
+
+use BookStack\Search\Vectors\TextChunker;
+use Tests\TestCase;
+
+class TextChunkerTest extends TestCase
+{
+ public function test_it_chunks_text()
+ {
+ $chunker = new TextChunker(3, []);
+ $chunks = $chunker->chunk('123456789');
+
+ $this->assertEquals(['123', '456', '789'], $chunks);
+ }
+
+ public function test_chunk_size_must_be_greater_than_zero()
+ {
+ $this->expectException(\InvalidArgumentException::class);
+ $chunker = new TextChunker(-5, []);
+ }
+
+ public function test_it_works_through_given_delimiters()
+ {
+ $chunker = new TextChunker(5, ['-', '.', '']);
+ $chunks = $chunker->chunk('12-3456.789abcdefg');
+
+ $this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
+ }
+
+ public function test_it_attempts_to_pack_chunks()
+ {
+ $chunker = new TextChunker(8, [' ', '']);
+ $chunks = $chunker->chunk('123 456 789 abc def');
+
+ $this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
+ }
+
+ public function test_it_attempts_to_pack_using_subchunks()
+ {
+ $chunker = new TextChunker(8, [' ', '-', '']);
+ $chunks = $chunker->chunk('123 456-789abc');
+
+ $this->assertEquals(['123 456', '789abc'], $chunks);
+ }
+}