]> BookStack Code Mirror - bookstack/commitdiff
Vectors: Added command to regenerate for all vectors 5552/head
authorDan Brown <redacted>
Tue, 25 Mar 2025 19:38:32 +0000 (19:38 +0000)
committerDan Brown <redacted>
Tue, 25 Mar 2025 19:38:32 +0000 (19:38 +0000)
Also made models configurable.
Tested system scales via 86k vector entries.

app/Config/services.php
app/Console/Commands/RegenerateVectorsCommand.php [new file with mode: 0644]
app/Search/Vectors/Services/OpenAiVectorQueryService.php
app/Search/Vectors/VectorQueryServiceProvider.php
app/Search/Vectors/VectorSearchRunner.php
database/migrations/2025_03_24_155748_create_search_vectors_table.php

index a34b243f07d20606ad79a6b46cf80e286ac8c9b8..aafe0bacc99e32eef6bfcf07feb5bf129ed66703 100644 (file)
@@ -30,6 +30,8 @@ return [
     'openai' => [
         'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
         'key' => env('OPENAI_KEY', ''),
+        'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'),
+        'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'),
     ],
 
     'github'   => [
diff --git a/app/Console/Commands/RegenerateVectorsCommand.php b/app/Console/Commands/RegenerateVectorsCommand.php
new file mode 100644 (file)
index 0000000..700d053
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+namespace BookStack\Console\Commands;
+
+use BookStack\Entities\EntityProvider;
+use BookStack\Entities\Models\Entity;
+use BookStack\Search\Vectors\SearchVector;
+use BookStack\Search\Vectors\StoreEntityVectorsJob;
+use Illuminate\Console\Command;
+
+class RegenerateVectorsCommand extends Command
+{
+    /**
+     * The name and signature of the console command.
+     *
+     * @var string
+     */
+    protected $signature = 'bookstack:regenerate-vectors';
+
+    /**
+     * The console command description.
+     *
+     * @var string
+     */
+    protected $description = 'Re-index vectors for all content in the system';
+
+    /**
+     * Execute the console command.
+     */
+    public function handle(EntityProvider $entityProvider)
+    {
+        // TODO - Add confirmation before run regarding deletion/time/effort/api-cost etc...
+        SearchVector::query()->delete();
+
+        $types = $entityProvider->all();
+        foreach ($types as $type => $typeInstance) {
+            $this->info("Creating jobs to store vectors for {$type} data...");
+            /** @var Entity[] $entities  */
+            $typeInstance->newQuery()->chunkById(100, function ($entities) {
+                foreach ($entities as $entity) {
+                    dispatch(new StoreEntityVectorsJob($entity));
+                }
+            });
+        }
+    }
+}
index e0e145f3ad7cd03908ba1de2133ebbb593640edd..fea4d5c144515d8dc445c94329484d3ad28e7bbf 100644 (file)
@@ -6,17 +6,26 @@ use BookStack\Http\HttpRequestService;
 
 class OpenAiVectorQueryService implements VectorQueryService
 {
+    protected string $key;
+    protected string $endpoint;
+    protected string $embeddingModel;
+    protected string $queryModel;
+
     public function __construct(
-        protected string $endpoint,
-        protected string $key,
+        protected array $options,
         protected HttpRequestService $http,
     ) {
+        // TODO - Some kind of validation of options
+        $this->key = $this->options['key'] ?? '';
+        $this->endpoint = $this->options['endpoint'] ?? '';
+        $this->embeddingModel = $this->options['embedding_model'] ?? '';
+        $this->queryModel = $this->options['query_model'] ?? '';
     }
 
     protected function jsonRequest(string $method, string $uri, array $data): array
     {
         $fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
-        $client = $this->http->buildClient(10);
+        $client = $this->http->buildClient(30);
         $request = $this->http->jsonRequest($method, $fullUrl, $data)
             ->withHeader('Authorization', 'Bearer ' . $this->key);
 
@@ -28,7 +37,7 @@ class OpenAiVectorQueryService implements VectorQueryService
     {
         $response = $this->jsonRequest('POST', 'v1/embeddings', [
             'input' => $text,
-            'model' => 'text-embedding-3-small',
+            'model' => $this->embeddingModel,
         ]);
 
         return $response['data'][0]['embedding'];
@@ -39,15 +48,15 @@ class OpenAiVectorQueryService implements VectorQueryService
         $formattedContext = implode("\n", $context);
 
         $response = $this->jsonRequest('POST', 'v1/chat/completions', [
-            'model' => 'gpt-4o',
+            'model' => $this->queryModel,
             'messages' => [
                 [
                     'role' => 'developer',
-                    'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response.'
+                    'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.'
                 ],
                 [
                     'role' => 'user',
-                    'content' => "Provide a response to the below given QUERY using the below given CONTEXT\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
+                    'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
                 ]
             ],
         ]);
index c700307e1f303aff9a7ba061622b2019e31483e6..eae7149d03cdb6420ac10586320e0b77b620ea5a 100644 (file)
@@ -18,9 +18,7 @@ class VectorQueryServiceProvider
         $service = $this->getServiceName();
 
         if ($service === 'openai') {
-            $key = config('services.openai.key');
-            $endpoint = config('services.openai.endpoint');
-            return new OpenAiVectorQueryService($endpoint, $key, $this->http);
+            return new OpenAiVectorQueryService(config('services.openai'), $this->http);
         }
 
         throw new \Exception("No '{$service}' LLM service found");
index db28779e403c573250673958c26e8370b65a7627..53b1a4bd6962293cf045bba7d5ae716290aae353 100644 (file)
@@ -19,6 +19,7 @@ class VectorSearchRunner
         $topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id')
             ->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance')
             ->orderBy('distance', 'asc')
+            ->having('distance', '<', 0.6)
             ->limit(10)
             ->get();
 
index 1b552b22c9a69d24be61e1dd8c530ac6d52e7791..0ae67c2256fc593184d3f15bd95bab5c046968cc 100644 (file)
@@ -21,6 +21,8 @@ return new class extends Migration
         });
 
         $table = DB::getTablePrefix() . 'search_vectors';
+
+        // TODO - Vector size might need to be dynamic
         DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)");
         DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine");
     }