Also made models configurable.
Tested system scales via 86k vector entries.
'openai' => [
'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
'key' => env('OPENAI_KEY', ''),
+ 'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'),
+ 'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'),
],
'github' => [
--- /dev/null
+<?php
+
+namespace BookStack\Console\Commands;
+
+use BookStack\Entities\EntityProvider;
+use BookStack\Entities\Models\Entity;
+use BookStack\Search\Vectors\SearchVector;
+use BookStack\Search\Vectors\StoreEntityVectorsJob;
+use Illuminate\Console\Command;
+
+class RegenerateVectorsCommand extends Command
+{
+ /**
+ * The name and signature of the console command.
+ *
+ * @var string
+ */
+ protected $signature = 'bookstack:regenerate-vectors';
+
+ /**
+ * The console command description.
+ *
+ * @var string
+ */
+ protected $description = 'Re-index vectors for all content in the system';
+
+ /**
+ * Execute the console command.
+ */
+ public function handle(EntityProvider $entityProvider)
+ {
+ // TODO - Add confirmation before run regarding deletion/time/effort/api-cost etc...
+ SearchVector::query()->delete();
+
+ $types = $entityProvider->all();
+ foreach ($types as $type => $typeInstance) {
+ $this->info("Creating jobs to store vectors for {$type} data...");
+ /** @var Entity[] $entities */
+ $typeInstance->newQuery()->chunkById(100, function ($entities) {
+ foreach ($entities as $entity) {
+ dispatch(new StoreEntityVectorsJob($entity));
+ }
+ });
+ }
+ }
+}
class OpenAiVectorQueryService implements VectorQueryService
{
+ protected string $key;
+ protected string $endpoint;
+ protected string $embeddingModel;
+ protected string $queryModel;
+
public function __construct(
- protected string $endpoint,
- protected string $key,
+ protected array $options,
protected HttpRequestService $http,
) {
+ // TODO - Some kind of validation of options
+ $this->key = $this->options['key'] ?? '';
+ $this->endpoint = $this->options['endpoint'] ?? '';
+ $this->embeddingModel = $this->options['embedding_model'] ?? '';
+ $this->queryModel = $this->options['query_model'] ?? '';
}
protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
- $client = $this->http->buildClient(10);
+ $client = $this->http->buildClient(30);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
- 'model' => 'text-embedding-3-small',
+ 'model' => $this->embeddingModel,
]);
return $response['data'][0]['embedding'];
$formattedContext = implode("\n", $context);
$response = $this->jsonRequest('POST', 'v1/chat/completions', [
- 'model' => 'gpt-4o',
+ 'model' => $this->queryModel,
'messages' => [
[
'role' => 'developer',
- 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response.'
+ 'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.'
],
[
'role' => 'user',
- 'content' => "Provide a response to the below given QUERY using the below given CONTEXT\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
+ 'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
]
],
]);
$service = $this->getServiceName();
if ($service === 'openai') {
- $key = config('services.openai.key');
- $endpoint = config('services.openai.endpoint');
- return new OpenAiVectorQueryService($endpoint, $key, $this->http);
+ return new OpenAiVectorQueryService(config('services.openai'), $this->http);
}
throw new \Exception("No '{$service}' LLM service found");
$topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id')
->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance')
->orderBy('distance', 'asc')
+ ->having('distance', '<', 0.6)
->limit(10)
->get();
});
$table = DB::getTablePrefix() . 'search_vectors';
+
+ // TODO - Vector size might need to be dynamic
DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)");
DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine");
}