Merge a023bed41d0219d08d7dbce52948e3e5c3528381 into fa566f156ad8998d67a63be2856dafc7ce277d88

This commit is contained in:
Dan Brown 2025-04-02 16:23:33 -03:00 committed by GitHub
commit 8d6577c094
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 439 additions and 1 deletions

View File

@ -22,6 +22,18 @@ return [
// Callback URL for social authentication methods
'callback_url' => env('APP_URL', false),
// LLM Service
// Options: openai
'llm' => env('LLM_SERVICE', ''),
// OpenAI API-compatible service details
'openai' => [
'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
'key' => env('OPENAI_KEY', ''),
'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'),
'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'),
],
'github' => [
'client_id' => env('GITHUB_APP_ID', false),
'client_secret' => env('GITHUB_APP_SECRET', false),

View File

@ -0,0 +1,46 @@
<?php
namespace BookStack\Console\Commands;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\SearchVector;
use BookStack\Search\Vectors\StoreEntityVectorsJob;
use Illuminate\Console\Command;
class RegenerateVectorsCommand extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'bookstack:regenerate-vectors';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Re-index vectors for all content in the system';
/**
* Execute the console command.
*/
public function handle(EntityProvider $entityProvider)
{
// TODO - Add confirmation before run regarding deletion/time/effort/api-cost etc...
SearchVector::query()->delete();
$types = $entityProvider->all();
foreach ($types as $type => $typeInstance) {
$this->info("Creating jobs to store vectors for {$type} data...");
/** @var Entity[] $entities */
$typeInstance->newQuery()->chunkById(100, function ($entities) {
foreach ($entities as $entity) {
dispatch(new StoreEntityVectorsJob($entity));
}
});
}
}
}

View File

@ -6,6 +6,7 @@ use BookStack\Entities\Queries\PageQueries;
use BookStack\Entities\Queries\QueryPopular;
use BookStack\Entities\Tools\SiblingFetcher;
use BookStack\Http\Controller;
use BookStack\Search\Vectors\VectorSearchRunner;
use Illuminate\Http\Request;
class SearchController extends Controller
@ -139,4 +140,19 @@ class SearchController extends Controller
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
}
public function searchQuery(Request $request, VectorSearchRunner $runner)
{
$query = $request->get('query', '');
if ($query) {
$results = $runner->run($query);
} else {
$results = null;
}
return view('search.query', [
'results' => $results,
]);
}
}

View File

@ -6,6 +6,8 @@ use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Search\Vectors\StoreEntityVectorsJob;
use BookStack\Search\Vectors\VectorQueryServiceProvider;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
@ -25,7 +27,7 @@ class SearchIndex
public static string $softDelimiters = ".-";
public function __construct(
protected EntityProvider $entityProvider
protected EntityProvider $entityProvider,
) {
}
@ -37,6 +39,10 @@ class SearchIndex
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
$this->insertTerms($terms);
if (VectorQueryServiceProvider::isEnabled()) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
/**
@ -47,9 +53,15 @@ class SearchIndex
public function indexEntities(array $entities): void
{
$terms = [];
$vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();
foreach ($entities as $entity) {
$entityTerms = $this->entityToTermDataArray($entity);
array_push($terms, ...$entityTerms);
if ($vectorQueryEnabled) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
$this->insertTerms($terms);

View File

@ -0,0 +1,84 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\Services\VectorQueryService;
use Illuminate\Support\Facades\DB;
class EntityVectorGenerator
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider
) {
}
public function generateAndStore(Entity $entity): void
{
$vectorService = $this->vectorQueryServiceProvider->get();
$text = $this->entityToPlainText($entity);
$chunks = $this->chunkText($text);
$embeddings = $this->chunksToEmbeddings($chunks, $vectorService);
$this->deleteExistingEmbeddingsForEntity($entity);
$this->storeEmbeddings($embeddings, $chunks, $entity);
}
protected function deleteExistingEmbeddingsForEntity(Entity $entity): void
{
SearchVector::query()
->where('entity_type', '=', $entity->getMorphClass())
->where('entity_id', '=', $entity->id)
->delete();
}
protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void
{
$toInsert = [];
foreach ($embeddings as $index => $embedding) {
$text = $textChunks[$index];
$toInsert[] = [
'entity_id' => $entity->id,
'entity_type' => $entity->getMorphClass(),
'embedding' => DB::raw('VEC_FROMTEXT("[' . implode(',', $embedding) . ']")'),
'text' => $text,
];
}
// TODO - Chunk inserts
SearchVector::query()->insert($toInsert);
}
/**
* @param string[] $chunks
* @return float[] array
*/
protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array
{
$embeddings = [];
foreach ($chunks as $index => $chunk) {
$embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk);
}
return $embeddings;
}
/**
* @return string[]
*/
protected function chunkText(string $text): array
{
// TODO - Join adjacent smaller chunks up
return array_filter(array_map(function (string $section): string {
return trim($section);
}, explode("\n", $text)));
}
protected function entityToPlainText(Entity $entity): string
{
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
// TODO - Add tags
return $text;
}
}

View File

@ -0,0 +1,16 @@
<?php
namespace BookStack\Search\Vectors;
use Illuminate\Database\Eloquent\Model;
/**
* @property string $entity_type
* @property int $entity_id
* @property string $text
* @property string $embedding
*/
class SearchVector extends Model
{
public $timestamps = false;
}

View File

@ -0,0 +1,66 @@
<?php
namespace BookStack\Search\Vectors\Services;
use BookStack\Http\HttpRequestService;
class OpenAiVectorQueryService implements VectorQueryService
{
protected string $key;
protected string $endpoint;
protected string $embeddingModel;
protected string $queryModel;
public function __construct(
protected array $options,
protected HttpRequestService $http,
) {
// TODO - Some kind of validation of options
$this->key = $this->options['key'] ?? '';
$this->endpoint = $this->options['endpoint'] ?? '';
$this->embeddingModel = $this->options['embedding_model'] ?? '';
$this->queryModel = $this->options['query_model'] ?? '';
}
protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
$client = $this->http->buildClient(30);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);
$response = $client->sendRequest($request);
return json_decode($response->getBody()->getContents(), true);
}
public function generateEmbeddings(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
'model' => $this->embeddingModel,
]);
return $response['data'][0]['embedding'];
}
public function query(string $input, array $context): string
{
$formattedContext = implode("\n", $context);
$response = $this->jsonRequest('POST', 'v1/chat/completions', [
'model' => $this->queryModel,
'messages' => [
[
'role' => 'developer',
'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.'
],
[
'role' => 'user',
'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
]
],
]);
return $response['choices'][0]['message']['content'] ?? '';
}
}

View File

@ -0,0 +1,21 @@
<?php
namespace BookStack\Search\Vectors\Services;
interface VectorQueryService
{
/**
* Generate embedding vectors from the given chunk of text.
* @return float[]
*/
public function generateEmbeddings(string $text): array;
/**
* Query the LLM service using the given user input, and
* relevant context text retrieved locally via a vector search.
* Returns the response output text from the LLM.
*
* @param string[] $context
*/
public function query(string $input, array $context): string;
}

View File

@ -0,0 +1,28 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Entities\Models\Entity;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
class StoreEntityVectorsJob implements ShouldQueue
{
use Queueable;
/**
* Create a new job instance.
*/
public function __construct(
protected Entity $entity
) {
}
/**
* Execute the job.
*/
public function handle(EntityVectorGenerator $generator): void
{
$generator->generateAndStore($this->entity);
}
}

View File

@ -0,0 +1,36 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Http\HttpRequestService;
use BookStack\Search\Vectors\Services\OpenAiVectorQueryService;
use BookStack\Search\Vectors\Services\VectorQueryService;
class VectorQueryServiceProvider
{
public function __construct(
protected HttpRequestService $http,
) {
}
public function get(): VectorQueryService
{
$service = $this->getServiceName();
if ($service === 'openai') {
return new OpenAiVectorQueryService(config('services.openai'), $this->http);
}
throw new \Exception("No '{$service}' LLM service found");
}
protected static function getServiceName(): string
{
return strtolower(config('services.llm'));
}
public static function isEnabled(): bool
{
return !empty(static::getServiceName());
}
}

View File

@ -0,0 +1,34 @@
<?php
namespace BookStack\Search\Vectors;
class VectorSearchRunner
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider
) {
}
public function run(string $query): array
{
$queryService = $this->vectorQueryServiceProvider->get();
$queryVector = $queryService->generateEmbeddings($query);
// TODO - Apply permissions
// TODO - Join models
$topMatches = SearchVector::query()->select('text', 'entity_type', 'entity_id')
->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance')
->orderBy('distance', 'asc')
->having('distance', '<', 0.6)
->limit(10)
->get();
$matchesText = array_values(array_map(fn (SearchVector $match) => $match->text, $topMatches->all()));
$llmResult = $queryService->query($query, $matchesText);
return [
'llm_result' => $llmResult,
'entity_matches' => $topMatches->toArray()
];
}
}

View File

@ -0,0 +1,37 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
// TODO - Handle compatibility with older databases that don't support vectors
Schema::create('search_vectors', function (Blueprint $table) {
$table->string('entity_type', 100);
$table->integer('entity_id');
$table->text('text');
$table->index(['entity_type', 'entity_id']);
});
$table = DB::getTablePrefix() . 'search_vectors';
// TODO - Vector size might need to be dynamic
DB::statement("ALTER TABLE {$table} ADD COLUMN (embedding VECTOR(1536) NOT NULL)");
DB::statement("ALTER TABLE {$table} ADD VECTOR INDEX (embedding) DISTANCE=cosine");
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('search_vectors');
}
};

View File

@ -0,0 +1,29 @@
@extends('layouts.simple')
@section('body')
<div class="container mt-xl" id="search-system">
<form action="{{ url('/search/query') }}" method="get">
<input name="query" type="text">
<button class="button">Query</button>
</form>
@if($results)
<h2>Results</h2>
<h3>LLM Output</h3>
<p>{{ $results['llm_result'] }}</p>
<h3>Entity Matches</h3>
@foreach($results['entity_matches'] as $match)
<div>
<div><strong>{{ $match['entity_type'] }}:{{ $match['entity_id'] }}; Distance: {{ $match['distance'] }}</strong></div>
<details>
<summary>match text</summary>
<div>{{ $match['text'] }}</div>
</details>
</div>
@endforeach
@endif
</div>
@stop

View File

@ -187,6 +187,7 @@ Route::middleware('auth')->group(function () {
// Search
Route::get('/search', [SearchController::class, 'search']);
Route::get('/search/query', [SearchController::class, 'searchQuery']);
Route::get('/search/book/{bookId}', [SearchController::class, 'searchBook']);
Route::get('/search/chapter/{bookId}', [SearchController::class, 'searchChapter']);
Route::get('/search/entity/siblings', [SearchController::class, 'searchSiblings']);