mirror of https://github.com/sysown/proxysql
These documents serve as blueprints for implementing RAG (Retrieval-Augmented Generation) capabilities in ProxySQL: - schema.sql: Database schema for RAG implementation - rag_ingest.cpp: PoC ingester blueprint to be integrated into ProxySQL - architecture-data-model.md: Data model architecture for RAG - architecture-runtime-retrieval.md: Runtime retrieval architecture - mcp-tools.md: MCP tools integration design - sql-examples.md: SQL usage examples for RAG - embeddings-design.md: Embeddings design for vector search These files will guide the upcoming RAG implementation in ProxySQL.pull/5318/head
parent
994bafa31f
commit
803115f504
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,172 @@
|
||||
-- ============================================================
|
||||
-- ProxySQL RAG Index Schema (SQLite)
|
||||
-- v0: documents + chunks + FTS5 + sqlite3-vec embeddings
|
||||
-- ============================================================
|
||||
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
|
||||
-- ============================================================
|
||||
-- 1) rag_sources: control plane
|
||||
-- Defines where to fetch from + how to transform + chunking.
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS rag_sources (
|
||||
source_id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE, -- e.g. "stack_posts"
|
||||
enabled INTEGER NOT NULL DEFAULT 1,
|
||||
|
||||
-- Where to retrieve from (PoC: connect directly; later can be "via ProxySQL")
|
||||
backend_type TEXT NOT NULL, -- "mysql" | "postgres" | ...
|
||||
backend_host TEXT NOT NULL,
|
||||
backend_port INTEGER NOT NULL,
|
||||
backend_user TEXT NOT NULL,
|
||||
backend_pass TEXT NOT NULL,
|
||||
backend_db TEXT NOT NULL, -- database/schema name
|
||||
|
||||
table_name TEXT NOT NULL, -- e.g. "posts"
|
||||
pk_column TEXT NOT NULL, -- e.g. "Id"
|
||||
|
||||
-- Optional: restrict ingestion; appended to SELECT as WHERE <where_sql>
|
||||
where_sql TEXT, -- e.g. "PostTypeId IN (1,2)"
|
||||
|
||||
-- REQUIRED: mapping from source row -> rag_documents fields
|
||||
-- JSON spec describing doc_id, title/body concat, metadata pick/rename, etc.
|
||||
doc_map_json TEXT NOT NULL,
|
||||
|
||||
-- REQUIRED: chunking strategy (enabled, chunk_size, overlap, etc.)
|
||||
chunking_json TEXT NOT NULL,
|
||||
|
||||
-- Optional: embedding strategy (how to build embedding input text)
|
||||
-- In v0 you can keep it NULL/empty; define later without schema changes.
|
||||
embedding_json TEXT,
|
||||
|
||||
created_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
||||
updated_at INTEGER NOT NULL DEFAULT (unixepoch())
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_sources_enabled
|
||||
ON rag_sources(enabled);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_sources_backend
|
||||
ON rag_sources(backend_type, backend_host, backend_port, backend_db, table_name);
|
||||
|
||||
|
||||
-- ============================================================
|
||||
-- 2) rag_documents: canonical documents
|
||||
-- One document per source row (e.g. one per posts.Id).
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS rag_documents (
|
||||
doc_id TEXT PRIMARY KEY, -- stable: e.g. "posts:12345"
|
||||
source_id INTEGER NOT NULL REFERENCES rag_sources(source_id),
|
||||
source_name TEXT NOT NULL, -- copy of rag_sources.name for convenience
|
||||
pk_json TEXT NOT NULL, -- e.g. {"Id":12345}
|
||||
|
||||
title TEXT,
|
||||
body TEXT,
|
||||
metadata_json TEXT NOT NULL DEFAULT '{}', -- JSON object
|
||||
|
||||
updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
||||
deleted INTEGER NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_documents_source_updated
|
||||
ON rag_documents(source_id, updated_at);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_documents_source_deleted
|
||||
ON rag_documents(source_id, deleted);
|
||||
|
||||
|
||||
-- ============================================================
|
||||
-- 3) rag_chunks: chunked content
|
||||
-- The unit we index in FTS and vectors.
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS rag_chunks (
|
||||
chunk_id TEXT PRIMARY KEY, -- e.g. "posts:12345#0"
|
||||
doc_id TEXT NOT NULL REFERENCES rag_documents(doc_id),
|
||||
source_id INTEGER NOT NULL REFERENCES rag_sources(source_id),
|
||||
|
||||
chunk_index INTEGER NOT NULL, -- 0..N-1
|
||||
title TEXT,
|
||||
body TEXT NOT NULL,
|
||||
|
||||
-- Optional per-chunk metadata (e.g. offsets, has_code, section label)
|
||||
metadata_json TEXT NOT NULL DEFAULT '{}',
|
||||
|
||||
updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
||||
deleted INTEGER NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_rag_chunks_doc_idx
|
||||
ON rag_chunks(doc_id, chunk_index);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_chunks_source_doc
|
||||
ON rag_chunks(source_id, doc_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rag_chunks_deleted
|
||||
ON rag_chunks(deleted);
|
||||
|
||||
|
||||
-- ============================================================
|
||||
-- 4) rag_fts_chunks: FTS5 index (contentless)
|
||||
-- Maintained explicitly by the ingester.
|
||||
-- Notes:
|
||||
-- - chunk_id is stored but UNINDEXED.
|
||||
-- - Use bm25(rag_fts_chunks) for ranking.
|
||||
-- ============================================================
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS rag_fts_chunks
|
||||
USING fts5(
|
||||
chunk_id UNINDEXED,
|
||||
title,
|
||||
body,
|
||||
tokenize = 'unicode61'
|
||||
);
|
||||
|
||||
|
||||
-- ============================================================
|
||||
-- 5) rag_vec_chunks: sqlite3-vec index
|
||||
-- Stores embeddings per chunk for vector search.
|
||||
--
|
||||
-- IMPORTANT:
|
||||
-- - dimension must match your embedding model (example: 1536).
|
||||
-- - metadata columns are included to help join/filter.
|
||||
-- ============================================================
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS rag_vec_chunks
|
||||
USING vec0(
|
||||
embedding float[1536], -- change if you use another dimension
|
||||
chunk_id TEXT, -- join key back to rag_chunks
|
||||
doc_id TEXT, -- optional convenience
|
||||
source_id INTEGER, -- optional convenience
|
||||
updated_at INTEGER -- optional convenience
|
||||
);
|
||||
|
||||
-- Optional: convenience view for debugging / SQL access patterns
|
||||
CREATE VIEW IF NOT EXISTS rag_chunk_view AS
|
||||
SELECT
|
||||
c.chunk_id,
|
||||
c.doc_id,
|
||||
c.source_id,
|
||||
d.source_name,
|
||||
d.pk_json,
|
||||
COALESCE(c.title, d.title) AS title,
|
||||
c.body,
|
||||
d.metadata_json AS doc_metadata_json,
|
||||
c.metadata_json AS chunk_metadata_json,
|
||||
c.updated_at
|
||||
FROM rag_chunks c
|
||||
JOIN rag_documents d ON d.doc_id = c.doc_id
|
||||
WHERE c.deleted = 0 AND d.deleted = 0;
|
||||
|
||||
|
||||
-- ============================================================
|
||||
-- 6) (Optional) sync state placeholder for later incremental ingestion
|
||||
-- Not used in v0, but reserving it avoids later schema churn.
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS rag_sync_state (
|
||||
source_id INTEGER PRIMARY KEY REFERENCES rag_sources(source_id),
|
||||
mode TEXT NOT NULL DEFAULT 'poll', -- 'poll' | 'cdc'
|
||||
cursor_json TEXT NOT NULL DEFAULT '{}', -- watermark/checkpoint
|
||||
last_ok_at INTEGER,
|
||||
last_error TEXT
|
||||
);
|
||||
|
||||
@ -0,0 +1,348 @@
|
||||
# ProxySQL RAG Index — SQL Examples (FTS, Vectors, Hybrid)
|
||||
|
||||
This file provides concrete SQL examples for querying the ProxySQL-hosted SQLite RAG index directly (for debugging, internal dashboards, or SQL-native applications).
|
||||
|
||||
The **preferred interface for AI agents** remains MCP tools (`mcp-tools.md`). SQL access should typically be restricted to trusted callers.
|
||||
|
||||
Assumed tables:
|
||||
- `rag_documents`
|
||||
- `rag_chunks`
|
||||
- `rag_fts_chunks` (FTS5)
|
||||
- `rag_vec_chunks` (sqlite3-vec vec0 table)
|
||||
|
||||
---
|
||||
|
||||
## 0. Common joins and inspection
|
||||
|
||||
### 0.1 Inspect one document and its chunks
|
||||
```sql
|
||||
SELECT * FROM rag_documents WHERE doc_id = 'posts:12345';
|
||||
SELECT * FROM rag_chunks WHERE doc_id = 'posts:12345' ORDER BY chunk_index;
|
||||
```
|
||||
|
||||
### 0.2 Use the convenience view (if enabled)
|
||||
```sql
|
||||
SELECT * FROM rag_chunk_view WHERE doc_id = 'posts:12345' ORDER BY chunk_id;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. FTS5 examples
|
||||
|
||||
### 1.1 Basic FTS search (top 10)
|
||||
```sql
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
WHERE rag_fts_chunks MATCH 'json_extract mysql'
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
### 1.2 Join FTS results to chunk text and document metadata
|
||||
```sql
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw,
|
||||
c.doc_id,
|
||||
COALESCE(c.title, d.title) AS title,
|
||||
c.body AS chunk_body,
|
||||
d.metadata_json AS doc_metadata_json
|
||||
FROM rag_fts_chunks f
|
||||
JOIN rag_chunks c ON c.chunk_id = f.chunk_id
|
||||
JOIN rag_documents d ON d.doc_id = c.doc_id
|
||||
WHERE rag_fts_chunks MATCH 'json_extract mysql'
|
||||
AND c.deleted = 0 AND d.deleted = 0
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
### 1.3 Apply a source filter (by source_id)
|
||||
```sql
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
JOIN rag_chunks c ON c.chunk_id = f.chunk_id
|
||||
WHERE rag_fts_chunks MATCH 'replication lag'
|
||||
AND c.source_id = 1
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
### 1.4 Phrase queries, boolean operators (FTS5)
|
||||
```sql
|
||||
-- phrase
|
||||
SELECT chunk_id FROM rag_fts_chunks
|
||||
WHERE rag_fts_chunks MATCH '"group replication"'
|
||||
LIMIT 20;
|
||||
|
||||
-- boolean: term1 AND term2
|
||||
SELECT chunk_id FROM rag_fts_chunks
|
||||
WHERE rag_fts_chunks MATCH 'mysql AND deadlock'
|
||||
LIMIT 20;
|
||||
|
||||
-- boolean: term1 NOT term2
|
||||
SELECT chunk_id FROM rag_fts_chunks
|
||||
WHERE rag_fts_chunks MATCH 'mysql NOT mariadb'
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Vector search examples (sqlite3-vec)
|
||||
|
||||
Vector SQL varies slightly depending on sqlite3-vec build and how you bind vectors.
|
||||
Below are **two patterns** you can implement in ProxySQL.
|
||||
|
||||
### 2.1 Pattern A (recommended): ProxySQL computes embeddings; SQL receives a bound vector
|
||||
In this pattern, ProxySQL:
|
||||
1) Computes the query embedding in C++
|
||||
2) Executes SQL with a bound parameter `:qvec` representing the embedding
|
||||
|
||||
A typical “nearest neighbors” query shape is:
|
||||
|
||||
```sql
|
||||
-- PSEUDOCODE: adapt to sqlite3-vec's exact operator/function in your build.
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance AS distance_raw
|
||||
FROM rag_vec_chunks v
|
||||
WHERE v.embedding MATCH :qvec
|
||||
ORDER BY distance_raw
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
Then join to chunks:
|
||||
```sql
|
||||
-- PSEUDOCODE: join with content and metadata
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance AS distance_raw,
|
||||
c.doc_id,
|
||||
c.body AS chunk_body,
|
||||
d.metadata_json AS doc_metadata_json
|
||||
FROM (
|
||||
SELECT chunk_id, distance
|
||||
FROM rag_vec_chunks
|
||||
WHERE embedding MATCH :qvec
|
||||
ORDER BY distance
|
||||
LIMIT 10
|
||||
) v
|
||||
JOIN rag_chunks c ON c.chunk_id = v.chunk_id
|
||||
JOIN rag_documents d ON d.doc_id = c.doc_id;
|
||||
```
|
||||
|
||||
### 2.2 Pattern B (debug): store a query vector in a temporary table
|
||||
This is useful when you want to run vector queries manually in SQL without MCP support.
|
||||
|
||||
```sql
|
||||
CREATE TEMP TABLE tmp_query_vec(qvec BLOB);
|
||||
-- Insert the query vector (float32 array blob). The insertion is usually done by tooling, not manually.
|
||||
-- INSERT INTO tmp_query_vec VALUES (X'...');
|
||||
|
||||
-- PSEUDOCODE: use tmp_query_vec.qvec as the query embedding
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance
|
||||
FROM rag_vec_chunks v, tmp_query_vec t
|
||||
WHERE v.embedding MATCH t.qvec
|
||||
ORDER BY v.distance
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Hybrid search examples
|
||||
|
||||
Hybrid retrieval is best implemented in the MCP layer because it mixes ranking systems and needs careful bounding.
|
||||
However, you can approximate hybrid behavior using SQL to validate logic.
|
||||
|
||||
### 3.1 Hybrid Mode A: Parallel FTS + Vector then fuse (RRF)
|
||||
|
||||
#### Step 1: FTS top 50 (ranked)
|
||||
```sql
|
||||
WITH fts AS (
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
WHERE rag_fts_chunks MATCH :fts_query
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 50
|
||||
)
|
||||
SELECT * FROM fts;
|
||||
```
|
||||
|
||||
#### Step 2: Vector top 50 (ranked)
|
||||
```sql
|
||||
WITH vec AS (
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance AS distance_raw
|
||||
FROM rag_vec_chunks v
|
||||
WHERE v.embedding MATCH :qvec
|
||||
ORDER BY v.distance
|
||||
LIMIT 50
|
||||
)
|
||||
SELECT * FROM vec;
|
||||
```
|
||||
|
||||
#### Step 3: Fuse via Reciprocal Rank Fusion (RRF)
|
||||
In SQL you need ranks. SQLite supports window functions in modern builds.
|
||||
|
||||
```sql
|
||||
WITH
|
||||
fts AS (
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw,
|
||||
ROW_NUMBER() OVER (ORDER BY bm25(rag_fts_chunks)) AS rank_fts
|
||||
FROM rag_fts_chunks f
|
||||
WHERE rag_fts_chunks MATCH :fts_query
|
||||
LIMIT 50
|
||||
),
|
||||
vec AS (
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance AS distance_raw,
|
||||
ROW_NUMBER() OVER (ORDER BY v.distance) AS rank_vec
|
||||
FROM rag_vec_chunks v
|
||||
WHERE v.embedding MATCH :qvec
|
||||
LIMIT 50
|
||||
),
|
||||
merged AS (
|
||||
SELECT
|
||||
COALESCE(fts.chunk_id, vec.chunk_id) AS chunk_id,
|
||||
fts.rank_fts,
|
||||
vec.rank_vec,
|
||||
fts.score_fts_raw,
|
||||
vec.distance_raw
|
||||
FROM fts
|
||||
FULL OUTER JOIN vec ON vec.chunk_id = fts.chunk_id
|
||||
),
|
||||
rrf AS (
|
||||
SELECT
|
||||
chunk_id,
|
||||
score_fts_raw,
|
||||
distance_raw,
|
||||
rank_fts,
|
||||
rank_vec,
|
||||
(1.0 / (60.0 + COALESCE(rank_fts, 1000000))) +
|
||||
(1.0 / (60.0 + COALESCE(rank_vec, 1000000))) AS score_rrf
|
||||
FROM merged
|
||||
)
|
||||
SELECT
|
||||
r.chunk_id,
|
||||
r.score_rrf,
|
||||
c.doc_id,
|
||||
c.body AS chunk_body
|
||||
FROM rrf r
|
||||
JOIN rag_chunks c ON c.chunk_id = r.chunk_id
|
||||
ORDER BY r.score_rrf DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
**Important**: SQLite does not support `FULL OUTER JOIN` directly in all builds.
|
||||
For production, implement the merge/fuse in C++ (MCP layer). This SQL is illustrative.
|
||||
|
||||
### 3.2 Hybrid Mode B: Broad FTS then vector rerank (candidate generation)
|
||||
|
||||
#### Step 1: FTS candidate set (top 200)
|
||||
```sql
|
||||
WITH candidates AS (
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
WHERE rag_fts_chunks MATCH :fts_query
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 200
|
||||
)
|
||||
SELECT * FROM candidates;
|
||||
```
|
||||
|
||||
#### Step 2: Vector rerank within candidates
|
||||
Conceptually:
|
||||
- Join candidates to `rag_vec_chunks` and compute distance to `:qvec`
|
||||
- Keep top 10
|
||||
|
||||
```sql
|
||||
WITH candidates AS (
|
||||
SELECT
|
||||
f.chunk_id
|
||||
FROM rag_fts_chunks f
|
||||
WHERE rag_fts_chunks MATCH :fts_query
|
||||
ORDER BY bm25(rag_fts_chunks)
|
||||
LIMIT 200
|
||||
),
|
||||
reranked AS (
|
||||
SELECT
|
||||
v.chunk_id,
|
||||
v.distance AS distance_raw
|
||||
FROM rag_vec_chunks v
|
||||
JOIN candidates c ON c.chunk_id = v.chunk_id
|
||||
WHERE v.embedding MATCH :qvec
|
||||
ORDER BY v.distance
|
||||
LIMIT 10
|
||||
)
|
||||
SELECT
|
||||
r.chunk_id,
|
||||
r.distance_raw,
|
||||
ch.doc_id,
|
||||
ch.body
|
||||
FROM reranked r
|
||||
JOIN rag_chunks ch ON ch.chunk_id = r.chunk_id;
|
||||
```
|
||||
|
||||
As above, the exact `MATCH :qvec` syntax may need adaptation to your sqlite3-vec build; implement vector query execution in C++ and keep SQL as internal glue.
|
||||
|
||||
---
|
||||
|
||||
## 4. Common “application-friendly” queries
|
||||
|
||||
### 4.1 Return doc_id + score + title only (no bodies)
|
||||
```sql
|
||||
SELECT
|
||||
f.chunk_id,
|
||||
c.doc_id,
|
||||
COALESCE(c.title, d.title) AS title,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
JOIN rag_chunks c ON c.chunk_id = f.chunk_id
|
||||
JOIN rag_documents d ON d.doc_id = c.doc_id
|
||||
WHERE rag_fts_chunks MATCH :q
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
### 4.2 Return top doc_ids (deduplicate by doc_id)
|
||||
```sql
|
||||
WITH ranked_chunks AS (
|
||||
SELECT
|
||||
c.doc_id,
|
||||
bm25(rag_fts_chunks) AS score_fts_raw
|
||||
FROM rag_fts_chunks f
|
||||
JOIN rag_chunks c ON c.chunk_id = f.chunk_id
|
||||
WHERE rag_fts_chunks MATCH :q
|
||||
ORDER BY score_fts_raw
|
||||
LIMIT 200
|
||||
)
|
||||
SELECT doc_id, MIN(score_fts_raw) AS best_score
|
||||
FROM ranked_chunks
|
||||
GROUP BY doc_id
|
||||
ORDER BY best_score
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Practical guidance
|
||||
|
||||
- Use SQL mode mainly for debugging and internal tooling.
|
||||
- Prefer MCP tools for agent interaction:
|
||||
- stable schemas
|
||||
- strong guardrails
|
||||
- consistent hybrid scoring
|
||||
- Implement hybrid fusion in C++ (not in SQL) to avoid dialect limitations and to keep scoring correct.
|
||||
Loading…
Reference in new issue