mirror of https://github.com/sysown/proxysql
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
173 lines
6.2 KiB
173 lines
6.2 KiB
-- ============================================================
|
|
-- ProxySQL RAG Index Schema (SQLite)
|
|
-- v0: documents + chunks + FTS5 + sqlite3-vec embeddings
|
|
-- ============================================================
|
|
|
|
PRAGMA foreign_keys = ON;
|
|
PRAGMA journal_mode = WAL;
|
|
PRAGMA synchronous = NORMAL;
|
|
|
|
-- ============================================================
|
|
-- 1) rag_sources: control plane
|
|
-- Defines where to fetch from + how to transform + chunking.
|
|
-- ============================================================
|
|
CREATE TABLE IF NOT EXISTS rag_sources (
|
|
source_id INTEGER PRIMARY KEY,
|
|
name TEXT NOT NULL UNIQUE, -- e.g. "stack_posts"
|
|
enabled INTEGER NOT NULL DEFAULT 1,
|
|
|
|
-- Where to retrieve from (PoC: connect directly; later can be "via ProxySQL")
|
|
backend_type TEXT NOT NULL, -- "mysql" | "postgres" | ...
|
|
backend_host TEXT NOT NULL,
|
|
backend_port INTEGER NOT NULL,
|
|
backend_user TEXT NOT NULL,
|
|
backend_pass TEXT NOT NULL,
|
|
backend_db TEXT NOT NULL, -- database/schema name
|
|
|
|
table_name TEXT NOT NULL, -- e.g. "posts"
|
|
pk_column TEXT NOT NULL, -- e.g. "Id"
|
|
|
|
-- Optional: restrict ingestion; appended to SELECT as WHERE <where_sql>
|
|
where_sql TEXT, -- e.g. "PostTypeId IN (1,2)"
|
|
|
|
-- REQUIRED: mapping from source row -> rag_documents fields
|
|
-- JSON spec describing doc_id, title/body concat, metadata pick/rename, etc.
|
|
doc_map_json TEXT NOT NULL,
|
|
|
|
-- REQUIRED: chunking strategy (enabled, chunk_size, overlap, etc.)
|
|
chunking_json TEXT NOT NULL,
|
|
|
|
-- Optional: embedding strategy (how to build embedding input text)
|
|
-- In v0 you can keep it NULL/empty; define later without schema changes.
|
|
embedding_json TEXT,
|
|
|
|
created_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
|
updated_at INTEGER NOT NULL DEFAULT (unixepoch())
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_sources_enabled
|
|
ON rag_sources(enabled);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_sources_backend
|
|
ON rag_sources(backend_type, backend_host, backend_port, backend_db, table_name);
|
|
|
|
|
|
-- ============================================================
|
|
-- 2) rag_documents: canonical documents
|
|
-- One document per source row (e.g. one per posts.Id).
|
|
-- ============================================================
|
|
CREATE TABLE IF NOT EXISTS rag_documents (
|
|
doc_id TEXT PRIMARY KEY, -- stable: e.g. "posts:12345"
|
|
source_id INTEGER NOT NULL REFERENCES rag_sources(source_id),
|
|
source_name TEXT NOT NULL, -- copy of rag_sources.name for convenience
|
|
pk_json TEXT NOT NULL, -- e.g. {"Id":12345}
|
|
|
|
title TEXT,
|
|
body TEXT,
|
|
metadata_json TEXT NOT NULL DEFAULT '{}', -- JSON object
|
|
|
|
updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
|
deleted INTEGER NOT NULL DEFAULT 0
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_documents_source_updated
|
|
ON rag_documents(source_id, updated_at);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_documents_source_deleted
|
|
ON rag_documents(source_id, deleted);
|
|
|
|
|
|
-- ============================================================
|
|
-- 3) rag_chunks: chunked content
|
|
-- The unit we index in FTS and vectors.
|
|
-- ============================================================
|
|
CREATE TABLE IF NOT EXISTS rag_chunks (
|
|
chunk_id TEXT PRIMARY KEY, -- e.g. "posts:12345#0"
|
|
doc_id TEXT NOT NULL REFERENCES rag_documents(doc_id),
|
|
source_id INTEGER NOT NULL REFERENCES rag_sources(source_id),
|
|
|
|
chunk_index INTEGER NOT NULL, -- 0..N-1
|
|
title TEXT,
|
|
body TEXT NOT NULL,
|
|
|
|
-- Optional per-chunk metadata (e.g. offsets, has_code, section label)
|
|
metadata_json TEXT NOT NULL DEFAULT '{}',
|
|
|
|
updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
|
deleted INTEGER NOT NULL DEFAULT 0
|
|
);
|
|
|
|
CREATE UNIQUE INDEX IF NOT EXISTS uq_rag_chunks_doc_idx
|
|
ON rag_chunks(doc_id, chunk_index);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_chunks_source_doc
|
|
ON rag_chunks(source_id, doc_id);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_rag_chunks_deleted
|
|
ON rag_chunks(deleted);
|
|
|
|
|
|
-- ============================================================
|
|
-- 4) rag_fts_chunks: FTS5 index (contentless)
|
|
-- Maintained explicitly by the ingester.
|
|
-- Notes:
|
|
-- - chunk_id is stored but UNINDEXED.
|
|
-- - Use bm25(rag_fts_chunks) for ranking.
|
|
-- ============================================================
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS rag_fts_chunks
|
|
USING fts5(
|
|
chunk_id UNINDEXED,
|
|
title,
|
|
body,
|
|
tokenize = 'unicode61'
|
|
);
|
|
|
|
|
|
-- ============================================================
|
|
-- 5) rag_vec_chunks: sqlite3-vec index
|
|
-- Stores embeddings per chunk for vector search.
|
|
--
|
|
-- IMPORTANT:
|
|
-- - dimension must match your embedding model (example: 1536).
|
|
-- - metadata columns are included to help join/filter.
|
|
-- ============================================================
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS rag_vec_chunks
|
|
USING vec0(
|
|
embedding float[1536], -- change if you use another dimension
|
|
chunk_id TEXT, -- join key back to rag_chunks
|
|
doc_id TEXT, -- optional convenience
|
|
source_id INTEGER, -- optional convenience
|
|
updated_at INTEGER -- optional convenience
|
|
);
|
|
|
|
-- Optional: convenience view for debugging / SQL access patterns
|
|
CREATE VIEW IF NOT EXISTS rag_chunk_view AS
|
|
SELECT
|
|
c.chunk_id,
|
|
c.doc_id,
|
|
c.source_id,
|
|
d.source_name,
|
|
d.pk_json,
|
|
COALESCE(c.title, d.title) AS title,
|
|
c.body,
|
|
d.metadata_json AS doc_metadata_json,
|
|
c.metadata_json AS chunk_metadata_json,
|
|
c.updated_at
|
|
FROM rag_chunks c
|
|
JOIN rag_documents d ON d.doc_id = c.doc_id
|
|
WHERE c.deleted = 0 AND d.deleted = 0;
|
|
|
|
|
|
-- ============================================================
|
|
-- 6) (Optional) sync state placeholder for later incremental ingestion
|
|
-- Not used in v0, but reserving it avoids later schema churn.
|
|
-- ============================================================
|
|
CREATE TABLE IF NOT EXISTS rag_sync_state (
|
|
source_id INTEGER PRIMARY KEY REFERENCES rag_sources(source_id),
|
|
mode TEXT NOT NULL DEFAULT 'poll', -- 'poll' | 'cdc'
|
|
cursor_json TEXT NOT NULL DEFAULT '{}', -- watermark/checkpoint
|
|
last_ok_at INTEGER,
|
|
last_error TEXT
|
|
);
|
|
|