You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/RAG_POC/test_rag_ingest_sqlite_serv...

570 lines
23 KiB

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)"
MYSQL_BIN="${MYSQL_BIN:-mysql}"
# SQLite Server (MySQL protocol gateway) connection parameters
SQLITE_SERVER_HOST="${SQLITE_SERVER_HOST:-127.0.0.1}"
SQLITE_SERVER_PORT="${SQLITE_SERVER_PORT:-6030}"
SQLITE_SERVER_USER="${SQLITE_SERVER_USER:-root}"
SQLITE_SERVER_PASS="${SQLITE_SERVER_PASS:-root}"
SQLITE_SERVER_DB="${SQLITE_SERVER_DB:-rag_db}"
# MySQL backend connection parameters (for sample data)
MYSQL_HOST="${MYSQL_HOST:-127.0.0.1}"
MYSQL_PORT="${MYSQL_PORT:-3306}"
MYSQL_USER="${MYSQL_USER:-root}"
MYSQL_PASS="${MYSQL_PASS:-root}"
# rag_ingest binary
RAG_INGEST="${RAG_INGEST:-${ROOT_DIR}/rag_ingest}"
# Embedding provider configuration (for phase 4/5)
EMBEDDING_PROVIDER="${EMBEDDING_PROVIDER:-stub}"
EMBEDDING_DIM="${EMBEDDING_DIM:-1536}"
OPENAI_API_BASE="${OPENAI_API_BASE:-https://api.synthetic.new/openai/v1}"
OPENAI_API_KEY="${OPENAI_API_KEY:-}"
OPENAI_MODEL="${OPENAI_MODEL:-hf:nomic-ai/nomic-embed-text-v1.5}"
OPENAI_EMBEDDING_DIM="${OPENAI_EMBEDDING_DIM:-}"
if [[ -z "${OPENAI_EMBEDDING_DIM}" ]]; then
if [[ "${OPENAI_MODEL}" == "hf:nomic-ai/nomic-embed-text-v1.5" ]]; then
OPENAI_EMBEDDING_DIM=768
else
OPENAI_EMBEDDING_DIM="${EMBEDDING_DIM}"
fi
fi
# Uncomment to test OpenAI-compatible embeddings
# export EMBEDDING_PROVIDER=openai
# export EMBEDDING_DIM=1536
# export OPENAI_API_BASE="https://api.synthetic.new/openai/v1"
# export OPENAI_API_KEY="your_api_key_here"
# export OPENAI_MODEL="hf:nomic-ai/nomic-embed-text-v1.5"
# Helper: Run SQL via SQLite Server (MySQL protocol)
run_sqlite_server() {
local sql="$1"
"${MYSQL_BIN}" \
-h"${SQLITE_SERVER_HOST}" -P"${SQLITE_SERVER_PORT}" \
-u"${SQLITE_SERVER_USER}" -p"${SQLITE_SERVER_PASS}" \
-D"${SQLITE_SERVER_DB}" \
-e "${sql}"
}
# Helper: Get single value from SQLite Server
run_sqlite_server_value() {
local sql="$1"
local db="${2:-${SQLITE_SERVER_DB}}"
"${MYSQL_BIN}" \
-h"${SQLITE_SERVER_HOST}" -P"${SQLITE_SERVER_PORT}" \
-u"${SQLITE_SERVER_USER}" -p"${SQLITE_SERVER_PASS}" \
-D"${db}" \
-N -s -e "${sql}"
}
# Helper: Run SQL via SQLite Server with custom database
run_sqlite_server_with_db() {
local db="$1"
local sql="$2"
"${MYSQL_BIN}" \
-h"${SQLITE_SERVER_HOST}" -P"${SQLITE_SERVER_PORT}" \
-u"${SQLITE_SERVER_USER}" -p"${SQLITE_SERVER_PASS}" \
-D"${db}" \
-e "${sql}"
}
# Helper: Initialize schema and insert source configuration
apply_schema_and_source() {
local where_sql="$1"
local load_schema="$2"
local chunking_json_override="${3:-}"
local embedding_json_override="${4:-}"
echo "==> SQLite Server: ${SQLITE_SERVER_HOST}:${SQLITE_SERVER_PORT}/${SQLITE_SERVER_DB}"
echo "==> load_schema: ${load_schema}"
echo "==> where_sql: ${where_sql:-<empty>}"
local chunking_json_value='{"enabled":false,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}'
if [[ -n "${chunking_json_override}" ]]; then
chunking_json_value="${chunking_json_override}"
fi
echo "==> chunking_json: ${chunking_json_value}"
local embedding_json_value='{"enabled":false}'
if [[ -n "${embedding_json_override}" ]]; then
embedding_json_value="${embedding_json_override}"
fi
echo "==> embedding_json: ${embedding_json_value}"
if [[ "${load_schema}" == "true" ]]; then
# Initialize schema using rag_ingest init
"${RAG_INGEST}" init \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
# Insert rag_sources configuration
run_sqlite_server "
INSERT INTO rag_sources (
name, enabled, backend_type, backend_host, backend_port, backend_user, backend_pass, backend_db,
table_name, pk_column, where_sql, doc_map_json, chunking_json, embedding_json
) VALUES (
'test_posts', 1,
'mysql', '${MYSQL_HOST}', ${MYSQL_PORT}, '${MYSQL_USER}', '${MYSQL_PASS}', 'rag_test',
'posts', 'Id', '${where_sql}',
'{\"doc_id\":{\"format\":\"posts:{Id}\"},\"title\":{\"concat\":[{\"col\":\"Title\"}]},\"body\":{\"concat\":[{\"col\":\"Body\"}]},\"metadata\":{\"pick\":[\"Tags\",\"Score\"],\"rename\":{\"Tags\":\"tags\",\"Score\":\"score\"}}}',
'${chunking_json_value}',
'${embedding_json_value}'
);
"
else
# Update existing source
run_sqlite_server "
UPDATE rag_sources
SET chunking_json='${chunking_json_value}',
embedding_json='${embedding_json_value}',
where_sql='${where_sql}',
backend_host='${MYSQL_HOST}',
backend_port=${MYSQL_PORT},
backend_user='${MYSQL_USER}',
backend_pass='${MYSQL_PASS}'
WHERE source_id=1;
"
fi
}
# Helper: Import sample data to MySQL backend
import_mysql_seed() {
"${MYSQL_BIN}" \
-h"${MYSQL_HOST}" -P"${MYSQL_PORT}" \
-u"${MYSQL_USER}" -p"${MYSQL_PASS}" \
< "${ROOT_DIR}/sample_mysql.sql"
}
# Helper: Run SQL on MySQL backend
run_mysql_sql() {
local sql="$1"
"${MYSQL_BIN}" \
-h"${MYSQL_HOST}" -P"${MYSQL_PORT}" \
-u"${MYSQL_USER}" -p"${MYSQL_PASS}" \
-e "${sql}"
}
# Helper: Assert equality
assert_eq() {
local label="$1"
local expected="$2"
local actual="$3"
if [[ "${expected}" != "${actual}" ]]; then
echo "FAIL: ${label} expected ${expected}, got ${actual}" >&2
exit 1
fi
echo "OK: ${label} = ${actual}"
}
# Helper: FTS count via SQLite Server
fts_count() {
local q="$1"
run_sqlite_server_value "SELECT COUNT(*) FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}';"
}
# Helper: FTS BM25 top result via SQLite Server
fts_bm25_top() {
local q="$1"
run_sqlite_server_value "SELECT chunk_id FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}' ORDER BY bm25(rag_fts_chunks) LIMIT 1;"
}
# Helper: Vector self-match via SQLite Server
vec_self_match() {
local chunk_id="$1"
run_sqlite_server_value "SELECT chunk_id FROM rag_vec_chunks WHERE embedding MATCH (SELECT embedding FROM rag_vec_chunks WHERE chunk_id='${chunk_id}') ORDER BY distance LIMIT 1;"
}
# Helper: Print sample data from SQLite Server
print_samples() {
echo "==> Sample rag_documents"
run_sqlite_server "
SELECT doc_id, source_id, substr(title,1,40) AS title, json_extract(metadata_json,'$.Score') AS score
FROM rag_documents ORDER BY doc_id LIMIT 5;
"
echo "==> Sample rag_chunks"
run_sqlite_server "
SELECT chunk_id, doc_id, chunk_index, substr(body,1,50) AS body
FROM rag_chunks ORDER BY chunk_id LIMIT 5;
"
echo "==> Sample rag_fts_chunks matches for 'ProxySQL'"
run_sqlite_server "
SELECT chunk_id, substr(title,1,40) AS title
FROM rag_fts_chunks WHERE rag_fts_chunks MATCH 'ProxySQL' ORDER BY chunk_id LIMIT 5;
"
}
# Helper: Cleanup - drop and recreate database
cleanup_db() {
# Drop all tables and recreate schema
run_sqlite_server "
DROP TABLE IF EXISTS rag_sync_state;
DROP TABLE IF EXISTS rag_fts_chunks;
DROP TABLE IF EXISTS rag_vec_chunks;
DROP VIEW IF EXISTS rag_chunk_view;
DROP TABLE IF EXISTS rag_chunks;
DROP TABLE IF EXISTS rag_documents;
DROP TABLE IF EXISTS rag_sources;
" 2>/dev/null || true
}
# Cleanup before starting
cleanup_db
# ===========================================================================
# Phase 1: load schema + source, chunking disabled, no where filter
# ===========================================================================
echo "=========================================="
echo "Phase 1: Initial ingestion (no chunking)"
echo "=========================================="
apply_schema_and_source "" "true"
# Seed MySQL backend
import_mysql_seed
# Run rag_ingest
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
# Validate counts (sample_mysql has 10 rows)
DOCS_COUNT="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
FTS_COUNT="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_fts_chunks;")"
VEC_COUNT="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_vec_chunks;")"
assert_eq "rag_documents" "10" "${DOCS_COUNT}"
assert_eq "rag_chunks (chunking disabled)" "10" "${CHUNKS_COUNT}"
assert_eq "rag_fts_chunks" "10" "${FTS_COUNT}"
assert_eq "rag_vec_chunks (embedding disabled)" "0" "${VEC_COUNT}"
print_samples
# FTS tests (phase 1)
FTS_PHRASE_1="$(fts_count '"ProxySQL adds MCP"')"
FTS_SHORT_1="$(fts_count 'Short')"
FTS_TAG_1="$(fts_count 'Tag')"
FTS_BM25_1="$(fts_bm25_top 'ProxySQL')"
assert_eq "fts phrase (ProxySQL adds MCP)" "1" "${FTS_PHRASE_1}"
assert_eq "fts term (Short)" "1" "${FTS_SHORT_1}"
assert_eq "fts term (Tag)" "1" "${FTS_TAG_1}"
assert_eq "fts bm25 top (ProxySQL)" "posts:3#0" "${FTS_BM25_1}"
# ===========================================================================
# Phase 1a: update skip behavior (existing docs are not updated)
# ===========================================================================
echo "=========================================="
echo "Phase 1a: Verify existing docs are not updated"
echo "=========================================="
run_mysql_sql "USE rag_test; UPDATE posts SET Title='Hello RAG UPDATED' WHERE Id=1;"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
TITLE_AFTER_UPDATE="$(run_sqlite_server_value "SELECT title FROM rag_documents WHERE doc_id='posts:1';")"
assert_eq "rag_documents title unchanged on update" "Hello RAG" "${TITLE_AFTER_UPDATE}"
# Reset MySQL data after update test
import_mysql_seed
# ===========================================================================
# Phase 1b: rag_sync_state watermark (incremental ingestion)
# ===========================================================================
echo "=========================================="
echo "Phase 1b: Watermark-based incremental sync"
echo "=========================================="
SYNC_COL_1="$(run_sqlite_server_value "SELECT json_extract(cursor_json,'$.column') FROM rag_sync_state WHERE source_id=1;")"
SYNC_VAL_1="$(run_sqlite_server_value "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
assert_eq "rag_sync_state column" "Id" "${SYNC_COL_1}"
assert_eq "rag_sync_state value (initial)" "10" "${SYNC_VAL_1}"
# Delete one doc to verify watermark prevents backfill
run_sqlite_server "DELETE FROM rag_vec_chunks WHERE chunk_id LIKE 'posts:5#%';"
run_sqlite_server "DELETE FROM rag_fts_chunks WHERE chunk_id LIKE 'posts:5#%';"
run_sqlite_server "DELETE FROM rag_chunks WHERE doc_id='posts:5';"
run_sqlite_server "DELETE FROM rag_documents WHERE doc_id='posts:5';"
DOCS_AFTER_DELETE="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
assert_eq "rag_documents after delete" "9" "${DOCS_AFTER_DELETE}"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_AFTER_REINGEST="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_AFTER_REINGEST="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
FTS_AFTER_REINGEST="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_fts_chunks;")"
assert_eq "rag_documents after watermark reingest" "9" "${DOCS_AFTER_REINGEST}"
assert_eq "rag_chunks after watermark reingest" "9" "${CHUNKS_AFTER_REINGEST}"
assert_eq "rag_fts_chunks after watermark reingest" "9" "${FTS_AFTER_REINGEST}"
# Insert a new source row and ensure only it is ingested
run_mysql_sql "USE rag_test; INSERT INTO posts (Id, Title, Body, Tags, Score, CreationDate, UpdatedAt) VALUES (11, 'Watermark New', 'This row should be ingested via watermark.', 'wm,test', 1, '2024-01-14 10:00:00', '2024-01-14 11:00:00');"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_AFTER_NEW="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
SYNC_VAL_2="$(run_sqlite_server_value "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
assert_eq "rag_documents after new row" "10" "${DOCS_AFTER_NEW}"
assert_eq "rag_sync_state value (after new row)" "11" "${SYNC_VAL_2}"
# Reset sync state for subsequent phases
run_sqlite_server "DELETE FROM rag_sync_state;"
# Reset MySQL data after watermark insert
import_mysql_seed
# ===========================================================================
# Phase 1c: UpdatedAt-based watermark filtering
# ===========================================================================
echo "=========================================="
echo "Phase 1c: UpdatedAt-based watermark filtering"
echo "=========================================="
run_sqlite_server "DELETE FROM rag_vec_chunks;"
run_sqlite_server "DELETE FROM rag_fts_chunks;"
run_sqlite_server "DELETE FROM rag_chunks;"
run_sqlite_server "DELETE FROM rag_documents;"
run_sqlite_server "INSERT OR REPLACE INTO rag_sync_state(source_id, mode, cursor_json, last_ok_at, last_error) VALUES (1, 'poll', '{\"column\":\"UpdatedAt\",\"value\":\"2024-01-10 10:00:00\"}', NULL, NULL);"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_UPDATED_AT="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
SYNC_UPDATED_AT="$(run_sqlite_server_value "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
assert_eq "rag_documents (UpdatedAt watermark)" "1" "${DOCS_UPDATED_AT}"
assert_eq "rag_sync_state value (UpdatedAt)" "2024-01-12 09:30:00" "${SYNC_UPDATED_AT}"
# Reset sync state for subsequent phases
run_sqlite_server "DELETE FROM rag_sync_state;"
# ===========================================================================
# Phase 2: apply where filter, re-ingest after cleanup
# ===========================================================================
echo "=========================================="
echo "Phase 2: WHERE filter (Score >= 7)"
echo "=========================================="
run_sqlite_server "DELETE FROM rag_vec_chunks;"
run_sqlite_server "DELETE FROM rag_fts_chunks;"
run_sqlite_server "DELETE FROM rag_chunks;"
run_sqlite_server "DELETE FROM rag_documents;"
apply_schema_and_source "Score >= 7" "false"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_COUNT_2="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT_2="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
FTS_COUNT_2="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_fts_chunks;")"
VEC_COUNT_2="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_vec_chunks;")"
# In sample_mysql: Score >= 7 matches Id 1,3,5,7,9 => 5 docs
assert_eq "rag_documents (where_sql)" "5" "${DOCS_COUNT_2}"
assert_eq "rag_chunks (where_sql)" "5" "${CHUNKS_COUNT_2}"
assert_eq "rag_fts_chunks (where_sql)" "5" "${FTS_COUNT_2}"
assert_eq "rag_vec_chunks (where_sql, embedding disabled)" "0" "${VEC_COUNT_2}"
print_samples
# FTS tests (phase 2)
FTS_PROXYSQL_2="$(fts_count 'ProxySQL')"
FTS_HIGH_2="$(fts_count 'High')"
FTS_LOW_2="$(fts_count 'Low')"
FTS_BM25_2="$(fts_bm25_top 'High')"
assert_eq "fts term (ProxySQL)" "1" "${FTS_PROXYSQL_2}"
assert_eq "fts term (High)" "1" "${FTS_HIGH_2}"
assert_eq "fts term (Low)" "0" "${FTS_LOW_2}"
assert_eq "fts bm25 top (High)" "posts:9#0" "${FTS_BM25_2}"
# ===========================================================================
# Phase 3: enable chunking and ensure rows split into multiple chunks
# ===========================================================================
echo "=========================================="
echo "Phase 3: Enable chunking"
echo "=========================================="
run_sqlite_server "DELETE FROM rag_sync_state;"
run_sqlite_server "DELETE FROM rag_vec_chunks;"
run_sqlite_server "DELETE FROM rag_fts_chunks;"
run_sqlite_server "DELETE FROM rag_chunks;"
run_sqlite_server "DELETE FROM rag_documents;"
apply_schema_and_source "" "false" '{"enabled":true,"unit":"chars","chunk_size":50,"overlap":10,"min_chunk_size":10}'
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_COUNT_3="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT_3="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
LONG_DOC_CHUNKS="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks WHERE doc_id='posts:5';")"
assert_eq "rag_documents (chunking enabled)" "10" "${DOCS_COUNT_3}"
if [[ "${CHUNKS_COUNT_3}" -le "${DOCS_COUNT_3}" ]]; then
echo "FAIL: rag_chunks should be greater than rag_documents when chunking enabled" >&2
exit 1
fi
if [[ "${LONG_DOC_CHUNKS}" -le "1" ]]; then
echo "FAIL: posts:5 should produce multiple chunks" >&2
exit 1
fi
print_samples
# ===========================================================================
# Phase 4: enable embeddings (stub) and validate vec rows
# ===========================================================================
echo "=========================================="
echo "Phase 4: Enable embeddings (stub provider)"
echo "=========================================="
run_sqlite_server "DELETE FROM rag_sync_state;"
run_sqlite_server "DELETE FROM rag_vec_chunks;"
run_sqlite_server "DELETE FROM rag_fts_chunks;"
run_sqlite_server "DELETE FROM rag_chunks;"
run_sqlite_server "DELETE FROM rag_documents;"
apply_schema_and_source "" "false" '' "{\"enabled\":true,\"provider\":\"${EMBEDDING_PROVIDER}\",\"dim\":${EMBEDDING_DIM},\"input\":{\"concat\":[{\"col\":\"Title\"},{\"lit\":\"\\n\"},{\"chunk_body\":true}]}}"
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
DOCS_COUNT_4="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT_4="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
VEC_COUNT_4="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_vec_chunks;")"
assert_eq "rag_documents (embeddings enabled)" "10" "${DOCS_COUNT_4}"
assert_eq "rag_chunks (embeddings enabled)" "10" "${CHUNKS_COUNT_4}"
assert_eq "rag_vec_chunks (embeddings enabled)" "10" "${VEC_COUNT_4}"
VEC_MATCH_1="$(vec_self_match 'posts:1#0')"
assert_eq "vec self-match (posts:1#0)" "posts:1#0" "${VEC_MATCH_1}"
print_samples
# ===========================================================================
# Phase 5: optional OpenAI-compatible embeddings test (requires env vars)
# ===========================================================================
echo "=========================================="
echo "Phase 5: OpenAI-compatible embeddings (optional)"
echo "=========================================="
if [[ -n "${OPENAI_API_BASE}" && -n "${OPENAI_API_KEY}" ]]; then
# Cleanup existing tables for OpenAI test
run_sqlite_server "
DROP TABLE IF EXISTS rag_sync_state;
DROP TABLE IF EXISTS rag_fts_chunks;
DROP TABLE IF EXISTS rag_vec_chunks;
DROP VIEW IF EXISTS rag_chunk_view;
DROP TABLE IF EXISTS rag_chunks;
DROP TABLE IF EXISTS rag_documents;
DROP TABLE IF EXISTS rag_sources;
"
# Initialize schema with rag_ingest init
"${RAG_INGEST}" init \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
# Recreate vec0 table with correct dimensions for the OpenAI model
# (rag_ingest init creates 1536 by default, but nomic-embed-text-v1.5 uses 768)
run_sqlite_server "
DROP TABLE IF EXISTS rag_vec_chunks;
CREATE VIRTUAL TABLE rag_vec_chunks USING vec0(
embedding float[${OPENAI_EMBEDDING_DIM}],
chunk_id TEXT,
doc_id TEXT,
source_id INTEGER,
updated_at INTEGER
);
"
# Insert source with OpenAI embedding config
openai_embedding_json="{\"enabled\":true,\"provider\":\"openai\",\"api_base\":\"${OPENAI_API_BASE}\",\"api_key\":\"${OPENAI_API_KEY}\",\"model\":\"${OPENAI_MODEL}\",\"dim\":${OPENAI_EMBEDDING_DIM},\"input\":{\"concat\":[{\"col\":\"Title\"},{\"lit\":\"\\n\"},{\"chunk_body\":true}]}}"
run_sqlite_server "
INSERT INTO rag_sources (
name, enabled, backend_type, backend_host, backend_port, backend_user, backend_pass, backend_db,
table_name, pk_column, where_sql, doc_map_json, chunking_json, embedding_json
) VALUES (
'test_posts_openai', 1,
'mysql', '${MYSQL_HOST}', ${MYSQL_PORT}, '${MYSQL_USER}', '${MYSQL_PASS}', 'rag_test',
'posts', 'Id', '',
'{\"doc_id\":{\"format\":\"posts:{Id}\"},\"title\":{\"concat\":[{\"col\":\"Title\"}]},\"body\":{\"concat\":[{\"col\":\"Body\"}]},\"metadata\":{\"pick\":[\"Tags\",\"Score\"],\"rename\":{\"Tags\":\"tags\",\"Score\":\"score\"}}}',
'{\"enabled\":false,\"unit\":\"chars\",\"chunk_size\":4000,\"overlap\":400,\"min_chunk_size\":800}',
'${openai_embedding_json}'
);
"
# Run ingestion
"${RAG_INGEST}" ingest \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}"
# Validate results
DOCS_COUNT_5="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT_5="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_chunks;")"
VEC_COUNT_5="$(run_sqlite_server_value "SELECT COUNT(*) FROM rag_vec_chunks;")"
assert_eq "rag_documents (openai embeddings)" "10" "${DOCS_COUNT_5}"
assert_eq "rag_chunks (openai embeddings)" "10" "${CHUNKS_COUNT_5}"
assert_eq "rag_vec_chunks (openai embeddings)" "10" "${VEC_COUNT_5}"
print_samples
# Test vector self-match (verify embeddings work correctly)
VEC_MATCH_5="$(vec_self_match 'posts:1#0')"
assert_eq "vec self-match (posts:1#0) with OpenAI embeddings" "posts:1#0" "${VEC_MATCH_5}"
# Test semantic vector search using rag_ingest query command
echo "==> Testing semantic vector search with OpenAI embeddings..."
QUERY_OUTPUT="$("${RAG_INGEST}" query \
-h "${SQLITE_SERVER_HOST}" -P "${SQLITE_SERVER_PORT}" \
-u "${SQLITE_SERVER_USER}" -p "${SQLITE_SERVER_PASS}" \
-D "${SQLITE_SERVER_DB}" \
--text="ProxySQL RAG" \
--source-id=1 \
--limit=3 2>&1)"
# Verify query returned results
if echo "${QUERY_OUTPUT}" | grep -q "chunk_id"; then
echo "OK: Semantic vector search returned results"
else
echo "FAIL: Semantic vector search did not return results"
echo "Query output: ${QUERY_OUTPUT}"
exit 1
fi
echo "OK: OpenAI embeddings test completed"
else
echo "==> OpenAI embeddings test skipped (set OPENAI_API_BASE and OPENAI_API_KEY)"
fi
echo ""
echo "=========================================="
echo "All tests passed."
echo "=========================================="