Added rag_chunks and rag_fts_chunks test

pull/5318/head
Rahim Kanji 4 months ago
parent 2f6b058f7b
commit d47e196fc6

@ -34,6 +34,7 @@ apply_schema_and_source() {
local db="$1"
local where_sql="$2"
local load_schema="$3"
local chunking_json_override="${4:-}"
local schema_cmd=""
if [[ "${load_schema}" == "true" ]]; then
@ -43,14 +44,22 @@ apply_schema_and_source() {
echo "==> SQLite DB: ${db}"
echo "==> load_schema: ${load_schema}"
echo "==> where_sql: ${where_sql:-<empty>}"
echo "==> chunking_json: {\"enabled\":false,\"unit\":\"chars\",\"chunk_size\":4000,\"overlap\":400,\"min_chunk_size\":800}"
local chunking_json_value='{"enabled":false,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}'
if [[ -n "${chunking_json_override}" ]]; then
chunking_json_value="${chunking_json_override}"
fi
echo "==> chunking_json: ${chunking_json_value}"
echo "==> embedding_json: {\"enabled\":false}"
"${SQLITE_BIN}" "${db}" <<SQL
.load ${VEC_EXT}
.bail on
.mode list
.separator |
.nullvalue NULL
${schema_cmd}
UPDATE rag_sources
SET chunking_json='{"enabled":false,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}'
UPDATE rag_sources
SET chunking_json='${chunking_json_value}'
WHERE source_id=1;
UPDATE rag_sources
SET embedding_json='{"enabled":false}'
@ -79,6 +88,28 @@ assert_eq() {
echo "OK: ${label} = ${actual}"
}
fts_count() {
local db="$1"
local q="$2"
run_sqlite "${db}" "SELECT COUNT(*) FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}';"
}
fts_bm25_top() {
local db="$1"
local q="$2"
run_sqlite "${db}" "SELECT chunk_id FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}' ORDER BY bm25(rag_fts_chunks) LIMIT 1;"
}
print_samples() {
local db="$1"
echo "==> Sample rag_documents"
run_sqlite "${db}" "SELECT doc_id, source_id, substr(title,1,40) AS title, json_extract(metadata_json,'$.Score') AS score FROM rag_documents ORDER BY doc_id LIMIT 5;"
echo "==> Sample rag_chunks"
run_sqlite "${db}" "SELECT chunk_id, doc_id, chunk_index, substr(body,1,50) AS body FROM rag_chunks ORDER BY chunk_id LIMIT 5;"
echo "==> Sample rag_fts_chunks matches for 'ProxySQL'"
run_sqlite "${db}" "SELECT chunk_id, substr(title,1,40) AS title FROM rag_fts_chunks WHERE rag_fts_chunks MATCH 'ProxySQL' ORDER BY chunk_id LIMIT 5;"
}
cleanup_db() {
rm -f "${DB1}"
}
@ -105,6 +136,19 @@ assert_eq "rag_chunks (chunking disabled)" "10" "${CHUNKS_COUNT}"
assert_eq "rag_fts_chunks" "10" "${FTS_COUNT}"
assert_eq "rag_vec_chunks (embedding disabled)" "0" "${VEC_COUNT}"
print_samples "${DB1}"
# FTS tests (phase 1)
FTS_PHRASE_1="$(fts_count "${DB1}" '"ProxySQL adds MCP"')"
FTS_SHORT_1="$(fts_count "${DB1}" 'Short')"
FTS_TAG_1="$(fts_count "${DB1}" 'Tag')"
FTS_BM25_1="$(fts_bm25_top "${DB1}" 'ProxySQL')"
assert_eq "fts phrase (ProxySQL adds MCP)" "1" "${FTS_PHRASE_1}"
assert_eq "fts term (Short)" "1" "${FTS_SHORT_1}"
assert_eq "fts term (Tag)" "1" "${FTS_TAG_1}"
assert_eq "fts bm25 top (ProxySQL)" "posts:3#0" "${FTS_BM25_1}"
# Phase 2: apply where filter, re-ingest after cleanup
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
@ -125,4 +169,42 @@ assert_eq "rag_chunks (where_sql)" "5" "${CHUNKS_COUNT_2}"
assert_eq "rag_fts_chunks (where_sql)" "5" "${FTS_COUNT_2}"
assert_eq "rag_vec_chunks (where_sql, embedding disabled)" "0" "${VEC_COUNT_2}"
print_samples "${DB1}"
# FTS tests (phase 2)
FTS_PROXYSQL_2="$(fts_count "${DB1}" 'ProxySQL')"
FTS_HIGH_2="$(fts_count "${DB1}" 'High')"
FTS_LOW_2="$(fts_count "${DB1}" 'Low')"
FTS_BM25_2="$(fts_bm25_top "${DB1}" 'High')"
assert_eq "fts term (ProxySQL)" "1" "${FTS_PROXYSQL_2}"
assert_eq "fts term (High)" "1" "${FTS_HIGH_2}"
assert_eq "fts term (Low)" "0" "${FTS_LOW_2}"
assert_eq "fts bm25 top (High)" "posts:9#0" "${FTS_BM25_2}"
# Phase 3: enable chunking and ensure rows split into multiple chunks
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
run_sqlite "${DB1}" "DELETE FROM rag_chunks;"
run_sqlite "${DB1}" "DELETE FROM rag_documents;"
apply_schema_and_source "${DB1}" "" "false" '{"enabled":true,"unit":"chars","chunk_size":50,"overlap":10,"min_chunk_size":10}'
"${ROOT_DIR}/rag_ingest" "${DB1}"
DOCS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
CHUNKS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
LONG_DOC_CHUNKS="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks WHERE doc_id='posts:5';")"
assert_eq "rag_documents (chunking enabled)" "10" "${DOCS_COUNT_3}"
if [[ "${CHUNKS_COUNT_3}" -le "${DOCS_COUNT_3}" ]]; then
echo "FAIL: rag_chunks should be greater than rag_documents when chunking enabled" >&2
exit 1
fi
if [[ "${LONG_DOC_CHUNKS}" -le "1" ]]; then
echo "FAIL: posts:5 should produce multiple chunks" >&2
exit 1
fi
print_samples "${DB1}"
echo "All tests passed."

Loading…
Cancel
Save