From d47e196fc6b1daf03a761ddd0a34f810c601e722 Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Thu, 22 Jan 2026 14:07:30 +0500 Subject: [PATCH] Added rag_chunks and rag_fts_chunks test --- RAG_POC/test_rag_ingest.sh | 88 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/RAG_POC/test_rag_ingest.sh b/RAG_POC/test_rag_ingest.sh index 0bf508375..5992b0d4b 100755 --- a/RAG_POC/test_rag_ingest.sh +++ b/RAG_POC/test_rag_ingest.sh @@ -34,6 +34,7 @@ apply_schema_and_source() { local db="$1" local where_sql="$2" local load_schema="$3" + local chunking_json_override="${4:-}" local schema_cmd="" if [[ "${load_schema}" == "true" ]]; then @@ -43,14 +44,22 @@ apply_schema_and_source() { echo "==> SQLite DB: ${db}" echo "==> load_schema: ${load_schema}" echo "==> where_sql: ${where_sql:-}" - echo "==> chunking_json: {\"enabled\":false,\"unit\":\"chars\",\"chunk_size\":4000,\"overlap\":400,\"min_chunk_size\":800}" + local chunking_json_value='{"enabled":false,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}' + if [[ -n "${chunking_json_override}" ]]; then + chunking_json_value="${chunking_json_override}" + fi + echo "==> chunking_json: ${chunking_json_value}" echo "==> embedding_json: {\"enabled\":false}" "${SQLITE_BIN}" "${db}" < Sample rag_documents" + run_sqlite "${db}" "SELECT doc_id, source_id, substr(title,1,40) AS title, json_extract(metadata_json,'$.Score') AS score FROM rag_documents ORDER BY doc_id LIMIT 5;" + echo "==> Sample rag_chunks" + run_sqlite "${db}" "SELECT chunk_id, doc_id, chunk_index, substr(body,1,50) AS body FROM rag_chunks ORDER BY chunk_id LIMIT 5;" + echo "==> Sample rag_fts_chunks matches for 'ProxySQL'" + run_sqlite "${db}" "SELECT chunk_id, substr(title,1,40) AS title FROM rag_fts_chunks WHERE rag_fts_chunks MATCH 'ProxySQL' ORDER BY chunk_id LIMIT 5;" +} + cleanup_db() { rm -f "${DB1}" } @@ -105,6 +136,19 @@ assert_eq "rag_chunks (chunking disabled)" "10" "${CHUNKS_COUNT}" assert_eq "rag_fts_chunks" "10" "${FTS_COUNT}" assert_eq "rag_vec_chunks (embedding disabled)" "0" "${VEC_COUNT}" +print_samples "${DB1}" + +# FTS tests (phase 1) +FTS_PHRASE_1="$(fts_count "${DB1}" '"ProxySQL adds MCP"')" +FTS_SHORT_1="$(fts_count "${DB1}" 'Short')" +FTS_TAG_1="$(fts_count "${DB1}" 'Tag')" +FTS_BM25_1="$(fts_bm25_top "${DB1}" 'ProxySQL')" + +assert_eq "fts phrase (ProxySQL adds MCP)" "1" "${FTS_PHRASE_1}" +assert_eq "fts term (Short)" "1" "${FTS_SHORT_1}" +assert_eq "fts term (Tag)" "1" "${FTS_TAG_1}" +assert_eq "fts bm25 top (ProxySQL)" "posts:3#0" "${FTS_BM25_1}" + # Phase 2: apply where filter, re-ingest after cleanup run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;" run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;" @@ -125,4 +169,42 @@ assert_eq "rag_chunks (where_sql)" "5" "${CHUNKS_COUNT_2}" assert_eq "rag_fts_chunks (where_sql)" "5" "${FTS_COUNT_2}" assert_eq "rag_vec_chunks (where_sql, embedding disabled)" "0" "${VEC_COUNT_2}" +print_samples "${DB1}" + +# FTS tests (phase 2) +FTS_PROXYSQL_2="$(fts_count "${DB1}" 'ProxySQL')" +FTS_HIGH_2="$(fts_count "${DB1}" 'High')" +FTS_LOW_2="$(fts_count "${DB1}" 'Low')" +FTS_BM25_2="$(fts_bm25_top "${DB1}" 'High')" + +assert_eq "fts term (ProxySQL)" "1" "${FTS_PROXYSQL_2}" +assert_eq "fts term (High)" "1" "${FTS_HIGH_2}" +assert_eq "fts term (Low)" "0" "${FTS_LOW_2}" +assert_eq "fts bm25 top (High)" "posts:9#0" "${FTS_BM25_2}" + +# Phase 3: enable chunking and ensure rows split into multiple chunks +run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_documents;" + +apply_schema_and_source "${DB1}" "" "false" '{"enabled":true,"unit":"chars","chunk_size":50,"overlap":10,"min_chunk_size":10}' +"${ROOT_DIR}/rag_ingest" "${DB1}" + +DOCS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")" +CHUNKS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")" +LONG_DOC_CHUNKS="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks WHERE doc_id='posts:5';")" + +assert_eq "rag_documents (chunking enabled)" "10" "${DOCS_COUNT_3}" +if [[ "${CHUNKS_COUNT_3}" -le "${DOCS_COUNT_3}" ]]; then + echo "FAIL: rag_chunks should be greater than rag_documents when chunking enabled" >&2 + exit 1 +fi +if [[ "${LONG_DOC_CHUNKS}" -le "1" ]]; then + echo "FAIL: posts:5 should produce multiple chunks" >&2 + exit 1 +fi + +print_samples "${DB1}" + echo "All tests passed."