From 2f6b058f7b227c3cbfddd5ee8fca72a4ef1fc91b Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Thu, 22 Jan 2026 13:51:22 +0500 Subject: [PATCH] Added test_rag_ingest.sh --- RAG_POC/test_rag_ingest.sh | 128 +++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100755 RAG_POC/test_rag_ingest.sh diff --git a/RAG_POC/test_rag_ingest.sh b/RAG_POC/test_rag_ingest.sh new file mode 100755 index 000000000..0bf508375 --- /dev/null +++ b/RAG_POC/test_rag_ingest.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)" + +SQLITE_BIN="${SQLITE_BIN:-${REPO_ROOT}/deps/sqlite3/sqlite3/sqlite3}" +MYSQL_BIN="${MYSQL_BIN:-mysql}" + +MYSQL_HOST="${MYSQL_HOST:-127.0.0.1}" +MYSQL_PORT="${MYSQL_PORT:-3306}" +MYSQL_USER="${MYSQL_USER:-root}" +MYSQL_PASS="${MYSQL_PASS:-root}" + +DB1="${ROOT_DIR}/rag_ingest_test.db" + +VEC_EXT="${REPO_ROOT}/deps/sqlite3/sqlite3/vec0.so" + +if [[ ! -f "${VEC_EXT}" ]]; then + echo "FATAL: vec0.so not found at ${VEC_EXT}" >&2 + exit 1 +fi + +run_sqlite() { + local db="$1" + local sql="$2" + "${SQLITE_BIN}" "${db}" < SQLite DB: ${db}" + echo "==> load_schema: ${load_schema}" + echo "==> where_sql: ${where_sql:-}" + echo "==> chunking_json: {\"enabled\":false,\"unit\":\"chars\",\"chunk_size\":4000,\"overlap\":400,\"min_chunk_size\":800}" + echo "==> embedding_json: {\"enabled\":false}" + + "${SQLITE_BIN}" "${db}" <&2 + exit 1 + fi + echo "OK: ${label} = ${actual}" +} + +cleanup_db() { + rm -f "${DB1}" +} + +cleanup_db + +# Phase 1: load schema + source, chunking disabled, no where filter +apply_schema_and_source "${DB1}" "" "true" + +# Seed MySQL +import_mysql_seed + +# Run rag_ingest +"${ROOT_DIR}/rag_ingest" "${DB1}" + +# Validate counts (sample_mysql has 10 rows) +DOCS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")" +CHUNKS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")" +FTS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_fts_chunks;")" +VEC_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_vec_chunks;")" + +assert_eq "rag_documents" "10" "${DOCS_COUNT}" +assert_eq "rag_chunks (chunking disabled)" "10" "${CHUNKS_COUNT}" +assert_eq "rag_fts_chunks" "10" "${FTS_COUNT}" +assert_eq "rag_vec_chunks (embedding disabled)" "0" "${VEC_COUNT}" + +# Phase 2: apply where filter, re-ingest after cleanup +run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_chunks;" +run_sqlite "${DB1}" "DELETE FROM rag_documents;" + +apply_schema_and_source "${DB1}" "Score >= 7" "false" +"${ROOT_DIR}/rag_ingest" "${DB1}" + +DOCS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")" +CHUNKS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")" +FTS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_fts_chunks;")" +VEC_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_vec_chunks;")" + +# In sample_mysql: Score >= 7 matches Id 1,3,5,7,9 => 5 docs +assert_eq "rag_documents (where_sql)" "5" "${DOCS_COUNT_2}" +assert_eq "rag_chunks (where_sql)" "5" "${CHUNKS_COUNT_2}" +assert_eq "rag_fts_chunks (where_sql)" "5" "${FTS_COUNT_2}" +assert_eq "rag_vec_chunks (where_sql, embedding disabled)" "0" "${VEC_COUNT_2}" + +echo "All tests passed."