mirror of https://github.com/sysown/proxysql
commit
342272367d
@ -0,0 +1,131 @@
|
||||
# Embedding Testing Plan
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. MySQL server running with test database
|
||||
2. OpenAI-compatible embedding service accessible
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# From repository root
|
||||
cd RAG_POC
|
||||
|
||||
# Step 1: Set your embedding service credentials
|
||||
export OPENAI_API_BASE="https://your-embedding-service.com/v1"
|
||||
export OPENAI_API_KEY="your-api-key-here"
|
||||
export OPENAI_MODEL="your-model-name"
|
||||
export OPENAI_EMBEDDING_DIM=1536 # Adjust based on your model
|
||||
|
||||
# Step 2: Run the test
|
||||
./test_rag_ingest.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### OpenAI API
|
||||
```bash
|
||||
export OPENAI_API_BASE="https://api.openai.com/v1"
|
||||
export OPENAI_API_KEY="sk-your-openai-key"
|
||||
export OPENAI_MODEL="text-embedding-3-small"
|
||||
export OPENAI_EMBEDDING_DIM=1536
|
||||
```
|
||||
|
||||
### Azure OpenAI
|
||||
```bash
|
||||
export OPENAI_API_BASE="https://your-resource.openai.azure.com/openai/deployments/your-deployment"
|
||||
export OPENAI_API_KEY="your-azure-key"
|
||||
export OPENAI_MODEL="text-embedding-ada-002" # Your deployment name
|
||||
export OPENAI_EMBEDDING_DIM=1536
|
||||
```
|
||||
|
||||
### Other OpenAI-compatible services
|
||||
```bash
|
||||
# Any service with OpenAI-compatible API
|
||||
export OPENAI_API_BASE="https://your-service.com/v1"
|
||||
export OPENAI_API_KEY="your-key"
|
||||
export OPENAI_MODEL="model-name"
|
||||
export OPENAI_EMBEDDING_DIM=dim # e.g., 768, 1536, 3072
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What the Test Does
|
||||
|
||||
**Phase 4** (runs automatically with OPENAI_ variables set):
|
||||
1. Creates RAG database with schema
|
||||
2. Configures embedding with your credentials
|
||||
3. Ingests 10 documents from MySQL
|
||||
4. Generates embeddings via your service
|
||||
5. Verifies:
|
||||
- 10 documents created
|
||||
- 10 chunks created
|
||||
- **10 embeddings created**
|
||||
- Vector self-match works (search finds itself)
|
||||
|
||||
---
|
||||
|
||||
## Expected Output
|
||||
|
||||
```
|
||||
==> embedding_json: {"enabled":true,"provider":"openai","api_base":"https://...","api_key":"***","model":"...","dim":1536,"input":{"concat":[{"col":"Title"},{"lit":"\n"},{"chunk_body":true}]}}
|
||||
Ingesting source_id=1 name=test_source backend=mysql table=posts
|
||||
Done source test_source ingested_docs=10 skipped_docs=0
|
||||
OK: rag_documents (embeddings enabled) = 10
|
||||
OK: rag_chunks (embeddings enabled) = 10
|
||||
OK: rag_vec_chunks (embeddings enabled) = 10
|
||||
OK: vec self-match (posts:1#0) = posts:1#0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Verification Queries
|
||||
|
||||
After the test, manually verify:
|
||||
|
||||
```bash
|
||||
sqlite3 rag_ingest_test_openai.db <<SQL
|
||||
.load ../deps/sqlite3/sqlite3/vec0.so
|
||||
|
||||
-- All chunks have embeddings?
|
||||
SELECT 'Missing embeddings: ' || COUNT(*) FROM rag_chunks c
|
||||
LEFT JOIN rag_vec_chunks v ON c.chunk_id = v.chunk_id
|
||||
WHERE v.chunk_id IS NULL;
|
||||
-- Expected: 0
|
||||
|
||||
-- Sample embeddings
|
||||
SELECT chunk_id, substr(hex(substr(embedding,1,4)),1,8) AS vec_prefix
|
||||
FROM rag_vec_chunks LIMIT 5;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Error: "Failed to generate embeddings"
|
||||
- Check `OPENAI_API_BASE` is correct
|
||||
- Check `OPENAI_API_KEY` is valid
|
||||
- Check `OPENAI_MODEL` exists in your service
|
||||
|
||||
### Error: "Dimension mismatch"
|
||||
- Set `OPENAI_EMBEDDING_DIM` to match your model
|
||||
- Common dimensions: 768, 1536, 3072
|
||||
|
||||
### Timeout errors
|
||||
- The test uses 20-second timeout (configurable in embedding_json)
|
||||
- Check network connectivity to embedding service
|
||||
|
||||
---
|
||||
|
||||
## Testing Different Batch Sizes
|
||||
|
||||
To test the batching implementation, you can modify the test temporarily:
|
||||
|
||||
```bash
|
||||
# Edit test_rag_ingest.sh, line ~339, add batch_size:
|
||||
# {"enabled":true,"provider":"openai",...,"batch_size":32}
|
||||
```
|
||||
|
||||
Then observe the number of API calls in your embedding service dashboard.
|
||||
@ -0,0 +1,439 @@
|
||||
# RAG Ingestion Tool - Usage Guide
|
||||
|
||||
## Overview
|
||||
|
||||
`rag_ingest` reads data from MySQL, transforms it, chunks documents, builds full-text search indexes, and optionally generates vector embeddings for semantic search.
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Build the tool (from repository root)
|
||||
cd RAG_POC
|
||||
make
|
||||
|
||||
# 2. Create a RAG database with schema
|
||||
./rag_ingest /path/to/rag.db # First run creates schema automatically
|
||||
|
||||
# 3. Configure your data source (via SQL)
|
||||
sqlite3 /path/to/rag.db < setup_source.sql
|
||||
|
||||
# 4. Run ingestion
|
||||
./rag_ingest /path/to/rag.db
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step Guide
|
||||
|
||||
### Step 1: Create the RAG Database
|
||||
|
||||
```bash
|
||||
# From repository root
|
||||
cd RAG_POC
|
||||
|
||||
# Create empty database and load schema
|
||||
sqlite3 rag_index.db < schema.sql
|
||||
|
||||
# Verify schema loaded
|
||||
sqlite3 rag_index.db ".tables"
|
||||
# Expected output:
|
||||
# rag_chunks rag_fts_chunks rag_sources
|
||||
# rag_documents rag_sync_state rag_vec_chunks
|
||||
```
|
||||
|
||||
### Step 2: Configure Your Data Source
|
||||
|
||||
Insert a source configuration into `rag_sources`:
|
||||
|
||||
```sql
|
||||
-- Minimal configuration (no chunking, no embeddings)
|
||||
INSERT INTO rag_sources (
|
||||
name,
|
||||
enabled,
|
||||
backend_type,
|
||||
host,
|
||||
port,
|
||||
user,
|
||||
pass,
|
||||
db,
|
||||
table_name,
|
||||
pk_column
|
||||
) VALUES (
|
||||
'my_mysql_data', -- Human-readable name
|
||||
1, -- enabled (1=enabled, 0=disabled)
|
||||
'mysql', -- backend type (only 'mysql' supported)
|
||||
'127.0.0.1', -- MySQL host
|
||||
3306, -- MySQL port
|
||||
'root', -- MySQL username
|
||||
'mypassword', -- MySQL password
|
||||
'my_database', -- MySQL database name
|
||||
'posts', -- Table name to read from
|
||||
'Id' -- Primary key column
|
||||
);
|
||||
```
|
||||
|
||||
### Step 3: Run Ingestion
|
||||
|
||||
```bash
|
||||
./rag_ingest rag_index.db
|
||||
```
|
||||
|
||||
**What happens:**
|
||||
1. Connects to MySQL using credentials from `rag_sources`
|
||||
2. Executes `SELECT * FROM posts`
|
||||
3. For each row:
|
||||
- Creates a document in `rag_documents`
|
||||
- Creates a chunk in `rag_chunks` (1 per document when chunking disabled)
|
||||
- Creates FTS entry in `rag_fts_chunks`
|
||||
4. Updates `rag_sync_state` with the max primary key value
|
||||
|
||||
---
|
||||
|
||||
## Common Configurations
|
||||
|
||||
### Configuration 1: Basic Ingestion (No Chunking, No Embeddings)
|
||||
|
||||
```sql
|
||||
INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column)
|
||||
VALUES ('basic_source', 1, 'mysql', '127.0.0.1', 3306, 'root', 'pass', 'mydb', 'posts', 'Id');
|
||||
|
||||
-- chunking_json and embedding_json default to disabled
|
||||
```
|
||||
|
||||
**Result:** 1 chunk per document, FTS only, no vectors.
|
||||
|
||||
---
|
||||
|
||||
### Configuration 2: Enable Chunking
|
||||
|
||||
Chunking splits long documents into smaller pieces for better retrieval precision.
|
||||
|
||||
```sql
|
||||
INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column, chunking_json)
|
||||
VALUES (
|
||||
'chunked_source',
|
||||
1,
|
||||
'mysql',
|
||||
'127.0.0.1',
|
||||
3306,
|
||||
'root',
|
||||
'pass',
|
||||
'mydb',
|
||||
'posts',
|
||||
'Id',
|
||||
'{
|
||||
"enabled": true,
|
||||
"unit": "chars",
|
||||
"chunk_size": 4000,
|
||||
"overlap": 400,
|
||||
"min_chunk_size": 800
|
||||
}'
|
||||
);
|
||||
```
|
||||
|
||||
**Result:** Documents split into ~4000-character chunks with 400-character overlap.
|
||||
|
||||
---
|
||||
|
||||
### Configuration 3: Enable Chunking + Embeddings (Stub)
|
||||
|
||||
For testing without an external embedding service.
|
||||
|
||||
```sql
|
||||
INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column, chunking_json, embedding_json)
|
||||
VALUES (
|
||||
'embedded_source_stub',
|
||||
1,
|
||||
'mysql',
|
||||
'127.0.0.1',
|
||||
3306,
|
||||
'root',
|
||||
'pass',
|
||||
'mydb',
|
||||
'posts',
|
||||
'Id',
|
||||
'{
|
||||
"enabled": true,
|
||||
"unit": "chars",
|
||||
"chunk_size": 4000,
|
||||
"overlap": 400,
|
||||
"min_chunk_size": 800
|
||||
}',
|
||||
'{
|
||||
"enabled": true,
|
||||
"provider": "stub",
|
||||
"dim": 1536
|
||||
}'
|
||||
);
|
||||
```
|
||||
|
||||
**Result:** Pseudo-embeddings generated instantly (no API call). Good for testing.
|
||||
|
||||
---
|
||||
|
||||
### Configuration 4: Enable Chunking + Real Embeddings
|
||||
|
||||
With an OpenAI-compatible embedding service.
|
||||
|
||||
```sql
|
||||
INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column, chunking_json, embedding_json)
|
||||
VALUES (
|
||||
'embedded_source_real',
|
||||
1,
|
||||
'mysql',
|
||||
'127.0.0.1',
|
||||
3306,
|
||||
'root',
|
||||
'pass',
|
||||
'mydb',
|
||||
'posts',
|
||||
'Id',
|
||||
'{
|
||||
"enabled": true,
|
||||
"unit": "chars",
|
||||
"chunk_size": 4000,
|
||||
"overlap": 400,
|
||||
"min_chunk_size": 800
|
||||
}',
|
||||
'{
|
||||
"enabled": true,
|
||||
"provider": "openai",
|
||||
"api_base": "https://api.openai.com/v1",
|
||||
"api_key": "sk-your-api-key",
|
||||
"model": "text-embedding-3-small",
|
||||
"dim": 1536,
|
||||
"batch_size": 16,
|
||||
"timeout_ms": 20000
|
||||
}'
|
||||
);
|
||||
```
|
||||
|
||||
**Result:** Real embeddings generated via OpenAI API in batches of 16.
|
||||
|
||||
---
|
||||
|
||||
## Configuration Reference
|
||||
|
||||
### chunking_json
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `enabled` | boolean | `true` | Enable/disable chunking |
|
||||
| `unit` | string | `"chars"` | Unit of measurement (only `"chars"` supported) |
|
||||
| `chunk_size` | integer | `4000` | Target size of each chunk |
|
||||
| `overlap` | integer | `400` | Overlap between consecutive chunks |
|
||||
| `min_chunk_size` | integer | `800` | Minimum size to avoid tiny tail chunks |
|
||||
|
||||
### embedding_json
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `enabled` | boolean | `false` | Enable/disable embedding generation |
|
||||
| `provider` | string | `"stub"` | `"stub"` or `"openai"` |
|
||||
| `model` | string | `"unknown"` | Model name (for observability) |
|
||||
| `dim` | integer | `1536` | Vector dimension |
|
||||
| `api_base` | string | - | API base URL (for `provider="openai"`) |
|
||||
| `api_key` | string | - | API authentication key |
|
||||
| `batch_size` | integer | `16` | Maximum chunks per API call |
|
||||
| `timeout_ms` | integer | `20000` | Request timeout in milliseconds |
|
||||
| `input` | object | - | Embedding input template (optional) |
|
||||
|
||||
### embedding_json.input (Advanced)
|
||||
|
||||
Controls what text is embedded. Example:
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"provider": "openai",
|
||||
"dim": 1536,
|
||||
"input": {
|
||||
"concat": [
|
||||
{"col": "Title"},
|
||||
{"lit": "\nTags: "},
|
||||
{"col": "Tags"},
|
||||
{"lit": "\n\n"},
|
||||
{"chunk_body": true}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Result:** Embeds: `{Title}\nTags: {Tags}\n\n{ChunkBody}`
|
||||
|
||||
---
|
||||
|
||||
## Document Transformation (doc_map_json)
|
||||
|
||||
By default, all columns from the source table are available. To map columns to document fields:
|
||||
|
||||
```sql
|
||||
INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column, doc_map_json)
|
||||
VALUES (
|
||||
'mapped_source',
|
||||
1,
|
||||
'mysql',
|
||||
'127.0.0.1',
|
||||
3306,
|
||||
'root',
|
||||
'pass',
|
||||
'mydb',
|
||||
'posts',
|
||||
'Id',
|
||||
'{
|
||||
"title": {"expr": "concat(Title, '' - '', Subtitle)"},
|
||||
"body": {"col": "Content"},
|
||||
"metadata": {"expr": "json_object(''id''', Id, ''score'', Score, ''tags'', Tags)"}
|
||||
}'
|
||||
);
|
||||
```
|
||||
|
||||
**Result:** Custom mapping from MySQL columns to document fields.
|
||||
|
||||
---
|
||||
|
||||
## Filtering (where_sql)
|
||||
|
||||
Only ingest rows matching a WHERE clause:
|
||||
|
||||
```sql
|
||||
UPDATE rag_sources
|
||||
SET where_sql = 'Score >= 7 AND CreationDate >= ''2024-01-01'''
|
||||
WHERE source_id = 1;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Running Ingestion
|
||||
|
||||
### Single Run
|
||||
|
||||
```bash
|
||||
./rag_ingest rag_index.db
|
||||
```
|
||||
|
||||
### Incremental Runs (Watermark)
|
||||
|
||||
The tool tracks the last processed primary key value in `rag_sync_state`. Subsequent runs only fetch new rows.
|
||||
|
||||
```bash
|
||||
# First run: ingests all rows
|
||||
./rag_ingest rag_index.db
|
||||
|
||||
# Second run: only ingests new rows
|
||||
./rag_ingest rag_index.db
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitoring Progress
|
||||
|
||||
```bash
|
||||
# Progress is printed to stderr
|
||||
./rag_ingest rag_index.db
|
||||
# Output:
|
||||
# Ingesting source_id=1 name=my_source backend=mysql table=posts
|
||||
# progress: ingested_docs=1000 skipped_docs=50
|
||||
# progress: ingested_docs=2000 skipped_docs=100
|
||||
# Done source my_source ingested_docs=2500 skipped_docs=120
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
sqlite3 rag_index.db <<SQL
|
||||
.load ../deps/sqlite3/sqlite3/vec0.so
|
||||
|
||||
-- Check counts
|
||||
SELECT 'documents' AS type, COUNT(*) FROM rag_documents
|
||||
UNION ALL
|
||||
SELECT 'chunks', COUNT(*) FROM rag_chunks
|
||||
UNION ALL
|
||||
SELECT 'fts_entries', COUNT(*) FROM rag_fts_chunks
|
||||
UNION ALL
|
||||
SELECT 'vectors', COUNT(*) FROM rag_vec_chunks;
|
||||
|
||||
-- Check sync state
|
||||
SELECT source_id, mode, cursor_json FROM rag_sync_state;
|
||||
SQL
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Workflow 1: Initial Setup
|
||||
|
||||
```bash
|
||||
# 1. Create database
|
||||
sqlite3 rag.db < schema.sql
|
||||
|
||||
# 2. Add source
|
||||
sqlite3 rag.db "INSERT INTO rag_sources (name, enabled, backend_type, host, port, user, pass, db, table_name, pk_column, chunking_json)
|
||||
VALUES ('my_data', 1, 'mysql', 'localhost', 3306, 'root', 'pass', 'mydb', 'posts', 'Id', '{\"enabled\":true,\"chunk_size\":4000,\"overlap\":400}');"
|
||||
|
||||
# 3. Ingest
|
||||
./rag_ingest rag.db
|
||||
```
|
||||
|
||||
### Workflow 2: Re-run with New Configuration
|
||||
|
||||
```bash
|
||||
# 1. Update source configuration
|
||||
sqlite3 rag.db "UPDATE rag_sources SET chunking_json='{\"enabled\":true,\"chunk_size\":2000}' WHERE source_id=1;"
|
||||
|
||||
# 2. Clear existing data (optional - to re-chunk with new settings)
|
||||
sqlite3 rag.db "DELETE FROM rag_vec_chunks; DELETE FROM rag_fts_chunks; DELETE FROM rag_chunks; DELETE FROM rag_documents; DELETE FROM rag_sync_state;"
|
||||
|
||||
# 3. Re-ingest
|
||||
./rag_ingest rag.db
|
||||
```
|
||||
|
||||
### Workflow 3: Add Embeddings to Existing Data
|
||||
|
||||
```bash
|
||||
# 1. Enable embeddings on existing source
|
||||
sqlite3 rag.db "UPDATE rag_sources SET embedding_json='{\"enabled\":true,\"provider\":\"stub\",\"dim\":1536}' WHERE source_id=1;"
|
||||
|
||||
# 2. Clear sync state (so it re-processes all rows)
|
||||
sqlite3 rag.db "DELETE FROM rag_sync_state WHERE source_id=1;"
|
||||
|
||||
# 3. Clear vectors only (keep documents and chunks)
|
||||
sqlite3 rag.db "DELETE FROM rag_vec_chunks;"
|
||||
|
||||
# 4. Re-ingest (will skip existing documents, but generate embeddings)
|
||||
./rag_ingest rag.db
|
||||
```
|
||||
|
||||
**Note:** v0 skips documents that already exist. To regenerate embeddings, clear `rag_documents` or use `WHERE` clause.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "MySQL query failed"
|
||||
|
||||
- Verify MySQL credentials in `rag_sources`
|
||||
- Check MySQL server is running
|
||||
- Verify table and column names exist
|
||||
|
||||
### "Failed to load vec0 extension"
|
||||
|
||||
- Ensure `RAG_VEC0_EXT` environment variable points to valid `vec0.so`
|
||||
- Or run: `export RAG_VEC0_EXT=/path/to/vec0.so`
|
||||
|
||||
### "Failed to generate embeddings"
|
||||
|
||||
- Check `embedding_json` configuration
|
||||
- For `provider="openai"`: verify `api_base`, `api_key`, `model`
|
||||
- Check network connectivity to embedding service
|
||||
- Increase `timeout_ms` if needed
|
||||
|
||||
### "No enabled sources found"
|
||||
|
||||
- Run: `SELECT * FROM rag_sources WHERE enabled = 1;`
|
||||
- Ensure `enabled = 1` for your source
|
||||
@ -0,0 +1,33 @@
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -std=c++17 -O2
|
||||
|
||||
ROOT_DIR := ..
|
||||
|
||||
INCLUDES := \
|
||||
-I$(ROOT_DIR)/deps/json \
|
||||
-I$(ROOT_DIR)/deps/mariadb-client-library/mariadb_client/include \
|
||||
-I$(ROOT_DIR)/deps/sqlite3/sqlite-amalgamation-3500400 \
|
||||
-I$(ROOT_DIR)/deps/curl/curl/include
|
||||
|
||||
LIBDIRS := \
|
||||
-L$(ROOT_DIR)/deps/mariadb-client-library/mariadb_client/libmariadb
|
||||
|
||||
SQLITE3_OBJ := $(ROOT_DIR)/deps/sqlite3/sqlite-amalgamation-3500400/sqlite3.o
|
||||
|
||||
# Use static libcurl
|
||||
CURL_STATIC_LIB := $(ROOT_DIR)/deps/curl/curl/lib/.libs/libcurl.a
|
||||
|
||||
LIBS := -lmariadbclient -lssl -lcrypto -lcrypt -ldl -lpthread $(CURL_STATIC_LIB) -lz
|
||||
|
||||
TARGET := rag_ingest
|
||||
SOURCES := rag_ingest.cpp
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(SOURCES)
|
||||
$(CXX) $(CXXFLAGS) $(INCLUDES) $(LIBDIRS) $(SQLITE3_OBJ) $^ -o $@ $(LIBS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGET)
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@
|
||||
-- Sample MySQL dataset for rag_ingest testing
|
||||
-- Creates a simple posts table and inserts a few rows.
|
||||
|
||||
CREATE DATABASE IF NOT EXISTS rag_test;
|
||||
USE rag_test;
|
||||
|
||||
DROP TABLE IF EXISTS posts;
|
||||
|
||||
CREATE TABLE posts (
|
||||
Id BIGINT NOT NULL PRIMARY KEY,
|
||||
Title VARCHAR(255) NOT NULL,
|
||||
Body TEXT NOT NULL,
|
||||
Tags VARCHAR(255) NULL,
|
||||
Score INT NOT NULL DEFAULT 0,
|
||||
CreationDate DATETIME NOT NULL,
|
||||
UpdatedAt DATETIME NULL
|
||||
);
|
||||
|
||||
INSERT INTO posts (Id, Title, Body, Tags, Score, CreationDate, UpdatedAt) VALUES
|
||||
(1, 'Hello RAG', 'This is the first test document. It contains sample text for chunking.', 'rag,test', 10, '2024-01-01 10:00:00', '2024-01-02 12:00:00'),
|
||||
(2, 'Second Doc', 'A second document body. It has more text to ensure chunking works across boundaries.', 'example,docs', 5, '2024-01-03 09:30:00', '2024-01-03 11:00:00'),
|
||||
(3, 'ProxySQL RAG', 'ProxySQL adds MCP and RAG support. This row is for ingestion testing.', 'proxysql,rag', 7, '2024-01-05 08:15:00', NULL),
|
||||
(4, 'Short Note', 'Tiny.', 'misc', 1, '2024-01-06 13:00:00', NULL),
|
||||
(5, 'Chunk Stress', 'This row contains a longer body to force multiple chunk boundaries when chunking is enabled. Repeat: This row contains a longer body to force multiple chunk boundaries when chunking is enabled.', 'long,chunk', 12, '2024-01-07 18:45:00', '2024-01-08 07:10:00'),
|
||||
(6, 'Filter Candidate', 'This document should be filtered out by a high score threshold.', 'filter,test', 2, '2024-01-09 14:20:00', NULL),
|
||||
(7, 'Tag Variation', 'Contains tags and mixed content for metadata pick/rename testing.', 'rag,meta,tag', 9, '2024-01-10 09:00:00', '2024-01-10 10:00:00'),
|
||||
(8, 'Null Updated', 'Document with NULL UpdatedAt for null handling in source.', 'nulls', 6, '2024-01-11 16:30:00', NULL),
|
||||
(9, 'High Score', 'This is a high score document for where_sql tests.', 'score,high', 20, '2024-01-12 08:00:00', '2024-01-12 09:30:00'),
|
||||
(10, 'Low Score', 'Low score entry to test filters.', 'score,low', 0, '2024-01-13 12:00:00', NULL);
|
||||
@ -0,0 +1,38 @@
|
||||
-- Sample SQLite setup for rag_ingest testing
|
||||
-- Inserts a sample rag_sources row that points to the MySQL sample.
|
||||
-- Note: schema.sql must be loaded separately before this script.
|
||||
|
||||
-- insert a sample source
|
||||
INSERT INTO rag_sources (
|
||||
source_id,
|
||||
name,
|
||||
enabled,
|
||||
backend_type,
|
||||
backend_host,
|
||||
backend_port,
|
||||
backend_user,
|
||||
backend_pass,
|
||||
backend_db,
|
||||
table_name,
|
||||
pk_column,
|
||||
where_sql,
|
||||
doc_map_json,
|
||||
chunking_json,
|
||||
embedding_json
|
||||
) VALUES (
|
||||
1,
|
||||
'mysql_posts',
|
||||
1,
|
||||
'mysql',
|
||||
'127.0.0.1',
|
||||
3306,
|
||||
'root',
|
||||
'root',
|
||||
'rag_test',
|
||||
'posts',
|
||||
'Id',
|
||||
'',
|
||||
'{"doc_id":{"format":"posts:{Id}"},"title":{"concat":[{"col":"Title"}]},"body":{"concat":[{"col":"Body"}]},"metadata":{"pick":["Id","Tags","Score","CreationDate"],"rename":{"CreationDate":"CreationDate"}}}',
|
||||
'{"enabled":true,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}',
|
||||
'{"enabled":true,"dim":1536,"model":"text-embedding-3-large","input":{"concat":[{"col":"Title"},{"lit":"\\nTags: "},{"col":"Tags"},{"lit":"\\n\\n"},{"chunk_body":true}]}}'
|
||||
);
|
||||
@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)"
|
||||
|
||||
SQLITE_BIN="${SQLITE_BIN:-${REPO_ROOT}/deps/sqlite3/sqlite3/sqlite3}"
|
||||
MYSQL_BIN="${MYSQL_BIN:-mysql}"
|
||||
|
||||
MYSQL_HOST="${MYSQL_HOST:-127.0.0.1}"
|
||||
MYSQL_PORT="${MYSQL_PORT:-3306}"
|
||||
MYSQL_USER="${MYSQL_USER:-root}"
|
||||
MYSQL_PASS="${MYSQL_PASS:-root}"
|
||||
|
||||
# Embedding provider configuration (for phase 4/5)
|
||||
EMBEDDING_PROVIDER="${EMBEDDING_PROVIDER:-stub}"
|
||||
EMBEDDING_DIM="${EMBEDDING_DIM:-1536}"
|
||||
OPENAI_API_BASE="${OPENAI_API_BASE:-}"
|
||||
OPENAI_API_KEY="${OPENAI_API_KEY:-}"
|
||||
OPENAI_MODEL="${OPENAI_MODEL:-hf:nomic-ai/nomic-embed-text-v1.5}"
|
||||
OPENAI_EMBEDDING_DIM="${OPENAI_EMBEDDING_DIM:-}"
|
||||
|
||||
if [[ -z "${OPENAI_EMBEDDING_DIM}" ]]; then
|
||||
if [[ "${OPENAI_MODEL}" == "hf:nomic-ai/nomic-embed-text-v1.5" ]]; then
|
||||
OPENAI_EMBEDDING_DIM=768
|
||||
else
|
||||
OPENAI_EMBEDDING_DIM="${EMBEDDING_DIM}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Uncomment to test OpenAI-compatible embeddings
|
||||
# export EMBEDDING_PROVIDER=openai
|
||||
# export EMBEDDING_DIM=1536
|
||||
export OPENAI_API_BASE="https://api.synthetic.new/openai/v1"
|
||||
export OPENAI_API_KEY="your_api_key_here"
|
||||
# export OPENAI_MODEL="hf:nomic-ai/nomic-embed-text-v1.5"
|
||||
|
||||
DB1="${ROOT_DIR}/rag_ingest_test.db"
|
||||
DB_OPENAI="${ROOT_DIR}/rag_ingest_test_openai.db"
|
||||
|
||||
VEC_EXT="${REPO_ROOT}/deps/sqlite3/sqlite3/vec0.so"
|
||||
|
||||
export RAG_VEC0_EXT="${VEC_EXT}"
|
||||
|
||||
if [[ ! -f "${VEC_EXT}" ]]; then
|
||||
echo "FATAL: vec0.so not found at ${VEC_EXT}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
run_sqlite() {
|
||||
local db="$1"
|
||||
local sql="$2"
|
||||
"${SQLITE_BIN}" "${db}" <<SQL
|
||||
.load ${VEC_EXT}
|
||||
${sql}
|
||||
SQL
|
||||
}
|
||||
|
||||
apply_schema_and_source() {
|
||||
local db="$1"
|
||||
local where_sql="$2"
|
||||
local load_schema="$3"
|
||||
local chunking_json_override="${4:-}"
|
||||
local embedding_json_override="${5:-}"
|
||||
local schema_override_path="${6:-}"
|
||||
|
||||
local schema_cmd=""
|
||||
if [[ "${load_schema}" == "true" ]]; then
|
||||
if [[ -n "${schema_override_path}" ]]; then
|
||||
schema_cmd=".read ${schema_override_path}"$'\n'".read ${ROOT_DIR}/sample_sqlite.sql"
|
||||
else
|
||||
schema_cmd=".read ${ROOT_DIR}/schema.sql"$'\n'".read ${ROOT_DIR}/sample_sqlite.sql"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "==> SQLite DB: ${db}"
|
||||
echo "==> load_schema: ${load_schema}"
|
||||
echo "==> where_sql: ${where_sql:-<empty>}"
|
||||
local chunking_json_value='{"enabled":false,"unit":"chars","chunk_size":4000,"overlap":400,"min_chunk_size":800}'
|
||||
if [[ -n "${chunking_json_override}" ]]; then
|
||||
chunking_json_value="${chunking_json_override}"
|
||||
fi
|
||||
echo "==> chunking_json: ${chunking_json_value}"
|
||||
local embedding_json_value='{"enabled":false}'
|
||||
if [[ -n "${embedding_json_override}" ]]; then
|
||||
embedding_json_value="${embedding_json_override}"
|
||||
fi
|
||||
echo "==> embedding_json: ${embedding_json_value}"
|
||||
|
||||
"${SQLITE_BIN}" "${db}" <<SQL
|
||||
.load ${VEC_EXT}
|
||||
.bail on
|
||||
.mode list
|
||||
.separator |
|
||||
.nullvalue NULL
|
||||
${schema_cmd}
|
||||
UPDATE rag_sources
|
||||
SET chunking_json='${chunking_json_value}'
|
||||
WHERE source_id=1;
|
||||
UPDATE rag_sources
|
||||
SET embedding_json='${embedding_json_value}'
|
||||
WHERE source_id=1;
|
||||
UPDATE rag_sources
|
||||
SET where_sql='${where_sql}'
|
||||
WHERE source_id=1;
|
||||
SQL
|
||||
}
|
||||
|
||||
import_mysql_seed() {
|
||||
"${MYSQL_BIN}" \
|
||||
-h"${MYSQL_HOST}" -P"${MYSQL_PORT}" \
|
||||
-u"${MYSQL_USER}" -p"${MYSQL_PASS}" \
|
||||
< "${ROOT_DIR}/sample_mysql.sql"
|
||||
}
|
||||
|
||||
run_mysql_sql() {
|
||||
local sql="$1"
|
||||
"${MYSQL_BIN}" \
|
||||
-h"${MYSQL_HOST}" -P"${MYSQL_PORT}" \
|
||||
-u"${MYSQL_USER}" -p"${MYSQL_PASS}" \
|
||||
-e "${sql}"
|
||||
}
|
||||
|
||||
assert_eq() {
|
||||
local label="$1"
|
||||
local expected="$2"
|
||||
local actual="$3"
|
||||
if [[ "${expected}" != "${actual}" ]]; then
|
||||
echo "FAIL: ${label} expected ${expected}, got ${actual}" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "OK: ${label} = ${actual}"
|
||||
}
|
||||
|
||||
fts_count() {
|
||||
local db="$1"
|
||||
local q="$2"
|
||||
run_sqlite "${db}" "SELECT COUNT(*) FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}';"
|
||||
}
|
||||
|
||||
fts_bm25_top() {
|
||||
local db="$1"
|
||||
local q="$2"
|
||||
run_sqlite "${db}" "SELECT chunk_id FROM rag_fts_chunks WHERE rag_fts_chunks MATCH '${q}' ORDER BY bm25(rag_fts_chunks) LIMIT 1;"
|
||||
}
|
||||
|
||||
vec_self_match() {
|
||||
local db="$1"
|
||||
local chunk_id="$2"
|
||||
run_sqlite "${db}" "SELECT chunk_id FROM rag_vec_chunks WHERE embedding MATCH (SELECT embedding FROM rag_vec_chunks WHERE chunk_id='${chunk_id}') ORDER BY distance LIMIT 1;"
|
||||
}
|
||||
|
||||
print_samples() {
|
||||
local db="$1"
|
||||
echo "==> Sample rag_documents"
|
||||
run_sqlite "${db}" "SELECT doc_id, source_id, substr(title,1,40) AS title, json_extract(metadata_json,'$.Score') AS score FROM rag_documents ORDER BY doc_id LIMIT 5;"
|
||||
echo "==> Sample rag_chunks"
|
||||
run_sqlite "${db}" "SELECT chunk_id, doc_id, chunk_index, substr(body,1,50) AS body FROM rag_chunks ORDER BY chunk_id LIMIT 5;"
|
||||
echo "==> Sample rag_fts_chunks matches for 'ProxySQL'"
|
||||
run_sqlite "${db}" "SELECT chunk_id, substr(title,1,40) AS title FROM rag_fts_chunks WHERE rag_fts_chunks MATCH 'ProxySQL' ORDER BY chunk_id LIMIT 5;"
|
||||
}
|
||||
|
||||
cleanup_db() {
|
||||
rm -f "${DB1}"
|
||||
rm -f "${DB_OPENAI}"
|
||||
}
|
||||
|
||||
cleanup_db
|
||||
|
||||
# Phase 1: load schema + source, chunking disabled, no where filter
|
||||
apply_schema_and_source "${DB1}" "" "true"
|
||||
|
||||
# Seed MySQL
|
||||
import_mysql_seed
|
||||
|
||||
# Run rag_ingest
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
# Validate counts (sample_mysql has 10 rows)
|
||||
DOCS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
FTS_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_fts_chunks;")"
|
||||
VEC_COUNT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_vec_chunks;")"
|
||||
|
||||
assert_eq "rag_documents" "10" "${DOCS_COUNT}"
|
||||
assert_eq "rag_chunks (chunking disabled)" "10" "${CHUNKS_COUNT}"
|
||||
assert_eq "rag_fts_chunks" "10" "${FTS_COUNT}"
|
||||
assert_eq "rag_vec_chunks (embedding disabled)" "0" "${VEC_COUNT}"
|
||||
|
||||
print_samples "${DB1}"
|
||||
|
||||
# FTS tests (phase 1)
|
||||
FTS_PHRASE_1="$(fts_count "${DB1}" '"ProxySQL adds MCP"')"
|
||||
FTS_SHORT_1="$(fts_count "${DB1}" 'Short')"
|
||||
FTS_TAG_1="$(fts_count "${DB1}" 'Tag')"
|
||||
FTS_BM25_1="$(fts_bm25_top "${DB1}" 'ProxySQL')"
|
||||
|
||||
assert_eq "fts phrase (ProxySQL adds MCP)" "1" "${FTS_PHRASE_1}"
|
||||
assert_eq "fts term (Short)" "1" "${FTS_SHORT_1}"
|
||||
assert_eq "fts term (Tag)" "1" "${FTS_TAG_1}"
|
||||
assert_eq "fts bm25 top (ProxySQL)" "posts:3#0" "${FTS_BM25_1}"
|
||||
|
||||
# Phase 1a: update skip behavior (existing docs are not updated)
|
||||
run_mysql_sql "USE rag_test; UPDATE posts SET Title='Hello RAG UPDATED' WHERE Id=1;"
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
TITLE_AFTER_UPDATE="$(run_sqlite "${DB1}" "SELECT title FROM rag_documents WHERE doc_id='posts:1';")"
|
||||
assert_eq "rag_documents title unchanged on update" "Hello RAG" "${TITLE_AFTER_UPDATE}"
|
||||
|
||||
# Reset MySQL data after update test
|
||||
import_mysql_seed
|
||||
|
||||
# Phase 1b: rag_sync_state watermark (incremental ingestion)
|
||||
SYNC_COL_1="$(run_sqlite "${DB1}" "SELECT json_extract(cursor_json,'$.column') FROM rag_sync_state WHERE source_id=1;")"
|
||||
SYNC_VAL_1="$(run_sqlite "${DB1}" "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
|
||||
|
||||
assert_eq "rag_sync_state column" "Id" "${SYNC_COL_1}"
|
||||
assert_eq "rag_sync_state value (initial)" "10" "${SYNC_VAL_1}"
|
||||
|
||||
# Delete one doc to verify watermark prevents backfill
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks WHERE chunk_id LIKE 'posts:5#%';"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks WHERE chunk_id LIKE 'posts:5#%';"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_chunks WHERE doc_id='posts:5';"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_documents WHERE doc_id='posts:5';"
|
||||
|
||||
DOCS_AFTER_DELETE="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
assert_eq "rag_documents after delete" "9" "${DOCS_AFTER_DELETE}"
|
||||
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_AFTER_REINGEST="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_AFTER_REINGEST="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
FTS_AFTER_REINGEST="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_fts_chunks;")"
|
||||
|
||||
assert_eq "rag_documents after watermark reingest" "9" "${DOCS_AFTER_REINGEST}"
|
||||
assert_eq "rag_chunks after watermark reingest" "9" "${CHUNKS_AFTER_REINGEST}"
|
||||
assert_eq "rag_fts_chunks after watermark reingest" "9" "${FTS_AFTER_REINGEST}"
|
||||
|
||||
# Insert a new source row and ensure only it is ingested
|
||||
run_mysql_sql "USE rag_test; INSERT INTO posts (Id, Title, Body, Tags, Score, CreationDate, UpdatedAt) VALUES (11, 'Watermark New', 'This row should be ingested via watermark.', 'wm,test', 1, '2024-01-14 10:00:00', '2024-01-14 11:00:00');"
|
||||
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_AFTER_NEW="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
SYNC_VAL_2="$(run_sqlite "${DB1}" "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
|
||||
|
||||
assert_eq "rag_documents after new row" "10" "${DOCS_AFTER_NEW}"
|
||||
assert_eq "rag_sync_state value (after new row)" "11" "${SYNC_VAL_2}"
|
||||
|
||||
# Reset sync state for subsequent phases
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_sync_state;"
|
||||
|
||||
# Reset MySQL data after watermark insert
|
||||
import_mysql_seed
|
||||
|
||||
# Phase 1c: UpdatedAt-based watermark filtering
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_documents;"
|
||||
run_sqlite "${DB1}" "INSERT OR REPLACE INTO rag_sync_state(source_id, mode, cursor_json, last_ok_at, last_error) VALUES (1, 'poll', '{\"column\":\"UpdatedAt\",\"value\":\"2024-01-10 10:00:00\"}', NULL, NULL);"
|
||||
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_UPDATED_AT="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
SYNC_UPDATED_AT="$(run_sqlite "${DB1}" "SELECT json_extract(cursor_json,'$.value') FROM rag_sync_state WHERE source_id=1;")"
|
||||
|
||||
assert_eq "rag_documents (UpdatedAt watermark)" "1" "${DOCS_UPDATED_AT}"
|
||||
assert_eq "rag_sync_state value (UpdatedAt)" "2024-01-12 09:30:00" "${SYNC_UPDATED_AT}"
|
||||
|
||||
# Reset sync state for subsequent phases
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_sync_state;"
|
||||
|
||||
# Phase 2: apply where filter, re-ingest after cleanup
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_documents;"
|
||||
|
||||
apply_schema_and_source "${DB1}" "Score >= 7" "false"
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
FTS_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_fts_chunks;")"
|
||||
VEC_COUNT_2="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_vec_chunks;")"
|
||||
|
||||
# In sample_mysql: Score >= 7 matches Id 1,3,5,7,9 => 5 docs
|
||||
assert_eq "rag_documents (where_sql)" "5" "${DOCS_COUNT_2}"
|
||||
assert_eq "rag_chunks (where_sql)" "5" "${CHUNKS_COUNT_2}"
|
||||
assert_eq "rag_fts_chunks (where_sql)" "5" "${FTS_COUNT_2}"
|
||||
assert_eq "rag_vec_chunks (where_sql, embedding disabled)" "0" "${VEC_COUNT_2}"
|
||||
|
||||
print_samples "${DB1}"
|
||||
|
||||
# FTS tests (phase 2)
|
||||
FTS_PROXYSQL_2="$(fts_count "${DB1}" 'ProxySQL')"
|
||||
FTS_HIGH_2="$(fts_count "${DB1}" 'High')"
|
||||
FTS_LOW_2="$(fts_count "${DB1}" 'Low')"
|
||||
FTS_BM25_2="$(fts_bm25_top "${DB1}" 'High')"
|
||||
|
||||
assert_eq "fts term (ProxySQL)" "1" "${FTS_PROXYSQL_2}"
|
||||
assert_eq "fts term (High)" "1" "${FTS_HIGH_2}"
|
||||
assert_eq "fts term (Low)" "0" "${FTS_LOW_2}"
|
||||
assert_eq "fts bm25 top (High)" "posts:9#0" "${FTS_BM25_2}"
|
||||
|
||||
# Phase 3: enable chunking and ensure rows split into multiple chunks
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_sync_state;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_documents;"
|
||||
|
||||
apply_schema_and_source "${DB1}" "" "false" '{"enabled":true,"unit":"chars","chunk_size":50,"overlap":10,"min_chunk_size":10}'
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_COUNT_3="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
LONG_DOC_CHUNKS="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks WHERE doc_id='posts:5';")"
|
||||
|
||||
assert_eq "rag_documents (chunking enabled)" "10" "${DOCS_COUNT_3}"
|
||||
if [[ "${CHUNKS_COUNT_3}" -le "${DOCS_COUNT_3}" ]]; then
|
||||
echo "FAIL: rag_chunks should be greater than rag_documents when chunking enabled" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${LONG_DOC_CHUNKS}" -le "1" ]]; then
|
||||
echo "FAIL: posts:5 should produce multiple chunks" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_samples "${DB1}"
|
||||
|
||||
# Phase 4: enable embeddings (stub) and validate vec rows
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_sync_state;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_vec_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_fts_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_chunks;"
|
||||
run_sqlite "${DB1}" "DELETE FROM rag_documents;"
|
||||
|
||||
apply_schema_and_source "${DB1}" "" "false" '' "{\"enabled\":true,\"provider\":\"${EMBEDDING_PROVIDER}\",\"dim\":${EMBEDDING_DIM},\"input\":{\"concat\":[{\"col\":\"Title\"},{\"lit\":\"\\n\"},{\"chunk_body\":true}]}}"
|
||||
"${ROOT_DIR}/rag_ingest" "${DB1}"
|
||||
|
||||
DOCS_COUNT_4="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_COUNT_4="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
VEC_COUNT_4="$(run_sqlite "${DB1}" "SELECT COUNT(*) FROM rag_vec_chunks;")"
|
||||
|
||||
assert_eq "rag_documents (embeddings enabled)" "10" "${DOCS_COUNT_4}"
|
||||
assert_eq "rag_chunks (embeddings enabled)" "10" "${CHUNKS_COUNT_4}"
|
||||
assert_eq "rag_vec_chunks (embeddings enabled)" "10" "${VEC_COUNT_4}"
|
||||
|
||||
VEC_MATCH_1="$(vec_self_match "${DB1}" 'posts:1#0')"
|
||||
assert_eq "vec self-match (posts:1#0)" "posts:1#0" "${VEC_MATCH_1}"
|
||||
|
||||
print_samples "${DB1}"
|
||||
|
||||
# Phase 5: optional OpenAI-compatible embeddings test (requires env vars)
|
||||
if [[ -n "${OPENAI_API_BASE}" && -n "${OPENAI_API_KEY}" ]]; then
|
||||
OPENAI_SCHEMA_TMP="${ROOT_DIR}/schema_openai_tmp.sql"
|
||||
sed "s/embedding float\[1536\]/embedding float[${OPENAI_EMBEDDING_DIM}]/" "${ROOT_DIR}/schema.sql" > "${OPENAI_SCHEMA_TMP}"
|
||||
|
||||
apply_schema_and_source "${DB_OPENAI}" "" "true" '' "{\"enabled\":true,\"provider\":\"openai\",\"api_base\":\"${OPENAI_API_BASE}\",\"api_key\":\"${OPENAI_API_KEY}\",\"model\":\"${OPENAI_MODEL}\",\"dim\":${OPENAI_EMBEDDING_DIM},\"input\":{\"concat\":[{\"col\":\"Title\"},{\"lit\":\"\\n\"},{\"chunk_body\":true}]}}" "${OPENAI_SCHEMA_TMP}"
|
||||
"${ROOT_DIR}/rag_ingest" "${DB_OPENAI}"
|
||||
|
||||
DOCS_COUNT_5="$(run_sqlite "${DB_OPENAI}" "SELECT COUNT(*) FROM rag_documents;")"
|
||||
CHUNKS_COUNT_5="$(run_sqlite "${DB_OPENAI}" "SELECT COUNT(*) FROM rag_chunks;")"
|
||||
VEC_COUNT_5="$(run_sqlite "${DB_OPENAI}" "SELECT COUNT(*) FROM rag_vec_chunks;")"
|
||||
|
||||
assert_eq "rag_documents (openai embeddings)" "10" "${DOCS_COUNT_5}"
|
||||
assert_eq "rag_chunks (openai embeddings)" "10" "${CHUNKS_COUNT_5}"
|
||||
assert_eq "rag_vec_chunks (openai embeddings)" "10" "${VEC_COUNT_5}"
|
||||
|
||||
print_samples "${DB_OPENAI}"
|
||||
rm -f "${OPENAI_SCHEMA_TMP}"
|
||||
else
|
||||
echo "==> OpenAI embeddings test skipped (set OPENAI_API_BASE and OPENAI_API_KEY)"
|
||||
fi
|
||||
|
||||
echo "All tests passed."
|
||||
Loading…
Reference in new issue