From ffdb334dc30929103603f1bbdb02cdeed8f73373 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Wed, 24 Dec 2025 05:41:46 +0000 Subject: [PATCH] Add WHERE filters to prevent empty input errors and fix SQL syntax Changes: - Filter Posts by PostTypeId IN (1,2) (Questions and Answers) - Filter by minimum text length > 30 characters (Title + Body) - Update get_total_posts to count only eligible posts for accurate progress - Fix SQL syntax error in process_batch WHERE clause - Update documentation with filter details Rationale: - Empty or very short text causes embedding generation failures - PostTypeId 1,2 are most relevant content (Questions and Answers) - Ensures consistent counting between total, remaining, and processed --- scripts/process_posts_embeddings.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/process_posts_embeddings.py b/scripts/process_posts_embeddings.py index 57fdda807..c736588b5 100755 --- a/scripts/process_posts_embeddings.py +++ b/scripts/process_posts_embeddings.py @@ -5,6 +5,10 @@ Process Posts table embeddings using sqlite-rembed in ProxySQL SQLite3 server. Connects to SQLite3 server via MySQL connector, configures API client, and processes unembedded Posts rows in batches of 10. +Filters applied: +- Only PostTypeId IN (1,2) (Questions and Answers) +- Minimum text length > 30 characters (Title + Body) + Prerequisites: 1. Posts table must exist (copied from MySQL) 2. Posts_embeddings virtual table must exist: @@ -115,7 +119,9 @@ def get_remaining_count(conn): SELECT COUNT(*) FROM Posts LEFT JOIN Posts_embeddings ON Posts.rowid = Posts_embeddings.rowid - WHERE Posts_embeddings.rowid IS NULL; + WHERE Posts.PostTypeId IN (1,2) + AND LENGTH(COALESCE(Posts.Title || '', '') || Posts.Body) > 30 + AND Posts_embeddings.rowid IS NULL; """ try: @@ -133,11 +139,16 @@ def get_remaining_count(conn): raise def get_total_posts(conn): - """Get total number of Posts.""" + """Get total number of eligible Posts (PostTypeId 1,2 with text length > 30).""" cursor = conn.cursor() try: - cursor.execute("SELECT COUNT(*) FROM Posts;") + cursor.execute(""" + SELECT COUNT(*) + FROM Posts + WHERE PostTypeId IN (1,2) + AND LENGTH(COALESCE(Posts.Title || '', '') || Posts.Body) > 30; + """) result = cursor.fetchone() if result and result[0] is not None: total = int(result[0]) @@ -160,7 +171,9 @@ def process_batch(conn, args): COALESCE(Posts.Title || ' ', '') || Posts.Body) as embedding FROM Posts LEFT JOIN Posts_embeddings ON Posts.rowid = Posts_embeddings.rowid - WHERE Posts_embeddings.rowid IS NULL + WHERE Posts.PostTypeId IN (1,2) + AND LENGTH(COALESCE(Posts.Title || '', '') || Posts.Body) > 30 + AND Posts_embeddings.rowid IS NULL LIMIT {args.batch_size}; """ @@ -287,4 +300,4 @@ def main(): conn.close() if __name__ == "__main__": - main() \ No newline at end of file + main()