From 62cbd6c71e313c82a30fa2a498cedfea1e76ff78 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 8 Jan 2026 12:39:07 +0000 Subject: [PATCH] Fix issues identified in AI code review - Fix combined_search: Build SQL query dynamically based on search_term - Only include MATCH relevance when search_term is provided - Fix parameter binding (fulltext_params vs params) - Fix search_term/query parameter mismatch in run_demo - combined_search mode now correctly receives query from kwargs - Add environment variable support for database credentials - nlp_search_demo.py: DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME - stackexchange_posts.py: SOURCE_DB_* and TARGET_DB_* variables - Fix README CreationDate SQL example to use JSON_EXTRACT - Add zero division checks in get_table_stats and similarity_search_preparation --- scripts/README.md | 4 +-- scripts/nlp_search_demo.py | 61 ++++++++++++++++++++++++---------- scripts/stackexchange_posts.py | 23 +++++++------ 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 9a93d4d79..897520b52 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -107,10 +107,10 @@ WHERE JSON_CONTAINS(Tags, '"mysql"') AND JSON_CONTAINS(Tags, '"performance"'); ```sql -- Search within date range -SELECT PostId, Title, CreationDate +SELECT PostId, Title, JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate FROM processed_posts WHERE MATCH(SearchText) AGAINST('python' IN BOOLEAN MODE) -AND CreationDate BETWEEN '2023-01-01' AND '2023-12-31'; +AND JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) BETWEEN '2023-01-01' AND '2023-12-31'; ``` ## Performance Tips diff --git a/scripts/nlp_search_demo.py b/scripts/nlp_search_demo.py index 3ba796e78..234b87f44 100755 --- a/scripts/nlp_search_demo.py +++ b/scripts/nlp_search_demo.py @@ -20,6 +20,7 @@ from typing import List, Dict, Any, Set, Tuple import argparse import time import sys +import os class NLPSearchDemo: @@ -80,7 +81,10 @@ class NLPSearchDemo: print(f"\nšŸ“Š Table Statistics:") print(f" Total posts: {total_posts:,}") - print(f" Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)") + if total_posts > 0: + print(f" Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)") + else: + print(f" Posts with tags: {posts_with_tags:,}") print(f" Date range: {date_range['earliest'][:10]} to {date_range['latest'][:10]}") print(f" Unique tags: {len(all_tags):,}") @@ -253,17 +257,37 @@ class NLPSearchDemo: # Build WHERE clause where_clause = " AND ".join(conditions) if conditions else "1=1" + # Build SELECT clause dynamically - only include relevance if search_term is provided + if search_term: + select_clause = """ + SELECT + PostId, + TitleText, + JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate, + JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson, + MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance, + CreatedAt + """ + order_clause = "ORDER BY relevance DESC, CreatedAt DESC" + # Add search_term again for the SELECT clause's MATCH + fulltext_params = [search_term] + params + [limit] + else: + select_clause = """ + SELECT + PostId, + TitleText, + JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate, + JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson, + CreatedAt + """ + order_clause = "ORDER BY CreatedAt DESC" + fulltext_params = params + [limit] + sql = f""" - SELECT - PostId, - TitleText, - JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate, - JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson, - MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance, - CreatedAt + {select_clause} FROM processed_posts WHERE {where_clause} - ORDER BY relevance DESC, CreatedAt DESC + {order_clause} LIMIT %s """ @@ -271,7 +295,7 @@ class NLPSearchDemo: try: # First try full-text search - cursor.execute(sql, params) + cursor.execute(sql, fulltext_params) results = cursor.fetchall() search_method = "combined" except Error: @@ -384,7 +408,8 @@ class NLPSearchDemo: all_text.append(combined) print(f" Total text length: {sum(len(text) for text in all_text):,} characters") - print(f" Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters") + if all_text: + print(f" Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters") return results @@ -417,7 +442,7 @@ class NLPSearchDemo: limit = kwargs.get('limit', 10) self.tag_search(conn, tags, operator, limit) elif mode == "combined": - search_term = kwargs.get('search_term', None) + search_term = kwargs.get('query', None) tags = kwargs.get('tags', None) date_from = kwargs.get('date_from', None) date_to = kwargs.get('date_to', None) @@ -436,13 +461,13 @@ class NLPSearchDemo: def main(): - # Default configuration + # Default configuration (can be overridden by environment variables) config = { - "host": "127.0.0.1", - "port": 3306, - "user": "stackexchange", - "password": "my-password", - "database": "stackexchange_post", + "host": os.getenv("DB_HOST", "127.0.0.1"), + "port": int(os.getenv("DB_PORT", "3306")), + "user": os.getenv("DB_USER", "stackexchange"), + "password": os.getenv("DB_PASSWORD", "my-password"), + "database": os.getenv("DB_NAME", "stackexchange_post"), "use_pure": True, "ssl_disabled": True } diff --git a/scripts/stackexchange_posts.py b/scripts/stackexchange_posts.py index 211c0cd4d..70584e0a2 100755 --- a/scripts/stackexchange_posts.py +++ b/scripts/stackexchange_posts.py @@ -19,6 +19,7 @@ from typing import List, Dict, Any, Set, Tuple import argparse import time import sys +import os class StackExchangeProcessor: def __init__(self, source_config: Dict[str, Any], target_config: Dict[str, Any]): @@ -396,23 +397,23 @@ class StackExchangeProcessor: print("\nšŸ”Œ Database connections closed") def main(): - # Default configurations + # Default configurations (can be overridden by environment variables) source_config = { - "host": "127.0.0.1", - "port": 3306, - "user": "stackexchange", - "password": "my-password", - "database": "stackexchange", + "host": os.getenv("SOURCE_DB_HOST", "127.0.0.1"), + "port": int(os.getenv("SOURCE_DB_PORT", "3306")), + "user": os.getenv("SOURCE_DB_USER", "stackexchange"), + "password": os.getenv("SOURCE_DB_PASSWORD", "my-password"), + "database": os.getenv("SOURCE_DB_NAME", "stackexchange"), "use_pure": True, "ssl_disabled": True } target_config = { - "host": "127.0.0.1", - "port": 3306, - "user": "stackexchange", - "password": "my-password", - "database": "stackexchange_post", + "host": os.getenv("TARGET_DB_HOST", "127.0.0.1"), + "port": int(os.getenv("TARGET_DB_PORT", "3306")), + "user": os.getenv("TARGET_DB_USER", "stackexchange"), + "password": os.getenv("TARGET_DB_PASSWORD", "my-password"), + "database": os.getenv("TARGET_DB_NAME", "stackexchange_post"), "use_pure": True, "ssl_disabled": True }