Fix issues identified in AI code review

- Fix combined_search: Build SQL query dynamically based on search_term - Only include MATCH relevance when search_term is provided - Fix parameter binding (fulltext_params vs params) - Fix search_term/query parameter mismatch in run_demo - combined_search mode now correctly receives query from kwargs - Add environment variable support for database credentials - nlp_search_demo.py: DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME - stackexchange_posts.py: SOURCE_DB_* and TARGET_DB_* variables - Fix README CreationDate SQL example to use JSON_EXTRACT - Add zero division checks in get_table_stats and similarity_search_preparation
2 months ago · 62cbd6c71e
parent ecfff09633
commit 62cbd6c71e
3 changed files with 57 additions and 31 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@ -107,10 +107,10 @@ WHERE JSON_CONTAINS(Tags, '"mysql"') AND JSON_CONTAINS(Tags, '"performance"');

 ```sql
 -- Search within date range
-SELECT PostId, Title, CreationDate
+SELECT PostId, Title, JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate
 FROM processed_posts
 WHERE MATCH(SearchText) AGAINST('python' IN BOOLEAN MODE)
-AND CreationDate BETWEEN '2023-01-01' AND '2023-12-31';
+AND JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) BETWEEN '2023-01-01' AND '2023-12-31';
 ```

 ## Performance Tips
--- a/scripts/nlp_search_demo.py
+++ b/scripts/nlp_search_demo.py
@ -20,6 +20,7 @@ from typing import List, Dict, Any, Set, Tuple
 import argparse
 import time
 import sys
+import os


 class NLPSearchDemo:
@ -80,7 +81,10 @@ class NLPSearchDemo:

            print(f"\n📊 Table Statistics:")
            print(f"   Total posts: {total_posts:,}")
-            print(f"   Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)")
+            if total_posts > 0:
+                print(f"   Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)")
+            else:
+                print(f"   Posts with tags: {posts_with_tags:,}")
            print(f"   Date range: {date_range['earliest'][:10]} to {date_range['latest'][:10]}")
            print(f"   Unique tags: {len(all_tags):,}")

@ -253,17 +257,37 @@ class NLPSearchDemo:
            # Build WHERE clause
            where_clause = " AND ".join(conditions) if conditions else "1=1"

+            # Build SELECT clause dynamically - only include relevance if search_term is provided
+            if search_term:
+                select_clause = """
+                SELECT
+                    PostId,
+                    TitleText,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
+                    MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance,
+                    CreatedAt
+                """
+                order_clause = "ORDER BY relevance DESC, CreatedAt DESC"
+                # Add search_term again for the SELECT clause's MATCH
+                fulltext_params = [search_term] + params + [limit]
+            else:
+                select_clause = """
+                SELECT
+                    PostId,
+                    TitleText,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
+                    CreatedAt
+                """
+                order_clause = "ORDER BY CreatedAt DESC"
+                fulltext_params = params + [limit]
+
            sql = f"""
-            SELECT
-                PostId,
-                TitleText,
-                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
-                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
-                MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance,
-                CreatedAt
+            {select_clause}
            FROM processed_posts
            WHERE {where_clause}
-            ORDER BY relevance DESC, CreatedAt DESC
+            {order_clause}
            LIMIT %s
            """

@ -271,7 +295,7 @@ class NLPSearchDemo:

            try:
                # First try full-text search
-                cursor.execute(sql, params)
+                cursor.execute(sql, fulltext_params)
                results = cursor.fetchall()
                search_method = "combined"
            except Error:
@ -384,7 +408,8 @@ class NLPSearchDemo:
                    all_text.append(combined)

            print(f"   Total text length: {sum(len(text) for text in all_text):,} characters")
-            print(f"   Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters")
+            if all_text:
+                print(f"   Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters")

            return results

@ -417,7 +442,7 @@ class NLPSearchDemo:
                limit = kwargs.get('limit', 10)
                self.tag_search(conn, tags, operator, limit)
            elif mode == "combined":
-                search_term = kwargs.get('search_term', None)
+                search_term = kwargs.get('query', None)
                tags = kwargs.get('tags', None)
                date_from = kwargs.get('date_from', None)
                date_to = kwargs.get('date_to', None)
@ -436,13 +461,13 @@ class NLPSearchDemo:


 def main():
-    # Default configuration
+    # Default configuration (can be overridden by environment variables)
    config = {
-        "host": "127.0.0.1",
-        "port": 3306,
-        "user": "stackexchange",
-        "password": "my-password",
-        "database": "stackexchange_post",
+        "host": os.getenv("DB_HOST", "127.0.0.1"),
+        "port": int(os.getenv("DB_PORT", "3306")),
+        "user": os.getenv("DB_USER", "stackexchange"),
+        "password": os.getenv("DB_PASSWORD", "my-password"),
+        "database": os.getenv("DB_NAME", "stackexchange_post"),
        "use_pure": True,
        "ssl_disabled": True
    }
--- a/scripts/stackexchange_posts.py
+++ b/scripts/stackexchange_posts.py
@ -19,6 +19,7 @@ from typing import List, Dict, Any, Set, Tuple
 import argparse
 import time
 import sys
+import os

 class StackExchangeProcessor:
    def __init__(self, source_config: Dict[str, Any], target_config: Dict[str, Any]):
@ -396,23 +397,23 @@ class StackExchangeProcessor:
            print("\n🔌 Database connections closed")

 def main():
-    # Default configurations
+    # Default configurations (can be overridden by environment variables)
    source_config = {
-        "host": "127.0.0.1",
-        "port": 3306,
-        "user": "stackexchange",
-        "password": "my-password",
-        "database": "stackexchange",
+        "host": os.getenv("SOURCE_DB_HOST", "127.0.0.1"),
+        "port": int(os.getenv("SOURCE_DB_PORT", "3306")),
+        "user": os.getenv("SOURCE_DB_USER", "stackexchange"),
+        "password": os.getenv("SOURCE_DB_PASSWORD", "my-password"),
+        "database": os.getenv("SOURCE_DB_NAME", "stackexchange"),
        "use_pure": True,
        "ssl_disabled": True
    }

    target_config = {
-        "host": "127.0.0.1",
-        "port": 3306,
-        "user": "stackexchange",
-        "password": "my-password",
-        "database": "stackexchange_post",
+        "host": os.getenv("TARGET_DB_HOST", "127.0.0.1"),
+        "port": int(os.getenv("TARGET_DB_PORT", "3306")),
+        "user": os.getenv("TARGET_DB_USER", "stackexchange"),
+        "password": os.getenv("TARGET_DB_PASSWORD", "my-password"),
+        "database": os.getenv("TARGET_DB_NAME", "stackexchange_post"),
        "use_pure": True,
        "ssl_disabled": True
    }