Fix issues identified in AI code review

- Fix combined_search: Build SQL query dynamically based on search_term
  - Only include MATCH relevance when search_term is provided
  - Fix parameter binding (fulltext_params vs params)
- Fix search_term/query parameter mismatch in run_demo
  - combined_search mode now correctly receives query from kwargs
- Add environment variable support for database credentials
  - nlp_search_demo.py: DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME
  - stackexchange_posts.py: SOURCE_DB_* and TARGET_DB_* variables
- Fix README CreationDate SQL example to use JSON_EXTRACT
- Add zero division checks in get_table_stats and similarity_search_preparation
pull/5310/head
Rene Cannao 2 months ago
parent ecfff09633
commit 62cbd6c71e

@ -107,10 +107,10 @@ WHERE JSON_CONTAINS(Tags, '"mysql"') AND JSON_CONTAINS(Tags, '"performance"');
```sql
-- Search within date range
SELECT PostId, Title, CreationDate
SELECT PostId, Title, JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate
FROM processed_posts
WHERE MATCH(SearchText) AGAINST('python' IN BOOLEAN MODE)
AND CreationDate BETWEEN '2023-01-01' AND '2023-12-31';
AND JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) BETWEEN '2023-01-01' AND '2023-12-31';
```
## Performance Tips

@ -20,6 +20,7 @@ from typing import List, Dict, Any, Set, Tuple
import argparse
import time
import sys
import os
class NLPSearchDemo:
@ -80,7 +81,10 @@ class NLPSearchDemo:
print(f"\n📊 Table Statistics:")
print(f" Total posts: {total_posts:,}")
print(f" Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)")
if total_posts > 0:
print(f" Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)")
else:
print(f" Posts with tags: {posts_with_tags:,}")
print(f" Date range: {date_range['earliest'][:10]} to {date_range['latest'][:10]}")
print(f" Unique tags: {len(all_tags):,}")
@ -253,17 +257,37 @@ class NLPSearchDemo:
# Build WHERE clause
where_clause = " AND ".join(conditions) if conditions else "1=1"
# Build SELECT clause dynamically - only include relevance if search_term is provided
if search_term:
select_clause = """
SELECT
PostId,
TitleText,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance,
CreatedAt
"""
order_clause = "ORDER BY relevance DESC, CreatedAt DESC"
# Add search_term again for the SELECT clause's MATCH
fulltext_params = [search_term] + params + [limit]
else:
select_clause = """
SELECT
PostId,
TitleText,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
CreatedAt
"""
order_clause = "ORDER BY CreatedAt DESC"
fulltext_params = params + [limit]
sql = f"""
SELECT
PostId,
TitleText,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance,
CreatedAt
{select_clause}
FROM processed_posts
WHERE {where_clause}
ORDER BY relevance DESC, CreatedAt DESC
{order_clause}
LIMIT %s
"""
@ -271,7 +295,7 @@ class NLPSearchDemo:
try:
# First try full-text search
cursor.execute(sql, params)
cursor.execute(sql, fulltext_params)
results = cursor.fetchall()
search_method = "combined"
except Error:
@ -384,7 +408,8 @@ class NLPSearchDemo:
all_text.append(combined)
print(f" Total text length: {sum(len(text) for text in all_text):,} characters")
print(f" Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters")
if all_text:
print(f" Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters")
return results
@ -417,7 +442,7 @@ class NLPSearchDemo:
limit = kwargs.get('limit', 10)
self.tag_search(conn, tags, operator, limit)
elif mode == "combined":
search_term = kwargs.get('search_term', None)
search_term = kwargs.get('query', None)
tags = kwargs.get('tags', None)
date_from = kwargs.get('date_from', None)
date_to = kwargs.get('date_to', None)
@ -436,13 +461,13 @@ class NLPSearchDemo:
def main():
# Default configuration
# Default configuration (can be overridden by environment variables)
config = {
"host": "127.0.0.1",
"port": 3306,
"user": "stackexchange",
"password": "my-password",
"database": "stackexchange_post",
"host": os.getenv("DB_HOST", "127.0.0.1"),
"port": int(os.getenv("DB_PORT", "3306")),
"user": os.getenv("DB_USER", "stackexchange"),
"password": os.getenv("DB_PASSWORD", "my-password"),
"database": os.getenv("DB_NAME", "stackexchange_post"),
"use_pure": True,
"ssl_disabled": True
}

@ -19,6 +19,7 @@ from typing import List, Dict, Any, Set, Tuple
import argparse
import time
import sys
import os
class StackExchangeProcessor:
def __init__(self, source_config: Dict[str, Any], target_config: Dict[str, Any]):
@ -396,23 +397,23 @@ class StackExchangeProcessor:
print("\n🔌 Database connections closed")
def main():
# Default configurations
# Default configurations (can be overridden by environment variables)
source_config = {
"host": "127.0.0.1",
"port": 3306,
"user": "stackexchange",
"password": "my-password",
"database": "stackexchange",
"host": os.getenv("SOURCE_DB_HOST", "127.0.0.1"),
"port": int(os.getenv("SOURCE_DB_PORT", "3306")),
"user": os.getenv("SOURCE_DB_USER", "stackexchange"),
"password": os.getenv("SOURCE_DB_PASSWORD", "my-password"),
"database": os.getenv("SOURCE_DB_NAME", "stackexchange"),
"use_pure": True,
"ssl_disabled": True
}
target_config = {
"host": "127.0.0.1",
"port": 3306,
"user": "stackexchange",
"password": "my-password",
"database": "stackexchange_post",
"host": os.getenv("TARGET_DB_HOST", "127.0.0.1"),
"port": int(os.getenv("TARGET_DB_PORT", "3306")),
"user": os.getenv("TARGET_DB_USER", "stackexchange"),
"password": os.getenv("TARGET_DB_PASSWORD", "my-password"),
"database": os.getenv("TARGET_DB_NAME", "stackexchange_post"),
"use_pure": True,
"ssl_disabled": True
}

Loading…
Cancel
Save