From ecfff0963371edeffa8264bc63d81ab0540d6970 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Wed, 7 Jan 2026 09:28:03 +0000
Subject: [PATCH] Add NLP search demo script with comprehensive search
 capabilities

- Create nlp_search_demo.py with full-text, boolean, tag-based, and combined search
- Include statistics and similarity search preparation modes
- Add proper error handling with MySQL fallback mechanisms
- Fix syntax errors and indentation issues
- Update README.md with correct command usage examples
- Support --mode parameter with appropriate sub-parameters
- Demonstrate various search techniques on processed StackExchange posts
---
 scripts/README.md          |  32 +++
 scripts/nlp_search_demo.py | 529 +++++++++++++++++++++++++++++++++++++
 2 files changed, 561 insertions(+)
 create mode 100755 scripts/nlp_search_demo.py

diff --git a/scripts/README.md b/scripts/README.md
index af4f5ad7e..9a93d4d79 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -192,6 +192,14 @@ pip install mysql-connector-python
 
 The `scripts/` directory also contains other utility scripts:
 
+- `nlp_search_demo.py` - Demonstrate various search techniques on processed posts:
+  - Full-text search with MySQL
+  - Boolean search with operators
+  - Tag-based JSON queries
+  - Combined search approaches
+  - Statistics and search analytics
+  - Data preparation for future semantic search
+
 - `add_mysql_user.sh` - Add/replace MySQL users in ProxySQL
 - `change_host_status.sh` - Change host status in ProxySQL
 - `flush_query_cache.sh` - Flush ProxySQL query cache
@@ -199,6 +207,30 @@ The `scripts/` directory also contains other utility scripts:
 - `proxysql_config.sh` - Configure ProxySQL settings
 - `stats_scrapper.py` - Scrape statistics from ProxySQL
 
+## Search Examples
+
+### Using the NLP Search Demo
+
+```bash
+# Show search statistics
+python3 nlp_search_demo.py --mode stats
+
+# Full-text search
+python3 nlp_search_demo.py --mode full-text --query "mysql performance optimization"
+
+# Boolean search with operators
+python3 nlp_search_demo.py --mode boolean --query "+database -oracle"
+
+# Search by tags
+python3 nlp_search_demo.py --mode tags --tags mysql performance --operator AND
+
+# Combined search with text and tags
+python3 nlp_search_demo.py --mode combined --query "python optimization" --tags python
+
+# Prepare data for semantic search
+python3 nlp_search_demo.py --mode similarity --query "machine learning"
+```
+
 ## License
 
 Internal use only.
diff --git a/scripts/nlp_search_demo.py b/scripts/nlp_search_demo.py
new file mode 100755
index 000000000..3ba796e78
--- /dev/null
+++ b/scripts/nlp_search_demo.py
@@ -0,0 +1,529 @@
+#!/usr/bin/env python3
+"""
+NLP Search Demo for StackExchange Posts
+
+Demonstrates various search techniques on processed posts:
+- Full-text search with MySQL
+- Boolean search with operators
+- Tag-based JSON queries
+- Combined search approaches
+- Statistics and search analytics
+- Data preparation for future semantic search
+"""
+
+import mysql.connector
+from mysql.connector import Error, OperationalError
+import json
+import re
+import html
+from typing import List, Dict, Any, Set, Tuple
+import argparse
+import time
+import sys
+
+
+class NLPSearchDemo:
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.stop_words = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
+            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
+            'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
+            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
+        }
+
+    def connect(self):
+        """Create database connection."""
+        try:
+            conn = mysql.connector.connect(**self.config)
+            print("✅ Connected to database")
+            return conn
+        except Error as e:
+            print(f"❌ Connection error: {e}")
+            return None
+
+    def get_table_stats(self, conn):
+        """Get statistics about the processed_posts table."""
+        cursor = conn.cursor(dictionary=True)
+
+        try:
+            # Basic table stats
+            cursor.execute("SELECT COUNT(*) as total_posts FROM processed_posts")
+            total_posts = cursor.fetchone()['total_posts']
+
+            cursor.execute("SELECT COUNT(*) as posts_with_tags FROM processed_posts WHERE Tags IS NOT NULL AND Tags != '[]'")
+            posts_with_tags = cursor.fetchone()['posts_with_tags']
+
+            cursor.execute("SELECT MIN(JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate'))) as earliest, "
+                         "MAX(JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate'))) as latest "
+                         "FROM processed_posts")
+            date_range = cursor.fetchone()
+
+            # Get unique tags
+            cursor.execute("""
+                SELECT DISTINCT Tags
+                FROM processed_posts
+                WHERE Tags IS NOT NULL AND Tags != '[]'
+                LIMIT 1000
+            """)
+            tags_data = cursor.fetchall()
+
+            # Extract all unique tags
+            all_tags = set()
+            for row in tags_data:
+                if row['Tags']:
+                    try:
+                        tags_list = json.loads(row['Tags'])
+                        all_tags.update(tags_list)
+                    except:
+                        pass
+
+            print(f"\n📊 Table Statistics:")
+            print(f"   Total posts: {total_posts:,}")
+            print(f"   Posts with tags: {posts_with_tags:,} ({posts_with_tags/total_posts*100:.1f}%)")
+            print(f"   Date range: {date_range['earliest'][:10]} to {date_range['latest'][:10]}")
+            print(f"   Unique tags: {len(all_tags):,}")
+
+            if all_tags:
+                print(f"   Top tags: {', '.join(sorted(list(all_tags))[:20])}")
+
+        except Error as e:
+            print(f"❌ Error getting stats: {e}")
+        finally:
+            cursor.close()
+
+    def full_text_search(self, conn, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """Perform full-text search with MySQL."""
+        cursor = conn.cursor(dictionary=True)
+
+        start_time = time.time()
+        try:
+            sql = """
+            SELECT PostId, TitleText, MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance
+            FROM processed_posts
+            WHERE MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE)
+            ORDER BY relevance DESC, CreatedAt DESC LIMIT %s
+            """
+            cursor.execute(sql, (query, query, limit))
+            results = cursor.fetchall()
+            search_method = "full-text"
+        except Error:
+            sql = """
+            SELECT PostId, TitleText, CreatedAt
+            FROM processed_posts
+            WHERE SearchText LIKE %s OR TitleText LIKE %s OR BodyText LIKE %s
+            ORDER BY CreatedAt DESC LIMIT %s
+            """
+            search_term = f"%{query}%"
+            cursor.execute(sql, (search_term, search_term, search_term, limit))
+            results = cursor.fetchall()
+            search_method = "LIKE"
+
+        elapsed = time.time() - start_time
+
+        print(f"🔍 {search_method.title()} search for '{query}' ({elapsed:.3f}s):")
+        for i, row in enumerate(results, 1):
+            print(f"  {i}. [{row['PostId']}] {row['TitleText'][:80]}...")
+
+        print(f"📊 Found {len(results)} results in {elapsed:.3f} seconds")
+        return results
+
+    def boolean_search(self, conn, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """Perform boolean search with operators."""
+        cursor = conn.cursor(dictionary=True)
+        start_time = time.time()
+
+        try:
+            # Try boolean mode first
+            sql = """
+            SELECT PostId, TitleText,
+                   MATCH(SearchText) AGAINST(%s IN BOOLEAN MODE) as relevance
+            FROM processed_posts
+            WHERE MATCH(SearchText) AGAINST(%s IN BOOLEAN MODE)
+            ORDER BY relevance DESC, CreatedAt DESC LIMIT %s
+            """
+            cursor.execute(sql, (query, query, limit))
+            results = cursor.fetchall()
+            search_method = "boolean"
+        except Error:
+            # Fallback to LIKE search
+            sql = """
+            SELECT PostId, TitleText, CreatedAt
+            FROM processed_posts
+            WHERE SearchText LIKE %s
+            ORDER BY CreatedAt DESC LIMIT %s
+            """
+            search_term = f"%{query}%"
+            cursor.execute(sql, (search_term, limit))
+            results = cursor.fetchall()
+            search_method = "LIKE"
+
+        elapsed = time.time() - start_time
+
+        print(f"🔍 Boolean search for '{query}' ({elapsed:.3f}s):")
+        for i, row in enumerate(results, 1):
+            print(f"  {i}. [{row['PostId']}] {row['TitleText'][:80]}...")
+
+        print(f"📊 Found {len(results)} results in {elapsed:.3f} seconds")
+        return results
+
+    def tag_search(self, conn, tags: List[str], operator: str = "AND", limit: int = 10) -> List[Dict[str, Any]]:
+        """Search by tags using JSON functions."""
+        cursor = conn.cursor(dictionary=True)
+
+        try:
+            # Build JSON_CONTAINS conditions
+            conditions = []
+            params = []
+
+            for tag in tags:
+                conditions.append(f"JSON_CONTAINS(Tags, %s)")
+                params.append(f'"{tag}"')
+
+            if operator.upper() == "AND":
+                where_clause = " AND ".join(conditions)
+            else:  # OR
+                where_clause = " OR ".join(conditions)
+
+            sql = f"""
+            SELECT
+                PostId,
+                TitleText,
+                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
+                CreatedAt
+            FROM processed_posts
+            WHERE {where_clause}
+            ORDER BY CreatedAt DESC
+            LIMIT %s
+            """
+
+            start_time = time.time()
+            cursor.execute(sql, params + [limit])
+            results = cursor.fetchall()
+            search_method = "JSON_CONTAINS"
+
+            elapsed = time.time() - start_time
+
+            tag_str = " AND ".join(tags) if operator == "AND" else " OR ".join(tags)
+            print(f"🏷️  Tag search for {tag_str} ({elapsed:.3f}s):")
+            for i, row in enumerate(results, 1):
+                found_tags = json.loads(row['TagsJson']) if row['TagsJson'] else []
+                print(f"  {i}. [{row['PostId']}] {row['TitleText'][:80]}...")
+                print(f"     All tags: {', '.join(found_tags[:5])}{'...' if len(found_tags) > 5 else ''}")
+                print()
+
+            print(f"📊 Found {len(results)} results in {elapsed:.3f} seconds")
+            return results
+
+        except Error as e:
+            print(f"❌ Tag search error: {e}")
+            return []
+        finally:
+            cursor.close()
+
+    def combined_search(self, conn, search_term: str = None, tags: List[str] = None,
+                        date_from: str = None, date_to: str = None, limit: int = 10) -> List[Dict[str, Any]]:
+        """Combined search with full-text, tags, and date filtering."""
+        cursor = conn.cursor(dictionary=True)
+
+        try:
+            conditions = []
+            params = []
+
+            # Full-text search condition
+            if search_term:
+                conditions.append("MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE)")
+                params.append(search_term)
+
+            # Tag conditions
+            if tags:
+                for tag in tags:
+                    conditions.append("JSON_CONTAINS(Tags, %s)")
+                    params.append(f'"{tag}"')
+
+            # Date conditions
+            if date_from:
+                conditions.append("JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) >= %s")
+                params.append(date_from)
+
+            if date_to:
+                conditions.append("JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) <= %s")
+                params.append(date_to)
+
+            # Build WHERE clause
+            where_clause = " AND ".join(conditions) if conditions else "1=1"
+
+            sql = f"""
+            SELECT
+                PostId,
+                TitleText,
+                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
+                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
+                MATCH(SearchText) AGAINST(%s IN NATURAL LANGUAGE MODE) as relevance,
+                CreatedAt
+            FROM processed_posts
+            WHERE {where_clause}
+            ORDER BY relevance DESC, CreatedAt DESC
+            LIMIT %s
+            """
+
+            start_time = time.time()
+
+            try:
+                # First try full-text search
+                cursor.execute(sql, params)
+                results = cursor.fetchall()
+                search_method = "combined"
+            except Error:
+                # Fallback to LIKE search
+                conditions = []
+                like_params = []
+
+                # Add search term condition
+                if search_term:
+                    conditions.append("(SearchText LIKE %s OR TitleText LIKE %s OR BodyText LIKE %s)")
+                    like_params.extend([f"%{search_term}%"] * 3)
+
+                # Add tag conditions
+                if tags:
+                    for tag in tags:
+                        conditions.append("JSON_CONTAINS(Tags, %s)")
+                        like_params.append(f'"{tag}"')
+
+                # Add date conditions
+                if date_from:
+                    conditions.append("JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) >= %s")
+                    like_params.append(date_from)
+
+                if date_to:
+                    conditions.append("JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) <= %s")
+                    like_params.append(date_to)
+
+                where_clause = " AND ".join(conditions) if conditions else "1=1"
+
+                like_sql = f"""
+                SELECT
+                    PostId,
+                    TitleText,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.CreationDate')) as CreationDate,
+                    JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson,
+                    CreatedAt
+                FROM processed_posts
+                WHERE {where_clause}
+                ORDER BY CreatedAt DESC
+                LIMIT %s
+                """
+
+                like_params.append(limit)
+                cursor.execute(like_sql, like_params)
+                results = cursor.fetchall()
+                search_method = "LIKE"
+
+            elapsed = time.time() - start_time
+
+            print(f"🔍 {search_method.title()} search ({elapsed:.3f}s):")
+            print(f"   Search term: {search_term or 'None'}")
+            print(f"   Tags: {tags or 'None'}")
+            print(f"   Date range: {date_from or 'beginning'} to {date_to or 'end'}")
+            print()
+
+            for i, row in enumerate(results, 1):
+                found_tags = json.loads(row['TagsJson']) if row['TagsJson'] else []
+                relevance = row.get('relevance', 0.0) if search_method == "combined" else "N/A"
+
+                print(f"  {i}. [{row['PostId']}] {row['TitleText'][:80]}...")
+                print(f"     Tags: {', '.join(found_tags[:3])}{'...' if len(found_tags) > 3 else ''}")
+                print(f"     Created: {row['CreationDate']}")
+                if search_method == "combined":
+                    print(f"     Relevance: {relevance:.3f}")
+                print()
+
+            print(f"📊 Found {len(results)} results in {elapsed:.3f} seconds")
+            return results
+
+        except Error as e:
+            print(f"❌ Combined search error: {e}")
+            return []
+        finally:
+            cursor.close()
+
+    def similarity_search_preparation(self, conn, query: str, limit: int = 20) -> List[Dict[str, Any]]:
+        """Prepare data for future semantic search by extracting relevant terms."""
+        cursor = conn.cursor(dictionary=True)
+
+        try:
+            # Search and return results with text content for future embedding generation
+            sql = """
+            SELECT
+                PostId,
+                TitleText,
+                BodyText,
+                RepliesText,
+                JSON_UNQUOTE(JSON_EXTRACT(JsonData, '$.Tags')) as TagsJson
+            FROM processed_posts
+            WHERE SearchText LIKE %s
+            ORDER BY CreatedAt DESC
+            LIMIT %s
+            """
+
+            search_term = f"%{query}%"
+            cursor.execute(sql, (search_term, limit))
+            results = cursor.fetchall()
+
+            print(f"🔍 Preparation for semantic search on '{query}':")
+            print(f"   Found {len(results)} relevant posts")
+
+            # Extract text for future embeddings
+            all_text = []
+            for row in results:
+                title = row['TitleText'] or ''
+                body = row['BodyText'] or ''
+                replies = row['RepliesText'] or ''
+                combined = f"{title} {body} {replies}".strip()
+                if combined:
+                    all_text.append(combined)
+
+            print(f"   Total text length: {sum(len(text) for text in all_text):,} characters")
+            print(f"   Average text length: {sum(len(text) for text in all_text) / len(all_text):,.0f} characters")
+
+            return results
+
+        except Error as e:
+            print(f"❌ Similarity search preparation error: {e}")
+            return []
+        finally:
+            cursor.close()
+
+    def run_demo(self, mode: str = "stats", **kwargs):
+        """Run the search demo with specified mode."""
+        conn = self.connect()
+        if not conn:
+            return
+
+        try:
+            if mode == "stats":
+                self.get_table_stats(conn)
+            elif mode == "full-text":
+                query = kwargs.get('query', '')
+                limit = kwargs.get('limit', 10)
+                self.full_text_search(conn, query, limit)
+            elif mode == "boolean":
+                query = kwargs.get('query', '')
+                limit = kwargs.get('limit', 10)
+                self.boolean_search(conn, query, limit)
+            elif mode == "tags":
+                tags = kwargs.get('tags', [])
+                operator = kwargs.get('operator', 'AND')
+                limit = kwargs.get('limit', 10)
+                self.tag_search(conn, tags, operator, limit)
+            elif mode == "combined":
+                search_term = kwargs.get('search_term', None)
+                tags = kwargs.get('tags', None)
+                date_from = kwargs.get('date_from', None)
+                date_to = kwargs.get('date_to', None)
+                limit = kwargs.get('limit', 10)
+                self.combined_search(conn, search_term, tags, date_from, date_to, limit)
+            elif mode == "similarity":
+                query = kwargs.get('query', '')
+                limit = kwargs.get('limit', 20)
+                self.similarity_search_preparation(conn, query, limit)
+            else:
+                print(f"❌ Unknown mode: {mode}")
+                print("Available modes: stats, full-text, boolean, tags, combined, similarity")
+        finally:
+            if conn and conn.is_connected():
+                conn.close()
+
+
+def main():
+    # Default configuration
+    config = {
+        "host": "127.0.0.1",
+        "port": 3306,
+        "user": "stackexchange",
+        "password": "my-password",
+        "database": "stackexchange_post",
+        "use_pure": True,
+        "ssl_disabled": True
+    }
+
+    parser = argparse.ArgumentParser(description="NLP Search Demo for StackExchange Posts")
+
+    parser.add_argument("--host", default=config['host'], help="Database host")
+    parser.add_argument("--port", type=int, default=config['port'], help="Database port")
+    parser.add_argument("--user", default=config['user'], help="Database user")
+    parser.add_argument("--password", default=config['password'], help="Database password")
+    parser.add_argument("--database", default=config['database'], help="Database name")
+
+    parser.add_argument("--mode", default="stats",
+                       choices=["stats", "full-text", "boolean", "tags", "combined", "similarity"],
+                       help="Search mode to demonstrate")
+
+    parser.add_argument("--limit", type=int, default=10, help="Number of results to return")
+    parser.add_argument("--operator", default="AND", choices=["AND", "OR"], help="Tag operator")
+
+    parser.add_argument("--query", help="Search query for text-based searches")
+    parser.add_argument("--tags", nargs='+', help="Tags to search for")
+    parser.add_argument("--date-from", help="Start date (YYYY-MM-DD)")
+    parser.add_argument("--date-to", help="End date (YYYY-MM-DD)")
+
+    parser.add_argument("--stats", action="store_true", help="Show table statistics")
+    parser.add_argument("--verbose", action="store_true", help="Show detailed output")
+
+    args = parser.parse_args()
+
+    # Override configuration with command line arguments
+    config.update({
+        "host": args.host,
+        "port": args.port,
+        "user": args.user,
+        "password": args.password,
+        "database": args.database
+    })
+
+    # Handle legacy --stats flag
+    if args.stats:
+        args.mode = "stats"
+
+    print("🔍 NLP Search Demo for StackExchange Posts")
+    print("=" * 50)
+    print(f"Database: {config['host']}:{config['port']}/{config['database']}")
+    print(f"Mode: {args.mode}")
+    print("=" * 50)
+
+    # Create demo instance and run
+    demo = NLPSearchDemo(config)
+
+    # Prepare kwargs based on mode
+    kwargs = {
+        'limit': args.limit,
+        'operator': args.operator,
+        'query': args.query,
+        'tags': args.tags,
+        'date_from': args.date_from,
+        'date_to': args.date_to
+    }
+
+    # Remove None values
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    # If mode is text-based and no query provided, use the mode as query
+    if args.mode in ["full-text", "boolean", "similarity"] and not args.query:
+        # For compatibility with command-line usage like: python3 script.py --full-text "mysql optimization"
+        if len(sys.argv) > 2 and sys.argv[1] == "--mode" and len(sys.argv) > 4:
+            # Find the actual query after the mode
+            mode_index = sys.argv.index("--mode")
+            if mode_index + 2 < len(sys.argv):
+                query_index = mode_index + 2
+                query_parts = []
+                while query_index < len(sys.argv) and not sys.argv[query_index].startswith("--"):
+                    query_parts.append(sys.argv[query_index])
+                    query_index += 1
+                if query_parts:
+                    kwargs['query'] = ' '.join(query_parts)
+
+    demo.run_demo(args.mode, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file