From a0e72aed039be52cf8c691027e784b9a07156fd3 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Mon, 19 Jan 2026 01:23:50 +0000
Subject: [PATCH] feat: Add related_objects support to two-phase discovery

Update two-phase discovery prompts and script to populate related_objects
when creating question templates.

Changes:
- Updated two_phase_discovery_prompt.md:
  - Added related_objects parameter to llm.question_template_add tool description
  - Added instruction to extract table/view names from example_sql
  - Added example showing proper related_objects format
- Updated two_phase_user_prompt.md:
  - Added example showing how to extract and pass related_objects
- Updated two_phase_discovery.py:
  - Made --schema parameter required (not optional)
  - Updated usage examples to show required --schema flag
  - Removed empty string fallback for schema_filter

This ensures that question templates created during discovery include
the related_objects field, enabling efficient object schema retrieval
when templates are searched via llm.search with include_objects=true.
---
 .../prompts/two_phase_discovery_prompt.md              |  9 ++++++++-
 .../prompts/two_phase_user_prompt.md                   |  4 +++-
 .../ClaudeCode_Headless/two_phase_discovery.py         | 10 +++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md
index 4907c6acd..c2032dabd 100644
--- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md
+++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md
@@ -71,7 +71,9 @@ Build semantic understanding of an already-harvested MySQL schema by:
     - Arguments: `agent_run_id`, `run_id`, `metric_key`, `title`, `description`, `domain_key`, `grain`, `unit`, `sql_template`, `depends`, `confidence`
 
 14. **`llm.question_template_add`** - Add question template
-    - Arguments: `agent_run_id`, `run_id`, `title`, `question_nl`, `template`, `example_sql`, `confidence`
+    - Arguments: `agent_run_id`, `run_id`, `title`, `question_nl`, `template`, `example_sql`, `related_objects`, `confidence`
+    - **IMPORTANT**: Always extract table/view names from `example_sql` or `template_json` and pass them as `related_objects` (JSON array of object names)
+    - Example: If SQL is "SELECT * FROM Customer JOIN Invoice...", related_objects should be ["Customer", "Invoice"]
 
 15. **`llm.note_add`** - Add durable note
     - Arguments: `agent_run_id`, `run_id`, `scope`, `object_id`, `domain_key`, `title`, `body`, `tags`
@@ -146,6 +148,11 @@ Create:
 1. 10–30 metrics (`llm.metric_upsert`) with metric_key, description, dependencies; add SQL templates only if confident
 2. 15–50 question templates (`llm.question_template_add`) mapping NL → structured plan; include example SQL only when confident
 
+**For question templates, ALWAYS populate `related_objects`:**
+- Extract table/view names from the `example_sql` or `template_json`
+- Pass as JSON array: `["Customer", "Invoice", "InvoiceLine"]`
+- This enables efficient fetching of object details when templates are retrieved
+
 Metrics/templates must reference the objects/columns you have summarized, not guesses.
 
 ## Quality Rules
diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md
index a64e72a93..faf549708 100644
--- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md
+++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md
@@ -109,7 +109,9 @@ for each metric:
     call llm.metric_upsert(agent_run_id, run_id, metric_key, title, description, sql_template, depends, confidence=0.7)
 
 for each question template:
-    call llm.question_template_add(agent_run_id, run_id, title, question_nl, template, example_sql, confidence=0.7)
+    # Extract table/view names from example_sql or template_json
+    related_objects = ["Customer", "Invoice", "InvoiceLine"]  # JSON array of object names
+    call llm.question_template_add(agent_run_id, run_id, title, question_nl, template, example_sql, related_objects, confidence=0.7)
 
 # Final summary
 call llm.note_add(agent_run_id, run_id, "global", title="Database Summary", body="...", tags=["final"])
diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py
index f568fb967..e687211e4 100755
--- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py
+++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py
@@ -38,8 +38,11 @@ Examples:
   # Discovery specific schema
   %(prog)s --mcp-config mcp_config.json --schema sales
 
+  # Discovery specific schema (REQUIRED)
+  %(prog)s --mcp-config mcp_config.json --schema Chinook
+
   # With custom model
-  %(prog)s --mcp-config mcp_config.json --model claude-3-opus-20240229 --schema production
+  %(prog)s --mcp-config mcp_config.json --schema sales --model claude-3-opus-20240229
         """
     )
 
@@ -50,7 +53,8 @@ Examples:
     )
     parser.add_argument(
         "--schema",
-        help="Restrict discovery to one MySQL schema/database (optional)"
+        required=True,
+        help="MySQL schema/database to discover (REQUIRED)"
     )
     parser.add_argument(
         "--model",
@@ -108,7 +112,7 @@ Examples:
                 "params": {
                         "name": "discovery.run_static",
                         "arguments": {
-                            "schema_filter": args.schema if args.schema else ""
+                            "schema_filter": args.schema
                         }
                 }
             }