From a0e72aed039be52cf8c691027e784b9a07156fd3 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 19 Jan 2026 01:23:50 +0000 Subject: [PATCH] feat: Add related_objects support to two-phase discovery Update two-phase discovery prompts and script to populate related_objects when creating question templates. Changes: - Updated two_phase_discovery_prompt.md: - Added related_objects parameter to llm.question_template_add tool description - Added instruction to extract table/view names from example_sql - Added example showing proper related_objects format - Updated two_phase_user_prompt.md: - Added example showing how to extract and pass related_objects - Updated two_phase_discovery.py: - Made --schema parameter required (not optional) - Updated usage examples to show required --schema flag - Removed empty string fallback for schema_filter This ensures that question templates created during discovery include the related_objects field, enabling efficient object schema retrieval when templates are searched via llm.search with include_objects=true. --- .../prompts/two_phase_discovery_prompt.md | 9 ++++++++- .../prompts/two_phase_user_prompt.md | 4 +++- .../ClaudeCode_Headless/two_phase_discovery.py | 10 +++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md index 4907c6acd..c2032dabd 100644 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md @@ -71,7 +71,9 @@ Build semantic understanding of an already-harvested MySQL schema by: - Arguments: `agent_run_id`, `run_id`, `metric_key`, `title`, `description`, `domain_key`, `grain`, `unit`, `sql_template`, `depends`, `confidence` 14. **`llm.question_template_add`** - Add question template - - Arguments: `agent_run_id`, `run_id`, `title`, `question_nl`, `template`, `example_sql`, `confidence` + - Arguments: `agent_run_id`, `run_id`, `title`, `question_nl`, `template`, `example_sql`, `related_objects`, `confidence` + - **IMPORTANT**: Always extract table/view names from `example_sql` or `template_json` and pass them as `related_objects` (JSON array of object names) + - Example: If SQL is "SELECT * FROM Customer JOIN Invoice...", related_objects should be ["Customer", "Invoice"] 15. **`llm.note_add`** - Add durable note - Arguments: `agent_run_id`, `run_id`, `scope`, `object_id`, `domain_key`, `title`, `body`, `tags` @@ -146,6 +148,11 @@ Create: 1. 10–30 metrics (`llm.metric_upsert`) with metric_key, description, dependencies; add SQL templates only if confident 2. 15–50 question templates (`llm.question_template_add`) mapping NL → structured plan; include example SQL only when confident +**For question templates, ALWAYS populate `related_objects`:** +- Extract table/view names from the `example_sql` or `template_json` +- Pass as JSON array: `["Customer", "Invoice", "InvoiceLine"]` +- This enables efficient fetching of object details when templates are retrieved + Metrics/templates must reference the objects/columns you have summarized, not guesses. ## Quality Rules diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md index a64e72a93..faf549708 100644 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md @@ -109,7 +109,9 @@ for each metric: call llm.metric_upsert(agent_run_id, run_id, metric_key, title, description, sql_template, depends, confidence=0.7) for each question template: - call llm.question_template_add(agent_run_id, run_id, title, question_nl, template, example_sql, confidence=0.7) + # Extract table/view names from example_sql or template_json + related_objects = ["Customer", "Invoice", "InvoiceLine"] # JSON array of object names + call llm.question_template_add(agent_run_id, run_id, title, question_nl, template, example_sql, related_objects, confidence=0.7) # Final summary call llm.note_add(agent_run_id, run_id, "global", title="Database Summary", body="...", tags=["final"]) diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py index f568fb967..e687211e4 100755 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py @@ -38,8 +38,11 @@ Examples: # Discovery specific schema %(prog)s --mcp-config mcp_config.json --schema sales + # Discovery specific schema (REQUIRED) + %(prog)s --mcp-config mcp_config.json --schema Chinook + # With custom model - %(prog)s --mcp-config mcp_config.json --model claude-3-opus-20240229 --schema production + %(prog)s --mcp-config mcp_config.json --schema sales --model claude-3-opus-20240229 """ ) @@ -50,7 +53,8 @@ Examples: ) parser.add_argument( "--schema", - help="Restrict discovery to one MySQL schema/database (optional)" + required=True, + help="MySQL schema/database to discover (REQUIRED)" ) parser.add_argument( "--model", @@ -108,7 +112,7 @@ Examples: "params": { "name": "discovery.run_static", "arguments": { - "schema_filter": args.schema if args.schema else "" + "schema_filter": args.schema } } }