From ade0130e67e88b9052d0dc4e76d48d98d3328485 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Tue, 17 Feb 2026 01:40:30 +0000 Subject: [PATCH] discoveryagent: update Claude Code headless flow for target_id-scoped MCP tools Enhance the existing Claude Code custom-prompt discovery examples to match the new MCP contract introduced by multi-target routing and target-scoped catalog semantics. Changes included: - static_harvest.sh - added required --target-id argument (with MCP_TARGET_ID env fallback) - always sends target_id in discovery.run_static arguments - updated usage/help/examples to include target_id - two_phase_discovery.py - added required --target-id argument - static-harvest bootstrap call now passes target_id - improved run_id error guidance with target_id-aware commands - injects placeholder into user prompt template - improved runtime logging to display target_id - Prompt templates - two_phase_discovery_prompt.md: - catalog/agent/llm tool signatures updated to include target_id - Stage 0 and workflow text updated to use provided target_id + run_id - removed contradictory instruction that asked to call discovery.run_static in Phase 2 - two_phase_user_prompt.md: - added target_id input section - all example calls updated to pass target_id - start instructions now require target_id + run_id - README and utility example - README quick-start curl and script examples now include target_id - test_catalog.sh now accepts/prints target_id and passes it to catalog/llm calls Validation: - bash -n passed for updated shell scripts - python3 -m py_compile passed for two_phase_discovery.py --- .../ClaudeCode_Headless/README.md | 4 +- .../prompts/two_phase_discovery_prompt.md | 40 +++++++++---------- .../prompts/two_phase_user_prompt.md | 30 +++++++------- .../ClaudeCode_Headless/static_harvest.sh | 30 ++++++++++---- .../ClaudeCode_Headless/test_catalog.sh | 5 +++ .../two_phase_discovery.py | 25 ++++++++---- 6 files changed, 83 insertions(+), 51 deletions(-) diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/README.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/README.md index 621bc4ed1..282dbecfe 100644 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/README.md +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/README.md @@ -35,7 +35,7 @@ cd scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/ # Phase 1: Static harvest (no Claude Code needed) # Option A: Using the convenience script (recommended) -./static_harvest.sh --schema test +./static_harvest.sh --target-id tap_mysql_default --schema test # Option B: Using curl directly curl -k -X POST https://localhost:6071/mcp/query \ @@ -47,6 +47,7 @@ curl -k -X POST https://localhost:6071/mcp/query \ "params": { "name": "discovery.run_static", "arguments": { + "target_id": "tap_mysql_default", "schema_filter": "test" } } @@ -56,6 +57,7 @@ curl -k -X POST https://localhost:6071/mcp/query \ cp mcp_config.example.json mcp_config.json ./two_phase_discovery.py \ --mcp-config mcp_config.json \ + --target-id tap_mysql_default \ --schema test \ --dry-run # Preview without executing ``` diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md index c2032dabd..6e7421b45 100644 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_discovery_prompt.md @@ -11,7 +11,7 @@ You are a Database Discovery Agent operating in Phase 2 (LLM Analysis) of a two- ## Goal Build semantic understanding of an already-harvested MySQL schema by: -1. Finding the latest completed harvest run_id +1. Using the provided `target_id` and completed `run_id` 2. Reading harvested catalog data via catalog tools 3. Creating semantic summaries, domains, metrics, and question templates via LLM tools @@ -28,21 +28,21 @@ Build semantic understanding of an already-harvested MySQL schema by: ### Catalog Tools (Reading Static Data) - USE THESE 1. **`catalog.search`** - FTS5 search over discovered objects - - Arguments: `run_id`, `query`, `limit`, `object_type`, `schema_name` + - Arguments: `target_id`, `run_id`, `query`, `limit`, `object_type`, `schema_name` 2. **`catalog.get_object`** - Get object with columns, indexes, FKs - - Arguments: `run_id`, `object_id` OR `object_key`, `include_definition`, `include_profiles` + - Arguments: `target_id`, `run_id`, `object_id` OR `object_key`, `include_definition`, `include_profiles` 3. **`catalog.list_objects`** - List objects (paged) - - Arguments: `run_id`, `schema_name`, `object_type`, `order_by`, `page_size`, `page_token` + - Arguments: `target_id`, `run_id`, `schema_name`, `object_type`, `order_by`, `page_size`, `page_token` 4. **`catalog.get_relationships`** - Get FKs, view deps, inferred relationships - - Arguments: `run_id`, `object_id` OR `object_key`, `include_inferred`, `min_confidence` + - Arguments: `target_id`, `run_id`, `object_id` OR `object_key`, `include_inferred`, `min_confidence` ### Agent Tracking Tools - USE THESE 5. **`agent.run_start`** - Create new LLM agent run bound to run_id - - Arguments: `run_id`, `model_name`, `prompt_hash`, `budget` + - Arguments: `target_id`, `run_id`, `model_name`, `prompt_hash`, `budget` 6. **`agent.run_finish`** - Mark agent run success/failed - Arguments: `agent_run_id`, `status`, `error` @@ -53,40 +53,40 @@ Build semantic understanding of an already-harvested MySQL schema by: ### LLM Memory Tools (Writing Semantic Data) - USE THESE 8. **`llm.summary_upsert`** - Store semantic summary for object - - Arguments: `agent_run_id`, `run_id`, `object_id`, `summary`, `confidence`, `status`, `sources` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `object_id`, `summary`, `confidence`, `status`, `sources` 9. **`llm.summary_get`** - Get semantic summary for object - - Arguments: `run_id`, `object_id`, `agent_run_id`, `latest` + - Arguments: `target_id`, `run_id`, `object_id`, `agent_run_id`, `latest` 10. **`llm.relationship_upsert`** - Store inferred relationship - - Arguments: `agent_run_id`, `run_id`, `child_object_id`, `child_column`, `parent_object_id`, `parent_column`, `rel_type`, `confidence`, `evidence` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `child_object_id`, `child_column`, `parent_object_id`, `parent_column`, `rel_type`, `confidence`, `evidence` 11. **`llm.domain_upsert`** - Create/update domain - - Arguments: `agent_run_id`, `run_id`, `domain_key`, `title`, `description`, `confidence` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `domain_key`, `title`, `description`, `confidence` 12. **`llm.domain_set_members`** - Set domain members - - Arguments: `agent_run_id`, `run_id`, `domain_key`, `members` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `domain_key`, `members` 13. **`llm.metric_upsert`** - Store metric definition - - Arguments: `agent_run_id`, `run_id`, `metric_key`, `title`, `description`, `domain_key`, `grain`, `unit`, `sql_template`, `depends`, `confidence` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `metric_key`, `title`, `description`, `domain_key`, `grain`, `unit`, `sql_template`, `depends`, `confidence` 14. **`llm.question_template_add`** - Add question template - - Arguments: `agent_run_id`, `run_id`, `title`, `question_nl`, `template`, `example_sql`, `related_objects`, `confidence` + - Arguments: `target_id`, `run_id`, `title`, `question_nl`, `template`, `agent_run_id`, `example_sql`, `related_objects`, `confidence` - **IMPORTANT**: Always extract table/view names from `example_sql` or `template_json` and pass them as `related_objects` (JSON array of object names) - Example: If SQL is "SELECT * FROM Customer JOIN Invoice...", related_objects should be ["Customer", "Invoice"] 15. **`llm.note_add`** - Add durable note - - Arguments: `agent_run_id`, `run_id`, `scope`, `object_id`, `domain_key`, `title`, `body`, `tags` + - Arguments: `target_id`, `agent_run_id`, `run_id`, `scope`, `object_id`, `domain_key`, `title`, `body`, `tags` 16. **`llm.search`** - FTS over LLM artifacts - - Arguments: `run_id`, `query`, `limit` + - Arguments: `target_id`, `run_id`, `query`, `limit` ## Operating Mode: Staged Discovery (MANDATORY) ### Stage 0 — Start and Plan -1. **Find the latest completed run_id** - Use `catalog.list_objects` to list runs, or assume run_id from the context -2. Call `agent.run_start` with the run_id and your model name +1. Use the provided `target_id` and `run_id` from static harvest context +2. Call `agent.run_start` with `target_id`, `run_id`, and your model name 3. Record discovery plan via `agent.event_append` 4. Determine scope using `catalog.list_objects` and/or `catalog.search` 5. Define "working sets" of objects to process in batches @@ -204,9 +204,9 @@ You are done when: ## Summary: Two-Phase Workflow ``` -START: discovery.run_static → run_id +START: use provided target_id + run_id ↓ - agent.run_start(run_id) → agent_run_id + agent.run_start(target_id, run_id) → agent_run_id ↓ catalog.list_objects/search → understand scope ↓ @@ -219,4 +219,4 @@ START: discovery.run_static → run_id agent.run_finish(success) ``` -Begin now with Stage 0: call `discovery.run_static` and start the agent run. +Begin now with Stage 0: start the agent run using the provided `target_id` and `run_id`. diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md index faf549708..4dfea0afe 100644 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/prompts/two_phase_user_prompt.md @@ -11,6 +11,7 @@ Perform LLM-driven discovery using the MCP catalog and persist your findings bac ## Inputs +- **target_id**: `` (required for all catalog/agent/llm tool calls) - **run_id**: **use the provided run_id from the static harvest** - **model_name**: `` - e.g., "claude-3.5-sonnet" - **desired coverage**: @@ -21,7 +22,7 @@ Perform LLM-driven discovery using the MCP catalog and persist your findings bac ## Required Outputs (persisted via MCP) ### 1) Agent Run Tracking -- Start an agent run bound to the provided run_id via `agent.run_start` +- Start an agent run bound to the provided target/run via `agent.run_start` - Record discovery plan and budgets via `agent.event_append` - Finish the run via `agent.run_finish` @@ -64,7 +65,8 @@ Perform LLM-driven discovery using the MCP catalog and persist your findings bac # Phase 2: LLM Agent Discovery - Start here run_id = -call agent.run_start(run_id=run_id, model_name="") +target_id = "" +call agent.run_start(target_id=target_id, run_id=run_id, model_name="") # → returns agent_run_id ``` @@ -72,8 +74,8 @@ call agent.run_start(run_id=run_id, model_name="") ```python # Understand what was harvested -call catalog.list_objects(run_id=run_id, order_by="name", page_size=100) -call catalog.search(run_id=run_id, query="", limit=25) +call catalog.list_objects(target_id=target_id, run_id=run_id, order_by="name", page_size=100) +call catalog.search(target_id=target_id, run_id=run_id, query="", limit=25) ``` ### Step 3: Execute Staged Discovery @@ -91,9 +93,9 @@ call agent.event_append(agent_run_id, "decision", {"plan": "...", "budgets": {.. # Stage 2: Summarize objects in batches for each batch: - call catalog.get_object(run_id, object_id, include_profiles=true) - call catalog.get_relationships(run_id, object_id) - call llm.summary_upsert(agent_run_id, run_id, object_id, summary={...}, confidence=0.8, sources={...}) + call catalog.get_object(target_id, run_id, object_id, include_profiles=true) + call catalog.get_relationships(target_id, run_id, object_id) + call llm.summary_upsert(target_id, agent_run_id, run_id, object_id, summary={...}, confidence=0.8, sources={...}) # Stage 3: Enhance relationships for each missing or unclear join: @@ -101,20 +103,20 @@ for each missing or unclear join: # Stage 4: Build domains for each domain (billing, sales, auth, etc.): - call llm.domain_upsert(agent_run_id, run_id, domain_key, title, description, confidence=0.8) - call llm.domain_set_members(agent_run_id, run_id, domain_key, members=[...]) + call llm.domain_upsert(target_id, agent_run_id, run_id, domain_key, title, description, confidence=0.8) + call llm.domain_set_members(target_id, agent_run_id, run_id, domain_key, members=[...]) # Stage 5: Create answerability artifacts for each metric: - call llm.metric_upsert(agent_run_id, run_id, metric_key, title, description, sql_template, depends, confidence=0.7) + call llm.metric_upsert(target_id, agent_run_id, run_id, metric_key, title, description, sql_template, depends, confidence=0.7) for each question template: # Extract table/view names from example_sql or template_json related_objects = ["Customer", "Invoice", "InvoiceLine"] # JSON array of object names - call llm.question_template_add(agent_run_id, run_id, title, question_nl, template, example_sql, related_objects, confidence=0.7) + call llm.question_template_add(target_id, run_id, title, question_nl, template, agent_run_id, example_sql, related_objects, confidence=0.7) # Final summary -call llm.note_add(agent_run_id, run_id, "global", title="Database Summary", body="...", tags=["final"]) +call llm.note_add(target_id, agent_run_id, run_id, "global", title="Database Summary", body="...", tags=["final"]) # Cleanup call agent.event_append(agent_run_id, "decision", {"status": "complete", "summaries": 50, "domains": 5, "metrics": 15, "templates": 25}) @@ -135,6 +137,6 @@ call agent.run_finish(agent_run_id, "success") ## Begin Now Start with Stage 0: -1. Use the provided run_id from the static harvest (DO NOT call discovery.run_static) -2. Call `agent.run_start` with that run_id +1. Use the provided target_id and run_id from the static harvest (DO NOT call discovery.run_static) +2. Call `agent.run_start` with target_id + run_id 3. Proceed with the discovery stages diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/static_harvest.sh b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/static_harvest.sh index 444020bb4..726dd38e1 100755 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/static_harvest.sh +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/static_harvest.sh @@ -6,12 +6,12 @@ # No Claude Code required. # # Usage: -# ./static_harvest.sh [--schema SCHEMA] [--notes NOTES] [--endpoint URL] +# ./static_harvest.sh --target-id TARGET_ID [--schema SCHEMA] [--notes NOTES] [--endpoint URL] # # Examples: -# ./static_harvest.sh # Harvest all schemas -# ./static_harvest.sh --schema sales # Harvest specific schema -# ./static_harvest.sh --schema production --notes "Prod DB discovery" +# ./static_harvest.sh --target-id tap_mysql_default +# ./static_harvest.sh --target-id tap_mysql_default --schema sales +# ./static_harvest.sh --target-id tap_pgsql_default --schema public --notes "Prod DB discovery" # ./static_harvest.sh --endpoint https://192.168.1.100:6071/mcp/query set -e @@ -20,6 +20,7 @@ set -e ENDPOINT="${PROXYSQL_MCP_ENDPOINT:-https://127.0.0.1:6071/mcp/query}" SCHEMA_FILTER="" NOTES="" +TARGET_ID="${MCP_TARGET_ID:-}" # Parse arguments while [[ $# -gt 0 ]]; do @@ -28,6 +29,10 @@ while [[ $# -gt 0 ]]; do SCHEMA_FILTER="$2" shift 2 ;; + --target-id) + TARGET_ID="$2" + shift 2 + ;; --notes) NOTES="$2" shift 2 @@ -37,9 +42,10 @@ while [[ $# -gt 0 ]]; do shift 2 ;; -h|--help) - echo "Usage: $0 [--schema SCHEMA] [--notes NOTES] [--endpoint URL]" + echo "Usage: $0 --target-id TARGET_ID [--schema SCHEMA] [--notes NOTES] [--endpoint URL]" echo "" echo "Options:" + echo " --target-id ID Logical MCP target_id (required)" echo " --schema SCHEMA Restrict harvest to one MySQL schema (optional)" echo " --notes NOTES Optional notes for this discovery run" echo " --endpoint URL ProxySQL MCP endpoint (default: PROXYSQL_MCP_ENDPOINT env var or https://127.0.0.1:6071/mcp/query)" @@ -49,9 +55,9 @@ while [[ $# -gt 0 ]]; do echo " PROXYSQL_MCP_ENDPOINT Default MCP endpoint URL" echo "" echo "Examples:" - echo " $0 # Harvest all schemas" - echo " $0 --schema sales # Harvest specific schema" - echo " $0 --schema production --notes 'Prod DB discovery'" + echo " $0 --target-id tap_mysql_default" + echo " $0 --target-id tap_mysql_default --schema sales" + echo " $0 --target-id tap_pgsql_default --schema public --notes 'Prod DB discovery'" exit 0 ;; *) @@ -62,8 +68,15 @@ while [[ $# -gt 0 ]]; do esac done +if [[ -z "$TARGET_ID" ]]; then + echo "Error: --target-id is required" + echo "Use --help for usage information" + exit 1 +fi + # Build JSON arguments JSON_ARGS="{}" +JSON_ARGS=$(echo "$JSON_ARGS" | jq --arg target_id "$TARGET_ID" '. + {target_id: $target_id}') if [[ -n "$SCHEMA_FILTER" ]]; then JSON_ARGS=$(echo "$JSON_ARGS" | jq --arg schema "$SCHEMA_FILTER" '. + {schema_filter: $schema}') @@ -89,6 +102,7 @@ JSON_REQUEST=$(jq -n \ # Display what we're doing echo "=== Phase 1: Static Harvest ===" echo "Endpoint: $ENDPOINT" +echo "Target ID: $TARGET_ID" if [[ -n "$SCHEMA_FILTER" ]]; then echo "Schema: $SCHEMA_FILTER" else diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/test_catalog.sh b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/test_catalog.sh index 8abd98d05..25b747968 100755 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/test_catalog.sh +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/test_catalog.sh @@ -7,10 +7,12 @@ set -e MCP_ENDPOINT="${PROXYSQL_MCP_ENDPOINT:-https://127.0.0.1:6071/mcp/query}" RUN_ID="${1:-10}" +TARGET_ID="${2:-${MCP_TARGET_ID:-tap_mysql_default}}" echo "=== Catalog Tools Test ===" echo "Using MCP endpoint: $MCP_ENDPOINT" echo "Using run_id: $RUN_ID" +echo "Using target_id: $TARGET_ID" echo "" echo "1. Testing catalog.list_objects..." @@ -24,6 +26,7 @@ curl -k -s -X POST "$MCP_ENDPOINT" \ "name": "catalog.list_objects", "arguments": { "run_id": '$RUN_ID', + "target_id": "'$TARGET_ID'", "order_by": "name", "page_size": 5 } @@ -42,6 +45,7 @@ curl -k -s -X POST "$MCP_ENDPOINT" \ "name": "catalog.get_object", "arguments": { "run_id": '$RUN_ID', + "target_id": "'$TARGET_ID'", "object_key": "codebase_community_template.users" } } @@ -60,6 +64,7 @@ curl -k -s -X POST "$MCP_ENDPOINT" \ "arguments": { "agent_run_id": 1, "run_id": '$RUN_ID', + "target_id": "'$TARGET_ID'", "object_id": 55, "summary": "{\"hypothesis\":\"Test user table\",\"grain\":\"one row per user\",\"primary_key\":[\"user_id\"],\"time_columns\":[\"created_at\"],\"example_questions\":[\"How many users do we have?\",\"Count users by registration date\"]}", "confidence": 0.9, diff --git a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py index e687211e4..b61438da7 100755 --- a/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py +++ b/scripts/mcp/DiscoveryAgent/ClaudeCode_Headless/two_phase_discovery.py @@ -33,16 +33,16 @@ def main(): epilog=""" Examples: # Discovery all schemas - %(prog)s --mcp-config mcp_config.json + %(prog)s --mcp-config mcp_config.json --target-id tap_mysql_default --schema test # Discovery specific schema - %(prog)s --mcp-config mcp_config.json --schema sales + %(prog)s --mcp-config mcp_config.json --target-id tap_mysql_default --schema sales # Discovery specific schema (REQUIRED) - %(prog)s --mcp-config mcp_config.json --schema Chinook + %(prog)s --mcp-config mcp_config.json --target-id tap_pgsql_default --schema public # With custom model - %(prog)s --mcp-config mcp_config.json --schema sales --model claude-3-opus-20240229 + %(prog)s --mcp-config mcp_config.json --target-id tap_mysql_default --schema sales --model claude-3-opus-20240229 """ ) @@ -54,7 +54,12 @@ Examples: parser.add_argument( "--schema", required=True, - help="MySQL schema/database to discover (REQUIRED)" + help="Schema/database to discover (REQUIRED)" + ) + parser.add_argument( + "--target-id", + required=True, + help="MCP target_id to use for static harvest and catalog/LLM tools (REQUIRED)" ) parser.add_argument( "--model", @@ -112,6 +117,7 @@ Examples: "params": { "name": "discovery.run_static", "arguments": { + "target_id": args.target_id, "schema_filter": args.schema } } @@ -137,11 +143,11 @@ Examples: if not run_id: print("Error: Could not determine run_id.", file=sys.stderr) print("Either:") - print(" 1. Run: ./static_harvest.sh --schema first") - print(" 2. Or use: ./two_phase_discovery.py --run-id --schema ") + print(" 1. Run: ./static_harvest.sh --target-id --schema first") + print(" 2. Or use: ./two_phase_discovery.py --run-id --target-id --schema ") sys.exit(1) - print(f"[*] Using run_id: {run_id} from existing static harvest") + print(f"[*] Using run_id: {run_id} for target_id: {args.target_id}") # Load prompts try: @@ -155,6 +161,7 @@ Examples: # Replace placeholders in user prompt schema_filter = args.schema if args.schema else "all schemas" user_prompt = user_prompt.replace("", str(run_id)) + user_prompt = user_prompt.replace("", args.target_id) user_prompt = user_prompt.replace("", args.model) user_prompt = user_prompt.replace("", schema_filter) @@ -163,6 +170,7 @@ Examples: print("[DRY RUN] Two-Phase Database Discovery") print(f" MCP Config: {args.mcp_config}") print(f" Schema: {schema_filter}") + print(f" Target ID: {args.target_id}") print(f" Model: {args.model}") print(f" Catalog Path: {args.catalog_path}") print() @@ -193,6 +201,7 @@ Examples: # Launch Claude Code with the prompts print("[*] Launching Claude Code for two-phase discovery...") print(f" Schema: {schema_filter}") + print(f" Target ID: {args.target_id}") print(f" Model: {args.model}") print(f" Catalog: {args.catalog_path}") print(f" MCP Config: {args.mcp_config}")