diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 081d42a2..58e7ae42 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -123,7 +123,7 @@ "name": "databases-on-aws", "source": "./plugins/databases-on-aws", "tags": ["aws", "database", "aurora", "dsql", "serverless", "postgresql"], - "version": "1.0.0" + "version": "1.1.0" }, { "category": "deployment", diff --git a/plugins/databases-on-aws/.claude-plugin/plugin.json b/plugins/databases-on-aws/.claude-plugin/plugin.json index a37712b7..ac479134 100644 --- a/plugins/databases-on-aws/.claude-plugin/plugin.json +++ b/plugins/databases-on-aws/.claude-plugin/plugin.json @@ -22,5 +22,5 @@ "license": "Apache-2.0", "name": "databases-on-aws", "repository": "https://github.com/awslabs/agent-plugins", - "version": "1.0.0" + "version": "1.1.0" } diff --git a/plugins/databases-on-aws/.codex-plugin/plugin.json b/plugins/databases-on-aws/.codex-plugin/plugin.json index 33831529..a797b3ba 100644 --- a/plugins/databases-on-aws/.codex-plugin/plugin.json +++ b/plugins/databases-on-aws/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "databases-on-aws", - "version": "1.0.0", + "version": "1.1.0", "description": "Expert database guidance for the AWS database portfolio. Design schemas, execute queries, handle migrations, and choose the right database for your workload.", "author": { "name": "Amazon Web Services", diff --git a/plugins/databases-on-aws/skills/dsql/SKILL.md b/plugins/databases-on-aws/skills/dsql/SKILL.md index 908a6514..ee3db2ba 100644 --- a/plugins/databases-on-aws/skills/dsql/SKILL.md +++ b/plugins/databases-on-aws/skills/dsql/SKILL.md @@ -153,13 +153,11 @@ defaults that may change — when a user's decision depends on an exact limit, v | Max indexes per table | 24 | `aurora dsql index limits` | | Max columns per index | 8 | `aurora dsql index limits` | | IDENTITY/SEQUENCE CACHE values | 1 or >= 65536 | `aurora dsql sequence cache` | +| Supported column data types | See docs | `aurora dsql supported data types` | -**When to verify:** Before recommending batch sizes, connection pool settings, or schema designs -where hitting a limit would cause failures. No need to verify for general guidance or when -the exact number doesn't affect the user's decision. +**When to verify:** Before recommending batch sizes, connection pool settings, or schema designs where hitting a limit would cause failures; any time the exact number can affect user decision. -**Fallback:** If `awsknowledge` is unavailable, use the defaults above and note to the user -that limits should be verified against [DSQL documentation](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/). +**Fallback:** If `awsknowledge` is unavailable, use the defaults above and flag that limits should be verified against [DSQL documentation](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/). ## CLI Scripts Available @@ -208,7 +206,7 @@ ALTER COLUMN TYPE, DROP COLUMN, DROP CONSTRAINT → Table Recreation Pattern (Wo - MUST include tenant_id in all tables - MUST use `CREATE INDEX ASYNC` exclusively - MUST issue each DDL in its own transact call: `transact(["CREATE TABLE ..."])` -- MUST store arrays/JSON as TEXT +- MUST serialize arrays as TEXT or JSON; cast back at query time (`string_to_array(text, ',')` or `jsonb_array_elements_text(json::jsonb)`) ### Workflow 2: Safe Data Migration diff --git a/plugins/databases-on-aws/skills/dsql/references/development-guide.md b/plugins/databases-on-aws/skills/dsql/references/development-guide.md index 5f47d3cb..d57f2245 100644 --- a/plugins/databases-on-aws/skills/dsql/references/development-guide.md +++ b/plugins/databases-on-aws/skills/dsql/references/development-guide.md @@ -13,7 +13,8 @@ effortless scaling, multi-region viability, among other advantages. - **REQUIRED: Follow DDL Guidelines** - Refer to [DDL Rules](#schema-ddl-rules) - **SHALL repeatedly generate fresh tokens** - Refer to [Connection Limits](auth/authentication-guide.md#connection-rules) - **ALWAYS use ASYNC indexes** - `CREATE INDEX ASYNC` is mandatory -- **MUST Serialize arrays/JSON as TEXT** - Store arrays/JSON as TEXT (comma separated, JSON.stringify) +- **MUST serialize arrays as TEXT or JSON** - see [Schema Design Rules](#schema-design-rules) +- **MUST cast to `JSONB` at query time** for JSONB operators — see [Supported Data Types](#supported-data-types) - **ALWAYS Batch within row limit** - maintain transaction limits (verify via `awsknowledge`: `aurora dsql transaction limits`) - **REQUIRED: Build and sanitize all SQL with `safe_query.build()`** - See [Input Validation](../mcp/tools/input-validation.md#required-pattern) - **MUST follow correct Application Layer Patterns** - when multi-tenant isolation or application referential integrity are required; refer to [Application Layer Patterns](#application-layer-patterns) @@ -53,9 +54,8 @@ effortless scaling, multi-region viability, among other advantages. ### Schema Design Rules -- MUST use **simple PostgreSQL types:** VARCHAR, TEXT, INTEGER, BOOLEAN, TIMESTAMP -- MUST store arrays as TEXT (comma-separated is recommended) -- MUST store JSON objects as TEXT (JSON.stringify) +- MUST verify column types via `awsknowledge`: `aurora dsql supported data types` or the [DSQL supported data types list](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html) +- MUST serialize arrays as TEXT or JSON; cast back at query time via `string_to_array(text, ',')` or `jsonb_array_elements_text(json::jsonb)` - ALWAYS include tenant_id in tables for multi-tenant isolation - SHOULD create async indexes for tenant_id and common query patterns @@ -124,9 +124,9 @@ UPDATE table SET c = 'default' WHERE c IS NULL; ← AFTER ADD COLUMN ### Supported Data Types -``` -VARCHAR, TEXT, INTEGER, DECIMAL, BOOLEAN, TIMESTAMP, UUID -``` +**MUST verify** column types against the [DSQL supported data types docs](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html) or via `awsknowledge`: `aurora dsql supported data types` — the supported set evolves, so do not treat any static list as exhaustive. + +`JSONB`, arrays, and `INET` are **[runtime-only](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html#working-with-postgresql-compatibility-query-runtime)** — cast at query time ### Supported Key diff --git a/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md b/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md index 2b6a18ad..080035d2 100644 --- a/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md +++ b/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md @@ -54,7 +54,7 @@ async function batchInsert(pool, tenantId, items) { await client.query( `INSERT INTO entities (tenant_id, name, metadata) VALUES ($1, $2, $3)`, - [tenantId, item.name, JSON.stringify(item.metadata)] + [tenantId, item.name, item.metadata] ); } @@ -105,7 +105,7 @@ async function processBatches(pool, tenantId, batches, startIdx, step) { for (const item of batch) { await client.query( 'INSERT INTO entities (tenant_id, name, metadata) VALUES ($1, $2, $3)', - [tenantId, item.name, JSON.stringify(item.metadata)] + [tenantId, item.name, item.metadata] ); } diff --git a/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md b/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md index fc6d0b97..75ceac3a 100644 --- a/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md +++ b/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md @@ -129,9 +129,12 @@ INSERT INTO distributors VALUES (nextval('order_seq'), 'nothing'); --- -## Data Serialization +## Runtime-Only Types -**Pattern:** MUST store arrays and JSON as TEXT (runtime-only types). Per [DSQL docs](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html), cast to JSON at query time. +`JSONB`, arrays, and `INET` are [runtime-only](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html#working-with-postgresql-compatibility-query-runtime) — not valid as column types. + +- **MUST** serialize arrays as `TEXT` or `JSON` — use `TEXT` (comma-separated) for homogeneous short strings; use `JSON` when elements may contain commas or aren't homogeneous +- **MUST** cast back at query time — `string_to_array(text, ',')` for TEXT, `jsonb_array_elements_text(json::jsonb)` for JSON ```javascript function toTextArray(values) { @@ -142,32 +145,21 @@ function fromTextArray(textValue) { return textValue ? textValue.split(',').map(v => v.trim()) : []; } -function toTextJSON(object) { - return JSON.stringify(object); -} - -function fromTextJSON(textValue) { - if (!textValue) return null; - try { - return JSON.parse(textValue); - } catch (err) { - console.warn('Invalid JSON in column:', err.message); - return null; - } -} - const categoriesText = toTextArray(['backend', 'api', 'database']); await pool.query('INSERT INTO projects (project_id, categories) VALUES ($1, $2)', [projectId, categoriesText]); -const configText = toTextJSON({ theme: 'dark', notifications: true }); -await pool.query('INSERT INTO user_settings (user_id, preferences) VALUES ($1, $2)', [userId, configText]); +await pool.query( + 'INSERT INTO user_settings (user_id, preferences) VALUES ($1, $2)', + [userId, { theme: 'dark', notifications: true }], +); ``` Query-time operations: ```sql -SELECT user_id, preferences::jsonb->>'theme' as theme -FROM user_settings WHERE preferences::jsonb->>'notifications' = 'true'; +SELECT user_id, preferences::jsonb->>'theme' AS theme +FROM user_settings +WHERE preferences::jsonb->>'notifications' = 'true'; -SELECT project_id, string_to_array(categories, ',') as category_array FROM projects; +SELECT project_id, string_to_array(categories, ',') AS category_array FROM projects; ``` diff --git a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md index ae73389f..c1896c24 100644 --- a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md +++ b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md @@ -45,7 +45,7 @@ transact([ price DECIMAL(10,2) NOT NULL, category VARCHAR(255) DEFAULT 'other' CHECK (category IN ('electronics', 'clothing', 'food', 'other')), tags TEXT, - metadata TEXT, + metadata JSON, stock INTEGER DEFAULT 0 CHECK (stock >= 0), is_active BOOLEAN DEFAULT true, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, @@ -70,7 +70,7 @@ transact(["CREATE INDEX ASYNC idx_products_category ON products(tenant_id, categ | `MEDIUMTEXT` | `TEXT` | | `ENUM(...)` | `VARCHAR(255)` with `CHECK` constraint | | `SET(...)` | `TEXT` (comma-separated) | -| `JSON` | `TEXT` (JSON.stringify) | +| `JSON` | `JSON` | | `UNSIGNED` | `CHECK (col >= 0)` | | `TINYINT(1)` | `BOOLEAN` | | `DATETIME` | `TIMESTAMP` | @@ -99,7 +99,6 @@ transact(["CREATE INDEX ASYNC idx_products_category ON products(tenant_id, categ - **MUST convert** AUTO_INCREMENT to UUID with gen_random_uuid(), IDENTITY column with `GENERATED AS IDENTITY (CACHE ...)`, or explicit SEQUENCE -- ALWAYS use `GENERATED AS IDENTITY` for auto-incrementing columns (see [AUTO_INCREMENT Migration](ddl-auto-increment.md#auto_increment-migration)) - **MUST replace** ENUM with VARCHAR and CHECK constraint - **MUST replace** SET with TEXT (comma-separated) -- **MUST replace** JSON columns with TEXT - **MUST replace** FOREIGN KEY constraints with application-layer referential integrity - **MUST replace** ON UPDATE CURRENT_TIMESTAMP with application-layer updates - **MUST convert** all index creation to use CREATE INDEX ASYNC diff --git a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md index bb15b42c..6eb9f615 100644 --- a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md +++ b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md @@ -97,7 +97,7 @@ Map MySQL data types to their DSQL equivalents. | MySQL Type | DSQL Equivalent | Notes | | -------------- | --------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | -| JSON | TEXT | MUST store as TEXT | +| JSON | `JSON` | Direct equivalent | | AUTO_INCREMENT | UUID with gen_random_uuid(), IDENTITY column, or SEQUENCE | See [AUTO_INCREMENT Migration](ddl-auto-increment.md#auto_increment-migration) for all three options | --- diff --git a/plugins/databases-on-aws/skills/dsql/references/onboarding.md b/plugins/databases-on-aws/skills/dsql/references/onboarding.md index 22a9e0bd..c1d87867 100644 --- a/plugins/databases-on-aws/skills/dsql/references/onboarding.md +++ b/plugins/databases-on-aws/skills/dsql/references/onboarding.md @@ -35,7 +35,7 @@ These guidelines apply when users say "Get started with DSQL" or similar phrases - Example: - "What column names would you like in this table?" - "What is the column name of the primary key?" - - "JSON must be serialized. Would you like to stringify the JSON to serialize it as TEXT?" + - "Would you like to store this in a `JSON` column, or serialize as TEXT?" **Examples:** @@ -252,7 +252,9 @@ cargo add aws-sdk-dsql tokio --features full - If yes, MUST verify DSQL compatibility: - No SERIAL types (use `GENERATED AS IDENTITY` with sequences, or UUID) - No foreign keys (implement in application) - - No array/JSON column types (serialize as TEXT) + - Serialize arrays as TEXT or JSON; cast back at query time (`string_to_array(text, ',')` / `jsonb_array_elements_text(json::jsonb)`) + - Cast to `JSONB` at query time for JSONB operators (`JSONB` is not a valid column type) + - Verify column types against the [supported data types list](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html) - Reference [`./development-guide.md`](./development-guide.md) for full constraints **If no schema found:** @@ -349,7 +351,7 @@ Let them know you're ready to help with more: **ALWAYS follow these rules:** 1. **Indexes:** Use `CREATE INDEX ASYNC` - synchronous index creation not supported -2. **Serialization:** Store arrays/JSON as TEXT (comma-separated or JSON.stringify) +2. **Runtime-only types:** Serialize arrays as TEXT or JSON; cast to `JSONB` at query time for JSONB operators 3. **Referential Integrity:** Implement foreign key validation in application code 4. **DDL Operations:** Execute one DDL per transaction, no mixing with DML 5. **Transaction Limits:** Maximum 3,000 row modifications, 10 MiB data size per transaction diff --git a/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md b/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md index bb0c0a22..ba540a16 100644 --- a/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md +++ b/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md @@ -52,6 +52,18 @@ Before referring to any listed errors, refer to the complete [DSQL troubleshooti - Use native TLS libraries (not OpenSSL 1.0.x) - Set `server_name_indication` to cluster endpoint in SSL config +## Cluster Lifecycle + +See [cluster lifecycle](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/cluster-lifecycle.html) for state definitions and behavior. + +### Error: "FATAL: unable to accept connection, waking up cluster, please retry later" + +The cluster is `INACTIVE` and waking up. Poll `aws dsql get-cluster --identifier --region --query status --output text` until `ACTIVE`, then retry. + +### Error: `FailedPrecondition` when backing up an `IDLE` / `INACTIVE` cluster + +Connect to the cluster to wake it, then retry the backup. + ## Incompatibility When migrating from PostgreSQL, remember DSQL doesn't support: @@ -83,10 +95,8 @@ See [full list of unsupported features](https://docs.aws.amazon.com/aurora-dsql/ **Cause:** Using TEXT[] or other array types **Solution:** -1. Change column to TEXT -2. Store as comma-separated: `"tag1,tag2,tag3"` -3. Or use JSON.stringify: `"["tag1","tag2","tag3"]"` -4. Deserialize in application layer +1. Change column to TEXT and store as comma-separated (`"tag1,tag2,tag3"`), or use a `JSON` column (`tags JSON`) +2. Deserialize in application layer; cast to `JSONB` at query time for JSONB operators ### Error: "Please use CREATE INDEX ASYNC" diff --git a/tools/evals/databases-on-aws/README.md b/tools/evals/databases-on-aws/README.md index 193d08ce..b520fc0b 100644 --- a/tools/evals/databases-on-aws/README.md +++ b/tools/evals/databases-on-aws/README.md @@ -14,7 +14,7 @@ scripts, and unit tests for that database's skill. tools/evals/databases-on-aws/ ├── README.md # This file — top-level index └── dsql/ # Aurora DSQL skill evals - ├── evals.json # Tier 2: functional evals (5 prompts, 20 assertions) + ├── evals.json # Tier 2: functional evals (9 prompts, 31 assertions) ├── trigger_evals.json # Tier 1: triggering evals (26 test cases) ├── safe_query_evals.json # Tier 3: safe_query enforcement (6 prompts, ~30 expectations) ├── query_explainability_evals.json # Workflow 8: query plan diagnostics (9 prompts, 70 assertions) @@ -68,15 +68,39 @@ python tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py \ --verbose ``` -**What it checks** (5 eval prompts, 20 assertions total): +Run a subset by ID (e.g., just the new type / lifecycle evals): -| Eval | Focus | Key assertions | -| ---------------------- | --------------------- | -------------------------------------------------------------------------- | -| 1. Transaction limits | MCP delegation | Calls `awsknowledge`, cites 3,000 row limit, recommends batching | -| 2. Multi-tenant schema | Correctness | Uses `tenant_id`, `CREATE INDEX ASYNC`, no foreign keys, separate DDL txns | -| 3. Index limits | MCP delegation | Calls `awsknowledge`, cites 24 index limit, suggests alternatives | -| 4. Python connection | Language routing | Recommends DSQL Python Connector, IAM auth, 15-min token expiry, SSL | -| 5. Column type change | DDL migration routing | Table Recreation Pattern, DROP TABLE warning, batching, user confirmation | +```bash +python tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py \ + --evals tools/evals/databases-on-aws/dsql/evals.json \ + --plugin-dir plugins/databases-on-aws \ + --output-dir /tmp/dsql-eval-results \ + --eval-ids 6,7,8 \ + --verbose +``` + +**What it checks** (9 eval prompts, 31 assertions total): + +| Eval | Focus | Grader | Key assertions | +| -------------------------- | --------------------- | --------- | ------------------------------------------------------------------------------------------------- | +| 1. Transaction limits | MCP delegation | regex | Calls `awsknowledge`, cites 3,000 row limit, recommends batching | +| 2. Multi-tenant schema | Correctness | regex | Uses `tenant_id`, `CREATE INDEX ASYNC`, no foreign keys, separate DDL txns | +| 3. Index limits | MCP delegation | regex | Calls `awsknowledge`, cites 24 index limit, suggests alternatives | +| 4. Python connection | Language routing | regex | Recommends DSQL Python Connector, IAM auth, 15-min token expiry, SSL | +| 5. Column type change | DDL migration routing | regex | Table Recreation Pattern, DROP TABLE warning, batching, user confirmation | +| 6. JSON column storage | Type guidance | LLM judge | Explains `::jsonb` cast, does not recommend `JSONB` as a column type | +| 7. Array storage | Type guidance | LLM judge | Flags `TEXT[]` / array column as unsupported, recommends TEXT or `JSON` column | +| 8. INACTIVE cluster error | Troubleshooting | LLM judge | Identifies INACTIVE state, uses `aws dsql get-cluster` to poll until `ACTIVE`, retries afterwards | +| 9. Backup on IDLE/INACTIVE | Troubleshooting | LLM judge | Identifies `FailedPrecondition`, connects to wake cluster to ACTIVE, retries backup | + +### Grader modes + +The runner supports two grading strategies; each eval declares which via `"llm_judge": true|false` (default `false`): + +- **Regex / tool-call** (evals 1-5): fast, cheap, deterministic. Good for verbatim tokens (`tenant_id`, `CREATE INDEX ASYNC`, the `3,000` row limit) and tool-invocation checks (`Calls awsknowledge with topic=X`). +- **LLM judge** (evals 6-9): runs `claude -p` once per expectation with the agent's final text, the user prompt, and the assertion. Returns `{passed, evidence}`. Good for semantic assertions where paraphrasing, negation, or synonym coverage makes regex brittle. Costs ~$0.01–0.05 per expectation; slower than regex. Use for assertions like "Does NOT recommend X" where the agent may phrase the refutation a hundred different ways. + +Pin the judge model independently of the subject model via `--judge-model` (defaults to the CLI default). Keep it stable across runs when bumping `--model` so grading stays comparable. ### Tier 3: Safe-Query Enforcement Evals diff --git a/tools/evals/databases-on-aws/dsql/evals.json b/tools/evals/databases-on-aws/dsql/evals.json index 706f0952..a4339000 100644 --- a/tools/evals/databases-on-aws/dsql/evals.json +++ b/tools/evals/databases-on-aws/dsql/evals.json @@ -60,6 +60,53 @@ "Mentions batching the data copy for tables exceeding 3,000 rows", "Requires or recommends user confirmation before destructive steps" ] + }, + { + "id": 6, + "prompt": "I need to store a JSON preferences blob per user in my DSQL users table. How should I model that column, and how do I query fields inside it?", + "expected_output": "Recommends either JSON or TEXT as a valid column type. Explains that JSONB is runtime-only and must be cast (preferences::jsonb->>'key') at query time for JSONB operators.", + "files": [], + "expectations": [ + "Mentions casting to jsonb at query time for JSONB operators", + "Does NOT claim JSONB is a valid column type (JSONB is runtime-only)" + ], + "llm_judge": true + }, + { + "id": 7, + "prompt": "I want to store a tags array per project in DSQL (e.g. ['backend','database','api']). Can I use TEXT[] for that?", + "expected_output": "Indicates TEXT[] / array column type is not supported in DSQL. Recommends storing arrays as TEXT (comma-separated) or inside a JSON column.", + "files": [], + "expectations": [ + "Indicates TEXT[] or array column type is not supported in DSQL", + "Recommends storing arrays as TEXT (comma-separated) or inside a JSON column" + ], + "llm_judge": true + }, + { + "id": 8, + "prompt": "My DSQL connection just failed with: FATAL: unable to accept connection, waking up cluster, please retry later. What's happening and what should I do?", + "expected_output": "Identifies the cluster as INACTIVE, explains that the first connection triggers the wake. Recommends polling cluster status via aws dsql get-cluster until ACTIVE, then retrying the connection.", + "files": [], + "expectations": [ + "Identifies the cluster is in INACTIVE state and waking up", + "Mentions aws dsql get-cluster for checking cluster status", + "Recommends polling until the cluster reaches ACTIVE state", + "Recommends retrying the connection after cluster becomes ACTIVE" + ], + "llm_judge": true + }, + { + "id": 9, + "prompt": "I ran `aws backup start-backup-job` on my DSQL cluster and got FailedPrecondition: Cluster is in state 'IDLE' and can't be backed up. How do I fix this?", + "expected_output": "Identifies that backups require an ACTIVE cluster. Recommends connecting to the cluster to transition it to ACTIVE, then retrying the backup.", + "files": [], + "expectations": [ + "Identifies that backups require the cluster to be in ACTIVE state", + "Recommends connecting to the cluster to wake it to ACTIVE", + "Recommends retrying the backup after cluster is ACTIVE" + ], + "llm_judge": true } ] } diff --git a/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py b/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py index 4ab8a585..82fc8a26 100644 --- a/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py +++ b/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py @@ -111,8 +111,109 @@ def run_prompt(prompt: str, plugin_dir: str, timeout: int = 180, model: str | No } -def grade_eval(eval_item: dict, run_result: dict) -> dict: - """Grade a single eval against its expectations.""" +def _llm_judge(prompt: str, result_text: str, expectation: str, model: str | None = None, timeout: int = 60) -> dict: + """Grade a single expectation via an LLM judge call (`claude -p`). + + Returns {"passed": bool, "evidence": str}. Used for semantic assertions where regex + grading is brittle (agent paraphrasing, negation handling, synonym coverage). The judge + sees the user prompt, the agent's final text, and the expectation, and returns a + one-line verdict. + """ + judge_prompt = ( + "You are grading a single assertion about an AI agent's answer. " + "Respond with a JSON object only, no prose, matching this schema:\n" + '{"passed": true|false, "evidence": ""}\n\n' + f"USER PROMPT TO AGENT:\n{prompt}\n\n" + f"AGENT'S FINAL ANSWER:\n{result_text}\n\n" + f"ASSERTION TO GRADE:\n{expectation}\n\n" + "Grade strictly: if the agent's answer supports the assertion, passed=true. " + "If the agent's answer contradicts or fails to address the assertion, passed=false. " + "For negative assertions (e.g. 'Does NOT claim X'), passed=true only if the agent " + "clearly avoids X or actively refutes it; passed=false if the agent endorses X." + ) + cmd = ["claude", "-p", judge_prompt, "--output-format", "json", "--max-turns", "1"] + if model: + cmd.extend(["--model", model]) + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + try: + result = subprocess.run( # nosec B603 - cmd is a fixed list of literals + cmd, capture_output=True, text=True, timeout=timeout, env=env, + ) + except subprocess.TimeoutExpired: + return {"passed": False, "evidence": f"LLM judge timed out after {timeout}s"} + if result.returncode != 0: + return {"passed": False, "evidence": f"LLM judge exited {result.returncode}: {result.stderr[:200]}"} + # claude -p --output-format json returns a top-level object with `result` field containing the reply. + # Any parsing failure maps to `passed=False` so the grader fails-closed — never silently passes + # on malformed judge output. We catch the full Exception tree here (not just JSONDecodeError) + # because the judge reply may be a list, null, or otherwise non-dict shape, which would raise + # AttributeError/TypeError from `.get(...)` and crash the whole eval loop otherwise. + try: + outer = json.loads(result.stdout) + if not isinstance(outer, dict): + return {"passed": False, "evidence": f"LLM judge outer JSON not a dict: {type(outer).__name__}"} + reply = outer.get("result", "").strip() + # Extract the JSON verdict via brace-matching so nested `{` / `}` inside the `evidence` + # string (e.g. when the judge quotes a JSON snippet as proof) don't truncate the match. + # Fall through to the error path if no balanced object is found. + verdict_text = _extract_balanced_json_object(reply) + if verdict_text is None: + return {"passed": False, "evidence": f"LLM judge reply did not contain JSON: {reply[:200]}"} + verdict = json.loads(verdict_text) + if not isinstance(verdict, dict): + return {"passed": False, "evidence": f"LLM judge verdict not an object: {str(verdict)[:100]}"} + return { + "passed": bool(verdict.get("passed", False)), + "evidence": str(verdict.get("evidence", ""))[:500], + } + except (json.JSONDecodeError, AttributeError, TypeError, KeyError) as e: + return {"passed": False, "evidence": f"LLM judge returned invalid JSON: {str(e)[:100]}"} + + +def _extract_balanced_json_object(s: str) -> str | None: + """Return the first balanced `{...}` substring in `s`, or None if none exists. + + Needed because LLM replies sometimes wrap the verdict in prose or fences AND the verdict's + `evidence` field may itself contain quoted `{}` characters that confuse naive regex matching. + Parses character-by-character, tracking brace depth, and respects string quoting. + """ + start = s.find("{") + if start == -1: + return None + depth = 0 + in_str = False + escape = False + for i in range(start, len(s)): + ch = s[i] + if escape: + escape = False + continue + if ch == "\\": + escape = True + continue + if ch == '"': + in_str = not in_str + continue + if in_str: + continue + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return s[start:i + 1] + return None + + +def grade_eval(eval_item: dict, run_result: dict, judge_model: str | None = None) -> dict: + """Grade a single eval against its expectations. + + If `eval_item["llm_judge"]` is true, all expectations are graded via `_llm_judge`. + Otherwise, each expectation falls through to the regex-based elif chain below. + Hybrid graders that work best with LLM semantic judgment (evals 6-9 in this suite) + set `llm_judge: true` in `evals.json`; evals grading on verbatim tokens or tool-call + presence (evals 1-5) keep regex-based grading where it is both sufficient and faster. + """ text = run_result["result_text"].lower() tool_calls = run_result["tool_calls"] @@ -124,11 +225,26 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict: full_text += " " + json.dumps(msg).lower() expectations = [] + use_llm_judge = bool(eval_item.get("llm_judge", False)) for expectation_text in eval_item.get("expectations", []): passed = False evidence = "" + if use_llm_judge: + verdict = _llm_judge( + prompt=eval_item.get("prompt", ""), + result_text=run_result.get("result_text", ""), + expectation=expectation_text, + model=judge_model, + ) + expectations.append({ + "text": expectation_text, + "passed": verdict["passed"], + "evidence": verdict["evidence"], + }) + continue + exp_lower = expectation_text.lower() # --- Assertion: awsknowledge call with topic --- @@ -335,6 +451,10 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict: evidence = "No alternatives suggested" # --- Fallback: keyword search --- + # Note: evals 6-9 assertions (JSONB column type, TEXT[], INACTIVE/backup lifecycle) + # are semantic — graded via `_llm_judge` when the eval sets `"llm_judge": true`. + # Regex branches below cover only evals 1-5 where verbatim tokens / tool-call topic + # matches are the right signal. See `_llm_judge` doc comment for rationale. else: keywords = re.findall(r'\b[a-z_]{3,}\b', exp_lower) significant = [k for k in keywords if k not in ( @@ -373,18 +493,44 @@ def main(): parser.add_argument("--evals", required=True, help="Path to evals.json") parser.add_argument("--plugin-dir", required=True, help="Path to the plugin directory") parser.add_argument("--output-dir", required=True, help="Directory to save results") - parser.add_argument("--model", default=None, help="Model to use") + parser.add_argument("--model", default=None, help="Model to use for the subject-under-test (the agent responding to eval prompts)") + parser.add_argument( + "--judge-model", + default=None, + help=( + "Model to use for the LLM judge on evals with `llm_judge: true`. Intentionally " + "separate from --model so that bumping the subject model does not silently swap " + "the judge and invalidate the regression baseline. Defaults to the claude CLI default." + ), + ) parser.add_argument("--timeout", type=int, default=180, help="Timeout per prompt in seconds") parser.add_argument("--verbose", action="store_true", help="Print progress") + parser.add_argument( + "--eval-ids", + type=lambda s: [int(x) for x in s.split(",")], + default=None, + help="Comma-separated list of eval IDs to run (default: all)", + ) args = parser.parse_args() evals_data = json.loads(Path(args.evals).read_text()) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) + eval_items = evals_data["evals"] + if args.eval_ids is not None: + requested = set(args.eval_ids) + eval_items = [e for e in eval_items if e["id"] in requested] + missing = requested - {e["id"] for e in eval_items} + if missing: + print(f"WARNING: eval IDs not found: {sorted(missing)}", file=sys.stderr) + if not eval_items: + print("ERROR: no matching eval IDs", file=sys.stderr) + return 1 + all_results = [] - for eval_item in evals_data["evals"]: + for eval_item in eval_items: eval_id = eval_item["id"] prompt = eval_item["prompt"] @@ -400,7 +546,7 @@ def main(): (eval_dir / "transcript.json").write_text(json.dumps(run_result, indent=2)) # Grade - grading = grade_eval(eval_item, run_result) + grading = grade_eval(eval_item, run_result, judge_model=args.judge_model) (eval_dir / "grading.json").write_text(json.dumps(grading, indent=2)) # Save timing @@ -456,4 +602,4 @@ def main(): if __name__ == "__main__": - main() + sys.exit(main() or 0)