diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 081d42a2..58e7ae42 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -123,7 +123,7 @@
       "name": "databases-on-aws",
       "source": "./plugins/databases-on-aws",
       "tags": ["aws", "database", "aurora", "dsql", "serverless", "postgresql"],
-      "version": "1.0.0"
+      "version": "1.1.0"
     },
     {
       "category": "deployment",
diff --git a/plugins/databases-on-aws/.claude-plugin/plugin.json b/plugins/databases-on-aws/.claude-plugin/plugin.json
index a37712b7..ac479134 100644
--- a/plugins/databases-on-aws/.claude-plugin/plugin.json
+++ b/plugins/databases-on-aws/.claude-plugin/plugin.json
@@ -22,5 +22,5 @@
   "license": "Apache-2.0",
   "name": "databases-on-aws",
   "repository": "https://github.com/awslabs/agent-plugins",
-  "version": "1.0.0"
+  "version": "1.1.0"
 }
diff --git a/plugins/databases-on-aws/.codex-plugin/plugin.json b/plugins/databases-on-aws/.codex-plugin/plugin.json
index 33831529..a797b3ba 100644
--- a/plugins/databases-on-aws/.codex-plugin/plugin.json
+++ b/plugins/databases-on-aws/.codex-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "databases-on-aws",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "description": "Expert database guidance for the AWS database portfolio. Design schemas, execute queries, handle migrations, and choose the right database for your workload.",
   "author": {
     "name": "Amazon Web Services",
diff --git a/plugins/databases-on-aws/skills/dsql/SKILL.md b/plugins/databases-on-aws/skills/dsql/SKILL.md
index 908a6514..ee3db2ba 100644
--- a/plugins/databases-on-aws/skills/dsql/SKILL.md
+++ b/plugins/databases-on-aws/skills/dsql/SKILL.md
@@ -153,13 +153,11 @@ defaults that may change — when a user's decision depends on an exact limit, v
 | Max indexes per table          | 24            | `aurora dsql index limits`         |
 | Max columns per index          | 8             | `aurora dsql index limits`         |
 | IDENTITY/SEQUENCE CACHE values | 1 or >= 65536 | `aurora dsql sequence cache`       |
+| Supported column data types    | See docs      | `aurora dsql supported data types` |
 
-**When to verify:** Before recommending batch sizes, connection pool settings, or schema designs
-where hitting a limit would cause failures. No need to verify for general guidance or when
-the exact number doesn't affect the user's decision.
+**When to verify:** Before recommending batch sizes, connection pool settings, or schema designs where hitting a limit would cause failures; any time the exact number can affect user decision.
 
-**Fallback:** If `awsknowledge` is unavailable, use the defaults above and note to the user
-that limits should be verified against [DSQL documentation](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/).
+**Fallback:** If `awsknowledge` is unavailable, use the defaults above and flag that limits should be verified against [DSQL documentation](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/).
 
 ## CLI Scripts Available
 
@@ -208,7 +206,7 @@ ALTER COLUMN TYPE, DROP COLUMN, DROP CONSTRAINT → Table Recreation Pattern (Wo
 - MUST include tenant_id in all tables
 - MUST use `CREATE INDEX ASYNC` exclusively
 - MUST issue each DDL in its own transact call: `transact(["CREATE TABLE ..."])`
-- MUST store arrays/JSON as TEXT
+- MUST serialize arrays as TEXT or JSON; cast back at query time (`string_to_array(text, ',')` or `jsonb_array_elements_text(json::jsonb)`)
 
 ### Workflow 2: Safe Data Migration
 
diff --git a/plugins/databases-on-aws/skills/dsql/references/development-guide.md b/plugins/databases-on-aws/skills/dsql/references/development-guide.md
index 5f47d3cb..d57f2245 100644
--- a/plugins/databases-on-aws/skills/dsql/references/development-guide.md
+++ b/plugins/databases-on-aws/skills/dsql/references/development-guide.md
@@ -13,7 +13,8 @@ effortless scaling, multi-region viability, among other advantages.
 - **REQUIRED: Follow DDL Guidelines** - Refer to [DDL Rules](#schema-ddl-rules)
 - **SHALL repeatedly generate fresh tokens** - Refer to [Connection Limits](auth/authentication-guide.md#connection-rules)
 - **ALWAYS use ASYNC indexes** - `CREATE INDEX ASYNC` is mandatory
-- **MUST Serialize arrays/JSON as TEXT** - Store arrays/JSON as TEXT (comma separated, JSON.stringify)
+- **MUST serialize arrays as TEXT or JSON** - see [Schema Design Rules](#schema-design-rules)
+- **MUST cast to `JSONB` at query time** for JSONB operators — see [Supported Data Types](#supported-data-types)
 - **ALWAYS Batch within row limit** - maintain transaction limits (verify via `awsknowledge`: `aurora dsql transaction limits`)
 - **REQUIRED: Build and sanitize all SQL with `safe_query.build()`** - See [Input Validation](../mcp/tools/input-validation.md#required-pattern)
 - **MUST follow correct Application Layer Patterns** - when multi-tenant isolation or application referential integrity are required; refer to [Application Layer Patterns](#application-layer-patterns)
@@ -53,9 +54,8 @@ effortless scaling, multi-region viability, among other advantages.
 
 ### Schema Design Rules
 
-- MUST use **simple PostgreSQL types:** VARCHAR, TEXT, INTEGER, BOOLEAN, TIMESTAMP
-- MUST store arrays as TEXT (comma-separated is recommended)
-- MUST store JSON objects as TEXT (JSON.stringify)
+- MUST verify column types via `awsknowledge`: `aurora dsql supported data types` or the [DSQL supported data types list](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html)
+- MUST serialize arrays as TEXT or JSON; cast back at query time via `string_to_array(text, ',')` or `jsonb_array_elements_text(json::jsonb)`
 - ALWAYS include tenant_id in tables for multi-tenant isolation
 - SHOULD create async indexes for tenant_id and common query patterns
 
@@ -124,9 +124,9 @@ UPDATE table SET c = 'default' WHERE c IS NULL;        ← AFTER ADD COLUMN
 
 ### Supported Data Types
 
-```
-VARCHAR, TEXT, INTEGER, DECIMAL, BOOLEAN, TIMESTAMP, UUID
-```
+**MUST verify** column types against the [DSQL supported data types docs](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html) or via `awsknowledge`: `aurora dsql supported data types` — the supported set evolves, so do not treat any static list as exhaustive.
+
+`JSONB`, arrays, and `INET` are **[runtime-only](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html#working-with-postgresql-compatibility-query-runtime)** — cast at query time
 
 ### Supported Key
 
diff --git a/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md b/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md
index 2b6a18ad..080035d2 100644
--- a/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md
+++ b/plugins/databases-on-aws/skills/dsql/references/examples/data-operations.md
@@ -54,7 +54,7 @@ async function batchInsert(pool, tenantId, items) {
         await client.query(
           `INSERT INTO entities (tenant_id, name, metadata)
           VALUES ($1, $2, $3)`,
-          [tenantId, item.name, JSON.stringify(item.metadata)]
+          [tenantId, item.name, item.metadata]
         );
       }
 
@@ -105,7 +105,7 @@ async function processBatches(pool, tenantId, batches, startIdx, step) {
       for (const item of batch) {
         await client.query(
           'INSERT INTO entities (tenant_id, name, metadata) VALUES ($1, $2, $3)',
-          [tenantId, item.name, JSON.stringify(item.metadata)]
+          [tenantId, item.name, item.metadata]
         );
       }
 
diff --git a/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md b/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md
index fc6d0b97..75ceac3a 100644
--- a/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md
+++ b/plugins/databases-on-aws/skills/dsql/references/examples/patterns.md
@@ -129,9 +129,12 @@ INSERT INTO distributors VALUES (nextval('order_seq'), 'nothing');
 
 ---
 
-## Data Serialization
+## Runtime-Only Types
 
-**Pattern:** MUST store arrays and JSON as TEXT (runtime-only types). Per [DSQL docs](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html), cast to JSON at query time.
+`JSONB`, arrays, and `INET` are [runtime-only](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html#working-with-postgresql-compatibility-query-runtime) — not valid as column types.
+
+- **MUST** serialize arrays as `TEXT` or `JSON` — use `TEXT` (comma-separated) for homogeneous short strings; use `JSON` when elements may contain commas or aren't homogeneous
+- **MUST** cast back at query time — `string_to_array(text, ',')` for TEXT, `jsonb_array_elements_text(json::jsonb)` for JSON
 
 ```javascript
 function toTextArray(values) {
@@ -142,32 +145,21 @@ function fromTextArray(textValue) {
   return textValue ? textValue.split(',').map(v => v.trim()) : [];
 }
 
-function toTextJSON(object) {
-  return JSON.stringify(object);
-}
-
-function fromTextJSON(textValue) {
-  if (!textValue) return null;
-  try {
-    return JSON.parse(textValue);
-  } catch (err) {
-    console.warn('Invalid JSON in column:', err.message);
-    return null;
-  }
-}
-
 const categoriesText = toTextArray(['backend', 'api', 'database']);
 await pool.query('INSERT INTO projects (project_id, categories) VALUES ($1, $2)', [projectId, categoriesText]);
 
-const configText = toTextJSON({ theme: 'dark', notifications: true });
-await pool.query('INSERT INTO user_settings (user_id, preferences) VALUES ($1, $2)', [userId, configText]);
+await pool.query(
+  'INSERT INTO user_settings (user_id, preferences) VALUES ($1, $2)',
+  [userId, { theme: 'dark', notifications: true }],
+);
 ```
 
 Query-time operations:
 
 ```sql
-SELECT user_id, preferences::jsonb->>'theme' as theme
-FROM user_settings WHERE preferences::jsonb->>'notifications' = 'true';
+SELECT user_id, preferences::jsonb->>'theme' AS theme
+FROM user_settings
+WHERE preferences::jsonb->>'notifications' = 'true';
 
-SELECT project_id, string_to_array(categories, ',') as category_array FROM projects;
+SELECT project_id, string_to_array(categories, ',') AS category_array FROM projects;
 ```
diff --git a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md
index ae73389f..c1896c24 100644
--- a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md
+++ b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/full-example.md
@@ -45,7 +45,7 @@ transact([
      price DECIMAL(10,2) NOT NULL,
      category VARCHAR(255) DEFAULT 'other' CHECK (category IN ('electronics', 'clothing', 'food', 'other')),
      tags TEXT,
-     metadata TEXT,
+     metadata JSON,
      stock INTEGER DEFAULT 0 CHECK (stock >= 0),
      is_active BOOLEAN DEFAULT true,
      created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
@@ -70,7 +70,7 @@ transact(["CREATE INDEX ASYNC idx_products_category ON products(tenant_id, categ
 | `MEDIUMTEXT`                  | `TEXT`                                                                                                                                                     |
 | `ENUM(...)`                   | `VARCHAR(255)` with `CHECK` constraint                                                                                                                     |
 | `SET(...)`                    | `TEXT` (comma-separated)                                                                                                                                   |
-| `JSON`                        | `TEXT` (JSON.stringify)                                                                                                                                    |
+| `JSON`                        | `JSON`                                                                                                                                                     |
 | `UNSIGNED`                    | `CHECK (col >= 0)`                                                                                                                                         |
 | `TINYINT(1)`                  | `BOOLEAN`                                                                                                                                                  |
 | `DATETIME`                    | `TIMESTAMP`                                                                                                                                                |
@@ -99,7 +99,6 @@ transact(["CREATE INDEX ASYNC idx_products_category ON products(tenant_id, categ
 - **MUST convert** AUTO_INCREMENT to UUID with gen_random_uuid(), IDENTITY column with `GENERATED AS IDENTITY (CACHE ...)`, or explicit SEQUENCE -- ALWAYS use `GENERATED AS IDENTITY` for auto-incrementing columns (see [AUTO_INCREMENT Migration](ddl-auto-increment.md#auto_increment-migration))
 - **MUST replace** ENUM with VARCHAR and CHECK constraint
 - **MUST replace** SET with TEXT (comma-separated)
-- **MUST replace** JSON columns with TEXT
 - **MUST replace** FOREIGN KEY constraints with application-layer referential integrity
 - **MUST replace** ON UPDATE CURRENT_TIMESTAMP with application-layer updates
 - **MUST convert** all index creation to use CREATE INDEX ASYNC
diff --git a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md
index bb15b42c..6eb9f615 100644
--- a/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md
+++ b/plugins/databases-on-aws/skills/dsql/references/mysql-migrations/type-mapping.md
@@ -97,7 +97,7 @@ Map MySQL data types to their DSQL equivalents.
 
 | MySQL Type     | DSQL Equivalent                                           | Notes                                                                                                |
 | -------------- | --------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- |
-| JSON           | TEXT                                                      | MUST store as TEXT                                                                                   |
+| JSON           | `JSON`                                                    | Direct equivalent                                                                                    |
 | AUTO_INCREMENT | UUID with gen_random_uuid(), IDENTITY column, or SEQUENCE | See [AUTO_INCREMENT Migration](ddl-auto-increment.md#auto_increment-migration) for all three options |
 
 ---
diff --git a/plugins/databases-on-aws/skills/dsql/references/onboarding.md b/plugins/databases-on-aws/skills/dsql/references/onboarding.md
index 22a9e0bd..c1d87867 100644
--- a/plugins/databases-on-aws/skills/dsql/references/onboarding.md
+++ b/plugins/databases-on-aws/skills/dsql/references/onboarding.md
@@ -35,7 +35,7 @@ These guidelines apply when users say "Get started with DSQL" or similar phrases
   - Example:
     - "What column names would you like in this table?"
     - "What is the column name of the primary key?"
-    - "JSON must be serialized. Would you like to stringify the JSON to serialize it as TEXT?"
+    - "Would you like to store this in a `JSON` column, or serialize as TEXT?"
 
 **Examples:**
 
@@ -252,7 +252,9 @@ cargo add aws-sdk-dsql tokio --features full
 - If yes, MUST verify DSQL compatibility:
   - No SERIAL types (use `GENERATED AS IDENTITY` with sequences, or UUID)
   - No foreign keys (implement in application)
-  - No array/JSON column types (serialize as TEXT)
+  - Serialize arrays as TEXT or JSON; cast back at query time (`string_to_array(text, ',')` / `jsonb_array_elements_text(json::jsonb)`)
+  - Cast to `JSONB` at query time for JSONB operators (`JSONB` is not a valid column type)
+  - Verify column types against the [supported data types list](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/working-with-postgresql-compatibility-supported-data-types.html)
   - Reference [`./development-guide.md`](./development-guide.md) for full constraints
 
 **If no schema found:**
@@ -349,7 +351,7 @@ Let them know you're ready to help with more:
 **ALWAYS follow these rules:**
 
 1. **Indexes:** Use `CREATE INDEX ASYNC` - synchronous index creation not supported
-2. **Serialization:** Store arrays/JSON as TEXT (comma-separated or JSON.stringify)
+2. **Runtime-only types:** Serialize arrays as TEXT or JSON; cast to `JSONB` at query time for JSONB operators
 3. **Referential Integrity:** Implement foreign key validation in application code
 4. **DDL Operations:** Execute one DDL per transaction, no mixing with DML
 5. **Transaction Limits:** Maximum 3,000 row modifications, 10 MiB data size per transaction
diff --git a/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md b/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md
index bb0c0a22..ba540a16 100644
--- a/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md
+++ b/plugins/databases-on-aws/skills/dsql/references/troubleshooting.md
@@ -52,6 +52,18 @@ Before referring to any listed errors, refer to the complete [DSQL troubleshooti
 - Use native TLS libraries (not OpenSSL 1.0.x)
 - Set `server_name_indication` to cluster endpoint in SSL config
 
+## Cluster Lifecycle
+
+See [cluster lifecycle](https://docs.aws.amazon.com/aurora-dsql/latest/userguide/cluster-lifecycle.html) for state definitions and behavior.
+
+### Error: "FATAL: unable to accept connection, waking up cluster, please retry later"
+
+The cluster is `INACTIVE` and waking up. Poll `aws dsql get-cluster --identifier <id> --region <region> --query status --output text` until `ACTIVE`, then retry.
+
+### Error: `FailedPrecondition` when backing up an `IDLE` / `INACTIVE` cluster
+
+Connect to the cluster to wake it, then retry the backup.
+
 ## Incompatibility
 
 When migrating from PostgreSQL, remember DSQL doesn't support:
@@ -83,10 +95,8 @@ See [full list of unsupported features](https://docs.aws.amazon.com/aurora-dsql/
 **Cause:** Using TEXT[] or other array types
 **Solution:**
 
-1. Change column to TEXT
-2. Store as comma-separated: `"tag1,tag2,tag3"`
-3. Or use JSON.stringify: `"["tag1","tag2","tag3"]"`
-4. Deserialize in application layer
+1. Change column to TEXT and store as comma-separated (`"tag1,tag2,tag3"`), or use a `JSON` column (`tags JSON`)
+2. Deserialize in application layer; cast to `JSONB` at query time for JSONB operators
 
 ### Error: "Please use CREATE INDEX ASYNC"
 
diff --git a/tools/evals/databases-on-aws/README.md b/tools/evals/databases-on-aws/README.md
index 193d08ce..b520fc0b 100644
--- a/tools/evals/databases-on-aws/README.md
+++ b/tools/evals/databases-on-aws/README.md
@@ -14,7 +14,7 @@ scripts, and unit tests for that database's skill.
 tools/evals/databases-on-aws/
 ├── README.md                        # This file — top-level index
 └── dsql/                            # Aurora DSQL skill evals
-    ├── evals.json                   # Tier 2: functional evals (5 prompts, 20 assertions)
+    ├── evals.json                   # Tier 2: functional evals (9 prompts, 31 assertions)
     ├── trigger_evals.json           # Tier 1: triggering evals (26 test cases)
     ├── safe_query_evals.json        # Tier 3: safe_query enforcement (6 prompts, ~30 expectations)
     ├── query_explainability_evals.json  # Workflow 8: query plan diagnostics (9 prompts, 70 assertions)
@@ -68,15 +68,39 @@ python tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py \
   --verbose
 ```
 
-**What it checks** (5 eval prompts, 20 assertions total):
+Run a subset by ID (e.g., just the new type / lifecycle evals):
 
-| Eval                   | Focus                 | Key assertions                                                             |
-| ---------------------- | --------------------- | -------------------------------------------------------------------------- |
-| 1. Transaction limits  | MCP delegation        | Calls `awsknowledge`, cites 3,000 row limit, recommends batching           |
-| 2. Multi-tenant schema | Correctness           | Uses `tenant_id`, `CREATE INDEX ASYNC`, no foreign keys, separate DDL txns |
-| 3. Index limits        | MCP delegation        | Calls `awsknowledge`, cites 24 index limit, suggests alternatives          |
-| 4. Python connection   | Language routing      | Recommends DSQL Python Connector, IAM auth, 15-min token expiry, SSL       |
-| 5. Column type change  | DDL migration routing | Table Recreation Pattern, DROP TABLE warning, batching, user confirmation  |
+```bash
+python tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py \
+  --evals tools/evals/databases-on-aws/dsql/evals.json \
+  --plugin-dir plugins/databases-on-aws \
+  --output-dir /tmp/dsql-eval-results \
+  --eval-ids 6,7,8 \
+  --verbose
+```
+
+**What it checks** (9 eval prompts, 31 assertions total):
+
+| Eval                       | Focus                 | Grader    | Key assertions                                                                                    |
+| -------------------------- | --------------------- | --------- | ------------------------------------------------------------------------------------------------- |
+| 1. Transaction limits      | MCP delegation        | regex     | Calls `awsknowledge`, cites 3,000 row limit, recommends batching                                  |
+| 2. Multi-tenant schema     | Correctness           | regex     | Uses `tenant_id`, `CREATE INDEX ASYNC`, no foreign keys, separate DDL txns                        |
+| 3. Index limits            | MCP delegation        | regex     | Calls `awsknowledge`, cites 24 index limit, suggests alternatives                                 |
+| 4. Python connection       | Language routing      | regex     | Recommends DSQL Python Connector, IAM auth, 15-min token expiry, SSL                              |
+| 5. Column type change      | DDL migration routing | regex     | Table Recreation Pattern, DROP TABLE warning, batching, user confirmation                         |
+| 6. JSON column storage     | Type guidance         | LLM judge | Explains `::jsonb` cast, does not recommend `JSONB` as a column type                              |
+| 7. Array storage           | Type guidance         | LLM judge | Flags `TEXT[]` / array column as unsupported, recommends TEXT or `JSON` column                    |
+| 8. INACTIVE cluster error  | Troubleshooting       | LLM judge | Identifies INACTIVE state, uses `aws dsql get-cluster` to poll until `ACTIVE`, retries afterwards |
+| 9. Backup on IDLE/INACTIVE | Troubleshooting       | LLM judge | Identifies `FailedPrecondition`, connects to wake cluster to ACTIVE, retries backup               |
+
+### Grader modes
+
+The runner supports two grading strategies; each eval declares which via `"llm_judge": true|false` (default `false`):
+
+- **Regex / tool-call** (evals 1-5): fast, cheap, deterministic. Good for verbatim tokens (`tenant_id`, `CREATE INDEX ASYNC`, the `3,000` row limit) and tool-invocation checks (`Calls awsknowledge with topic=X`).
+- **LLM judge** (evals 6-9): runs `claude -p` once per expectation with the agent's final text, the user prompt, and the assertion. Returns `{passed, evidence}`. Good for semantic assertions where paraphrasing, negation, or synonym coverage makes regex brittle. Costs ~$0.01–0.05 per expectation; slower than regex. Use for assertions like "Does NOT recommend X" where the agent may phrase the refutation a hundred different ways.
+
+Pin the judge model independently of the subject model via `--judge-model` (defaults to the CLI default). Keep it stable across runs when bumping `--model` so grading stays comparable.
 
 ### Tier 3: Safe-Query Enforcement Evals
 
diff --git a/tools/evals/databases-on-aws/dsql/evals.json b/tools/evals/databases-on-aws/dsql/evals.json
index 706f0952..a4339000 100644
--- a/tools/evals/databases-on-aws/dsql/evals.json
+++ b/tools/evals/databases-on-aws/dsql/evals.json
@@ -60,6 +60,53 @@
         "Mentions batching the data copy for tables exceeding 3,000 rows",
         "Requires or recommends user confirmation before destructive steps"
       ]
+    },
+    {
+      "id": 6,
+      "prompt": "I need to store a JSON preferences blob per user in my DSQL users table. How should I model that column, and how do I query fields inside it?",
+      "expected_output": "Recommends either JSON or TEXT as a valid column type. Explains that JSONB is runtime-only and must be cast (preferences::jsonb->>'key') at query time for JSONB operators.",
+      "files": [],
+      "expectations": [
+        "Mentions casting to jsonb at query time for JSONB operators",
+        "Does NOT claim JSONB is a valid column type (JSONB is runtime-only)"
+      ],
+      "llm_judge": true
+    },
+    {
+      "id": 7,
+      "prompt": "I want to store a tags array per project in DSQL (e.g. ['backend','database','api']). Can I use TEXT[] for that?",
+      "expected_output": "Indicates TEXT[] / array column type is not supported in DSQL. Recommends storing arrays as TEXT (comma-separated) or inside a JSON column.",
+      "files": [],
+      "expectations": [
+        "Indicates TEXT[] or array column type is not supported in DSQL",
+        "Recommends storing arrays as TEXT (comma-separated) or inside a JSON column"
+      ],
+      "llm_judge": true
+    },
+    {
+      "id": 8,
+      "prompt": "My DSQL connection just failed with: FATAL: unable to accept connection, waking up cluster, please retry later. What's happening and what should I do?",
+      "expected_output": "Identifies the cluster as INACTIVE, explains that the first connection triggers the wake. Recommends polling cluster status via aws dsql get-cluster until ACTIVE, then retrying the connection.",
+      "files": [],
+      "expectations": [
+        "Identifies the cluster is in INACTIVE state and waking up",
+        "Mentions aws dsql get-cluster for checking cluster status",
+        "Recommends polling until the cluster reaches ACTIVE state",
+        "Recommends retrying the connection after cluster becomes ACTIVE"
+      ],
+      "llm_judge": true
+    },
+    {
+      "id": 9,
+      "prompt": "I ran `aws backup start-backup-job` on my DSQL cluster and got FailedPrecondition: Cluster is in state 'IDLE' and can't be backed up. How do I fix this?",
+      "expected_output": "Identifies that backups require an ACTIVE cluster. Recommends connecting to the cluster to transition it to ACTIVE, then retrying the backup.",
+      "files": [],
+      "expectations": [
+        "Identifies that backups require the cluster to be in ACTIVE state",
+        "Recommends connecting to the cluster to wake it to ACTIVE",
+        "Recommends retrying the backup after cluster is ACTIVE"
+      ],
+      "llm_judge": true
     }
   ]
 }
diff --git a/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py b/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py
index 4ab8a585..82fc8a26 100644
--- a/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py
+++ b/tools/evals/databases-on-aws/dsql/scripts/run_functional_evals.py
@@ -111,8 +111,109 @@ def run_prompt(prompt: str, plugin_dir: str, timeout: int = 180, model: str | No
     }
 
 
-def grade_eval(eval_item: dict, run_result: dict) -> dict:
-    """Grade a single eval against its expectations."""
+def _llm_judge(prompt: str, result_text: str, expectation: str, model: str | None = None, timeout: int = 60) -> dict:
+    """Grade a single expectation via an LLM judge call (`claude -p`).
+
+    Returns {"passed": bool, "evidence": str}. Used for semantic assertions where regex
+    grading is brittle (agent paraphrasing, negation handling, synonym coverage). The judge
+    sees the user prompt, the agent's final text, and the expectation, and returns a
+    one-line verdict.
+    """
+    judge_prompt = (
+        "You are grading a single assertion about an AI agent's answer. "
+        "Respond with a JSON object only, no prose, matching this schema:\n"
+        '{"passed": true|false, "evidence": "<under 200 chars explaining the verdict>"}\n\n'
+        f"USER PROMPT TO AGENT:\n{prompt}\n\n"
+        f"AGENT'S FINAL ANSWER:\n{result_text}\n\n"
+        f"ASSERTION TO GRADE:\n{expectation}\n\n"
+        "Grade strictly: if the agent's answer supports the assertion, passed=true. "
+        "If the agent's answer contradicts or fails to address the assertion, passed=false. "
+        "For negative assertions (e.g. 'Does NOT claim X'), passed=true only if the agent "
+        "clearly avoids X or actively refutes it; passed=false if the agent endorses X."
+    )
+    cmd = ["claude", "-p", judge_prompt, "--output-format", "json", "--max-turns", "1"]
+    if model:
+        cmd.extend(["--model", model])
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+    try:
+        result = subprocess.run(  # nosec B603 - cmd is a fixed list of literals
+            cmd, capture_output=True, text=True, timeout=timeout, env=env,
+        )
+    except subprocess.TimeoutExpired:
+        return {"passed": False, "evidence": f"LLM judge timed out after {timeout}s"}
+    if result.returncode != 0:
+        return {"passed": False, "evidence": f"LLM judge exited {result.returncode}: {result.stderr[:200]}"}
+    # claude -p --output-format json returns a top-level object with `result` field containing the reply.
+    # Any parsing failure maps to `passed=False` so the grader fails-closed — never silently passes
+    # on malformed judge output. We catch the full Exception tree here (not just JSONDecodeError)
+    # because the judge reply may be a list, null, or otherwise non-dict shape, which would raise
+    # AttributeError/TypeError from `.get(...)` and crash the whole eval loop otherwise.
+    try:
+        outer = json.loads(result.stdout)
+        if not isinstance(outer, dict):
+            return {"passed": False, "evidence": f"LLM judge outer JSON not a dict: {type(outer).__name__}"}
+        reply = outer.get("result", "").strip()
+        # Extract the JSON verdict via brace-matching so nested `{` / `}` inside the `evidence`
+        # string (e.g. when the judge quotes a JSON snippet as proof) don't truncate the match.
+        # Fall through to the error path if no balanced object is found.
+        verdict_text = _extract_balanced_json_object(reply)
+        if verdict_text is None:
+            return {"passed": False, "evidence": f"LLM judge reply did not contain JSON: {reply[:200]}"}
+        verdict = json.loads(verdict_text)
+        if not isinstance(verdict, dict):
+            return {"passed": False, "evidence": f"LLM judge verdict not an object: {str(verdict)[:100]}"}
+        return {
+            "passed": bool(verdict.get("passed", False)),
+            "evidence": str(verdict.get("evidence", ""))[:500],
+        }
+    except (json.JSONDecodeError, AttributeError, TypeError, KeyError) as e:
+        return {"passed": False, "evidence": f"LLM judge returned invalid JSON: {str(e)[:100]}"}
+
+
+def _extract_balanced_json_object(s: str) -> str | None:
+    """Return the first balanced `{...}` substring in `s`, or None if none exists.
+
+    Needed because LLM replies sometimes wrap the verdict in prose or fences AND the verdict's
+    `evidence` field may itself contain quoted `{}` characters that confuse naive regex matching.
+    Parses character-by-character, tracking brace depth, and respects string quoting.
+    """
+    start = s.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    in_str = False
+    escape = False
+    for i in range(start, len(s)):
+        ch = s[i]
+        if escape:
+            escape = False
+            continue
+        if ch == "\\":
+            escape = True
+            continue
+        if ch == '"':
+            in_str = not in_str
+            continue
+        if in_str:
+            continue
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return s[start:i + 1]
+    return None
+
+
+def grade_eval(eval_item: dict, run_result: dict, judge_model: str | None = None) -> dict:
+    """Grade a single eval against its expectations.
+
+    If `eval_item["llm_judge"]` is true, all expectations are graded via `_llm_judge`.
+    Otherwise, each expectation falls through to the regex-based elif chain below.
+    Hybrid graders that work best with LLM semantic judgment (evals 6-9 in this suite)
+    set `llm_judge: true` in `evals.json`; evals grading on verbatim tokens or tool-call
+    presence (evals 1-5) keep regex-based grading where it is both sufficient and faster.
+    """
     text = run_result["result_text"].lower()
     tool_calls = run_result["tool_calls"]
 
@@ -124,11 +225,26 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict:
         full_text += " " + json.dumps(msg).lower()
 
     expectations = []
+    use_llm_judge = bool(eval_item.get("llm_judge", False))
 
     for expectation_text in eval_item.get("expectations", []):
         passed = False
         evidence = ""
 
+        if use_llm_judge:
+            verdict = _llm_judge(
+                prompt=eval_item.get("prompt", ""),
+                result_text=run_result.get("result_text", ""),
+                expectation=expectation_text,
+                model=judge_model,
+            )
+            expectations.append({
+                "text": expectation_text,
+                "passed": verdict["passed"],
+                "evidence": verdict["evidence"],
+            })
+            continue
+
         exp_lower = expectation_text.lower()
 
         # --- Assertion: awsknowledge call with topic ---
@@ -335,6 +451,10 @@ def grade_eval(eval_item: dict, run_result: dict) -> dict:
                 evidence = "No alternatives suggested"
 
         # --- Fallback: keyword search ---
+        # Note: evals 6-9 assertions (JSONB column type, TEXT[], INACTIVE/backup lifecycle)
+        # are semantic — graded via `_llm_judge` when the eval sets `"llm_judge": true`.
+        # Regex branches below cover only evals 1-5 where verbatim tokens / tool-call topic
+        # matches are the right signal. See `_llm_judge` doc comment for rationale.
         else:
             keywords = re.findall(r'\b[a-z_]{3,}\b', exp_lower)
             significant = [k for k in keywords if k not in (
@@ -373,18 +493,44 @@ def main():
     parser.add_argument("--evals", required=True, help="Path to evals.json")
     parser.add_argument("--plugin-dir", required=True, help="Path to the plugin directory")
     parser.add_argument("--output-dir", required=True, help="Directory to save results")
-    parser.add_argument("--model", default=None, help="Model to use")
+    parser.add_argument("--model", default=None, help="Model to use for the subject-under-test (the agent responding to eval prompts)")
+    parser.add_argument(
+        "--judge-model",
+        default=None,
+        help=(
+            "Model to use for the LLM judge on evals with `llm_judge: true`. Intentionally "
+            "separate from --model so that bumping the subject model does not silently swap "
+            "the judge and invalidate the regression baseline. Defaults to the claude CLI default."
+        ),
+    )
     parser.add_argument("--timeout", type=int, default=180, help="Timeout per prompt in seconds")
     parser.add_argument("--verbose", action="store_true", help="Print progress")
+    parser.add_argument(
+        "--eval-ids",
+        type=lambda s: [int(x) for x in s.split(",")],
+        default=None,
+        help="Comma-separated list of eval IDs to run (default: all)",
+    )
     args = parser.parse_args()
 
     evals_data = json.loads(Path(args.evals).read_text())
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
 
+    eval_items = evals_data["evals"]
+    if args.eval_ids is not None:
+        requested = set(args.eval_ids)
+        eval_items = [e for e in eval_items if e["id"] in requested]
+        missing = requested - {e["id"] for e in eval_items}
+        if missing:
+            print(f"WARNING: eval IDs not found: {sorted(missing)}", file=sys.stderr)
+        if not eval_items:
+            print("ERROR: no matching eval IDs", file=sys.stderr)
+            return 1
+
     all_results = []
 
-    for eval_item in evals_data["evals"]:
+    for eval_item in eval_items:
         eval_id = eval_item["id"]
         prompt = eval_item["prompt"]
 
@@ -400,7 +546,7 @@ def main():
         (eval_dir / "transcript.json").write_text(json.dumps(run_result, indent=2))
 
         # Grade
-        grading = grade_eval(eval_item, run_result)
+        grading = grade_eval(eval_item, run_result, judge_model=args.judge_model)
         (eval_dir / "grading.json").write_text(json.dumps(grading, indent=2))
 
         # Save timing
@@ -456,4 +602,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main() or 0)