agentic-layer · fmallmann · Jan 6, 2026 · Dec 16, 2025 · Dec 12, 2025 · Dec 15, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -122,8 +122,12 @@ jobs:
               echo "Tilt CI failed after 5 attempts, exiting."
               exit 1
             fi
-            echo "Tilt CI failed, retrying... ($i/3)"
-            # Wait a bit for resource to stabilize / cleanup
+            echo "Tilt CI failed, retrying... ($i/4)"
+
+            # Clean up stuck Helm releases before retrying
+            helm rollback testkube -n testkube 2>/dev/null || helm uninstall testkube -n testkube 2>/dev/null || true
+
+            # Wait a bit for resources to stabilize
             sleep 10
           done
 
@@ -136,7 +140,6 @@ jobs:
               --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
               --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
               --config metrics="nv_accuracy context_recall" \
-              --config workflowName="Testworkflow-Name" \
               --config image="${{ steps.extract-tag.outputs.image-tag }}" \
               -n testkube \
               --watch
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -75,7 +75,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://weather-agent.sample-agents:8000" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Test-Run" \
     -n testkube
 
 # Watch workflow execution
@@ -155,7 +154,6 @@ Observability Backend (Grafana)
 - `agentUrl` - A2A endpoint of agent to evaluate
 - `model` - LLM model for RAGAS evaluation (e.g., `gemini-2.5-flash-lite`)
 - `metrics` - Space-separated RAGAS metrics (e.g., `faithfulness context_recall`)
-- `workflowName` - Label for published metrics
 - `otlpEndpoint` - OpenTelemetry collector URL (default: `http://lgtm.monitoring:4318`)
 - `image` - Docker image to use (default: `ghcr.io/agentic-layer/testbench/testworkflows:latest`)
 

diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md
@@ -164,12 +164,14 @@ Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
 **Syntax:**
 
 ```shell
-python3 scripts/publish.py <workflow_name> [otlp_endpoint]
+python3 scripts/publish.py <workflow_name> <execution_id> <execution_number> [otlp_endpoint]
 ```
 
 **Arguments:**
 
 - `workflow_name` (required): Name of the test workflow (used as metric label)
+- `execution_id` (required): Testkube execution ID for this workflow run
+- `execution_number` (required): Numeric execution number for this workflow run (used as X-axis in Grafana)
 - `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)
 
 **Input:**
@@ -178,17 +180,34 @@ python3 scripts/publish.py <workflow_name> [otlp_endpoint]
 
 **Published Metrics:**
 
-Each RAGAS metric is published as a gauge with the workflow name as an attribute:
+Three gauge types are published to the OTLP endpoint:
+
+| Gauge Name | Description | Attributes |
+|------------|-------------|------------|
+| `testbench_evaluation_metric` | Per-sample evaluation scores | `name`, `workflow_name`, `execution_id`, `execution_number`, `trace_id`, `user_input_hash`, `user_input_truncated` |
+| `testbench_evaluation_token_usage` | Token counts from evaluation | `type` (input_tokens/output_tokens), `workflow_name`, `execution_id`, `execution_number` |
+| `testbench_evaluation_cost` | Total evaluation cost in USD | `workflow_name`, `execution_id`, `execution_number` |
+
+**Attribute Details:**
+
+- `user_input_hash`: 12-character SHA256 hash of the user input for stable identification across executions
+- `user_input_truncated`: First 50 characters of the user input with "..." suffix (for display in Grafana legends)
+
+**Example output:**
 
 ```
-ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
-ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
+testbench_evaluation_metric{name="faithfulness", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 0.85
+testbench_evaluation_metric{name="context_recall", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 1.0
+testbench_evaluation_token_usage{type="input_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 1500
+testbench_evaluation_token_usage{type="output_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 500
+testbench_evaluation_cost{workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 0.015
 ```
 
 **Notes:**
 
 - Sends metrics to `/v1/metrics` endpoint
 - Uses resource with `service.name="ragas-evaluation"`
+- The `trace_id` attribute links metrics to distributed traces for debugging
 - Forces flush to ensure delivery before exit
 
 
@@ -254,4 +273,4 @@ user_input,retrieved_contexts,reference
 - Verify your dataset includes all required fields for the metrics you're using
 - Check the RAGAS documentation for metric-specific requirements
 
-----
+----
diff --git a/README.md b/README.md
@@ -132,8 +132,8 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
+    --config otlpEndpoint="http://lgtm.monitoring:4318" \
     -n testkube
 ```
 
@@ -144,7 +144,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
     --config model="gemini/gemini-2.5-flash" \
     --config otlpEndpoint="http://otlp-endpoint:4093" \
@@ -176,8 +175,8 @@ uv run python3 scripts/run.py "http://localhost:11010"
 # 3. Evaluate responses with RAGAS metrics
 uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy
 
-# 4. Publish metrics to OpenTelemetry
-uv run python3 scripts/publish.py "my-agent-evaluation"
+# 4. Publish metrics to OpenTelemetry (workflow_name, execution_id, execution_number)
+uv run python3 scripts/publish.py "my-agent-evaluation" "local-exec-001" 1
 ```
 
 ----
@@ -277,6 +276,18 @@ uv run pytest tests_e2e/test_e2e.py -v
 
 ## Development
 
+### Deployment Structure
+
+```
+deploy/
+  base/                    # Shared resources for all environments
+    templates/             # Testkube TestWorkflowTemplates
+    grafana-dashboards/    # Dashboard ConfigMaps (auto-discovered via grafana_dashboard label)
+  local/                   # Local Tilt environment (uses LGTM all-in-one)
+  dev/                     # Dev cluster environment (uses Grafana sidecar for dashboard discovery)
+```
+
+
 ## Code Quality Standards
 
 ### Code Style:

diff --git a/Tiltfile b/Tiltfile
@@ -1,7 +1,7 @@
 # -*- mode: Python -*-
 
-# Increase Kubernetes upsert timeout for CRD installations
-update_settings(max_parallel_updates=10)
+# Increase Kubernetes upsert timeout for CRD installations and slow Helm charts (testkube)
+update_settings(max_parallel_updates=10, k8s_upsert_timeout_secs=600)
 
 # Load .env file for environment variables
 load('ext://dotenv', 'dotenv')
@@ -39,7 +39,7 @@ k8s_yaml(kustomize('deploy/local'))
 
 k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000'])
 k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime'])
-k8s_resource('lgtm', port_forwards=['11000:3000'])
+k8s_resource('lgtm', port_forwards=['11000:3000', '4318:4318'])
 k8s_resource('data-server', port_forwards='11020:8000')
 
 # Declare Testkube resources