agentic-layer · felixk101 · Dec 16, 2025 · Dec 12, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -122,8 +122,12 @@ jobs:
               echo "Tilt CI failed after 5 attempts, exiting."
               exit 1
             fi
-            echo "Tilt CI failed, retrying... ($i/3)"
-            # Wait a bit for resource to stabilize / cleanup
+            echo "Tilt CI failed, retrying... ($i/4)"
+
+            # Clean up stuck Helm releases before retrying
+            helm rollback testkube -n testkube 2>/dev/null || helm uninstall testkube -n testkube 2>/dev/null || true
+
+            # Wait a bit for resources to stabilize
             sleep 10
           done
 
@@ -136,7 +140,6 @@ jobs:
               --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
               --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
               --config metrics="nv_accuracy context_recall" \
-              --config workflowName="Testworkflow-Name" \
               --config image="${{ steps.extract-tag.outputs.image-tag }}" \
               -n testkube \
               --watch
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -75,7 +75,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://weather-agent.sample-agents:8000" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Test-Run" \
     -n testkube
 
 # Watch workflow execution
@@ -155,7 +154,6 @@ Observability Backend (Grafana)
 - `agentUrl` - A2A endpoint of agent to evaluate
 - `model` - LLM model for RAGAS evaluation (e.g., `gemini-2.5-flash-lite`)
 - `metrics` - Space-separated RAGAS metrics (e.g., `faithfulness context_recall`)
-- `workflowName` - Label for published metrics
 - `otlpEndpoint` - OpenTelemetry collector URL (default: `http://lgtm.monitoring:4318`)
 - `image` - Docker image to use (default: `ghcr.io/agentic-layer/testbench/testworkflows:latest`)
 

diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md
@@ -164,12 +164,13 @@ Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
 **Syntax:**
 
 ```shell
-python3 scripts/publish.py <workflow_name> [otlp_endpoint]
+python3 scripts/publish.py <workflow_name> <execution_id> [otlp_endpoint]
 ```
 
 **Arguments:**
 
 - `workflow_name` (required): Name of the test workflow (used as metric label)
+- `execution_id` (required): Testkube execution ID for this workflow run
 - `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)
 
 **Input:**
@@ -178,17 +179,29 @@ python3 scripts/publish.py <workflow_name> [otlp_endpoint]
 
 **Published Metrics:**
 
-Each RAGAS metric is published as a gauge with the workflow name as an attribute:
+Three gauge types are published to the OTLP endpoint:
+
+| Gauge Name | Description | Attributes |
+|------------|-------------|------------|
+| `testbench_evaluation_metric` | Per-sample evaluation scores | `name`, `workflow_name`, `execution_id`, `trace_id` |
+| `testbench_evaluation_token_usage` | Token counts from evaluation | `type` (input_tokens/output_tokens), `workflow_name`, `execution_id` |
+| `testbench_evaluation_cost` | Total evaluation cost in USD | `workflow_name`, `execution_id` |
+
+**Example output:**
 
 ```
-ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
-ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
+testbench_evaluation_metric{name="faithfulness", workflow_name="weather-eval", execution_id="exec-123", trace_id="abc123..."} = 0.85
+testbench_evaluation_metric{name="context_recall", workflow_name="weather-eval", execution_id="exec-123", trace_id="abc123..."} = 1.0
+testbench_evaluation_token_usage{type="input_tokens", workflow_name="weather-eval", execution_id="exec-123"} = 1500
+testbench_evaluation_token_usage{type="output_tokens", workflow_name="weather-eval", execution_id="exec-123"} = 500
+testbench_evaluation_cost{workflow_name="weather-eval", execution_id="exec-123"} = 0.015
 ```
 
 **Notes:**
 
 - Sends metrics to `/v1/metrics` endpoint
 - Uses resource with `service.name="ragas-evaluation"`
+- The `trace_id` attribute links metrics to distributed traces for debugging
 - Forces flush to ensure delivery before exit
 
 
@@ -254,4 +267,4 @@ user_input,retrieved_contexts,reference
 - Verify your dataset includes all required fields for the metrics you're using
 - Check the RAGAS documentation for metric-specific requirements
 
-----
+----
diff --git a/README.md b/README.md
@@ -132,7 +132,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
     -n testkube
 ```
@@ -144,7 +143,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
     --config model="gemini/gemini-2.5-flash" \
     --config otlpEndpoint="http://otlp-endpoint:4093" \
@@ -177,7 +175,7 @@ uv run python3 scripts/run.py "http://localhost:11010"
 uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy
 
 # 4. Publish metrics to OpenTelemetry
-uv run python3 scripts/publish.py "my-agent-evaluation"
+uv run python3 scripts/publish.py "my-agent-evaluation" "local-exec-001"
 ```
 
 ----

diff --git a/Tiltfile b/Tiltfile
@@ -1,7 +1,7 @@
 # -*- mode: Python -*-
 
-# Increase Kubernetes upsert timeout for CRD installations
-update_settings(max_parallel_updates=10)
+# Increase Kubernetes upsert timeout for CRD installations and slow Helm charts (testkube)
+update_settings(max_parallel_updates=10, k8s_upsert_timeout_secs=600)
 
 # Load .env file for environment variables
 load('ext://dotenv', 'dotenv')
@@ -39,7 +39,7 @@ k8s_yaml(kustomize('deploy/local'))
 
 k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000'])
 k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime'])
-k8s_resource('lgtm', port_forwards=['11000:3000'])
+k8s_resource('lgtm', port_forwards=['11000:3000', '4318:4318'])
 k8s_resource('data-server', port_forwards='11020:8000')
 
 # Declare Testkube resources

diff --git a/deploy/base/templates/publish-template.yaml b/deploy/base/templates/publish-template.yaml
@@ -9,9 +9,6 @@ metadata:
 spec:
   # Configuration parameters that can be overridden
   config:
-    workflowName:
-      type: string
-      description: "Name of the test workflow (used as label)"
     otlpEndpoint:
       type: string
       description: "URL of the OTLP endpoint"
@@ -27,5 +24,6 @@ spec:
       run:
         args:
           - publish.py
-          - "{{ config.workflowName }}"
+          - "{{ workflow.name }}"
+          - "{{ execution.id }}"
           - "{{ config.otlpEndpoint }}"
diff --git a/deploy/local/lgtm.yaml b/deploy/local/lgtm.yaml
@@ -58,6 +58,9 @@ spec:
               command:
                 - cat
                 - /tmp/ready
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
           volumeMounts:
             - name: tempo-data
               mountPath: /data/tempo

diff --git a/deploy/local/ragas-evaluation-workflow.yaml b/deploy/local/ragas-evaluation-workflow.yaml
@@ -49,11 +49,6 @@ spec:
       default: "nv_accuracy context_recall"
 
     # Publishing configuration
-    workflowName:
-      type: string
-      description: "Workflow name for metrics"
-      default: "ragas-test-workflow"
-
     otlpEndpoint:
       type: string
       description: "OTLP endpoint URL"
@@ -97,6 +92,5 @@ spec:
       template:
         name: ragas-publish-template
         config:
-          workflowName: "{{ config.workflowName }}"
           otlpEndpoint: "{{ config.otlpEndpoint }}"
           image: "{{ config.image }}"