Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,12 @@ jobs:
echo "Tilt CI failed after 5 attempts, exiting."
exit 1
fi
echo "Tilt CI failed, retrying... ($i/3)"
# Wait a bit for resource to stabilize / cleanup
echo "Tilt CI failed, retrying... ($i/4)"

# Clean up stuck Helm releases before retrying
helm rollback testkube -n testkube 2>/dev/null || helm uninstall testkube -n testkube 2>/dev/null || true

# Wait a bit for resources to stabilize
sleep 10
done

Expand All @@ -136,7 +140,6 @@ jobs:
--config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
--config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
--config metrics="nv_accuracy context_recall" \
--config workflowName="Testworkflow-Name" \
--config image="${{ steps.extract-tag.outputs.image-tag }}" \
-n testkube \
--watch
2 changes: 0 additions & 2 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
--config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
--config agentUrl="http://weather-agent.sample-agents:8000" \
--config metrics="nv_accuracy context_recall" \
--config workflowName="Test-Run" \
-n testkube

# Watch workflow execution
Expand Down Expand Up @@ -155,7 +154,6 @@ Observability Backend (Grafana)
- `agentUrl` - A2A endpoint of agent to evaluate
- `model` - LLM model for RAGAS evaluation (e.g., `gemini-2.5-flash-lite`)
- `metrics` - Space-separated RAGAS metrics (e.g., `faithfulness context_recall`)
- `workflowName` - Label for published metrics
- `otlpEndpoint` - OpenTelemetry collector URL (default: `http://lgtm.monitoring:4318`)
- `image` - Docker image to use (default: `ghcr.io/agentic-layer/testbench/testworkflows:latest`)

Expand Down
23 changes: 18 additions & 5 deletions DetailedUsageAndTroubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,13 @@ Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
**Syntax:**

```shell
python3 scripts/publish.py <workflow_name> [otlp_endpoint]
python3 scripts/publish.py <workflow_name> <execution_id> [otlp_endpoint]
```

**Arguments:**

- `workflow_name` (required): Name of the test workflow (used as metric label)
- `execution_id` (required): Testkube execution ID for this workflow run
- `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)

**Input:**
Expand All @@ -178,17 +179,29 @@ python3 scripts/publish.py <workflow_name> [otlp_endpoint]

**Published Metrics:**

Each RAGAS metric is published as a gauge with the workflow name as an attribute:
Three gauge types are published to the OTLP endpoint:

| Gauge Name | Description | Attributes |
|------------|-------------|------------|
| `testbench_evaluation_metric` | Per-sample evaluation scores | `name`, `workflow_name`, `execution_id`, `trace_id` |
| `testbench_evaluation_token_usage` | Token counts from evaluation | `type` (input_tokens/output_tokens), `workflow_name`, `execution_id` |
| `testbench_evaluation_cost` | Total evaluation cost in USD | `workflow_name`, `execution_id` |

**Example output:**

```
ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
testbench_evaluation_metric{name="faithfulness", workflow_name="weather-eval", execution_id="exec-123", trace_id="abc123..."} = 0.85
testbench_evaluation_metric{name="context_recall", workflow_name="weather-eval", execution_id="exec-123", trace_id="abc123..."} = 1.0
testbench_evaluation_token_usage{type="input_tokens", workflow_name="weather-eval", execution_id="exec-123"} = 1500
testbench_evaluation_token_usage{type="output_tokens", workflow_name="weather-eval", execution_id="exec-123"} = 500
testbench_evaluation_cost{workflow_name="weather-eval", execution_id="exec-123"} = 0.015
```

**Notes:**

- Sends metrics to `/v1/metrics` endpoint
- Uses resource with `service.name="ragas-evaluation"`
- The `trace_id` attribute links metrics to distributed traces for debugging
- Forces flush to ensure delivery before exit


Expand Down Expand Up @@ -254,4 +267,4 @@ user_input,retrieved_contexts,reference
- Verify your dataset includes all required fields for the metrics you're using
- Check the RAGAS documentation for metric-specific requirements

----
----
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
--config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
--config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
--config metrics="nv_accuracy context_recall" \
--config workflowName="Testworkflow-Name" \
--config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
-n testkube
```
Expand All @@ -144,7 +143,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
--config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
--config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
--config metrics="nv_accuracy context_recall" \
--config workflowName="Testworkflow-Name" \
--config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
--config model="gemini/gemini-2.5-flash" \
--config otlpEndpoint="http://otlp-endpoint:4093" \
Expand Down Expand Up @@ -177,7 +175,7 @@ uv run python3 scripts/run.py "http://localhost:11010"
uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy

# 4. Publish metrics to OpenTelemetry
uv run python3 scripts/publish.py "my-agent-evaluation"
uv run python3 scripts/publish.py "my-agent-evaluation" "local-exec-001"
```

----
Expand Down
6 changes: 3 additions & 3 deletions Tiltfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- mode: Python -*-

# Increase Kubernetes upsert timeout for CRD installations
update_settings(max_parallel_updates=10)
# Increase Kubernetes upsert timeout for CRD installations and slow Helm charts (testkube)
update_settings(max_parallel_updates=10, k8s_upsert_timeout_secs=600)

# Load .env file for environment variables
load('ext://dotenv', 'dotenv')
Expand Down Expand Up @@ -39,7 +39,7 @@ k8s_yaml(kustomize('deploy/local'))

k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000'])
k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime'])
k8s_resource('lgtm', port_forwards=['11000:3000'])
k8s_resource('lgtm', port_forwards=['11000:3000', '4318:4318'])
k8s_resource('data-server', port_forwards='11020:8000')

# Declare Testkube resources
Expand Down
6 changes: 2 additions & 4 deletions deploy/base/templates/publish-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ metadata:
spec:
# Configuration parameters that can be overridden
config:
workflowName:
type: string
description: "Name of the test workflow (used as label)"
otlpEndpoint:
type: string
description: "URL of the OTLP endpoint"
Expand All @@ -27,5 +24,6 @@ spec:
run:
args:
- publish.py
- "{{ config.workflowName }}"
- "{{ workflow.name }}"
- "{{ execution.id }}"
- "{{ config.otlpEndpoint }}"
3 changes: 3 additions & 0 deletions deploy/local/lgtm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ spec:
command:
- cat
- /tmp/ready
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 30
volumeMounts:
- name: tempo-data
mountPath: /data/tempo
Expand Down
6 changes: 0 additions & 6 deletions deploy/local/ragas-evaluation-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,6 @@ spec:
default: "nv_accuracy context_recall"

# Publishing configuration
workflowName:
type: string
description: "Workflow name for metrics"
default: "ragas-test-workflow"

otlpEndpoint:
type: string
description: "OTLP endpoint URL"
Expand Down Expand Up @@ -97,6 +92,5 @@ spec:
template:
name: ragas-publish-template
config:
workflowName: "{{ config.workflowName }}"
otlpEndpoint: "{{ config.otlpEndpoint }}"
image: "{{ config.image }}"
Loading