diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc0fed4..22cb5bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,8 +122,12 @@ jobs: echo "Tilt CI failed after 5 attempts, exiting." exit 1 fi - echo "Tilt CI failed, retrying... ($i/3)" - # Wait a bit for resource to stabilize / cleanup + echo "Tilt CI failed, retrying... ($i/4)" + + # Clean up stuck Helm releases before retrying + helm rollback testkube -n testkube 2>/dev/null || helm uninstall testkube -n testkube 2>/dev/null || true + + # Wait a bit for resources to stabilize sleep 10 done @@ -136,7 +140,6 @@ jobs: --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \ --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \ --config metrics="nv_accuracy context_recall" \ - --config workflowName="Testworkflow-Name" \ --config image="${{ steps.extract-tag.outputs.image-tag }}" \ -n testkube \ --watch diff --git a/CLAUDE.md b/CLAUDE.md index 179c46d..0558890 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -75,7 +75,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \ --config agentUrl="http://weather-agent.sample-agents:8000" \ --config metrics="nv_accuracy context_recall" \ - --config workflowName="Test-Run" \ -n testkube # Watch workflow execution @@ -155,7 +154,6 @@ Observability Backend (Grafana) - `agentUrl` - A2A endpoint of agent to evaluate - `model` - LLM model for RAGAS evaluation (e.g., `gemini-2.5-flash-lite`) - `metrics` - Space-separated RAGAS metrics (e.g., `faithfulness context_recall`) -- `workflowName` - Label for published metrics - `otlpEndpoint` - OpenTelemetry collector URL (default: `http://lgtm.monitoring:4318`) - `image` - Docker image to use (default: `ghcr.io/agentic-layer/testbench/testworkflows:latest`) diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md index 1672c8e..93580ca 100644 --- a/DetailedUsageAndTroubleshooting.md +++ b/DetailedUsageAndTroubleshooting.md @@ -164,12 +164,14 @@ Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring. **Syntax:** ```shell -python3 scripts/publish.py [otlp_endpoint] +python3 scripts/publish.py [otlp_endpoint] ``` **Arguments:** - `workflow_name` (required): Name of the test workflow (used as metric label) +- `execution_id` (required): Testkube execution ID for this workflow run +- `execution_number` (required): Numeric execution number for this workflow run (used as X-axis in Grafana) - `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`) **Input:** @@ -178,17 +180,34 @@ python3 scripts/publish.py [otlp_endpoint] **Published Metrics:** -Each RAGAS metric is published as a gauge with the workflow name as an attribute: +Three gauge types are published to the OTLP endpoint: + +| Gauge Name | Description | Attributes | +|------------|-------------|------------| +| `testbench_evaluation_metric` | Per-sample evaluation scores | `name`, `workflow_name`, `execution_id`, `execution_number`, `trace_id`, `user_input_hash`, `user_input_truncated` | +| `testbench_evaluation_token_usage` | Token counts from evaluation | `type` (input_tokens/output_tokens), `workflow_name`, `execution_id`, `execution_number` | +| `testbench_evaluation_cost` | Total evaluation cost in USD | `workflow_name`, `execution_id`, `execution_number` | + +**Attribute Details:** + +- `user_input_hash`: 12-character SHA256 hash of the user input for stable identification across executions +- `user_input_truncated`: First 50 characters of the user input with "..." suffix (for display in Grafana legends) + +**Example output:** ``` -ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85 -ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92 +testbench_evaluation_metric{name="faithfulness", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 0.85 +testbench_evaluation_metric{name="context_recall", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 1.0 +testbench_evaluation_token_usage{type="input_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 1500 +testbench_evaluation_token_usage{type="output_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 500 +testbench_evaluation_cost{workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 0.015 ``` **Notes:** - Sends metrics to `/v1/metrics` endpoint - Uses resource with `service.name="ragas-evaluation"` +- The `trace_id` attribute links metrics to distributed traces for debugging - Forces flush to ensure delivery before exit @@ -254,4 +273,4 @@ user_input,retrieved_contexts,reference - Verify your dataset includes all required fields for the metrics you're using - Check the RAGAS documentation for metric-specific requirements ----- \ No newline at end of file +---- diff --git a/README.md b/README.md index 46a7b24..d7c947e 100644 --- a/README.md +++ b/README.md @@ -132,8 +132,8 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \ --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \ --config metrics="nv_accuracy context_recall" \ - --config workflowName="Testworkflow-Name" \ --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \ + --config otlpEndpoint="http://lgtm.monitoring:4318" \ -n testkube ``` @@ -144,7 +144,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \ --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \ --config metrics="nv_accuracy context_recall" \ - --config workflowName="Testworkflow-Name" \ --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \ --config model="gemini/gemini-2.5-flash" \ --config otlpEndpoint="http://otlp-endpoint:4093" \ @@ -176,8 +175,8 @@ uv run python3 scripts/run.py "http://localhost:11010" # 3. Evaluate responses with RAGAS metrics uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy -# 4. Publish metrics to OpenTelemetry -uv run python3 scripts/publish.py "my-agent-evaluation" +# 4. Publish metrics to OpenTelemetry (workflow_name, execution_id, execution_number) +uv run python3 scripts/publish.py "my-agent-evaluation" "local-exec-001" 1 ``` ---- @@ -277,6 +276,18 @@ uv run pytest tests_e2e/test_e2e.py -v ## Development +### Deployment Structure + +``` +deploy/ + base/ # Shared resources for all environments + templates/ # Testkube TestWorkflowTemplates + grafana-dashboards/ # Dashboard ConfigMaps (auto-discovered via grafana_dashboard label) + local/ # Local Tilt environment (uses LGTM all-in-one) + dev/ # Dev cluster environment (uses Grafana sidecar for dashboard discovery) +``` + + ## Code Quality Standards ### Code Style: diff --git a/Tiltfile b/Tiltfile index c852cb2..7829ff8 100644 --- a/Tiltfile +++ b/Tiltfile @@ -1,7 +1,7 @@ # -*- mode: Python -*- -# Increase Kubernetes upsert timeout for CRD installations -update_settings(max_parallel_updates=10) +# Increase Kubernetes upsert timeout for CRD installations and slow Helm charts (testkube) +update_settings(max_parallel_updates=10, k8s_upsert_timeout_secs=600) # Load .env file for environment variables load('ext://dotenv', 'dotenv') @@ -39,7 +39,7 @@ k8s_yaml(kustomize('deploy/local')) k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000']) k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime']) -k8s_resource('lgtm', port_forwards=['11000:3000']) +k8s_resource('lgtm', port_forwards=['11000:3000', '4318:4318']) k8s_resource('data-server', port_forwards='11020:8000') # Declare Testkube resources diff --git a/deploy/base/grafana-dashboards/evaluation-dashboard.json b/deploy/base/grafana-dashboards/evaluation-dashboard.json new file mode 100644 index 0000000..eee1524 --- /dev/null +++ b/deploy/base/grafana-dashboards/evaluation-dashboard.json @@ -0,0 +1,639 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 4, + "panels": [], + "repeat": "metric", + "title": "Evaluation of $metric", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": false, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "avg(testbench_evaluation_metric{name=\"context_recall\", workflow_name=\"ragas-evaluation-workflow\"})", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "B" + } + ], + "title": "Average $metric over time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 50, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "pointShape": "square", + "pointSize": { + "fixed": 7 + }, + "pointStrokeWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "show": "points+lines" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "mapping": "auto", + "series": [ + { + "frame": { + "matcher": { + "id": "byIndex", + "options": 0 + } + }, + "x": { + "matcher": { + "id": "byName", + "options": "execution_number" + } + }, + "y": { + "matcher": { + "id": "byName", + "options": "Value" + } + } + } + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg by(execution_number) (testbench_evaluation_metric{workflow_name=\"ragas-evaluation-workflow\", name=~\"$metric\"})", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "A" + } + ], + "title": "Average $metric per test", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "mode": "columns" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "enumConfig": { + "text": [ + "62" + ] + }, + "targetField": "execution_number" + } + ], + "fields": {} + } + }, + { + "id": "merge", + "options": {} + } + ], + "type": "xychart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "builder", + "expr": "max by (user_input_hash, user_input_truncated) (\n testbench_evaluation_metric{name=\"context_recall\", workflow_name=\"ragas-evaluation-workflow\"}\n )", + "hide": false, + "instant": false, + "legendFormat": "Question: \"{{user_input_truncated}}\"", + "range": true, + "refId": "A" + } + ], + "title": "Individual $metric results over time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 50, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "pointShape": "square", + "pointSize": { + "fixed": 7 + }, + "pointStrokeWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "show": "points+lines" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "mapping": "auto", + "series": [ + { + "x": { + "matcher": { + "id": "byName", + "options": "execution_number (lastNotNull)" + } + }, + "y": { + "matcher": { + "id": "byName", + "options": "Value (lastNotNull)" + } + } + } + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg by(user_input_hash, trace_id, user_input_truncated, execution_number) (testbench_evaluation_metric{workflow_name=\"ragas-evaluation-workflow\", name=~\"$metric\"})", + "hide": false, + "instant": false, + "legendFormat": "Question: \"{{user_input_truncated}}\"", + "range": true, + "refId": "A" + } + ], + "title": "Individual $metric results per test", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "mode": "columns" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "execution_number" + } + ], + "fields": {} + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Time": { + "aggregations": [ + "last" + ], + "operation": "aggregate" + }, + "Value": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "execution_number": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "trace_id": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "user_input_hash": { + "aggregations": [] + }, + "user_input_truncated": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "partitionByValues", + "options": { + "fields": [ + "user_input_truncated" + ], + "keepFields": false, + "naming": { + "asLabels": false + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*?) Value \\(lastNotNull\\)", + "renamePattern": "$1" + } + } + ], + "type": "xychart" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "ragas-evaluation-workflow", + "value": "ragas-evaluation-workflow" + }, + "definition": "label_values(workflow_name)", + "description": "name of the TestWorkflow CR", + "label": "Workflow Name", + "name": "workflow", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(workflow_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 5, + "type": "query" + }, + { + "allowCustomValue": false, + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(testbench_evaluation_metric,name)", + "includeAll": true, + "name": "metric", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(testbench_evaluation_metric,name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "2025-12-19T10:24:55.111Z", + "to": "2025-12-19T15:28:09.705Z" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Workflow Evaluations", + "uid": "g9lx75", + "version": 32 +} diff --git a/deploy/base/grafana-dashboards/kustomization.yaml b/deploy/base/grafana-dashboards/kustomization.yaml new file mode 100644 index 0000000..2a65176 --- /dev/null +++ b/deploy/base/grafana-dashboards/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +configMapGenerator: + - name: grafana-testkube-dashboard + files: + - evaluation-dashboard.json + - testkube-dashboard.json +generatorOptions: + disableNameSuffixHash: true + labels: + grafana_dashboard: "1" diff --git a/deploy/base/grafana-dashboards/testkube-dashboard.json b/deploy/base/grafana-dashboards/testkube-dashboard.json new file mode 100644 index 0000000..85cf3fd --- /dev/null +++ b/deploy/base/grafana-dashboards/testkube-dashboard.json @@ -0,0 +1,316 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.2" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{result=\"passed\"}", + "hide": false, + "instant": false, + "interval": "1", + "legendFormat": "{{name}}", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "SUCCESSFUL workflow executions", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{result=\"failed\"}", + "hide": false, + "instant": false, + "interval": "1", + "legendFormat": "{{name}}", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "FAILED workflow executions", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{}", + "format": "table", + "hide": false, + "instant": false, + "interval": "1", + "intervalFactor": 10, + "legendFormat": "{{type}} created", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "Total workflow executions", + "transparent": true, + "type": "stat" + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Testkube", + "uid": "jMtrP3nnz", + "version": 2 +} diff --git a/deploy/base/kustomization.yaml b/deploy/base/kustomization.yaml index 7ddc4f1..4d09cdc 100644 --- a/deploy/base/kustomization.yaml +++ b/deploy/base/kustomization.yaml @@ -2,3 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - templates/ + - grafana-dashboards diff --git a/deploy/base/templates/publish-template.yaml b/deploy/base/templates/publish-template.yaml index 32ea61b..74a4ea5 100644 --- a/deploy/base/templates/publish-template.yaml +++ b/deploy/base/templates/publish-template.yaml @@ -9,9 +9,6 @@ metadata: spec: # Configuration parameters that can be overridden config: - workflowName: - type: string - description: "Name of the test workflow (used as label)" otlpEndpoint: type: string description: "URL of the OTLP endpoint" @@ -27,5 +24,7 @@ spec: run: args: - publish.py - - "{{ config.workflowName }}" + - "{{ workflow.name }}" + - "{{ execution.id }}" + - "{{ execution.number }}" - "{{ config.otlpEndpoint }}" diff --git a/deploy/local/kustomization.yaml b/deploy/local/kustomization.yaml index fe7375c..4b95374 100644 --- a/deploy/local/kustomization.yaml +++ b/deploy/local/kustomization.yaml @@ -1,7 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - lgtm.yaml + - lgtm/ - weather-agent.yaml - data-server/ - ../base diff --git a/deploy/local/lgtm/kustomization.yaml b/deploy/local/lgtm/kustomization.yaml new file mode 100644 index 0000000..33c1c50 --- /dev/null +++ b/deploy/local/lgtm/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - lgtm.yaml + - lgtm-otel-scraping-config.yaml + - lgtm-grafana-dashboard-config.yaml diff --git a/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml b/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml new file mode 100644 index 0000000..32e05ac --- /dev/null +++ b/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml @@ -0,0 +1,20 @@ +# Grafana dashboard provisioning config. +# This tells Grafana where to find dashboard JSON files and how to organize them. +# Without this, Grafana won't scan any directories for dashboards. +# Reference: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-provisioning + namespace: monitoring +data: + testbench-dashboards.yaml: | + apiVersion: 1 + providers: + - name: 'testbench-dashboards' + type: file + folder: 'Testbench' + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards/testbench-dashboards diff --git a/deploy/local/lgtm/lgtm-otel-scraping-config.yaml b/deploy/local/lgtm/lgtm-otel-scraping-config.yaml new file mode 100644 index 0000000..cbfe14d --- /dev/null +++ b/deploy/local/lgtm/lgtm-otel-scraping-config.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: otelcol-config + namespace: monitoring +data: + lgtm-otelcol-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + cors: + allowed_origins: + - http://* + prometheus/collector: + config: + scrape_configs: + - job_name: "opentelemetry-collector" + scrape_interval: 1s + static_configs: + - targets: ["127.0.0.1:8888"] + prometheus/testkube: + config: + scrape_configs: + - job_name: "testkube-api" + scrape_interval: 15s + static_configs: + - targets: ["testkube-api-server.testkube:8088"] + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + path: "/ready" + + processors: + batch: + + exporters: + otlphttp/metrics: + endpoint: http://127.0.0.1:9090/api/v1/otlp + tls: + insecure: true + otlphttp/traces: + endpoint: http://127.0.0.1:4418 + tls: + insecure: true + otlphttp/logs: + endpoint: http://127.0.0.1:3100/otlp + tls: + insecure: true + otlp/profiles: + endpoint: http://127.0.0.1:4040 + tls: + insecure: true + + service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/traces] + #exporters: [otlphttp/traces,debug/traces] + metrics: + receivers: [otlp, prometheus/collector, prometheus/testkube] + processors: [batch] + exporters: [otlphttp/metrics] + #exporters: [otlphttp/metrics,debug/metrics] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/logs] + #exporters: [otlphttp/logs,debug/logs] + profiles: + receivers: [otlp] + exporters: [otlp/profiles] diff --git a/deploy/local/lgtm.yaml b/deploy/local/lgtm/lgtm.yaml similarity index 66% rename from deploy/local/lgtm.yaml rename to deploy/local/lgtm/lgtm.yaml index 7edc3b9..6ab19de 100644 --- a/deploy/local/lgtm.yaml +++ b/deploy/local/lgtm/lgtm.yaml @@ -58,6 +58,9 @@ spec: command: - cat - /tmp/ready + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 30 volumeMounts: - name: tempo-data mountPath: /data/tempo @@ -71,6 +74,15 @@ spec: mountPath: /data/prometheus - name: pyroscope-storage mountPath: /data/pyroscope + - name: grafana-dashboard-provisioning + # Provisioning config that tells Grafana to scan the testbench-dashboards directory + mountPath: /otel-lgtm/grafana/conf/provisioning/dashboards/testbench-dashboards.yaml + subPath: testbench-dashboards.yaml + - name: grafana-dashboards + mountPath: /etc/grafana/provisioning/dashboards/testbench-dashboards + - name: otelcol-config + mountPath: /otel-lgtm/otelcol-config.yaml + subPath: lgtm-otelcol-config.yaml volumes: - name: tempo-data emptyDir: {} @@ -84,3 +96,12 @@ spec: emptyDir: {} - name: pyroscope-storage emptyDir: {} + - name: grafana-dashboards + configMap: + name: grafana-testkube-dashboard + - name: grafana-dashboard-provisioning + configMap: + name: grafana-dashboard-provisioning + - name: otelcol-config + configMap: + name: otelcol-config diff --git a/deploy/local/ragas-evaluation-workflow.yaml b/deploy/local/ragas-evaluation-workflow.yaml index 5462082..d50c34a 100644 --- a/deploy/local/ragas-evaluation-workflow.yaml +++ b/deploy/local/ragas-evaluation-workflow.yaml @@ -49,15 +49,10 @@ spec: default: "nv_accuracy context_recall" # Publishing configuration - workflowName: - type: string - description: "Workflow name for metrics" - default: "ragas-test-workflow" - otlpEndpoint: type: string description: "OTLP endpoint URL" - default: "http://lgtm.monitoring:4318" + default: "http://otel-collector.monitoring.svc.cluster.local:4318" # Docker image image: @@ -97,6 +92,5 @@ spec: template: name: ragas-publish-template config: - workflowName: "{{ config.workflowName }}" otlpEndpoint: "{{ config.otlpEndpoint }}" image: "{{ config.image }}" diff --git a/scripts/publish.py b/scripts/publish.py index 599b66b..7f3ae96 100644 --- a/scripts/publish.py +++ b/scripts/publish.py @@ -1,7 +1,11 @@ import argparse +import hashlib import json import logging +import math +from dataclasses import dataclass from logging import Logger +from typing import Any, TypeGuard from opentelemetry import metrics from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter @@ -14,98 +18,193 @@ logger: Logger = logging.getLogger(__name__) -def get_overall_scores(file_path: str) -> dict[str, float]: - """Load the evaluation_scores.json file and return the 'overall_scores' metrics.""" +@dataclass +class EvaluationData: + """Container for all evaluation data to be published as metrics.""" + + individual_results: list[dict[str, Any]] + total_tokens: dict[str, int] + total_cost: float + + +def load_evaluation_data(file_path: str) -> EvaluationData: + """Load the evaluation_scores.json file and return the relevant data for metrics.""" with open(file_path, "r") as file: - return json.load(file).get("overall_scores", {}) + data = json.load(file) + return EvaluationData( + individual_results=data.get("individual_results", []), + total_tokens=data.get("total_tokens", {"input_tokens": 0, "output_tokens": 0}), + total_cost=data.get("total_cost", 0.0), + ) + + +def _is_metric_value(value: Any) -> TypeGuard[int | float]: + """Check if a value is a valid metric score (numeric and not NaN).""" + if not isinstance(value, (int, float)): + return False + if isinstance(value, float) and math.isnan(value): + return False + return True + +def _get_user_input_hash(user_input: str) -> str: + """Generate a short hash of the user input for stable identification.""" + return hashlib.sha256(user_input.encode()).hexdigest()[:12] -def create_and_push_metrics(overall_scores: dict[str, float], workflow_name: str, otlp_endpoint: str) -> None: + +def _get_user_input_truncated(user_input: str, max_length: int = 50) -> str: + """Truncate user input text for display in metric labels.""" + if len(user_input) <= max_length: + return user_input + return user_input[:max_length] + "..." + + +def create_and_push_metrics( + evaluation_data: EvaluationData, workflow_name: str, execution_id: str, execution_number: int, otlp_endpoint: str +) -> None: """ - Create OpenTelemetry metrics for each overall score and push via OTLP. + Create OpenTelemetry metrics for evaluation results and push via OTLP. + + Creates per-sample gauges for each metric, plus token usage and cost gauges. Args: - overall_scores: Dictionary of metric names to scores + evaluation_data: Container with individual results, token counts, and cost workflow_name: Name of the test workflow (used as label to distinguish workflows) + execution_id: Testkube execution ID for this workflow run otlp_endpoint: URL of the OTLP endpoint (e.g., 'http://localhost:4318') """ - # Ensure the endpoint has the correct protocol if not otlp_endpoint.startswith("http://") and not otlp_endpoint.startswith("https://"): otlp_endpoint = f"http://{otlp_endpoint}" - # Create OTLP exporter exporter = OTLPMetricExporter(endpoint=f"{otlp_endpoint}/v1/metrics") - - # Create a metric reader that exports immediately - reader = PeriodicExportingMetricReader( - exporter=exporter, - export_interval_millis=1000, # Export every second - ) - - # Create resource with workflow metadata + reader = PeriodicExportingMetricReader(exporter=exporter, export_interval_millis=1000) resource = Resource.create({"service.name": "ragas-evaluation", "workflow.name": workflow_name}) - - # Create MeterProvider with the exporter and resource provider = MeterProvider(resource=resource, metric_readers=[reader]) metrics.set_meter_provider(provider) - - # Get a meter meter = metrics.get_meter("ragas.evaluation", "1.0.0") - # Create and record metrics try: logger.info(f"Pushing metrics to OTLP endpoint at {otlp_endpoint}...") - for metric_name, score in overall_scores.items(): - # Create a Gauge - gauge = meter.create_gauge( - name=f"ragas_evaluation_{metric_name}", - description=f"Overall {metric_name} score from RAGAS evaluation", - unit="1", - ) + # Collect metric names from individual results (any numeric field is a metric) + metric_names: set[str] = set() + for result in evaluation_data.individual_results: + for key, value in result.items(): + if _is_metric_value(value): + metric_names.add(key) + + # Single gauge for all evaluation metrics, differentiated by 'name' attribute + metric_gauge = meter.create_gauge( + name="testbench_evaluation_metric", + description="Evaluation metric from RAGAS testbench", + unit="", + ) + + # Set per-sample values for each metric + for metric_name in sorted(metric_names): + for result in evaluation_data.individual_results: + score = result.get(metric_name) + if not _is_metric_value(score): + logger.debug(f"Skipping invalid metric value for {metric_name}: {score}") + continue + trace_id = result.get("trace_id") + if not trace_id: + logger.warning(f"Missing trace_id for sample in execution {execution_id}") + trace_id = "missing-trace-id" + user_input = result.get("user_input", "(user_input missing or invalid)") + attributes = { + "name": metric_name, + "workflow_name": workflow_name, + "execution_id": execution_id, + "execution_number": execution_number, + "trace_id": trace_id, + "user_input_hash": _get_user_input_hash(user_input), + "user_input_truncated": _get_user_input_truncated(user_input), + } + metric_gauge.set(score, attributes) + logger.info(f"testbench_evaluation_metric{attributes} = {score}") + + # Token usage gauge with 'type' attribute + token_gauge = meter.create_gauge( + name="testbench_evaluation_token_usage", + description="Token usage from RAGAS evaluation", + unit="", + ) + + input_tokens = evaluation_data.total_tokens.get("input_tokens", 0) + token_gauge.set( + input_tokens, + { + "type": "input_tokens", + "workflow_name": workflow_name, + "execution_id": execution_id, + "execution_number": execution_number, + }, + ) + logger.info( + f"testbench_evaluation_token_usage{{type=input_tokens, workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {input_tokens}" + ) + + output_tokens = evaluation_data.total_tokens.get("output_tokens", 0) + token_gauge.set( + output_tokens, + { + "type": "output_tokens", + "workflow_name": workflow_name, + "execution_id": execution_id, + "execution_number": execution_number, + }, + ) + logger.info( + f"testbench_evaluation_token_usage{{type=output_tokens, workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {output_tokens}" + ) + + # Total cost gauge + cost_gauge = meter.create_gauge( + name="testbench_evaluation_cost", + description="Total cost of RAGAS evaluation in USD", + unit="", + ) + cost_gauge.set( + evaluation_data.total_cost, + {"workflow_name": workflow_name, "execution_id": execution_id, "execution_number": execution_number}, + ) + logger.info( + f"testbench_evaluation_cost{{workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {evaluation_data.total_cost}" + ) - # Set the gauge value with workflow_name as an attribute - gauge.set(score, {"workflow_name": workflow_name}) - logger.info(f"Set metric 'ragas_evaluation_{metric_name}{{workflow_name=\"{workflow_name}\"}}' to {score}") - - # Force flush to ensure metrics are sent provider.force_flush() - - logger.info("✓ Metrics successfully pushed via OTLP") + logger.info("Metrics successfully pushed via OTLP") except Exception as e: - logger.error(f"✗ Error pushing metrics via OTLP: {e}") + logger.error(f"Error pushing metrics via OTLP: {e}") raise finally: - # Shutdown the provider provider.shutdown() - logger.info("Published metrics:") - for metric_name, score in overall_scores.items(): - logger.info(f' - ragas_evaluation_{metric_name}{{workflow_name="{workflow_name}"}}: {score}') - -def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) -> None: +def publish_metrics( + input_file: str, workflow_name: str, execution_id: str, execution_number: int, otlp_endpoint: str +) -> None: """ Publish evaluation metrics via OpenTelemetry OTLP. Args: - input_file: Path to the evaluation scores + input_file: Path to the evaluation scores JSON file workflow_name: Name of the test workflow (e.g., 'weather-assistant-test'). + execution_id: Testkube execution ID for this workflow run. + execution_number: Number of the execution for the current workflow (e.g. 3) otlp_endpoint: URL of the OTLP endpoint (e.g., 'http://localhost:4318'). """ + logger.info(f"Loading evaluation data from {input_file}...") + evaluation_data = load_evaluation_data(input_file) - # Load overall scores from the evaluation file - logger.info(f"Loading evaluation scores from {input_file}...") - overall_scores = get_overall_scores(input_file) - - if not overall_scores: - logger.warning("No overall scores found in evaluation_scores.json") + if not evaluation_data.individual_results: + logger.warning("No individual results found in evaluation_scores.json") return - # Create and push OpenTelemetry metrics - logger.info(f"Creating OpenTelemetry metrics for {len(overall_scores)} scores...") - logger.info(f"Workflow: {workflow_name}") - create_and_push_metrics(overall_scores, workflow_name, otlp_endpoint) + logger.info(f"Publishing metrics for {len(evaluation_data.individual_results)} samples...") + logger.info(f"Workflow: {workflow_name}, Execution: {execution_id}") + create_and_push_metrics(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint) if __name__ == "__main__": @@ -114,11 +213,12 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) -> Args: workflow_name: Name of the test workflow + execution_id: Testkube execution ID for this workflow run otlp_endpoint: (OPTIONAL) URL to the OTLP endpoint (default: localhost:4318) Examples: - python3 scripts/publish.py weather-assistant-test - python3 scripts/publish.py weather-assistant-test http://localhost:4318 + python3 scripts/publish.py weather-assistant-test exec-123 + python3 scripts/publish.py weather-assistant-test exec-123 http://localhost:4318 """ parser = argparse.ArgumentParser(description="Publish RAGAS evaluation metrics via OpenTelemetry OTLP") @@ -126,6 +226,14 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) -> "workflow_name", help="Name of the test workflow (e.g., 'weather-assistant-test')", ) + parser.add_argument( + "execution_id", + help="Testkube execution ID for this workflow run", + ) + parser.add_argument( + "execution_number", + help="Testkube execution number for this workflow run (for use as a *numeric* identifier in Grafana)", + ) parser.add_argument( "otlp_endpoint", nargs="?", @@ -135,9 +243,10 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) -> args = parser.parse_args() - # Call 'publish_metrics' with hardcoded input file and specified 'workflow_name' & 'otlp_endpoint' publish_metrics( input_file="data/results/evaluation_scores.json", workflow_name=args.workflow_name, + execution_id=args.execution_id, + execution_number=args.execution_number, otlp_endpoint=args.otlp_endpoint, ) diff --git a/scripts/run.py b/scripts/run.py index b93f592..d1860f4 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -16,11 +16,10 @@ ) from opentelemetry import trace from opentelemetry.trace import Status, StatusCode +from otel_setup import setup_otel from pydantic import BaseModel from ragas import Dataset, experiment -from otel_setup import setup_otel - # Set up module-level logger logging.basicConfig(level=logging.INFO) logger: Logger = logging.getLogger(__name__) diff --git a/tests/test_publish.py b/tests/test_publish.py index 9eab789..9cc5693 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -5,6 +5,7 @@ """ import json +import math import shutil import sys import tempfile @@ -14,7 +15,15 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) -from publish import create_and_push_metrics, get_overall_scores, publish_metrics +from publish import ( + EvaluationData, + _get_user_input_hash, + _get_user_input_truncated, + _is_metric_value, + create_and_push_metrics, + load_evaluation_data, + publish_metrics, +) # Mock classes for OpenTelemetry meter provider (used by HTTPXClientInstrumentor) @@ -58,13 +67,28 @@ def temp_dir(): @pytest.fixture def evaluation_scores_file(temp_dir): - """Create a test evaluation scores file""" + """Create a test evaluation scores file with individual results""" test_file = Path(temp_dir) / "evaluation_scores.json" test_data = { "overall_scores": {"faithfulness": 0.85, "answer_relevancy": 0.90}, - "individual_results": [], - "total_tokens": {"input_tokens": 0, "output_tokens": 0}, - "total_cost": 0.0, + "individual_results": [ + { + "user_input": "What is the weather?", + "response": "It is sunny.", + "faithfulness": 0.85, + "answer_relevancy": 0.90, + "trace_id": "a1b2c3d4e5f6789012345678901234aa", + }, + { + "user_input": "What is the time?", + "response": "It is noon.", + "faithfulness": 0.80, + "answer_relevancy": 0.95, + "trace_id": "b2c3d4e5f6a7890123456789012345bb", + }, + ], + "total_tokens": {"input_tokens": 1000, "output_tokens": 200}, + "total_cost": 0.05, } with open(test_file, "w") as f: @@ -90,10 +114,13 @@ def realistic_scores_file(temp_dir): "response": "It is sunny.", "faithfulness": 0.85, "answer_relevancy": 0.90, + "context_precision": 0.78, + "context_recall": 0.82, + "trace_id": "c3d4e5f6a7b8901234567890123456cc", } ], - "total_tokens": {"input_tokens": 0, "output_tokens": 0}, - "total_cost": 0.0, + "total_tokens": {"input_tokens": 500, "output_tokens": 100}, + "total_cost": 0.025, } with open(test_file, "w") as f: @@ -102,25 +129,111 @@ def realistic_scores_file(temp_dir): return test_file -# TestGetOverallScores tests -def test_loads_overall_scores(evaluation_scores_file): - """Test that get_overall_scores loads the overall_scores section""" - scores = get_overall_scores(str(evaluation_scores_file)) +# Test _is_metric_value +def test_is_metric_value_with_float(): + """Test that valid floats are recognized as metric values""" + assert _is_metric_value(0.85) is True + assert _is_metric_value(1.0) is True + assert _is_metric_value(0.0) is True + + +def test_is_metric_value_with_int(): + """Test that integers are recognized as metric values""" + assert _is_metric_value(1) is True + assert _is_metric_value(0) is True + + +def test_is_metric_value_with_nan(): + """Test that NaN is not recognized as a metric value""" + assert _is_metric_value(float("nan")) is False + assert _is_metric_value(math.nan) is False + + +def test_is_metric_value_with_non_numeric(): + """Test that non-numeric values are not recognized as metric values""" + assert _is_metric_value("string") is False + assert _is_metric_value(["list"]) is False + assert _is_metric_value({"dict": "value"}) is False + assert _is_metric_value(None) is False + + +# Test _get_user_input_hash +def test_get_user_input_hash_returns_12_char_hex(): + """Test that _get_user_input_hash returns a 12-character hex string""" + result = _get_user_input_hash("What is the weather?") + assert len(result) == 12 + assert all(c in "0123456789abcdef" for c in result) + + +def test_get_user_input_hash_is_deterministic(): + """Test that _get_user_input_hash returns the same hash for the same input""" + input_text = "What is the weather in New York?" + assert _get_user_input_hash(input_text) == _get_user_input_hash(input_text) + + +def test_get_user_input_hash_different_for_different_inputs(): + """Test that _get_user_input_hash returns different hashes for different inputs""" + hash1 = _get_user_input_hash("Question 1") + hash2 = _get_user_input_hash("Question 2") + assert hash1 != hash2 + + +# Test _get_user_input_truncated +def test_get_user_input_truncated_short_input(): + """Test that short inputs are returned unchanged""" + short_input = "Short question" + assert _get_user_input_truncated(short_input) == short_input + + +def test_get_user_input_truncated_exact_length(): + """Test that inputs exactly at max_length are returned unchanged""" + exact_input = "a" * 50 + assert _get_user_input_truncated(exact_input) == exact_input + + +def test_get_user_input_truncated_long_input(): + """Test that long inputs are truncated with ellipsis""" + long_input = "a" * 100 + result = _get_user_input_truncated(long_input) + assert len(result) == 53 # 50 chars + "..." + assert result.endswith("...") + - assert scores["faithfulness"] == 0.85 - assert scores["answer_relevancy"] == 0.90 +def test_get_user_input_truncated_custom_length(): + """Test that custom max_length is respected""" + input_text = "This is a longer question" + result = _get_user_input_truncated(input_text, max_length=10) + assert result == "This is a ..." + + +# Test load_evaluation_data +def test_loads_evaluation_data(evaluation_scores_file): + """Test that load_evaluation_data loads all required fields""" + data = load_evaluation_data(str(evaluation_scores_file)) + + assert len(data.individual_results) == 2 + assert data.total_tokens["input_tokens"] == 1000 + assert data.total_tokens["output_tokens"] == 200 + assert data.total_cost == 0.05 def test_file_not_found(temp_dir): """Test behavior when file doesn't exist""" with pytest.raises(FileNotFoundError): - get_overall_scores(str(Path(temp_dir) / "nonexistent.json")) + load_evaluation_data(str(Path(temp_dir) / "nonexistent.json")) # TestCreateAndPushMetrics tests def test_creates_gauges_for_each_metric(monkeypatch): - """Test that a Gauge is created for each metric""" - overall_scores = {"faithfulness": 0.85, "answer_relevancy": 0.90} + """Test that a Gauge is created for each metric plus token/cost gauges""" + evaluation_data = EvaluationData( + individual_results=[ + {"user_input": "Question 1", "faithfulness": 0.85, "answer_relevancy": 0.90, "trace_id": "trace1"}, + {"user_input": "Question 2", "faithfulness": 0.80, "answer_relevancy": 0.95, "trace_id": "trace2"}, + ], + total_tokens={"input_tokens": 1000, "output_tokens": 200}, + total_cost=0.05, + ) # Mock the meter and gauge create_gauge_calls = [] @@ -167,34 +280,51 @@ def mock_exporter_init(endpoint): monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init) create_and_push_metrics( - overall_scores=overall_scores, + evaluation_data=evaluation_data, workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) - # Verify create_gauge was called for each metric - assert len(create_gauge_calls) == 2 + # Verify gauges created: 1 metric gauge + 1 token gauge + 1 cost gauge = 3 + assert len(create_gauge_calls) == 3 # Verify gauge names gauge_names = [call["name"] for call in create_gauge_calls] - assert "ragas_evaluation_faithfulness" in gauge_names - assert "ragas_evaluation_answer_relevancy" in gauge_names + assert "testbench_evaluation_metric" in gauge_names + assert "testbench_evaluation_token_usage" in gauge_names + assert "testbench_evaluation_cost" in gauge_names -def test_sets_gauge_values(monkeypatch): - """Test that gauge values are set correctly""" - overall_scores = {"faithfulness": 0.85} +def test_sets_per_sample_gauge_values(monkeypatch): + """Test that gauge values are set for each sample with all required attributes""" + evaluation_data = EvaluationData( + individual_results=[ + {"user_input": "Question 1", "faithfulness": 0.85, "trace_id": "d4e5f6a7b8c9012345678901234567dd"}, + { + "user_input": "This is a very long question that exceeds fifty characters in length", + "faithfulness": 0.80, + "trace_id": "e5f6a7b8c9d0123456789012345678ee", + }, + ], + total_tokens={"input_tokens": 0, "output_tokens": 0}, + total_cost=0.0, + ) # Mock the meter and gauge set_calls = [] class MockGauge: + def __init__(self, name): + self.name = name + def set(self, value, attributes): - set_calls.append({"value": value, "attributes": attributes}) + set_calls.append({"name": self.name, "value": value, "attributes": attributes}) class MockMeter: def create_gauge(self, name, unit=None, description=None): - return MockGauge() + return MockGauge(name) mock_meter = MockMeter() @@ -229,20 +359,48 @@ def mock_exporter_init(endpoint): monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init) create_and_push_metrics( - overall_scores=overall_scores, + evaluation_data=evaluation_data, workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) - # Verify gauge.set was called with correct value and attributes - assert len(set_calls) == 1 - assert set_calls[0]["value"] == 0.85 - assert set_calls[0]["attributes"] == {"workflow_name": "test-workflow"} + # Filter to faithfulness metric calls only (name attribute = "faithfulness") + faithfulness_calls = [ + c + for c in set_calls + if c["name"] == "testbench_evaluation_metric" and c["attributes"].get("name") == "faithfulness" + ] + assert len(faithfulness_calls) == 2 + + # Verify gauge.set was called with correct values and all required attributes + # First sample: short question + assert faithfulness_calls[0]["value"] == 0.85 + assert faithfulness_calls[0]["attributes"]["workflow_name"] == "test-workflow" + assert faithfulness_calls[0]["attributes"]["execution_id"] == "exec-test-123" + assert faithfulness_calls[0]["attributes"]["execution_number"] == 42 + assert faithfulness_calls[0]["attributes"]["trace_id"] == "d4e5f6a7b8c9012345678901234567dd" + assert faithfulness_calls[0]["attributes"]["user_input_hash"] == _get_user_input_hash("Question 1") + assert faithfulness_calls[0]["attributes"]["user_input_truncated"] == "Question 1" + + # Second sample: long question (should be truncated) + long_question = "This is a very long question that exceeds fifty characters in length" + assert faithfulness_calls[1]["value"] == 0.80 + assert faithfulness_calls[1]["attributes"]["execution_id"] == "exec-test-123" + assert faithfulness_calls[1]["attributes"]["execution_number"] == 42 + assert faithfulness_calls[1]["attributes"]["trace_id"] == "e5f6a7b8c9d0123456789012345678ee" + assert faithfulness_calls[1]["attributes"]["user_input_hash"] == _get_user_input_hash(long_question) + assert faithfulness_calls[1]["attributes"]["user_input_truncated"] == _get_user_input_truncated(long_question) def test_pushes_via_otlp(monkeypatch): """Test that metrics are pushed via OTLP""" - overall_scores = {"faithfulness": 0.85} + evaluation_data = EvaluationData( + individual_results=[{"user_input": "Q1", "faithfulness": 0.85, "trace_id": "f6a7b8c9d0e1234567890123456789ff"}], + total_tokens={"input_tokens": 100, "output_tokens": 50}, + total_cost=0.01, + ) # Mock the meter and gauge class MockGauge: @@ -288,8 +446,10 @@ def mock_exporter_init(endpoint): monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init) create_and_push_metrics( - overall_scores=overall_scores, + evaluation_data=evaluation_data, workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) @@ -304,21 +464,14 @@ def mock_exporter_init(endpoint): def test_handles_push_error(monkeypatch): """Test error handling when OTLP export fails""" - overall_scores = {"faithfulness": 0.85} - - # Mock the meter and gauge - class MockGauge: - def set(self, value, attributes): - pass - - class MockMeter: - def create_gauge(self, name, unit=None, description=None): - return MockGauge() - - mock_meter = MockMeter() + evaluation_data = EvaluationData( + individual_results=[{"user_input": "Q1", "faithfulness": 0.85, "trace_id": "a7b8c9d0e1f2345678901234567890aa"}], + total_tokens={"input_tokens": 0, "output_tokens": 0}, + total_cost=0.0, + ) def mock_get_meter(*args, **kwargs): - return mock_meter + return _OtelMockMeter() # Mock the provider to raise an exception on force_flush shutdown_calls = [] @@ -347,8 +500,10 @@ def mock_exporter_init(endpoint): with pytest.raises(Exception, match="Connection refused"): create_and_push_metrics( - overall_scores=overall_scores, + evaluation_data=evaluation_data, workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) @@ -361,11 +516,13 @@ def test_publish_metrics_calls_create_and_push(evaluation_scores_file, monkeypat """Test that publish_metrics calls create_and_push_metrics""" create_push_calls = [] - def mock_create_push(overall_scores, workflow_name, otlp_endpoint): + def mock_create_push(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint): create_push_calls.append( { - "overall_scores": overall_scores, + "evaluation_data": evaluation_data, "workflow_name": workflow_name, + "execution_id": execution_id, + "execution_number": execution_number, "otlp_endpoint": otlp_endpoint, } ) @@ -375,6 +532,8 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint): publish_metrics( input_file=str(evaluation_scores_file), workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) @@ -382,16 +541,22 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint): assert len(create_push_calls) == 1 # Verify parameters - assert create_push_calls[0]["overall_scores"]["faithfulness"] == 0.85 - assert create_push_calls[0]["overall_scores"]["answer_relevancy"] == 0.90 + assert len(create_push_calls[0]["evaluation_data"].individual_results) == 2 assert create_push_calls[0]["workflow_name"] == "test-workflow" + assert create_push_calls[0]["execution_id"] == "exec-test-123" + assert create_push_calls[0]["execution_number"] == 42 assert create_push_calls[0]["otlp_endpoint"] == "localhost:4318" -def test_publish_metrics_with_empty_scores(temp_dir, monkeypatch): - """Test behavior when overall_scores is empty""" - # Create file with empty overall_scores - test_data = {"overall_scores": {}, "individual_results": []} +def test_publish_metrics_with_empty_results(temp_dir, monkeypatch): + """Test behavior when individual_results is empty""" + # Create file with empty individual_results + test_data = { + "overall_scores": {}, + "individual_results": [], + "total_tokens": {"input_tokens": 0, "output_tokens": 0}, + "total_cost": 0.0, + } empty_file = Path(temp_dir) / "empty_scores.json" with open(empty_file, "w") as f: @@ -399,7 +564,7 @@ def test_publish_metrics_with_empty_scores(temp_dir, monkeypatch): create_push_calls = [] - def mock_create_push(overall_scores, workflow_name, otlp_endpoint): + def mock_create_push(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint): create_push_calls.append(True) monkeypatch.setattr("publish.create_and_push_metrics", mock_create_push) @@ -407,6 +572,8 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint): publish_metrics( input_file=str(empty_file), workflow_name="test-workflow", + execution_id="exec-test-123", + execution_number=42, otlp_endpoint="localhost:4318", ) @@ -467,11 +634,18 @@ def mock_exporter_init(endpoint): publish_metrics( input_file=str(realistic_scores_file), workflow_name="weather-assistant-test", + execution_id="exec-weather-456", + execution_number=42, otlp_endpoint="localhost:4318", ) # Verify OTLPMetricExporter was called assert len(exporter_calls) == 1 - # Verify 4 metrics were created (faithfulness, answer_relevancy, context_precision, context_recall) - assert len(create_gauge_calls) == 4 + # Verify 3 gauges: 1 metric gauge + 1 token gauge + 1 cost gauge + assert len(create_gauge_calls) == 3 + + gauge_names = [call["name"] for call in create_gauge_calls] + assert "testbench_evaluation_metric" in gauge_names + assert "testbench_evaluation_token_usage" in gauge_names + assert "testbench_evaluation_cost" in gauge_names diff --git a/tests/test_run.py b/tests/test_run.py index bfbbe60..8cacc30 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -111,9 +111,7 @@ def mock_httpx_client(): # Call the function result = await run_agent_experiment.func( - test_row, - agent_url="http://test-agent:8000", - workflow_name="test-workflow" + test_row, agent_url="http://test-agent:8000", workflow_name="test-workflow" ) # Verify result structure @@ -156,9 +154,7 @@ def mock_httpx_client(): # Call the function result = await run_agent_experiment.func( - test_row, - agent_url="http://test-agent:8000", - workflow_name="test-workflow" + test_row, agent_url="http://test-agent:8000", workflow_name="test-workflow" ) # Verify error is captured in response