From cb886287dc27b9bcf43aa520f5e2af37e6aae095 Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Tue, 25 Nov 2025 14:36:26 +0100 Subject: [PATCH 1/8] FIX: Port-Forwardings so E2E Test runs locally --- Tiltfile | 1 + tests_e2e/test_e2e.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Tiltfile b/Tiltfile index 20f8284..85524a3 100644 --- a/Tiltfile +++ b/Tiltfile @@ -39,6 +39,7 @@ k8s_yaml(kustomize('deploy/local')) k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000']) k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime']) k8s_resource('lgtm', port_forwards=['11000:3000']) +k8s_resource('data-server', port_forwards='11020:8000') # Declare Testkube resources k8s_kind( diff --git a/tests_e2e/test_e2e.py b/tests_e2e/test_e2e.py index 34d78a3..1d39d4e 100755 --- a/tests_e2e/test_e2e.py +++ b/tests_e2e/test_e2e.py @@ -2,7 +2,7 @@ End-to-end test that runs all scripts in the correct order: 1. setup.py - Downloads, converts and saves Ragas Dataset to data/datasets/ragas_dataset.jsonl 2. run.py - Runs agent queries on the dataset and saves Ragas Experiment to data/experiments/ragas_experiment.jsonl -3. evaluate.py - Evaluates results using RAGAS metrics and saves result to results/evaluation_scores.json +3. evaluate.py - Evaluates results using RAGAS metrics and saves result to data/results/evaluation_scores.json 4. publish.py - Publishes metrics via OpenTelemetry OTLP Usage: @@ -57,7 +57,7 @@ def __init__( # Define expected output files self.dataset_file = Path("./data/datasets/ragas_dataset.jsonl") self.results_file = Path("./data/experiments/ragas_experiment.jsonl") - self.evaluation_file = Path("./results/evaluation_scores.json") + self.evaluation_file = Path("./data/results/evaluation_scores.json") def verify_scripts_exist(self) -> bool: """Verify that all required scripts exist.""" @@ -231,7 +231,7 @@ def test_e2e_pipeline(): """ # Get configuration from environment variables with sensible defaults - dataset_url = os.getenv("E2E_DATASET_URL", "http://localhost:8000/dataset.json") + dataset_url = os.getenv("E2E_DATASET_URL", "http://data-server.data-server:8000/dataset.csv") agent_url = os.getenv("E2E_AGENT_URL", "http://localhost:11010") model = os.getenv("E2E_MODEL", "gemini-2.5-flash-lite") metrics_str = os.getenv("E2E_METRICS", "faithfulness") From b0321902cc931e4272365ca7d34a4256d5105f33 Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Thu, 4 Dec 2025 11:25:51 +0100 Subject: [PATCH 2/8] DOCS: Update documentation --- DetailedUsageAndTroubleshooting.md | 257 ++++++++++++++++++++++ README.md | 334 ++++------------------------- 2 files changed, 297 insertions(+), 294 deletions(-) create mode 100644 DetailedUsageAndTroubleshooting.md diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md new file mode 100644 index 0000000..1672c8e --- /dev/null +++ b/DetailedUsageAndTroubleshooting.md @@ -0,0 +1,257 @@ +## Detailed Usage + +### 1. setup.py - Dataset Preparation + +Downloads and converts test datasets to RAGAS-native JSONL format. + +**Syntax:** + +```shell +python3 scripts/setup.py +``` + +**Arguments:** + +- `dataset_url` (required): URL to dataset file (`.csv`, `.json`, or `.parquet`) + +**Required Dataset Schema:** + +- See [Dataset Requirements](#dataset-requirements) + +**Output:** + +- `data/datasets/ragas_dataset.jsonl` - RAGAS Dataset in JSONL format + +--- + +### 2. run.py - Agent Query Execution + +Executes test queries through an agent using the A2A protocol and collects responses. + +**Syntax:** + +```shell +python3 scripts/run.py +``` + +**Arguments:** + +- `agent_url` (required): URL to the agent's A2A endpoint + +**Input:** + +- `data/datasets/ragas_dataset.jsonl` (loaded automatically) + +**Output:** + +- `data/experiments/ragas_experiment.jsonl` - Agent responses with preserved context + +**Output Schema:** + +```jsonl +{"user_input": "What is X?", "retrieved_contexts": ["Context about X"], "reference": "X is...", "response": "Agent's answer"} +``` + +**Notes:** + +- Uses asynchronous A2A client for efficient communication +- Preserves all original dataset fields +- Automatically handles response streaming + +--- + +### 3. evaluate.py - RAGAS Metric Evaluation + +Evaluates agent responses using configurable RAGAS metrics and calculates costs. + +**Syntax:** + +```shell +python3 scripts/evaluate.py [metric2 ...] [--cost-per-input COST] [--cost-per-output COST] +``` + +**Arguments:** + +- `model` (required): Model name for evaluation (e.g., `gemini-2.5-flash-lite`, `gpt-4`) +- `metrics` (required): One or more RAGAS metric names +- `--cost-per-input` (optional): Cost per input token (default: 0.000005, i.e., $5 per 1M tokens) +- `--cost-per-output` (optional): Cost per output token (default: 0.000015, i.e., $15 per 1M tokens) + +### **Available Metrics:** + +| Metric | Special required columns | +|-------------------------------------------|--------------------------| +| `faithfulness` | retrieved_contexts | +| `context_precision` | retrieved_contexts | +| `context_recall` | retrieved_contexts | +| `context_entity_recall` | retrieved_contexts | +| `context_utilization` | retrieved_contexts | +| `llm_context_precision_with_reference` | retrieved_contexts | +| `llm_context_precision_without_reference` | retrieved_contexts | +| `faithful_rate` | retrieved_contexts | +| `relevance_rate` | retrieved_contexts | +| `noise_sensitivity` | retrieved_contexts | +| `factual_correctness` | | +| `domain_specific_rubrics` | | +| `nv_accuracy` | | +| `nv_context_relevance` | retrieved_contexts | +| `nv_response_groundedness` | retrieved_contexts | +| `string_present` | | +| `exact_match` | | +| `summary_score` | reference_contexts | +| `llm_sql_equivalence_with_reference` | reference_contexts | + +**Input:** + +- `data/experiments/ragas_experiment.jsonl` (loaded automatically) + +**Examples:** + +```shell +# Single metric +python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness + +# Multiple metrics +python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy context_precision + +# Custom token costs +python3 scripts/evaluate.py gpt-4 faithfulness answer_correctness \ + --cost-per-input 0.00003 \ + --cost-per-output 0.00006 +``` + +**Output:** + +- `results/evaluation_scores.json` - Evaluation results with metrics, token usage, and costs + +**Output Format:** + +```json +{ + "overall_scores": { + "faithfulness": 0.95, + "answer_relevancy": 0.98 + }, + "individual_results": [ + { + "user_input": "What is the capital of France?", + "response": "Paris is the capital of France.", + "faithfulness": 0.95, + "answer_relevancy": 0.98 + } + ], + "total_tokens": { + "input_tokens": 1500, + "output_tokens": 500 + }, + "total_cost": 0.015 +} +``` + +**Notes:** + +- Currently only support **SingleTurnSample** Metrics (see [Available Metrics](#available-metrics)) +- Dynamically discovers available metrics from `ragas.metrics` module +- Invalid metric names will show available options +- Token costs can be customized per model pricing + +--- + +### 4. publish.py - Metrics Publishing + +Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring. + +**Syntax:** + +```shell +python3 scripts/publish.py [otlp_endpoint] +``` + +**Arguments:** + +- `workflow_name` (required): Name of the test workflow (used as metric label) +- `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`) + +**Input:** + +- `results/evaluation_scores.json` (loaded automatically) + +**Published Metrics:** + +Each RAGAS metric is published as a gauge with the workflow name as an attribute: + +``` +ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85 +ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92 +``` + +**Notes:** + +- Sends metrics to `/v1/metrics` endpoint +- Uses resource with `service.name="ragas-evaluation"` +- Forces flush to ensure delivery before exit + + +---- + +## Troubleshooting + +### "Source dataset is missing required columns" + +**Problem**: Dataset doesn't have the required schema. + +**Solution**: + +- Verify your dataset has columns: `user_input`, `retrieved_contexts`, and `reference` +- Check that column names match exactly (case-sensitive) +- Ensure `retrieved_contexts` is formatted as a list (see Dataset Requirements) + +Example fix for CSV: + +```csv +# Wrong (missing columns) +question,context,answer + +# Correct +user_input,retrieved_contexts,reference +``` + +### "No results found in experiment" + +**Problem**: `evaluate.py` can't find experiment results. + +**Solution**: + +- Check if `data/experiments/ragas_experiment.jsonl` exists +- Verify `run.py` completed successfully without errors +- Ensure the agent URL was accessible during execution +- Check file permissions on the `data/` directory + +### CSV List Conversion Issues + +**Problem**: `retrieved_contexts` not parsing correctly from CSV. + +**Solution**: + +- Ensure lists are formatted as Python array strings: `"['item1', 'item2']"` +- Use proper quoting in CSV: wrap the entire array string in double quotes +- Consider using JSON or Parquet format for complex data types + +Example: + +```csv +user_input,retrieved_contexts,reference +"What is X?","['Context about X', 'More context']","X is..." +``` + +### Evaluation Metrics Fail + +**Problem**: Certain metrics fail during evaluation. + +**Solution**: + +- Some metrics require the `reference` field (e.g., `context_precision`, `context_recall`) +- Verify your dataset includes all required fields for the metrics you're using +- Check the RAGAS documentation for metric-specific requirements + +---- \ No newline at end of file diff --git a/README.md b/README.md index e56ddfd..59d1bc7 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,41 @@ # Agentic Layer Test Bench - Automated Agent Evaluation System -An automated evaluation and testing system for AI agents using the **RAGAS** (Retrieval Augmented Generation Assessment) +An automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS** framework. This system downloads test datasets, executes queries through agents via the **A2A** protocol, evaluates responses using configurable metrics, and publishes results to **OpenTelemetry** for monitoring. ---- +## Overview + +This project provides a complete pipeline for evaluating AI agent performance: + +- **Cloud-Native**: Easily deployable to your Kubernetes cluster +- **Local Support**: Test and evaluate agents locally +- **Automated Testing**: Run predefined test queries through your agents +- **Multi-Format Support**: Support for datasets in CSV, JSON & Parquet formats +- **Flexible Evaluation**: Evaluate agent replies using Ragas Metrics +- **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis + +---- + ## Table of Contents - [Architecture](#architecture) - [Prerequisites](#prerequisites) - [Getting Started](#getting-started) + - [Setup with Testkube](#setup-with-testkube) + - [Local Setup](#local-setup) - [Quick Start](#quick-start) - [Detailed Usage](#detailed-usage) - [Dataset Requirements](#dataset-requirements) - [Testing](#testing) - [Development](#development) - [Troubleshooting](#troubleshooting) -- [Project Structure](#project-structure) - [Contributing](#contributing) ---- -## Overview - -This project provides a complete pipeline for evaluating AI agent performance: - -- **Automated Testing**: Run predefined test queries through your agents -- **Multi-Format Support**: Accept datasets in CSV, JSON, or Parquet formats -- **Flexible Evaluation**: Configure multiple RAGAS metrics for comprehensive assessment -- **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis -- **Type-Safe**: Built with type hints and validated with MyPy -- **Limitation**: Currently only support SingleTurnSample Metrics (see [Available Metrics](#available-metrics)) - ----- - ## Architecture ``` @@ -60,7 +61,7 @@ data/experiments/ragas_experiment.jsonl | | | LLM Model v -results/evaluation_scores.json +data/results/evaluation_scores.json | v [4. publish.py] - Publishes to OTLP endpoint @@ -88,13 +89,15 @@ OpenTelemetry Collector ## Getting Started -### With Tilt and Local Kubernetes +Use Tilt to spin up all the required backends: ```shell Start Tilt in the project root to set up the local Kubernetes environment: tilt up ``` +### Setup with Testkube + Run the RAGAS evaluation workflow with minimal setup: ```shell @@ -121,239 +124,38 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ -n testkube ``` -### Install dependencies using UV +### Local Setup + +### Dependencies & Environment Setup ```shell -# Install (dev & prod) dependencies with uv +# Install dependencies with uv uv sync -``` -### Environment Setup - -```shell # Required for evaluation export OPENAI_API_KEY="your-api-key-here" - -# Optional: Configure custom OTLP endpoint -export OTLP_ENDPOINT="http://otlp-collector:4318" ``` -The system automatically creates the required directories (`data/`, `results/`) on first run. - ----- - -## Quick Start - Run the complete evaluation pipeline in 4 steps: ```shell # 1. Download and prepare dataset -python3 scripts/setup.py "https://example.com/dataset.csv" +uv run python3 scripts/setup.py "https://example.com/dataset.csv" # 2. Execute queries through your agent -python3 scripts/run.py "http://localhost:8000" +uv run python3 scripts/run.py "http://localhost:8000" # 3. Evaluate responses with RAGAS metrics -python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy +uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy # 4. Publish metrics to OpenTelemetry -python3 scripts/publish.py "my-agent-evaluation" +uv run python3 scripts/publish.py "my-agent-evaluation" ``` ---- ## Detailed Usage - -### 1. setup.py - Dataset Preparation - -Downloads and converts test datasets to RAGAS-native JSONL format. - -**Syntax:** - -```shell -python3 scripts/setup.py -``` - -**Arguments:** - -- `dataset_url` (required): URL to dataset file (`.csv`, `.json`, or `.parquet`) - -**Required Dataset Schema:** - -- See [Dataset Requirements](#dataset-requirements) - -**Output:** - -- `data/datasets/ragas_dataset.jsonl` - RAGAS Dataset in JSONL format - ---- - -### 2. run.py - Agent Query Execution - -Executes test queries through an agent using the A2A protocol and collects responses. - -**Syntax:** - -```shell -python3 scripts/run.py -``` - -**Arguments:** - -- `agent_url` (required): URL to the agent's A2A endpoint - -**Input:** - -- `data/datasets/ragas_dataset.jsonl` (loaded automatically) - -**Output:** - -- `data/experiments/ragas_experiment.jsonl` - Agent responses with preserved context - -**Output Schema:** - -```jsonl -{"user_input": "What is X?", "retrieved_contexts": ["Context about X"], "reference": "X is...", "response": "Agent's answer"} -``` - -**Notes:** - -- Uses asynchronous A2A client for efficient communication -- Preserves all original dataset fields -- Automatically handles response streaming - ---- - -### 3. evaluate.py - RAGAS Metric Evaluation - -Evaluates agent responses using configurable RAGAS metrics and calculates costs. - -**Syntax:** - -```shell -python3 scripts/evaluate.py [metric2 ...] [--cost-per-input COST] [--cost-per-output COST] -``` - -**Arguments:** - -- `model` (required): Model name for evaluation (e.g., `gemini-2.5-flash-lite`, `gpt-4`) -- `metrics` (required): One or more RAGAS metric names -- `--cost-per-input` (optional): Cost per input token (default: 0.000005, i.e., $5 per 1M tokens) -- `--cost-per-output` (optional): Cost per output token (default: 0.000015, i.e., $15 per 1M tokens) - -### **Available Metrics:** - -| Metric | Special required columns | -|-------------------------------------------|--------------------------| -| `faithfulness` | retrieved_contexts | -| `context_precision` | retrieved_contexts | -| `context_recall` | retrieved_contexts | -| `context_entity_recall` | retrieved_contexts | -| `context_utilization` | retrieved_contexts | -| `llm_context_precision_with_reference` | retrieved_contexts | -| `llm_context_precision_without_reference` | retrieved_contexts | -| `faithful_rate` | retrieved_contexts | -| `relevance_rate` | retrieved_contexts | -| `noise_sensitivity` | retrieved_contexts | -| `factual_correctness` | | -| `domain_specific_rubrics` | | -| `nv_accuracy` | | -| `nv_context_relevance` | retrieved_contexts | -| `nv_response_groundedness` | retrieved_contexts | -| `string_present` | | -| `exact_match` | | -| `summary_score` | reference_contexts | -| `llm_sql_equivalence_with_reference` | reference_contexts | - -**Input:** - -- `data/experiments/ragas_experiment.jsonl` (loaded automatically) - -**Examples:** - -```shell -# Single metric -python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness - -# Multiple metrics -python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy context_precision - -# Custom token costs -python3 scripts/evaluate.py gpt-4 faithfulness answer_correctness \ - --cost-per-input 0.00003 \ - --cost-per-output 0.00006 -``` - -**Output:** - -- `results/evaluation_scores.json` - Evaluation results with metrics, token usage, and costs - -**Output Format:** - -```json -{ - "overall_scores": { - "faithfulness": 0.95, - "answer_relevancy": 0.98 - }, - "individual_results": [ - { - "user_input": "What is the capital of France?", - "response": "Paris is the capital of France.", - "faithfulness": 0.95, - "answer_relevancy": 0.98 - } - ], - "total_tokens": { - "input_tokens": 1500, - "output_tokens": 500 - }, - "total_cost": 0.015 -} -``` - -**Notes:** - -- Currently only support **SingleTurnSample** Metrics (see [Available Metrics](#available-metrics)) -- Dynamically discovers available metrics from `ragas.metrics` module -- Invalid metric names will show available options -- Token costs can be customized per model pricing - ---- - -### 4. publish.py - Metrics Publishing - -Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring. - -**Syntax:** - -```shell -python3 scripts/publish.py [otlp_endpoint] -``` - -**Arguments:** - -- `workflow_name` (required): Name of the test workflow (used as metric label) -- `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`) - -**Input:** - -- `results/evaluation_scores.json` (loaded automatically) - -**Published Metrics:** - -Each RAGAS metric is published as a gauge with the workflow name as an attribute: - -``` -ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85 -ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92 -``` - -**Notes:** - -- Sends metrics to `/v1/metrics` endpoint -- Uses resource with `service.name="ragas-evaluation"` -- Forces flush to ensure delivery before exit +See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md) ---- @@ -420,29 +222,30 @@ uv run poe test ### End-to-End Tests -Run the complete pipeline integration test: +The E2E Test (found at `tests_e2e/test_e2e.py`) runs a complete pipeline integration test - from setup.py to publish.py. +The E2E Test can be run in different ways: ```shell +# Using Pytest directly uv run pytest tests_e2e/test_e2e.py -v -``` - -Or using the task runner: -```shell +# Using the task runner uv run poe test_e2e ``` +To use custom endpoints, evaluation models or metrics you can set the following environment variables before running the E2E Test: + **Configuration via Environment Variables:** ```shell -export E2E_DATASET_URL="http://localhost:8000/dataset.json" -export E2E_AGENT_URL="http://localhost:11010" +export E2E_DATASET_URL="http://data-server.data-server:8000/dataset.csv" +export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent export E2E_MODEL="gemini-2.5-flash-lite" export E2E_METRICS="faithfulness,answer_relevancy" export E2E_WORKFLOW_NAME="Test Workflow" export E2E_OTLP_ENDPOINT="localhost:4318" -pytest tests_e2e/test_e2e.py -v +uv run pytest tests_e2e/test_e2e.py -v ``` ---- @@ -481,65 +284,8 @@ uv run poe lint # Auto-fix linting issues ---- -## Troubleshooting - -### "Source dataset is missing required columns" - -**Problem**: Dataset doesn't have the required schema. - -**Solution**: - -- Verify your dataset has columns: `user_input`, `retrieved_contexts`, and `reference` -- Check that column names match exactly (case-sensitive) -- Ensure `retrieved_contexts` is formatted as a list (see Dataset Requirements) - -Example fix for CSV: - -```csv -# Wrong (missing columns) -question,context,answer - -# Correct -user_input,retrieved_contexts,reference -``` - -### "No results found in experiment" - -**Problem**: `evaluate.py` can't find experiment results. - -**Solution**: - -- Check if `data/experiments/ragas_experiment.jsonl` exists -- Verify `run.py` completed successfully without errors -- Ensure the agent URL was accessible during execution -- Check file permissions on the `data/` directory - -### CSV List Conversion Issues - -**Problem**: `retrieved_contexts` not parsing correctly from CSV. - -**Solution**: - -- Ensure lists are formatted as Python array strings: `"['item1', 'item2']"` -- Use proper quoting in CSV: wrap the entire array string in double quotes -- Consider using JSON or Parquet format for complex data types - -Example: - -```csv -user_input,retrieved_contexts,reference -"What is X?","['Context about X', 'More context']","X is..." -``` - -### Evaluation Metrics Fail - -**Problem**: Certain metrics fail during evaluation. - -**Solution**: - -- Some metrics require the `reference` field (e.g., `context_precision`, `context_recall`) -- Verify your dataset includes all required fields for the metrics you're using -- Check the RAGAS documentation for metric-specific requirements +## Detailed Usage +See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md) ---- From 48ee528f585700990b3405b4f64e31f350dbe41e Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Thu, 4 Dec 2025 11:36:54 +0100 Subject: [PATCH 3/8] DOCS: Update documentation --- README.md | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 59d1bc7..66415db 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Agentic Layer Test Bench - Automated Agent Evaluation System -An automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS** +A **Kubernetes** native, automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS** framework. This system downloads test datasets, executes queries through agents via the **A2A** protocol, evaluates responses using configurable metrics, and publishes results to **OpenTelemetry** for monitoring. @@ -17,6 +17,29 @@ This project provides a complete pipeline for evaluating AI agent performance: - **Flexible Evaluation**: Evaluate agent replies using Ragas Metrics - **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis +### Example Output: + +```json +{ + "overall_scores": { + "faithfulness": 0.95, + "answer_relevancy": 0.98 + }, + "individual_results": [ + { + "user_input": "What is the capital of France?", + "response": "Paris is the capital of France.", + "faithfulness": 0.95, + "answer_relevancy": 0.98 + } + ], + "total_tokens": { + "input_tokens": 1500, + "output_tokens": 500 + }, + "total_cost": 0.015 +} +``` ---- ## Table of Contents @@ -26,7 +49,6 @@ This project provides a complete pipeline for evaluating AI agent performance: - [Getting Started](#getting-started) - [Setup with Testkube](#setup-with-testkube) - [Local Setup](#local-setup) -- [Quick Start](#quick-start) - [Detailed Usage](#detailed-usage) - [Dataset Requirements](#dataset-requirements) - [Testing](#testing) @@ -70,17 +92,20 @@ data/results/evaluation_scores.json OpenTelemetry Collector ``` -### Key Design Principles - -- **RAGAS-Native Format**: Uses RAGAS column names (`user_input`, `response`, `retrieved_contexts`, `reference`) - throughout -- **JSONL Backend**: Internal storage uses JSONL for native list support -- **Format-Aware Input**: Intelligent handling of CSV (list conversion), JSON, and Parquet formats - ---- ## Prerequisites +### Setup with Testkube + +- **Testkube CLI** +- **Kubernetes Cluster**: either cloud-deployed or locally (e.g. kind) +- **Docker** +- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation) +- **OTLP Endpoint**: Optional, defaults to `localhost:4318` + +### Local Setup + - **Python 3.13+** - **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation) - **OTLP Endpoint**: Optional, defaults to `localhost:4318` @@ -92,7 +117,7 @@ OpenTelemetry Collector Use Tilt to spin up all the required backends: ```shell -Start Tilt in the project root to set up the local Kubernetes environment: +# Start Tilt in the project root to set up the local Kubernetes environment: tilt up ``` @@ -126,7 +151,7 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ ### Local Setup -### Dependencies & Environment Setup +#### Dependencies & Environment Setup ```shell # Install dependencies with uv @@ -136,7 +161,7 @@ uv sync export OPENAI_API_KEY="your-api-key-here" ``` -Run the complete evaluation pipeline in 4 steps: +#### Run the complete evaluation pipeline in 4 steps: ```shell # 1. Download and prepare dataset From 6d4415bc14d3bc35e9dab6fa037cc5d26ce39cbd Mon Sep 17 00:00:00 2001 From: Nicolai Ommer Date: Thu, 4 Dec 2025 12:54:50 +0100 Subject: [PATCH 4/8] ci: Improve retry mechanism for tilt up --- .github/workflows/ci.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 450ae74..fc0fed4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -116,9 +116,15 @@ jobs: - name: Run Tilt CI run: | # The tilt setup is a bit flaky on a fresh startup, so we retry a few times - for i in {1..3}; do + for i in {1..5}; do tilt ci && break + if [ $i -eq 5 ]; then + echo "Tilt CI failed after 5 attempts, exiting." + exit 1 + fi echo "Tilt CI failed, retrying... ($i/3)" + # Wait a bit for resource to stabilize / cleanup + sleep 10 done - name: Setup Testkube CLI From e3f2fc022f77ccac02acfc50856fc026f266eaae Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Thu, 4 Dec 2025 14:05:25 +0100 Subject: [PATCH 5/8] FIX: setting OPENAI_API_KEY to placeholder not longer necessary, cleaned up Docs --- README.md | 32 +++++++++++++------------------- scripts/evaluate.py | 5 ++++- tests_e2e/test_e2e.py | 4 ++-- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 66415db..afd77cf 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ This project provides a complete pipeline for evaluating AI agent performance: - [Getting Started](#getting-started) - [Setup with Testkube](#setup-with-testkube) - [Local Setup](#local-setup) -- [Detailed Usage](#detailed-usage) +- [Detailed Usage & Troubleshooting](#detailed-usage--troubleshooting) - [Dataset Requirements](#dataset-requirements) - [Testing](#testing) - [Development](#development) @@ -101,20 +101,22 @@ OpenTelemetry Collector - **Testkube CLI** - **Kubernetes Cluster**: either cloud-deployed or locally (e.g. kind) - **Docker** -- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation) +- **API Key**: `GOOGLE_API_KEY` environment variable - **OTLP Endpoint**: Optional, defaults to `localhost:4318` ### Local Setup - **Python 3.13+** -- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation) +- **API Key**: `GOOGLE_API_KEY` environment variable - **OTLP Endpoint**: Optional, defaults to `localhost:4318` ---- ## Getting Started -Use Tilt to spin up all the required backends: +1. Create a `.env` file in the root directory +2. Set the `GOOGLE_API_KEY=` variable in the `.env` +3. Use Tilt to spin up all the required backends: ```shell # Start Tilt in the project root to set up the local Kubernetes environment: @@ -141,7 +143,7 @@ Run the RAGAS evaluation workflow with all optional parameters: kubectl testkube run testworkflow ragas-evaluation-workflow \ --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \ --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \ - --config metrics="nv_accuracy context_recall" + --config metrics="nv_accuracy context_recall" \ --config workflowName="Testworkflow-Name" \ --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \ --config model="gemini/gemini-2.5-flash" \ @@ -158,14 +160,14 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ uv sync # Required for evaluation -export OPENAI_API_KEY="your-api-key-here" +export OPENAI_API_BASE="http://localhost:11001" ``` #### Run the complete evaluation pipeline in 4 steps: ```shell # 1. Download and prepare dataset -uv run python3 scripts/setup.py "https://example.com/dataset.csv" +uv run python3 scripts/setup.py "https://localhost:11020/dataset.csv" # 2. Execute queries through your agent uv run python3 scripts/run.py "http://localhost:8000" @@ -179,7 +181,8 @@ uv run python3 scripts/publish.py "my-agent-evaluation" ---- -## Detailed Usage +## Detailed Usage & Troubleshooting + See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md) ---- @@ -248,13 +251,9 @@ uv run poe test ### End-to-End Tests The E2E Test (found at `tests_e2e/test_e2e.py`) runs a complete pipeline integration test - from setup.py to publish.py. -The E2E Test can be run in different ways: +The E2E Test can be run using the `poe` task runner: ```shell -# Using Pytest directly -uv run pytest tests_e2e/test_e2e.py -v - -# Using the task runner uv run poe test_e2e ``` @@ -264,7 +263,7 @@ To use custom endpoints, evaluation models or metrics you can set the following ```shell export E2E_DATASET_URL="http://data-server.data-server:8000/dataset.csv" -export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent +export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" export E2E_MODEL="gemini-2.5-flash-lite" export E2E_METRICS="faithfulness,answer_relevancy" export E2E_WORKFLOW_NAME="Test Workflow" @@ -309,11 +308,6 @@ uv run poe lint # Auto-fix linting issues ---- -## Detailed Usage -See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md) - ----- - ## Contributing See [Contribution Guide](https://github.com/agentic-layer/testbench?tab=contributing-ov-file) for details on diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 275207d..01477af 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -163,7 +163,10 @@ def main( raise ArgumentError(argument=metrics, message="No metrics were provided as arguments") # Create LLM client using the AI-Gateway - ragas_llm: ChatOpenAI = ChatOpenAI(model=model) + # Setting a placeholder for the api_key since we instantiate a ChatOpenAI object, + # but the AI-Gateway actually uses Gemini under the hood. + # Not setting api_key here results in an OpenAIError + ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key="123") llm = LangchainLLMWrapper(ragas_llm) # type: ignore[arg-type] dataset = EvaluationDataset.from_jsonl("data/experiments/ragas_experiment.jsonl") diff --git a/tests_e2e/test_e2e.py b/tests_e2e/test_e2e.py index 1d39d4e..913285c 100755 --- a/tests_e2e/test_e2e.py +++ b/tests_e2e/test_e2e.py @@ -9,7 +9,7 @@ pytest tests/test_e2e.py # With custom configuration via environment variables: - E2E_DATASET_URL="https://example.com/data.csv" \ + E2E_DATASET_URL="http://localhost:11020/dataset.csv" \ E2E_AGENT_URL="http://localhost:8000" \ E2E_MODEL="gemini-flash-latest" \ E2E_METRICS="faithfulness,answer_relevancy" \ @@ -231,7 +231,7 @@ def test_e2e_pipeline(): """ # Get configuration from environment variables with sensible defaults - dataset_url = os.getenv("E2E_DATASET_URL", "http://data-server.data-server:8000/dataset.csv") + dataset_url = os.getenv("E2E_DATASET_URL", "http://localhost:11020/dataset.csv") agent_url = os.getenv("E2E_AGENT_URL", "http://localhost:11010") model = os.getenv("E2E_MODEL", "gemini-2.5-flash-lite") metrics_str = os.getenv("E2E_METRICS", "faithfulness") From 46c1d402e2e17ee42444841bff8c9d84b2e4ed2c Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Thu, 4 Dec 2025 14:15:30 +0100 Subject: [PATCH 6/8] FIX: test_evaluate.py to accept new llm format --- tests/test_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index b1cd89b..738d670 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -218,7 +218,7 @@ def mock_evaluate(dataset, metrics, llm, token_usage_parser): class MockChatOpenAI: pass - def mock_chat_openai_init(model): + def mock_chat_openai_init(model, api_key): return MockChatOpenAI() class MockLLMWrapper: From b8e31cd49085e5d3e45b396e6abda8119ca49adc Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Thu, 4 Dec 2025 14:18:30 +0100 Subject: [PATCH 7/8] FIX: MyPy Type issue --- scripts/evaluate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 01477af..c6d32d4 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -10,6 +10,7 @@ import ragas.metrics as metrics_module from langchain_openai import ChatOpenAI +from pydantic import SecretStr from ragas import evaluate from ragas.cost import get_token_usage_for_openai from ragas.dataset_schema import EvaluationDataset, EvaluationResult @@ -166,7 +167,7 @@ def main( # Setting a placeholder for the api_key since we instantiate a ChatOpenAI object, # but the AI-Gateway actually uses Gemini under the hood. # Not setting api_key here results in an OpenAIError - ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key="123") + ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key=SecretStr("123")) llm = LangchainLLMWrapper(ragas_llm) # type: ignore[arg-type] dataset = EvaluationDataset.from_jsonl("data/experiments/ragas_experiment.jsonl") From 955116178a76c1e4610e2331ba3ce3897bd495cf Mon Sep 17 00:00:00 2001 From: MiaAppel Date: Tue, 9 Dec 2025 08:18:07 +0100 Subject: [PATCH 8/8] DOCS: add small clarification --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index afd77cf..692e088 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \ # Install dependencies with uv uv sync -# Required for evaluation +# Required for evaluation - routes requests through our AI Gateway export OPENAI_API_BASE="http://localhost:11001" ```