From cb886287dc27b9bcf43aa520f5e2af37e6aae095 Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Tue, 25 Nov 2025 14:36:26 +0100
Subject: [PATCH 1/8] FIX: Port-Forwardings so E2E Test runs locally

---
 Tiltfile              | 1 +
 tests_e2e/test_e2e.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Tiltfile b/Tiltfile
index 20f8284..85524a3 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -39,6 +39,7 @@ k8s_yaml(kustomize('deploy/local'))
 k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000'])
 k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime'])
 k8s_resource('lgtm', port_forwards=['11000:3000'])
+k8s_resource('data-server', port_forwards='11020:8000')
 
 # Declare Testkube resources
 k8s_kind(
diff --git a/tests_e2e/test_e2e.py b/tests_e2e/test_e2e.py
index 34d78a3..1d39d4e 100755
--- a/tests_e2e/test_e2e.py
+++ b/tests_e2e/test_e2e.py
@@ -2,7 +2,7 @@
 End-to-end test that runs all scripts in the correct order:
 1. setup.py - Downloads, converts and saves Ragas Dataset to data/datasets/ragas_dataset.jsonl
 2. run.py - Runs agent queries on the dataset and saves Ragas Experiment to data/experiments/ragas_experiment.jsonl
-3. evaluate.py - Evaluates results using RAGAS metrics and saves result to results/evaluation_scores.json
+3. evaluate.py - Evaluates results using RAGAS metrics and saves result to data/results/evaluation_scores.json
 4. publish.py - Publishes metrics via OpenTelemetry OTLP
 
 Usage:
@@ -57,7 +57,7 @@ def __init__(
         # Define expected output files
         self.dataset_file = Path("./data/datasets/ragas_dataset.jsonl")
         self.results_file = Path("./data/experiments/ragas_experiment.jsonl")
-        self.evaluation_file = Path("./results/evaluation_scores.json")
+        self.evaluation_file = Path("./data/results/evaluation_scores.json")
 
     def verify_scripts_exist(self) -> bool:
         """Verify that all required scripts exist."""
@@ -231,7 +231,7 @@ def test_e2e_pipeline():
     """
 
     # Get configuration from environment variables with sensible defaults
-    dataset_url = os.getenv("E2E_DATASET_URL", "http://localhost:8000/dataset.json")
+    dataset_url = os.getenv("E2E_DATASET_URL", "http://data-server.data-server:8000/dataset.csv")
     agent_url = os.getenv("E2E_AGENT_URL", "http://localhost:11010")
     model = os.getenv("E2E_MODEL", "gemini-2.5-flash-lite")
     metrics_str = os.getenv("E2E_METRICS", "faithfulness")

From b0321902cc931e4272365ca7d34a4256d5105f33 Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Thu, 4 Dec 2025 11:25:51 +0100
Subject: [PATCH 2/8] DOCS: Update documentation

---
 DetailedUsageAndTroubleshooting.md | 257 ++++++++++++++++++++++
 README.md                          | 334 ++++-------------------------
 2 files changed, 297 insertions(+), 294 deletions(-)
 create mode 100644 DetailedUsageAndTroubleshooting.md

diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md
new file mode 100644
index 0000000..1672c8e
--- /dev/null
+++ b/DetailedUsageAndTroubleshooting.md
@@ -0,0 +1,257 @@
+## Detailed Usage
+
+### 1. setup.py - Dataset Preparation
+
+Downloads and converts test datasets to RAGAS-native JSONL format.
+
+**Syntax:**
+
+```shell
+python3 scripts/setup.py <dataset_url>
+```
+
+**Arguments:**
+
+- `dataset_url` (required): URL to dataset file (`.csv`, `.json`, or `.parquet`)
+
+**Required Dataset Schema:**
+
+- See [Dataset Requirements](#dataset-requirements)
+
+**Output:**
+
+- `data/datasets/ragas_dataset.jsonl` - RAGAS Dataset in JSONL format
+
+---
+
+### 2. run.py - Agent Query Execution
+
+Executes test queries through an agent using the A2A protocol and collects responses.
+
+**Syntax:**
+
+```shell
+python3 scripts/run.py <agent_url>
+```
+
+**Arguments:**
+
+- `agent_url` (required): URL to the agent's A2A endpoint
+
+**Input:**
+
+- `data/datasets/ragas_dataset.jsonl` (loaded automatically)
+
+**Output:**
+
+- `data/experiments/ragas_experiment.jsonl` - Agent responses with preserved context
+
+**Output Schema:**
+
+```jsonl
+{"user_input": "What is X?", "retrieved_contexts": ["Context about X"], "reference": "X is...", "response": "Agent's answer"}
+```
+
+**Notes:**
+
+- Uses asynchronous A2A client for efficient communication
+- Preserves all original dataset fields
+- Automatically handles response streaming
+
+---
+
+### 3. evaluate.py - RAGAS Metric Evaluation
+
+Evaluates agent responses using configurable RAGAS metrics and calculates costs.
+
+**Syntax:**
+
+```shell
+python3 scripts/evaluate.py <model> <metric1> [metric2 ...] [--cost-per-input COST] [--cost-per-output COST]
+```
+
+**Arguments:**
+
+- `model` (required): Model name for evaluation (e.g., `gemini-2.5-flash-lite`, `gpt-4`)
+- `metrics` (required): One or more RAGAS metric names
+- `--cost-per-input` (optional): Cost per input token (default: 0.000005, i.e., $5 per 1M tokens)
+- `--cost-per-output` (optional): Cost per output token (default: 0.000015, i.e., $15 per 1M tokens)
+
+### **Available Metrics:**
+
+| Metric                                    | Special required columns |
+|-------------------------------------------|--------------------------|
+| `faithfulness`                            | retrieved_contexts       |
+| `context_precision`                       | retrieved_contexts       |
+| `context_recall`                          | retrieved_contexts       |
+| `context_entity_recall`                   | retrieved_contexts       |
+| `context_utilization`                     | retrieved_contexts       |
+| `llm_context_precision_with_reference`    | retrieved_contexts       |
+| `llm_context_precision_without_reference` | retrieved_contexts       |
+| `faithful_rate`                           | retrieved_contexts       |
+| `relevance_rate`                          | retrieved_contexts       |
+| `noise_sensitivity`                       | retrieved_contexts       |
+| `factual_correctness`                     |                          |
+| `domain_specific_rubrics`                 |                          |
+| `nv_accuracy`                             |                          |
+| `nv_context_relevance`                    | retrieved_contexts       |
+| `nv_response_groundedness`                | retrieved_contexts       |
+| `string_present`                          |                          |
+| `exact_match`                             |                          |
+| `summary_score`                           | reference_contexts       |
+| `llm_sql_equivalence_with_reference`      | reference_contexts       |
+
+**Input:**
+
+- `data/experiments/ragas_experiment.jsonl` (loaded automatically)
+
+**Examples:**
+
+```shell
+# Single metric
+python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness
+
+# Multiple metrics
+python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy context_precision
+
+# Custom token costs
+python3 scripts/evaluate.py gpt-4 faithfulness answer_correctness \
+  --cost-per-input 0.00003 \
+  --cost-per-output 0.00006
+```
+
+**Output:**
+
+- `results/evaluation_scores.json` - Evaluation results with metrics, token usage, and costs
+
+**Output Format:**
+
+```json
+{
+  "overall_scores": {
+    "faithfulness": 0.95,
+    "answer_relevancy": 0.98
+  },
+  "individual_results": [
+    {
+      "user_input": "What is the capital of France?",
+      "response": "Paris is the capital of France.",
+      "faithfulness": 0.95,
+      "answer_relevancy": 0.98
+    }
+  ],
+  "total_tokens": {
+    "input_tokens": 1500,
+    "output_tokens": 500
+  },
+  "total_cost": 0.015
+}
+```
+
+**Notes:**
+
+- Currently only support **SingleTurnSample** Metrics (see [Available Metrics](#available-metrics))
+- Dynamically discovers available metrics from `ragas.metrics` module
+- Invalid metric names will show available options
+- Token costs can be customized per model pricing
+
+---
+
+### 4. publish.py - Metrics Publishing
+
+Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
+
+**Syntax:**
+
+```shell
+python3 scripts/publish.py <workflow_name> [otlp_endpoint]
+```
+
+**Arguments:**
+
+- `workflow_name` (required): Name of the test workflow (used as metric label)
+- `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)
+
+**Input:**
+
+- `results/evaluation_scores.json` (loaded automatically)
+
+**Published Metrics:**
+
+Each RAGAS metric is published as a gauge with the workflow name as an attribute:
+
+```
+ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
+ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
+```
+
+**Notes:**
+
+- Sends metrics to `/v1/metrics` endpoint
+- Uses resource with `service.name="ragas-evaluation"`
+- Forces flush to ensure delivery before exit
+
+
+----
+
+## Troubleshooting
+
+### "Source dataset is missing required columns"
+
+**Problem**: Dataset doesn't have the required schema.
+
+**Solution**:
+
+- Verify your dataset has columns: `user_input`, `retrieved_contexts`, and `reference`
+- Check that column names match exactly (case-sensitive)
+- Ensure `retrieved_contexts` is formatted as a list (see Dataset Requirements)
+
+Example fix for CSV:
+
+```csv
+# Wrong (missing columns)
+question,context,answer
+
+# Correct
+user_input,retrieved_contexts,reference
+```
+
+### "No results found in experiment"
+
+**Problem**: `evaluate.py` can't find experiment results.
+
+**Solution**:
+
+- Check if `data/experiments/ragas_experiment.jsonl` exists
+- Verify `run.py` completed successfully without errors
+- Ensure the agent URL was accessible during execution
+- Check file permissions on the `data/` directory
+
+### CSV List Conversion Issues
+
+**Problem**: `retrieved_contexts` not parsing correctly from CSV.
+
+**Solution**:
+
+- Ensure lists are formatted as Python array strings: `"['item1', 'item2']"`
+- Use proper quoting in CSV: wrap the entire array string in double quotes
+- Consider using JSON or Parquet format for complex data types
+
+Example:
+
+```csv
+user_input,retrieved_contexts,reference
+"What is X?","['Context about X', 'More context']","X is..."
+```
+
+### Evaluation Metrics Fail
+
+**Problem**: Certain metrics fail during evaluation.
+
+**Solution**:
+
+- Some metrics require the `reference` field (e.g., `context_precision`, `context_recall`)
+- Verify your dataset includes all required fields for the metrics you're using
+- Check the RAGAS documentation for metric-specific requirements
+
+----
\ No newline at end of file
diff --git a/README.md b/README.md
index e56ddfd..59d1bc7 100644
--- a/README.md
+++ b/README.md
@@ -1,40 +1,41 @@
 # Agentic Layer Test Bench - Automated Agent Evaluation System
 
-An automated evaluation and testing system for AI agents using the **RAGAS** (Retrieval Augmented Generation Assessment)
+An automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS**
 framework. This system downloads test datasets, executes queries through agents via the **A2A** protocol, evaluates
 responses using configurable metrics, and publishes results to **OpenTelemetry** for monitoring.
 
 ----
 
+## Overview
+
+This project provides a complete pipeline for evaluating AI agent performance:
+
+- **Cloud-Native**: Easily deployable to your Kubernetes cluster
+- **Local Support**: Test and evaluate agents locally
+- **Automated Testing**: Run predefined test queries through your agents
+- **Multi-Format Support**: Support for datasets in CSV, JSON & Parquet formats
+- **Flexible Evaluation**: Evaluate agent replies using Ragas Metrics
+- **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis
+
+----
+
 ## Table of Contents
 
 - [Architecture](#architecture)
 - [Prerequisites](#prerequisites)
 - [Getting Started](#getting-started)
+  - [Setup with Testkube](#setup-with-testkube)
+  - [Local Setup](#local-setup)
 - [Quick Start](#quick-start)
 - [Detailed Usage](#detailed-usage)
 - [Dataset Requirements](#dataset-requirements)
 - [Testing](#testing)
 - [Development](#development)
 - [Troubleshooting](#troubleshooting)
-- [Project Structure](#project-structure)
 - [Contributing](#contributing)
 
 ----
 
-## Overview
-
-This project provides a complete pipeline for evaluating AI agent performance:
-
-- **Automated Testing**: Run predefined test queries through your agents
-- **Multi-Format Support**: Accept datasets in CSV, JSON, or Parquet formats
-- **Flexible Evaluation**: Configure multiple RAGAS metrics for comprehensive assessment
-- **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis
-- **Type-Safe**: Built with type hints and validated with MyPy
-- **Limitation**: Currently only support SingleTurnSample Metrics (see [Available Metrics](#available-metrics))
-
-----
-
 ## Architecture
 
 ```
@@ -60,7 +61,7 @@ data/experiments/ragas_experiment.jsonl
         |              |
         |         LLM Model
         v
-results/evaluation_scores.json
+data/results/evaluation_scores.json
         |
         v
  [4. publish.py] - Publishes to OTLP endpoint
@@ -88,13 +89,15 @@ OpenTelemetry Collector
 
 ## Getting Started
 
-### With Tilt and Local Kubernetes
+Use Tilt to spin up all the required backends:
 
 ```shell
 Start Tilt in the project root to set up the local Kubernetes environment:
 tilt up
 ```
 
+### Setup with Testkube
+
 Run the RAGAS evaluation workflow with minimal setup:
 
 ```shell
@@ -121,239 +124,38 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     -n testkube
 ```
 
-### Install dependencies using UV
+### Local Setup
+
+### Dependencies & Environment Setup
 
 ```shell
-# Install (dev & prod) dependencies with uv
+# Install dependencies with uv
 uv sync
-```
 
-### Environment Setup
-
-```shell
 # Required for evaluation
 export OPENAI_API_KEY="your-api-key-here"
-
-# Optional: Configure custom OTLP endpoint
-export OTLP_ENDPOINT="http://otlp-collector:4318"
 ```
 
-The system automatically creates the required directories (`data/`, `results/`) on first run.
-
-----
-
-## Quick Start
-
 Run the complete evaluation pipeline in 4 steps:
 
 ```shell
 # 1. Download and prepare dataset
-python3 scripts/setup.py "https://example.com/dataset.csv"
+uv run python3 scripts/setup.py "https://example.com/dataset.csv"
 
 # 2. Execute queries through your agent
-python3 scripts/run.py "http://localhost:8000"
+uv run python3 scripts/run.py "http://localhost:8000"
 
 # 3. Evaluate responses with RAGAS metrics
-python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy
+uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy
 
 # 4. Publish metrics to OpenTelemetry
-python3 scripts/publish.py "my-agent-evaluation"
+uv run python3 scripts/publish.py "my-agent-evaluation"
 ```
 
 ----
 
 ## Detailed Usage
-
-### 1. setup.py - Dataset Preparation
-
-Downloads and converts test datasets to RAGAS-native JSONL format.
-
-**Syntax:**
-
-```shell
-python3 scripts/setup.py <dataset_url>
-```
-
-**Arguments:**
-
-- `dataset_url` (required): URL to dataset file (`.csv`, `.json`, or `.parquet`)
-
-**Required Dataset Schema:**
-
-- See [Dataset Requirements](#dataset-requirements)
-
-**Output:**
-
-- `data/datasets/ragas_dataset.jsonl` - RAGAS Dataset in JSONL format
-
----
-
-### 2. run.py - Agent Query Execution
-
-Executes test queries through an agent using the A2A protocol and collects responses.
-
-**Syntax:**
-
-```shell
-python3 scripts/run.py <agent_url>
-```
-
-**Arguments:**
-
-- `agent_url` (required): URL to the agent's A2A endpoint
-
-**Input:**
-
-- `data/datasets/ragas_dataset.jsonl` (loaded automatically)
-
-**Output:**
-
-- `data/experiments/ragas_experiment.jsonl` - Agent responses with preserved context
-
-**Output Schema:**
-
-```jsonl
-{"user_input": "What is X?", "retrieved_contexts": ["Context about X"], "reference": "X is...", "response": "Agent's answer"}
-```
-
-**Notes:**
-
-- Uses asynchronous A2A client for efficient communication
-- Preserves all original dataset fields
-- Automatically handles response streaming
-
----
-
-### 3. evaluate.py - RAGAS Metric Evaluation
-
-Evaluates agent responses using configurable RAGAS metrics and calculates costs.
-
-**Syntax:**
-
-```shell
-python3 scripts/evaluate.py <model> <metric1> [metric2 ...] [--cost-per-input COST] [--cost-per-output COST]
-```
-
-**Arguments:**
-
-- `model` (required): Model name for evaluation (e.g., `gemini-2.5-flash-lite`, `gpt-4`)
-- `metrics` (required): One or more RAGAS metric names
-- `--cost-per-input` (optional): Cost per input token (default: 0.000005, i.e., $5 per 1M tokens)
-- `--cost-per-output` (optional): Cost per output token (default: 0.000015, i.e., $15 per 1M tokens)
-
-### **Available Metrics:**
-
-| Metric                                    | Special required columns |
-|-------------------------------------------|--------------------------|
-| `faithfulness`                            | retrieved_contexts       |
-| `context_precision`                       | retrieved_contexts       |
-| `context_recall`                          | retrieved_contexts       |
-| `context_entity_recall`                   | retrieved_contexts       |
-| `context_utilization`                     | retrieved_contexts       |
-| `llm_context_precision_with_reference`    | retrieved_contexts       |
-| `llm_context_precision_without_reference` | retrieved_contexts       |
-| `faithful_rate`                           | retrieved_contexts       |
-| `relevance_rate`                          | retrieved_contexts       |
-| `noise_sensitivity`                       | retrieved_contexts       |
-| `factual_correctness`                     |                          |
-| `domain_specific_rubrics`                 |                          |
-| `nv_accuracy`                             |                          |
-| `nv_context_relevance`                    | retrieved_contexts       |
-| `nv_response_groundedness`                | retrieved_contexts       |
-| `string_present`                          |                          |
-| `exact_match`                             |                          |
-| `summary_score`                           | reference_contexts       |
-| `llm_sql_equivalence_with_reference`      | reference_contexts       |
-
-**Input:**
-
-- `data/experiments/ragas_experiment.jsonl` (loaded automatically)
-
-**Examples:**
-
-```shell
-# Single metric
-python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness
-
-# Multiple metrics
-python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy context_precision
-
-# Custom token costs
-python3 scripts/evaluate.py gpt-4 faithfulness answer_correctness \
-  --cost-per-input 0.00003 \
-  --cost-per-output 0.00006
-```
-
-**Output:**
-
-- `results/evaluation_scores.json` - Evaluation results with metrics, token usage, and costs
-
-**Output Format:**
-
-```json
-{
-  "overall_scores": {
-    "faithfulness": 0.95,
-    "answer_relevancy": 0.98
-  },
-  "individual_results": [
-    {
-      "user_input": "What is the capital of France?",
-      "response": "Paris is the capital of France.",
-      "faithfulness": 0.95,
-      "answer_relevancy": 0.98
-    }
-  ],
-  "total_tokens": {
-    "input_tokens": 1500,
-    "output_tokens": 500
-  },
-  "total_cost": 0.015
-}
-```
-
-**Notes:**
-
-- Currently only support **SingleTurnSample** Metrics (see [Available Metrics](#available-metrics))
-- Dynamically discovers available metrics from `ragas.metrics` module
-- Invalid metric names will show available options
-- Token costs can be customized per model pricing
-
----
-
-### 4. publish.py - Metrics Publishing
-
-Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
-
-**Syntax:**
-
-```shell
-python3 scripts/publish.py <workflow_name> [otlp_endpoint]
-```
-
-**Arguments:**
-
-- `workflow_name` (required): Name of the test workflow (used as metric label)
-- `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)
-
-**Input:**
-
-- `results/evaluation_scores.json` (loaded automatically)
-
-**Published Metrics:**
-
-Each RAGAS metric is published as a gauge with the workflow name as an attribute:
-
-```
-ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
-ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
-```
-
-**Notes:**
-
-- Sends metrics to `/v1/metrics` endpoint
-- Uses resource with `service.name="ragas-evaluation"`
-- Forces flush to ensure delivery before exit
+See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md)
 
 ----
 
@@ -420,29 +222,30 @@ uv run poe test
 
 ### End-to-End Tests
 
-Run the complete pipeline integration test:
+The E2E Test (found at `tests_e2e/test_e2e.py`) runs a complete pipeline integration test - from setup.py to publish.py.
+The E2E Test can be run in different ways:
 
 ```shell
+# Using Pytest directly
 uv run pytest tests_e2e/test_e2e.py -v
-```
-
-Or using the task runner:
 
-```shell
+# Using the task runner
 uv run poe test_e2e
 ```
 
+To use custom endpoints, evaluation models or metrics you can set the following environment variables before running the E2E Test:
+
 **Configuration via Environment Variables:**
 
 ```shell
-export E2E_DATASET_URL="http://localhost:8000/dataset.json"
-export E2E_AGENT_URL="http://localhost:11010"
+export E2E_DATASET_URL="http://data-server.data-server:8000/dataset.csv"
+export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent
 export E2E_MODEL="gemini-2.5-flash-lite"
 export E2E_METRICS="faithfulness,answer_relevancy"
 export E2E_WORKFLOW_NAME="Test Workflow"
 export E2E_OTLP_ENDPOINT="localhost:4318"
 
-pytest tests_e2e/test_e2e.py -v
+uv run pytest tests_e2e/test_e2e.py -v
 ```
 
 ----
@@ -481,65 +284,8 @@ uv run poe lint          # Auto-fix linting issues
 
 ----
 
-## Troubleshooting
-
-### "Source dataset is missing required columns"
-
-**Problem**: Dataset doesn't have the required schema.
-
-**Solution**:
-
-- Verify your dataset has columns: `user_input`, `retrieved_contexts`, and `reference`
-- Check that column names match exactly (case-sensitive)
-- Ensure `retrieved_contexts` is formatted as a list (see Dataset Requirements)
-
-Example fix for CSV:
-
-```csv
-# Wrong (missing columns)
-question,context,answer
-
-# Correct
-user_input,retrieved_contexts,reference
-```
-
-### "No results found in experiment"
-
-**Problem**: `evaluate.py` can't find experiment results.
-
-**Solution**:
-
-- Check if `data/experiments/ragas_experiment.jsonl` exists
-- Verify `run.py` completed successfully without errors
-- Ensure the agent URL was accessible during execution
-- Check file permissions on the `data/` directory
-
-### CSV List Conversion Issues
-
-**Problem**: `retrieved_contexts` not parsing correctly from CSV.
-
-**Solution**:
-
-- Ensure lists are formatted as Python array strings: `"['item1', 'item2']"`
-- Use proper quoting in CSV: wrap the entire array string in double quotes
-- Consider using JSON or Parquet format for complex data types
-
-Example:
-
-```csv
-user_input,retrieved_contexts,reference
-"What is X?","['Context about X', 'More context']","X is..."
-```
-
-### Evaluation Metrics Fail
-
-**Problem**: Certain metrics fail during evaluation.
-
-**Solution**:
-
-- Some metrics require the `reference` field (e.g., `context_precision`, `context_recall`)
-- Verify your dataset includes all required fields for the metrics you're using
-- Check the RAGAS documentation for metric-specific requirements
+## Detailed Usage
+See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md)
 
 ----
 

From 48ee528f585700990b3405b4f64e31f350dbe41e Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Thu, 4 Dec 2025 11:36:54 +0100
Subject: [PATCH 3/8] DOCS: Update documentation

---
 README.md | 49 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 59d1bc7..66415db 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Agentic Layer Test Bench - Automated Agent Evaluation System
 
-An automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS**
+A **Kubernetes** native, automated evaluation and testing system for AI agents based on **Testkube** and using the **RAGAS**
 framework. This system downloads test datasets, executes queries through agents via the **A2A** protocol, evaluates
 responses using configurable metrics, and publishes results to **OpenTelemetry** for monitoring.
 
@@ -17,6 +17,29 @@ This project provides a complete pipeline for evaluating AI agent performance:
 - **Flexible Evaluation**: Evaluate agent replies using Ragas Metrics
 - **Observability**: Publish metrics to OpenTelemetry endpoints for monitoring and analysis
 
+### Example Output:
+
+```json
+{
+  "overall_scores": {
+    "faithfulness": 0.95,
+    "answer_relevancy": 0.98
+  },
+  "individual_results": [
+    {
+      "user_input": "What is the capital of France?",
+      "response": "Paris is the capital of France.",
+      "faithfulness": 0.95,
+      "answer_relevancy": 0.98
+    }
+  ],
+  "total_tokens": {
+    "input_tokens": 1500,
+    "output_tokens": 500
+  },
+  "total_cost": 0.015
+}
+```
 ----
 
 ## Table of Contents
@@ -26,7 +49,6 @@ This project provides a complete pipeline for evaluating AI agent performance:
 - [Getting Started](#getting-started)
   - [Setup with Testkube](#setup-with-testkube)
   - [Local Setup](#local-setup)
-- [Quick Start](#quick-start)
 - [Detailed Usage](#detailed-usage)
 - [Dataset Requirements](#dataset-requirements)
 - [Testing](#testing)
@@ -70,17 +92,20 @@ data/results/evaluation_scores.json
 OpenTelemetry Collector
 ```
 
-### Key Design Principles
-
-- **RAGAS-Native Format**: Uses RAGAS column names (`user_input`, `response`, `retrieved_contexts`, `reference`)
-  throughout
-- **JSONL Backend**: Internal storage uses JSONL for native list support
-- **Format-Aware Input**: Intelligent handling of CSV (list conversion), JSON, and Parquet formats
-
 ----
 
 ## Prerequisites
 
+### Setup with Testkube
+
+- **Testkube CLI**
+- **Kubernetes Cluster**: either cloud-deployed or locally (e.g. kind)
+- **Docker**
+- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation)
+- **OTLP Endpoint**: Optional, defaults to `localhost:4318`
+
+### Local Setup
+
 - **Python 3.13+**
 - **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation)
 - **OTLP Endpoint**: Optional, defaults to `localhost:4318`
@@ -92,7 +117,7 @@ OpenTelemetry Collector
 Use Tilt to spin up all the required backends:
 
 ```shell
-Start Tilt in the project root to set up the local Kubernetes environment:
+# Start Tilt in the project root to set up the local Kubernetes environment:
 tilt up
 ```
 
@@ -126,7 +151,7 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
 
 ### Local Setup
 
-### Dependencies & Environment Setup
+#### Dependencies & Environment Setup
 
 ```shell
 # Install dependencies with uv
@@ -136,7 +161,7 @@ uv sync
 export OPENAI_API_KEY="your-api-key-here"
 ```
 
-Run the complete evaluation pipeline in 4 steps:
+#### Run the complete evaluation pipeline in 4 steps:
 
 ```shell
 # 1. Download and prepare dataset

From 6d4415bc14d3bc35e9dab6fa037cc5d26ce39cbd Mon Sep 17 00:00:00 2001
From: Nicolai Ommer <nicolai.ommer@qaware.de>
Date: Thu, 4 Dec 2025 12:54:50 +0100
Subject: [PATCH 4/8] ci: Improve retry mechanism for tilt up

---
 .github/workflows/ci.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 450ae74..fc0fed4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -116,9 +116,15 @@ jobs:
       - name: Run Tilt CI
         run: |
           # The tilt setup is a bit flaky on a fresh startup, so we retry a few times
-          for i in {1..3}; do
+          for i in {1..5}; do
             tilt ci && break
+            if [ $i -eq 5 ]; then
+              echo "Tilt CI failed after 5 attempts, exiting."
+              exit 1
+            fi
             echo "Tilt CI failed, retrying... ($i/3)"
+            # Wait a bit for resource to stabilize / cleanup
+            sleep 10
           done
 
       - name: Setup Testkube CLI

From e3f2fc022f77ccac02acfc50856fc026f266eaae Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Thu, 4 Dec 2025 14:05:25 +0100
Subject: [PATCH 5/8] FIX: setting OPENAI_API_KEY to placeholder not longer
 necessary, cleaned up Docs

---
 README.md             | 32 +++++++++++++-------------------
 scripts/evaluate.py   |  5 ++++-
 tests_e2e/test_e2e.py |  4 ++--
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 66415db..afd77cf 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ This project provides a complete pipeline for evaluating AI agent performance:
 - [Getting Started](#getting-started)
   - [Setup with Testkube](#setup-with-testkube)
   - [Local Setup](#local-setup)
-- [Detailed Usage](#detailed-usage)
+- [Detailed Usage & Troubleshooting](#detailed-usage--troubleshooting)
 - [Dataset Requirements](#dataset-requirements)
 - [Testing](#testing)
 - [Development](#development)
@@ -101,20 +101,22 @@ OpenTelemetry Collector
 - **Testkube CLI**
 - **Kubernetes Cluster**: either cloud-deployed or locally (e.g. kind)
 - **Docker**
-- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation)
+- **API Key**: `GOOGLE_API_KEY` environment variable
 - **OTLP Endpoint**: Optional, defaults to `localhost:4318`
 
 ### Local Setup
 
 - **Python 3.13+**
-- **API Key**: `OPENAI_API_KEY` environment variable (required for LLM-based evaluation)
+- **API Key**: `GOOGLE_API_KEY` environment variable
 - **OTLP Endpoint**: Optional, defaults to `localhost:4318`
 
 ----
 
 ## Getting Started
 
-Use Tilt to spin up all the required backends:
+1. Create a `.env` file in the root directory
+2. Set the `GOOGLE_API_KEY=` variable in the `.env`
+3. Use Tilt to spin up all the required backends:
 
 ```shell
 # Start Tilt in the project root to set up the local Kubernetes environment:
@@ -141,7 +143,7 @@ Run the RAGAS evaluation workflow with all optional parameters:
 kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
-    --config metrics="nv_accuracy context_recall"
+    --config metrics="nv_accuracy context_recall" \
     --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
     --config model="gemini/gemini-2.5-flash" \
@@ -158,14 +160,14 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
 uv sync
 
 # Required for evaluation
-export OPENAI_API_KEY="your-api-key-here"
+export OPENAI_API_BASE="http://localhost:11001"
 ```
 
 #### Run the complete evaluation pipeline in 4 steps:
 
 ```shell
 # 1. Download and prepare dataset
-uv run python3 scripts/setup.py "https://example.com/dataset.csv"
+uv run python3 scripts/setup.py "https://localhost:11020/dataset.csv"
 
 # 2. Execute queries through your agent
 uv run python3 scripts/run.py "http://localhost:8000"
@@ -179,7 +181,8 @@ uv run python3 scripts/publish.py "my-agent-evaluation"
 
 ----
 
-## Detailed Usage
+## Detailed Usage & Troubleshooting
+
 See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md)
 
 ----
@@ -248,13 +251,9 @@ uv run poe test
 ### End-to-End Tests
 
 The E2E Test (found at `tests_e2e/test_e2e.py`) runs a complete pipeline integration test - from setup.py to publish.py.
-The E2E Test can be run in different ways:
+The E2E Test can be run using the `poe` task runner:
 
 ```shell
-# Using Pytest directly
-uv run pytest tests_e2e/test_e2e.py -v
-
-# Using the task runner
 uv run poe test_e2e
 ```
 
@@ -264,7 +263,7 @@ To use custom endpoints, evaluation models or metrics you can set the following
 
 ```shell
 export E2E_DATASET_URL="http://data-server.data-server:8000/dataset.csv"
-export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent
+export E2E_AGENT_URL="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent"
 export E2E_MODEL="gemini-2.5-flash-lite"
 export E2E_METRICS="faithfulness,answer_relevancy"
 export E2E_WORKFLOW_NAME="Test Workflow"
@@ -309,11 +308,6 @@ uv run poe lint          # Auto-fix linting issues
 
 ----
 
-## Detailed Usage
-See [Detailed Usage & Troubleshooting](DetailedUsageAndTroubleshooting.md)
-
-----
-
 ## Contributing
 
 See [Contribution Guide](https://github.com/agentic-layer/testbench?tab=contributing-ov-file) for details on
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index 275207d..01477af 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -163,7 +163,10 @@ def main(
         raise ArgumentError(argument=metrics, message="No metrics were provided as arguments")
 
     # Create LLM client using the AI-Gateway
-    ragas_llm: ChatOpenAI = ChatOpenAI(model=model)
+    # Setting a placeholder for the api_key since we instantiate a ChatOpenAI object,
+    # but the AI-Gateway actually uses Gemini under the hood.
+    # Not setting api_key here results in an OpenAIError
+    ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key="123")
     llm = LangchainLLMWrapper(ragas_llm)  # type: ignore[arg-type]
 
     dataset = EvaluationDataset.from_jsonl("data/experiments/ragas_experiment.jsonl")
diff --git a/tests_e2e/test_e2e.py b/tests_e2e/test_e2e.py
index 1d39d4e..913285c 100755
--- a/tests_e2e/test_e2e.py
+++ b/tests_e2e/test_e2e.py
@@ -9,7 +9,7 @@
     pytest tests/test_e2e.py
 
     # With custom configuration via environment variables:
-    E2E_DATASET_URL="https://example.com/data.csv" \
+    E2E_DATASET_URL="http://localhost:11020/dataset.csv" \
     E2E_AGENT_URL="http://localhost:8000" \
     E2E_MODEL="gemini-flash-latest" \
     E2E_METRICS="faithfulness,answer_relevancy" \
@@ -231,7 +231,7 @@ def test_e2e_pipeline():
     """
 
     # Get configuration from environment variables with sensible defaults
-    dataset_url = os.getenv("E2E_DATASET_URL", "http://data-server.data-server:8000/dataset.csv")
+    dataset_url = os.getenv("E2E_DATASET_URL", "http://localhost:11020/dataset.csv")
     agent_url = os.getenv("E2E_AGENT_URL", "http://localhost:11010")
     model = os.getenv("E2E_MODEL", "gemini-2.5-flash-lite")
     metrics_str = os.getenv("E2E_METRICS", "faithfulness")

From 46c1d402e2e17ee42444841bff8c9d84b2e4ed2c Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Thu, 4 Dec 2025 14:15:30 +0100
Subject: [PATCH 6/8] FIX: test_evaluate.py to accept new llm format

---
 tests/test_evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index b1cd89b..738d670 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -218,7 +218,7 @@ def mock_evaluate(dataset, metrics, llm, token_usage_parser):
         class MockChatOpenAI:
             pass
 
-        def mock_chat_openai_init(model):
+        def mock_chat_openai_init(model, api_key):
             return MockChatOpenAI()
 
         class MockLLMWrapper:

From b8e31cd49085e5d3e45b396e6abda8119ca49adc Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Thu, 4 Dec 2025 14:18:30 +0100
Subject: [PATCH 7/8] FIX: MyPy Type issue

---
 scripts/evaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index 01477af..c6d32d4 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -10,6 +10,7 @@
 
 import ragas.metrics as metrics_module
 from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
 from ragas import evaluate
 from ragas.cost import get_token_usage_for_openai
 from ragas.dataset_schema import EvaluationDataset, EvaluationResult
@@ -166,7 +167,7 @@ def main(
     # Setting a placeholder for the api_key since we instantiate a ChatOpenAI object,
     # but the AI-Gateway actually uses Gemini under the hood.
     # Not setting api_key here results in an OpenAIError
-    ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key="123")
+    ragas_llm: ChatOpenAI = ChatOpenAI(model=model, api_key=SecretStr("123"))
     llm = LangchainLLMWrapper(ragas_llm)  # type: ignore[arg-type]
 
     dataset = EvaluationDataset.from_jsonl("data/experiments/ragas_experiment.jsonl")

From 955116178a76c1e4610e2331ba3ce3897bd495cf Mon Sep 17 00:00:00 2001
From: MiaAppel <mia.appel@qaware.de>
Date: Tue, 9 Dec 2025 08:18:07 +0100
Subject: [PATCH 8/8] DOCS: add small clarification

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index afd77cf..692e088 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
 # Install dependencies with uv
 uv sync
 
-# Required for evaluation
+# Required for evaluation - routes requests through our AI Gateway
 export OPENAI_API_BASE="http://localhost:11001"
 ```