diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fc0fed4..22cb5bb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -122,8 +122,12 @@ jobs:
               echo "Tilt CI failed after 5 attempts, exiting."
               exit 1
             fi
-            echo "Tilt CI failed, retrying... ($i/3)"
-            # Wait a bit for resource to stabilize / cleanup
+            echo "Tilt CI failed, retrying... ($i/4)"
+
+            # Clean up stuck Helm releases before retrying
+            helm rollback testkube -n testkube 2>/dev/null || helm uninstall testkube -n testkube 2>/dev/null || true
+
+            # Wait a bit for resources to stabilize
             sleep 10
           done
 
@@ -136,7 +140,6 @@ jobs:
               --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
               --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
               --config metrics="nv_accuracy context_recall" \
-              --config workflowName="Testworkflow-Name" \
               --config image="${{ steps.extract-tag.outputs.image-tag }}" \
               -n testkube \
               --watch
diff --git a/CLAUDE.md b/CLAUDE.md
index 179c46d..0558890 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -75,7 +75,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://weather-agent.sample-agents:8000" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Test-Run" \
     -n testkube
 
 # Watch workflow execution
@@ -155,7 +154,6 @@ Observability Backend (Grafana)
 - `agentUrl` - A2A endpoint of agent to evaluate
 - `model` - LLM model for RAGAS evaluation (e.g., `gemini-2.5-flash-lite`)
 - `metrics` - Space-separated RAGAS metrics (e.g., `faithfulness context_recall`)
-- `workflowName` - Label for published metrics
 - `otlpEndpoint` - OpenTelemetry collector URL (default: `http://lgtm.monitoring:4318`)
 - `image` - Docker image to use (default: `ghcr.io/agentic-layer/testbench/testworkflows:latest`)
 
diff --git a/DetailedUsageAndTroubleshooting.md b/DetailedUsageAndTroubleshooting.md
index 1672c8e..93580ca 100644
--- a/DetailedUsageAndTroubleshooting.md
+++ b/DetailedUsageAndTroubleshooting.md
@@ -164,12 +164,14 @@ Publishes evaluation metrics to an OpenTelemetry OTLP endpoint for monitoring.
 **Syntax:**
 
 ```shell
-python3 scripts/publish.py <workflow_name> [otlp_endpoint]
+python3 scripts/publish.py <workflow_name> <execution_id> <execution_number> [otlp_endpoint]
 ```
 
 **Arguments:**
 
 - `workflow_name` (required): Name of the test workflow (used as metric label)
+- `execution_id` (required): Testkube execution ID for this workflow run
+- `execution_number` (required): Numeric execution number for this workflow run (used as X-axis in Grafana)
 - `otlp_endpoint` (optional): OTLP HTTP endpoint URL (default: `localhost:4318`)
 
 **Input:**
@@ -178,17 +180,34 @@ python3 scripts/publish.py <workflow_name> [otlp_endpoint]
 
 **Published Metrics:**
 
-Each RAGAS metric is published as a gauge with the workflow name as an attribute:
+Three gauge types are published to the OTLP endpoint:
+
+| Gauge Name | Description | Attributes |
+|------------|-------------|------------|
+| `testbench_evaluation_metric` | Per-sample evaluation scores | `name`, `workflow_name`, `execution_id`, `execution_number`, `trace_id`, `user_input_hash`, `user_input_truncated` |
+| `testbench_evaluation_token_usage` | Token counts from evaluation | `type` (input_tokens/output_tokens), `workflow_name`, `execution_id`, `execution_number` |
+| `testbench_evaluation_cost` | Total evaluation cost in USD | `workflow_name`, `execution_id`, `execution_number` |
+
+**Attribute Details:**
+
+- `user_input_hash`: 12-character SHA256 hash of the user input for stable identification across executions
+- `user_input_truncated`: First 50 characters of the user input with "..." suffix (for display in Grafana legends)
+
+**Example output:**
 
 ```
-ragas_evaluation_faithfulness{workflow_name="weather-assistant-eval"} = 0.85
-ragas_evaluation_answer_relevancy{workflow_name="weather-assistant-eval"} = 0.92
+testbench_evaluation_metric{name="faithfulness", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 0.85
+testbench_evaluation_metric{name="context_recall", workflow_name="weather-eval", execution_id="exec-123", execution_number=1, trace_id="abc123...", user_input_hash="a1b2c3d4e5f6", user_input_truncated="What is the weather like in New York?"} = 1.0
+testbench_evaluation_token_usage{type="input_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 1500
+testbench_evaluation_token_usage{type="output_tokens", workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 500
+testbench_evaluation_cost{workflow_name="weather-eval", execution_id="exec-123", execution_number=1} = 0.015
 ```
 
 **Notes:**
 
 - Sends metrics to `/v1/metrics` endpoint
 - Uses resource with `service.name="ragas-evaluation"`
+- The `trace_id` attribute links metrics to distributed traces for debugging
 - Forces flush to ensure delivery before exit
 
 
@@ -254,4 +273,4 @@ user_input,retrieved_contexts,reference
 - Verify your dataset includes all required fields for the metrics you're using
 - Check the RAGAS documentation for metric-specific requirements
 
-----
\ No newline at end of file
+----
diff --git a/README.md b/README.md
index 46a7b24..d7c947e 100644
--- a/README.md
+++ b/README.md
@@ -132,8 +132,8 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
+    --config otlpEndpoint="http://lgtm.monitoring:4318" \
     -n testkube
 ```
 
@@ -144,7 +144,6 @@ kubectl testkube run testworkflow ragas-evaluation-workflow \
     --config datasetUrl="http://data-server.data-server:8000/dataset.csv" \
     --config agentUrl="http://agent-gateway-krakend.agent-gateway-krakend:10000/weather-agent" \
     --config metrics="nv_accuracy context_recall" \
-    --config workflowName="Testworkflow-Name" \
     --config image="ghcr.io/agentic-layer/testbench/testworkflows:latest" \
     --config model="gemini/gemini-2.5-flash" \
     --config otlpEndpoint="http://otlp-endpoint:4093" \
@@ -176,8 +175,8 @@ uv run python3 scripts/run.py "http://localhost:11010"
 # 3. Evaluate responses with RAGAS metrics
 uv run python3 scripts/evaluate.py gemini-2.5-flash-lite faithfulness answer_relevancy
 
-# 4. Publish metrics to OpenTelemetry
-uv run python3 scripts/publish.py "my-agent-evaluation"
+# 4. Publish metrics to OpenTelemetry (workflow_name, execution_id, execution_number)
+uv run python3 scripts/publish.py "my-agent-evaluation" "local-exec-001" 1
 ```
 
 ----
@@ -277,6 +276,18 @@ uv run pytest tests_e2e/test_e2e.py -v
 
 ## Development
 
+### Deployment Structure
+
+```
+deploy/
+  base/                    # Shared resources for all environments
+    templates/             # Testkube TestWorkflowTemplates
+    grafana-dashboards/    # Dashboard ConfigMaps (auto-discovered via grafana_dashboard label)
+  local/                   # Local Tilt environment (uses LGTM all-in-one)
+  dev/                     # Dev cluster environment (uses Grafana sidecar for dashboard discovery)
+```
+
+
 ## Code Quality Standards
 
 ### Code Style:
diff --git a/Tiltfile b/Tiltfile
index c852cb2..7829ff8 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -1,7 +1,7 @@
 # -*- mode: Python -*-
 
-# Increase Kubernetes upsert timeout for CRD installations
-update_settings(max_parallel_updates=10)
+# Increase Kubernetes upsert timeout for CRD installations and slow Helm charts (testkube)
+update_settings(max_parallel_updates=10, k8s_upsert_timeout_secs=600)
 
 # Load .env file for environment variables
 load('ext://dotenv', 'dotenv')
@@ -39,7 +39,7 @@ k8s_yaml(kustomize('deploy/local'))
 
 k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000'])
 k8s_resource('weather-agent', port_forwards='11010:8000', labels=['agents'], resource_deps=['agent-runtime'])
-k8s_resource('lgtm', port_forwards=['11000:3000'])
+k8s_resource('lgtm', port_forwards=['11000:3000', '4318:4318'])
 k8s_resource('data-server', port_forwards='11020:8000')
 
 # Declare Testkube resources
diff --git a/deploy/base/grafana-dashboards/evaluation-dashboard.json b/deploy/base/grafana-dashboards/evaluation-dashboard.json
new file mode 100644
index 0000000..eee1524
--- /dev/null
+++ b/deploy/base/grafana-dashboards/evaluation-dashboard.json
@@ -0,0 +1,639 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 0,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 4,
+      "panels": [],
+      "repeat": "metric",
+      "title": "Evaluation of $metric",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "showValues": false,
+            "spanNulls": true,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [
+            "last"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": false,
+          "sortBy": "Name",
+          "sortDesc": false
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "builder",
+          "exemplar": false,
+          "expr": "avg(testbench_evaluation_metric{name=\"context_recall\", workflow_name=\"ragas-evaluation-workflow\"})",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Average $metric over time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "fillOpacity": 50,
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "pointShape": "square",
+            "pointSize": {
+              "fixed": 7
+            },
+            "pointStrokeWidth": 1,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "show": "points+lines"
+          },
+          "fieldMinMax": false,
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "mapping": "auto",
+        "series": [
+          {
+            "frame": {
+              "matcher": {
+                "id": "byIndex",
+                "options": 0
+              }
+            },
+            "x": {
+              "matcher": {
+                "id": "byName",
+                "options": "execution_number"
+              }
+            },
+            "y": {
+              "matcher": {
+                "id": "byName",
+                "options": "Value"
+              }
+            }
+          }
+        ],
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "avg by(execution_number) (testbench_evaluation_metric{workflow_name=\"ragas-evaluation-workflow\", name=~\"$metric\"})",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Average $metric per test",
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {
+            "mode": "columns"
+          }
+        },
+        {
+          "id": "convertFieldType",
+          "options": {
+            "conversions": [
+              {
+                "destinationType": "number",
+                "enumConfig": {
+                  "text": [
+                    "62"
+                  ]
+                },
+                "targetField": "execution_number"
+              }
+            ],
+            "fields": {}
+          }
+        },
+        {
+          "id": "merge",
+          "options": {}
+        }
+      ],
+      "type": "xychart"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "showValues": false,
+            "spanNulls": true,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 9
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [
+            "last"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true,
+          "sortBy": "Name",
+          "sortDesc": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "builder",
+          "expr": "max by (user_input_hash, user_input_truncated) (\n    testbench_evaluation_metric{name=\"context_recall\", workflow_name=\"ragas-evaluation-workflow\"}\n  )",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Question: \"{{user_input_truncated}}\"",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Individual $metric results over time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "fillOpacity": 50,
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "pointShape": "square",
+            "pointSize": {
+              "fixed": 7
+            },
+            "pointStrokeWidth": 1,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "show": "points+lines"
+          },
+          "fieldMinMax": false,
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 9
+      },
+      "id": 13,
+      "options": {
+        "legend": {
+          "calcs": [
+            "last"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true,
+          "sortBy": "Name",
+          "sortDesc": true
+        },
+        "mapping": "auto",
+        "series": [
+          {
+            "x": {
+              "matcher": {
+                "id": "byName",
+                "options": "execution_number (lastNotNull)"
+              }
+            },
+            "y": {
+              "matcher": {
+                "id": "byName",
+                "options": "Value (lastNotNull)"
+              }
+            }
+          }
+        ],
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "avg by(user_input_hash, trace_id, user_input_truncated, execution_number) (testbench_evaluation_metric{workflow_name=\"ragas-evaluation-workflow\", name=~\"$metric\"})",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Question: \"{{user_input_truncated}}\"",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Individual $metric results per test",
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {
+            "mode": "columns"
+          }
+        },
+        {
+          "id": "convertFieldType",
+          "options": {
+            "conversions": [
+              {
+                "destinationType": "number",
+                "targetField": "execution_number"
+              }
+            ],
+            "fields": {}
+          }
+        },
+        {
+          "id": "groupBy",
+          "options": {
+            "fields": {
+              "Time": {
+                "aggregations": [
+                  "last"
+                ],
+                "operation": "aggregate"
+              },
+              "Value": {
+                "aggregations": [
+                  "lastNotNull"
+                ],
+                "operation": "aggregate"
+              },
+              "execution_number": {
+                "aggregations": [
+                  "lastNotNull"
+                ],
+                "operation": "aggregate"
+              },
+              "trace_id": {
+                "aggregations": [
+                  "lastNotNull"
+                ],
+                "operation": "aggregate"
+              },
+              "user_input_hash": {
+                "aggregations": []
+              },
+              "user_input_truncated": {
+                "aggregations": [],
+                "operation": "groupby"
+              }
+            }
+          }
+        },
+        {
+          "id": "merge",
+          "options": {}
+        },
+        {
+          "id": "seriesToRows",
+          "options": {}
+        },
+        {
+          "id": "partitionByValues",
+          "options": {
+            "fields": [
+              "user_input_truncated"
+            ],
+            "keepFields": false,
+            "naming": {
+              "asLabels": false
+            }
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "(.*?) Value \\(lastNotNull\\)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "xychart"
+    }
+  ],
+  "preload": false,
+  "schemaVersion": 42,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "text": "ragas-evaluation-workflow",
+          "value": "ragas-evaluation-workflow"
+        },
+        "definition": "label_values(workflow_name)",
+        "description": "name of the TestWorkflow CR",
+        "label": "Workflow Name",
+        "name": "workflow",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(workflow_name)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "sort": 5,
+        "type": "query"
+      },
+      {
+        "allowCustomValue": false,
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "definition": "label_values(testbench_evaluation_metric,name)",
+        "includeAll": true,
+        "name": "metric",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(testbench_evaluation_metric,name)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "2025-12-19T10:24:55.111Z",
+    "to": "2025-12-19T15:28:09.705Z"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Workflow Evaluations",
+  "uid": "g9lx75",
+  "version": 32
+}
diff --git a/deploy/base/grafana-dashboards/kustomization.yaml b/deploy/base/grafana-dashboards/kustomization.yaml
new file mode 100644
index 0000000..2a65176
--- /dev/null
+++ b/deploy/base/grafana-dashboards/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: monitoring
+configMapGenerator:
+  - name: grafana-testkube-dashboard
+    files:
+      - evaluation-dashboard.json
+      - testkube-dashboard.json
+generatorOptions:
+  disableNameSuffixHash: true
+  labels:
+    grafana_dashboard: "1"
diff --git a/deploy/base/grafana-dashboards/testkube-dashboard.json b/deploy/base/grafana-dashboards/testkube-dashboard.json
new file mode 100644
index 0000000..85cf3fd
--- /dev/null
+++ b/deploy/base/grafana-dashboards/testkube-dashboard.json
@@ -0,0 +1,316 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "8.1.2"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {"type": "prometheus"},
+      "description": "Executions distributions",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": -1,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "opacity",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 1,
+            "pointSize": 3,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "always",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "single"
+        }
+      },
+      "targets": [
+        {
+          "exemplar": true,
+          "expr": "testkube_testworkflow_executions_count_total{result=\"passed\"}",
+          "hide": false,
+          "instant": false,
+          "interval": "1",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        },
+        {
+          "hide": false,
+          "refId": "B"
+        }
+      ],
+      "title": "SUCCESSFUL workflow executions",
+      "transparent": true,
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus"},
+      "description": "Executions distributions",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": -1,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "opacity",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 1,
+            "pointSize": 3,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "always",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "single"
+        }
+      },
+      "targets": [
+        {
+          "exemplar": true,
+          "expr": "testkube_testworkflow_executions_count_total{result=\"failed\"}",
+          "hide": false,
+          "instant": false,
+          "interval": "1",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        },
+        {
+          "hide": false,
+          "refId": "B"
+        }
+      ],
+      "title": "FAILED workflow executions",
+      "transparent": true,
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus"},
+      "description": "Executions distributions",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 9
+      },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "text": {},
+        "textMode": "auto"
+      },
+      "pluginVersion": "8.1.2",
+      "targets": [
+        {
+          "exemplar": true,
+          "expr": "testkube_testworkflow_executions_count_total{}",
+          "format": "table",
+          "hide": false,
+          "instant": false,
+          "interval": "1",
+          "intervalFactor": 10,
+          "legendFormat": "{{type}} created",
+          "refId": "A"
+        },
+        {
+          "hide": false,
+          "refId": "B"
+        }
+      ],
+      "title": "Total workflow executions",
+      "transparent": true,
+      "type": "stat"
+    }
+  ],
+  "schemaVersion": 30,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Testkube",
+  "uid": "jMtrP3nnz",
+  "version": 2
+}
diff --git a/deploy/base/kustomization.yaml b/deploy/base/kustomization.yaml
index 7ddc4f1..4d09cdc 100644
--- a/deploy/base/kustomization.yaml
+++ b/deploy/base/kustomization.yaml
@@ -2,3 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - templates/
+  - grafana-dashboards
diff --git a/deploy/base/templates/publish-template.yaml b/deploy/base/templates/publish-template.yaml
index 32ea61b..74a4ea5 100644
--- a/deploy/base/templates/publish-template.yaml
+++ b/deploy/base/templates/publish-template.yaml
@@ -9,9 +9,6 @@ metadata:
 spec:
   # Configuration parameters that can be overridden
   config:
-    workflowName:
-      type: string
-      description: "Name of the test workflow (used as label)"
     otlpEndpoint:
       type: string
       description: "URL of the OTLP endpoint"
@@ -27,5 +24,7 @@ spec:
       run:
         args:
           - publish.py
-          - "{{ config.workflowName }}"
+          - "{{ workflow.name }}"
+          - "{{ execution.id }}"
+          - "{{ execution.number }}"
           - "{{ config.otlpEndpoint }}"
diff --git a/deploy/local/kustomization.yaml b/deploy/local/kustomization.yaml
index fe7375c..4b95374 100644
--- a/deploy/local/kustomization.yaml
+++ b/deploy/local/kustomization.yaml
@@ -1,7 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
-  - lgtm.yaml
+  - lgtm/
   - weather-agent.yaml
   - data-server/
   - ../base
diff --git a/deploy/local/lgtm/kustomization.yaml b/deploy/local/lgtm/kustomization.yaml
new file mode 100644
index 0000000..33c1c50
--- /dev/null
+++ b/deploy/local/lgtm/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - lgtm.yaml
+  - lgtm-otel-scraping-config.yaml
+  - lgtm-grafana-dashboard-config.yaml
diff --git a/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml b/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml
new file mode 100644
index 0000000..32e05ac
--- /dev/null
+++ b/deploy/local/lgtm/lgtm-grafana-dashboard-config.yaml
@@ -0,0 +1,20 @@
+# Grafana dashboard provisioning config.
+# This tells Grafana where to find dashboard JSON files and how to organize them.
+# Without this, Grafana won't scan any directories for dashboards.
+# Reference: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-provisioning
+  namespace: monitoring
+data:
+  testbench-dashboards.yaml: |
+    apiVersion: 1
+    providers:
+      - name: 'testbench-dashboards'
+        type: file
+        folder: 'Testbench'
+        disableDeletion: false
+        editable: true
+        options:
+          path: /etc/grafana/provisioning/dashboards/testbench-dashboards
diff --git a/deploy/local/lgtm/lgtm-otel-scraping-config.yaml b/deploy/local/lgtm/lgtm-otel-scraping-config.yaml
new file mode 100644
index 0000000..cbfe14d
--- /dev/null
+++ b/deploy/local/lgtm/lgtm-otel-scraping-config.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: otelcol-config
+  namespace: monitoring
+data:
+  lgtm-otelcol-config.yaml: |
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+            cors:
+              allowed_origins:
+                - http://*
+      prometheus/collector:
+        config:
+          scrape_configs:
+            - job_name: "opentelemetry-collector"
+              scrape_interval: 1s
+              static_configs:
+                - targets: ["127.0.0.1:8888"]
+      prometheus/testkube:
+        config:
+          scrape_configs:
+            - job_name: "testkube-api"
+              scrape_interval: 15s
+              static_configs:
+                - targets: ["testkube-api-server.testkube:8088"]
+    
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+        path: "/ready"
+    
+    processors:
+      batch:
+    
+    exporters:
+      otlphttp/metrics:
+        endpoint: http://127.0.0.1:9090/api/v1/otlp
+        tls:
+          insecure: true
+      otlphttp/traces:
+        endpoint: http://127.0.0.1:4418
+        tls:
+          insecure: true
+      otlphttp/logs:
+        endpoint: http://127.0.0.1:3100/otlp
+        tls:
+          insecure: true
+      otlp/profiles:
+        endpoint: http://127.0.0.1:4040
+        tls:
+          insecure: true
+    
+    service:
+      extensions: [health_check]
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [batch]
+          exporters: [otlphttp/traces]
+          #exporters: [otlphttp/traces,debug/traces]
+        metrics:
+          receivers: [otlp, prometheus/collector, prometheus/testkube]
+          processors: [batch]
+          exporters: [otlphttp/metrics]
+          #exporters: [otlphttp/metrics,debug/metrics]
+        logs:
+          receivers: [otlp]
+          processors: [batch]
+          exporters: [otlphttp/logs]
+          #exporters: [otlphttp/logs,debug/logs]
+        profiles:
+          receivers: [otlp]
+          exporters: [otlp/profiles]
diff --git a/deploy/local/lgtm.yaml b/deploy/local/lgtm/lgtm.yaml
similarity index 66%
rename from deploy/local/lgtm.yaml
rename to deploy/local/lgtm/lgtm.yaml
index 7edc3b9..6ab19de 100644
--- a/deploy/local/lgtm.yaml
+++ b/deploy/local/lgtm/lgtm.yaml
@@ -58,6 +58,9 @@ spec:
               command:
                 - cat
                 - /tmp/ready
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
           volumeMounts:
             - name: tempo-data
               mountPath: /data/tempo
@@ -71,6 +74,15 @@ spec:
               mountPath: /data/prometheus
             - name: pyroscope-storage
               mountPath: /data/pyroscope
+            - name: grafana-dashboard-provisioning
+              # Provisioning config that tells Grafana to scan the testbench-dashboards directory
+              mountPath: /otel-lgtm/grafana/conf/provisioning/dashboards/testbench-dashboards.yaml
+              subPath: testbench-dashboards.yaml
+            - name: grafana-dashboards
+              mountPath: /etc/grafana/provisioning/dashboards/testbench-dashboards
+            - name: otelcol-config
+              mountPath: /otel-lgtm/otelcol-config.yaml
+              subPath: lgtm-otelcol-config.yaml
       volumes:
         - name: tempo-data
           emptyDir: {}
@@ -84,3 +96,12 @@ spec:
           emptyDir: {}
         - name: pyroscope-storage
           emptyDir: {}
+        - name: grafana-dashboards
+          configMap:
+            name: grafana-testkube-dashboard
+        - name: grafana-dashboard-provisioning
+          configMap:
+            name: grafana-dashboard-provisioning
+        - name: otelcol-config
+          configMap:
+            name: otelcol-config
diff --git a/deploy/local/ragas-evaluation-workflow.yaml b/deploy/local/ragas-evaluation-workflow.yaml
index 5462082..d50c34a 100644
--- a/deploy/local/ragas-evaluation-workflow.yaml
+++ b/deploy/local/ragas-evaluation-workflow.yaml
@@ -49,15 +49,10 @@ spec:
       default: "nv_accuracy context_recall"
 
     # Publishing configuration
-    workflowName:
-      type: string
-      description: "Workflow name for metrics"
-      default: "ragas-test-workflow"
-
     otlpEndpoint:
       type: string
       description: "OTLP endpoint URL"
-      default: "http://lgtm.monitoring:4318"
+      default: "http://otel-collector.monitoring.svc.cluster.local:4318"
 
     # Docker image
     image:
@@ -97,6 +92,5 @@ spec:
       template:
         name: ragas-publish-template
         config:
-          workflowName: "{{ config.workflowName }}"
           otlpEndpoint: "{{ config.otlpEndpoint }}"
           image: "{{ config.image }}"
diff --git a/scripts/publish.py b/scripts/publish.py
index 599b66b..7f3ae96 100644
--- a/scripts/publish.py
+++ b/scripts/publish.py
@@ -1,7 +1,11 @@
 import argparse
+import hashlib
 import json
 import logging
+import math
+from dataclasses import dataclass
 from logging import Logger
+from typing import Any, TypeGuard
 
 from opentelemetry import metrics
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
@@ -14,98 +18,193 @@
 logger: Logger = logging.getLogger(__name__)
 
 
-def get_overall_scores(file_path: str) -> dict[str, float]:
-    """Load the evaluation_scores.json file and return the 'overall_scores' metrics."""
+@dataclass
+class EvaluationData:
+    """Container for all evaluation data to be published as metrics."""
+
+    individual_results: list[dict[str, Any]]
+    total_tokens: dict[str, int]
+    total_cost: float
+
+
+def load_evaluation_data(file_path: str) -> EvaluationData:
+    """Load the evaluation_scores.json file and return the relevant data for metrics."""
     with open(file_path, "r") as file:
-        return json.load(file).get("overall_scores", {})
+        data = json.load(file)
+        return EvaluationData(
+            individual_results=data.get("individual_results", []),
+            total_tokens=data.get("total_tokens", {"input_tokens": 0, "output_tokens": 0}),
+            total_cost=data.get("total_cost", 0.0),
+        )
+
+
+def _is_metric_value(value: Any) -> TypeGuard[int | float]:
+    """Check if a value is a valid metric score (numeric and not NaN)."""
+    if not isinstance(value, (int, float)):
+        return False
+    if isinstance(value, float) and math.isnan(value):
+        return False
+    return True
+
 
+def _get_user_input_hash(user_input: str) -> str:
+    """Generate a short hash of the user input for stable identification."""
+    return hashlib.sha256(user_input.encode()).hexdigest()[:12]
 
-def create_and_push_metrics(overall_scores: dict[str, float], workflow_name: str, otlp_endpoint: str) -> None:
+
+def _get_user_input_truncated(user_input: str, max_length: int = 50) -> str:
+    """Truncate user input text for display in metric labels."""
+    if len(user_input) <= max_length:
+        return user_input
+    return user_input[:max_length] + "..."
+
+
+def create_and_push_metrics(
+    evaluation_data: EvaluationData, workflow_name: str, execution_id: str, execution_number: int, otlp_endpoint: str
+) -> None:
     """
-    Create OpenTelemetry metrics for each overall score and push via OTLP.
+    Create OpenTelemetry metrics for evaluation results and push via OTLP.
+
+    Creates per-sample gauges for each metric, plus token usage and cost gauges.
 
     Args:
-        overall_scores: Dictionary of metric names to scores
+        evaluation_data: Container with individual results, token counts, and cost
         workflow_name: Name of the test workflow (used as label to distinguish workflows)
+        execution_id: Testkube execution ID for this workflow run
         otlp_endpoint: URL of the OTLP endpoint (e.g., 'http://localhost:4318')
     """
-    # Ensure the endpoint has the correct protocol
     if not otlp_endpoint.startswith("http://") and not otlp_endpoint.startswith("https://"):
         otlp_endpoint = f"http://{otlp_endpoint}"
 
-    # Create OTLP exporter
     exporter = OTLPMetricExporter(endpoint=f"{otlp_endpoint}/v1/metrics")
-
-    # Create a metric reader that exports immediately
-    reader = PeriodicExportingMetricReader(
-        exporter=exporter,
-        export_interval_millis=1000,  # Export every second
-    )
-
-    # Create resource with workflow metadata
+    reader = PeriodicExportingMetricReader(exporter=exporter, export_interval_millis=1000)
     resource = Resource.create({"service.name": "ragas-evaluation", "workflow.name": workflow_name})
-
-    # Create MeterProvider with the exporter and resource
     provider = MeterProvider(resource=resource, metric_readers=[reader])
     metrics.set_meter_provider(provider)
-
-    # Get a meter
     meter = metrics.get_meter("ragas.evaluation", "1.0.0")
 
-    # Create and record metrics
     try:
         logger.info(f"Pushing metrics to OTLP endpoint at {otlp_endpoint}...")
 
-        for metric_name, score in overall_scores.items():
-            # Create a Gauge
-            gauge = meter.create_gauge(
-                name=f"ragas_evaluation_{metric_name}",
-                description=f"Overall {metric_name} score from RAGAS evaluation",
-                unit="1",
-            )
+        # Collect metric names from individual results (any numeric field is a metric)
+        metric_names: set[str] = set()
+        for result in evaluation_data.individual_results:
+            for key, value in result.items():
+                if _is_metric_value(value):
+                    metric_names.add(key)
+
+        # Single gauge for all evaluation metrics, differentiated by 'name' attribute
+        metric_gauge = meter.create_gauge(
+            name="testbench_evaluation_metric",
+            description="Evaluation metric from RAGAS testbench",
+            unit="",
+        )
+
+        # Set per-sample values for each metric
+        for metric_name in sorted(metric_names):
+            for result in evaluation_data.individual_results:
+                score = result.get(metric_name)
+                if not _is_metric_value(score):
+                    logger.debug(f"Skipping invalid metric value for {metric_name}: {score}")
+                    continue
+                trace_id = result.get("trace_id")
+                if not trace_id:
+                    logger.warning(f"Missing trace_id for sample in execution {execution_id}")
+                    trace_id = "missing-trace-id"
+                user_input = result.get("user_input", "(user_input missing or invalid)")
+                attributes = {
+                    "name": metric_name,
+                    "workflow_name": workflow_name,
+                    "execution_id": execution_id,
+                    "execution_number": execution_number,
+                    "trace_id": trace_id,
+                    "user_input_hash": _get_user_input_hash(user_input),
+                    "user_input_truncated": _get_user_input_truncated(user_input),
+                }
+                metric_gauge.set(score, attributes)
+                logger.info(f"testbench_evaluation_metric{attributes} = {score}")
+
+        # Token usage gauge with 'type' attribute
+        token_gauge = meter.create_gauge(
+            name="testbench_evaluation_token_usage",
+            description="Token usage from RAGAS evaluation",
+            unit="",
+        )
+
+        input_tokens = evaluation_data.total_tokens.get("input_tokens", 0)
+        token_gauge.set(
+            input_tokens,
+            {
+                "type": "input_tokens",
+                "workflow_name": workflow_name,
+                "execution_id": execution_id,
+                "execution_number": execution_number,
+            },
+        )
+        logger.info(
+            f"testbench_evaluation_token_usage{{type=input_tokens, workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {input_tokens}"
+        )
+
+        output_tokens = evaluation_data.total_tokens.get("output_tokens", 0)
+        token_gauge.set(
+            output_tokens,
+            {
+                "type": "output_tokens",
+                "workflow_name": workflow_name,
+                "execution_id": execution_id,
+                "execution_number": execution_number,
+            },
+        )
+        logger.info(
+            f"testbench_evaluation_token_usage{{type=output_tokens, workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {output_tokens}"
+        )
+
+        # Total cost gauge
+        cost_gauge = meter.create_gauge(
+            name="testbench_evaluation_cost",
+            description="Total cost of RAGAS evaluation in USD",
+            unit="",
+        )
+        cost_gauge.set(
+            evaluation_data.total_cost,
+            {"workflow_name": workflow_name, "execution_id": execution_id, "execution_number": execution_number},
+        )
+        logger.info(
+            f"testbench_evaluation_cost{{workflow_name={workflow_name}, execution_id={execution_id}, execution_number={execution_number}}} = {evaluation_data.total_cost}"
+        )
 
-            # Set the gauge value with workflow_name as an attribute
-            gauge.set(score, {"workflow_name": workflow_name})
-            logger.info(f"Set metric 'ragas_evaluation_{metric_name}{{workflow_name=\"{workflow_name}\"}}' to {score}")
-
-        # Force flush to ensure metrics are sent
         provider.force_flush()
-
-        logger.info("✓ Metrics successfully pushed via OTLP")
+        logger.info("Metrics successfully pushed via OTLP")
     except Exception as e:
-        logger.error(f"✗ Error pushing metrics via OTLP: {e}")
+        logger.error(f"Error pushing metrics via OTLP: {e}")
         raise
     finally:
-        # Shutdown the provider
         provider.shutdown()
 
-    logger.info("Published metrics:")
-    for metric_name, score in overall_scores.items():
-        logger.info(f'  - ragas_evaluation_{metric_name}{{workflow_name="{workflow_name}"}}: {score}')
-
 
-def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) -> None:
+def publish_metrics(
+    input_file: str, workflow_name: str, execution_id: str, execution_number: int, otlp_endpoint: str
+) -> None:
     """
     Publish evaluation metrics via OpenTelemetry OTLP.
 
     Args:
-        input_file: Path to the evaluation scores
+        input_file: Path to the evaluation scores JSON file
         workflow_name: Name of the test workflow (e.g., 'weather-assistant-test').
+        execution_id: Testkube execution ID for this workflow run.
+        execution_number: Number of the execution for the current workflow (e.g. 3)
         otlp_endpoint: URL of the OTLP endpoint (e.g., 'http://localhost:4318').
     """
+    logger.info(f"Loading evaluation data from {input_file}...")
+    evaluation_data = load_evaluation_data(input_file)
 
-    # Load overall scores from the evaluation file
-    logger.info(f"Loading evaluation scores from {input_file}...")
-    overall_scores = get_overall_scores(input_file)
-
-    if not overall_scores:
-        logger.warning("No overall scores found in evaluation_scores.json")
+    if not evaluation_data.individual_results:
+        logger.warning("No individual results found in evaluation_scores.json")
         return
 
-    # Create and push OpenTelemetry metrics
-    logger.info(f"Creating OpenTelemetry metrics for {len(overall_scores)} scores...")
-    logger.info(f"Workflow: {workflow_name}")
-    create_and_push_metrics(overall_scores, workflow_name, otlp_endpoint)
+    logger.info(f"Publishing metrics for {len(evaluation_data.individual_results)} samples...")
+    logger.info(f"Workflow: {workflow_name}, Execution: {execution_id}")
+    create_and_push_metrics(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint)
 
 
 if __name__ == "__main__":
@@ -114,11 +213,12 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) ->
 
     Args:
         workflow_name: Name of the test workflow
+        execution_id: Testkube execution ID for this workflow run
         otlp_endpoint: (OPTIONAL) URL to the OTLP endpoint (default: localhost:4318)
 
     Examples:
-            python3 scripts/publish.py weather-assistant-test
-            python3 scripts/publish.py weather-assistant-test http://localhost:4318
+            python3 scripts/publish.py weather-assistant-test exec-123
+            python3 scripts/publish.py weather-assistant-test exec-123 http://localhost:4318
     """
 
     parser = argparse.ArgumentParser(description="Publish RAGAS evaluation metrics via OpenTelemetry OTLP")
@@ -126,6 +226,14 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) ->
         "workflow_name",
         help="Name of the test workflow (e.g., 'weather-assistant-test')",
     )
+    parser.add_argument(
+        "execution_id",
+        help="Testkube execution ID for this workflow run",
+    )
+    parser.add_argument(
+        "execution_number",
+        help="Testkube execution number for this workflow run (for use as a *numeric* identifier in Grafana)",
+    )
     parser.add_argument(
         "otlp_endpoint",
         nargs="?",
@@ -135,9 +243,10 @@ def publish_metrics(input_file: str, workflow_name: str, otlp_endpoint: str) ->
 
     args = parser.parse_args()
 
-    # Call 'publish_metrics' with hardcoded input file and specified 'workflow_name' & 'otlp_endpoint'
     publish_metrics(
         input_file="data/results/evaluation_scores.json",
         workflow_name=args.workflow_name,
+        execution_id=args.execution_id,
+        execution_number=args.execution_number,
         otlp_endpoint=args.otlp_endpoint,
     )
diff --git a/scripts/run.py b/scripts/run.py
index b93f592..d1860f4 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -16,11 +16,10 @@
 )
 from opentelemetry import trace
 from opentelemetry.trace import Status, StatusCode
+from otel_setup import setup_otel
 from pydantic import BaseModel
 from ragas import Dataset, experiment
 
-from otel_setup import setup_otel
-
 # Set up module-level logger
 logging.basicConfig(level=logging.INFO)
 logger: Logger = logging.getLogger(__name__)
diff --git a/tests/test_publish.py b/tests/test_publish.py
index 9eab789..9cc5693 100644
--- a/tests/test_publish.py
+++ b/tests/test_publish.py
@@ -5,6 +5,7 @@
 """
 
 import json
+import math
 import shutil
 import sys
 import tempfile
@@ -14,7 +15,15 @@
 
 sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
 
-from publish import create_and_push_metrics, get_overall_scores, publish_metrics
+from publish import (
+    EvaluationData,
+    _get_user_input_hash,
+    _get_user_input_truncated,
+    _is_metric_value,
+    create_and_push_metrics,
+    load_evaluation_data,
+    publish_metrics,
+)
 
 
 # Mock classes for OpenTelemetry meter provider (used by HTTPXClientInstrumentor)
@@ -58,13 +67,28 @@ def temp_dir():
 
 @pytest.fixture
 def evaluation_scores_file(temp_dir):
-    """Create a test evaluation scores file"""
+    """Create a test evaluation scores file with individual results"""
     test_file = Path(temp_dir) / "evaluation_scores.json"
     test_data = {
         "overall_scores": {"faithfulness": 0.85, "answer_relevancy": 0.90},
-        "individual_results": [],
-        "total_tokens": {"input_tokens": 0, "output_tokens": 0},
-        "total_cost": 0.0,
+        "individual_results": [
+            {
+                "user_input": "What is the weather?",
+                "response": "It is sunny.",
+                "faithfulness": 0.85,
+                "answer_relevancy": 0.90,
+                "trace_id": "a1b2c3d4e5f6789012345678901234aa",
+            },
+            {
+                "user_input": "What is the time?",
+                "response": "It is noon.",
+                "faithfulness": 0.80,
+                "answer_relevancy": 0.95,
+                "trace_id": "b2c3d4e5f6a7890123456789012345bb",
+            },
+        ],
+        "total_tokens": {"input_tokens": 1000, "output_tokens": 200},
+        "total_cost": 0.05,
     }
 
     with open(test_file, "w") as f:
@@ -90,10 +114,13 @@ def realistic_scores_file(temp_dir):
                 "response": "It is sunny.",
                 "faithfulness": 0.85,
                 "answer_relevancy": 0.90,
+                "context_precision": 0.78,
+                "context_recall": 0.82,
+                "trace_id": "c3d4e5f6a7b8901234567890123456cc",
             }
         ],
-        "total_tokens": {"input_tokens": 0, "output_tokens": 0},
-        "total_cost": 0.0,
+        "total_tokens": {"input_tokens": 500, "output_tokens": 100},
+        "total_cost": 0.025,
     }
 
     with open(test_file, "w") as f:
@@ -102,25 +129,111 @@ def realistic_scores_file(temp_dir):
     return test_file
 
 
-# TestGetOverallScores tests
-def test_loads_overall_scores(evaluation_scores_file):
-    """Test that get_overall_scores loads the overall_scores section"""
-    scores = get_overall_scores(str(evaluation_scores_file))
+# Test _is_metric_value
+def test_is_metric_value_with_float():
+    """Test that valid floats are recognized as metric values"""
+    assert _is_metric_value(0.85) is True
+    assert _is_metric_value(1.0) is True
+    assert _is_metric_value(0.0) is True
+
+
+def test_is_metric_value_with_int():
+    """Test that integers are recognized as metric values"""
+    assert _is_metric_value(1) is True
+    assert _is_metric_value(0) is True
+
+
+def test_is_metric_value_with_nan():
+    """Test that NaN is not recognized as a metric value"""
+    assert _is_metric_value(float("nan")) is False
+    assert _is_metric_value(math.nan) is False
+
+
+def test_is_metric_value_with_non_numeric():
+    """Test that non-numeric values are not recognized as metric values"""
+    assert _is_metric_value("string") is False
+    assert _is_metric_value(["list"]) is False
+    assert _is_metric_value({"dict": "value"}) is False
+    assert _is_metric_value(None) is False
+
+
+# Test _get_user_input_hash
+def test_get_user_input_hash_returns_12_char_hex():
+    """Test that _get_user_input_hash returns a 12-character hex string"""
+    result = _get_user_input_hash("What is the weather?")
+    assert len(result) == 12
+    assert all(c in "0123456789abcdef" for c in result)
+
+
+def test_get_user_input_hash_is_deterministic():
+    """Test that _get_user_input_hash returns the same hash for the same input"""
+    input_text = "What is the weather in New York?"
+    assert _get_user_input_hash(input_text) == _get_user_input_hash(input_text)
+
+
+def test_get_user_input_hash_different_for_different_inputs():
+    """Test that _get_user_input_hash returns different hashes for different inputs"""
+    hash1 = _get_user_input_hash("Question 1")
+    hash2 = _get_user_input_hash("Question 2")
+    assert hash1 != hash2
+
+
+# Test _get_user_input_truncated
+def test_get_user_input_truncated_short_input():
+    """Test that short inputs are returned unchanged"""
+    short_input = "Short question"
+    assert _get_user_input_truncated(short_input) == short_input
+
+
+def test_get_user_input_truncated_exact_length():
+    """Test that inputs exactly at max_length are returned unchanged"""
+    exact_input = "a" * 50
+    assert _get_user_input_truncated(exact_input) == exact_input
+
+
+def test_get_user_input_truncated_long_input():
+    """Test that long inputs are truncated with ellipsis"""
+    long_input = "a" * 100
+    result = _get_user_input_truncated(long_input)
+    assert len(result) == 53  # 50 chars + "..."
+    assert result.endswith("...")
+
 
-    assert scores["faithfulness"] == 0.85
-    assert scores["answer_relevancy"] == 0.90
+def test_get_user_input_truncated_custom_length():
+    """Test that custom max_length is respected"""
+    input_text = "This is a longer question"
+    result = _get_user_input_truncated(input_text, max_length=10)
+    assert result == "This is a ..."
+
+
+# Test load_evaluation_data
+def test_loads_evaluation_data(evaluation_scores_file):
+    """Test that load_evaluation_data loads all required fields"""
+    data = load_evaluation_data(str(evaluation_scores_file))
+
+    assert len(data.individual_results) == 2
+    assert data.total_tokens["input_tokens"] == 1000
+    assert data.total_tokens["output_tokens"] == 200
+    assert data.total_cost == 0.05
 
 
 def test_file_not_found(temp_dir):
     """Test behavior when file doesn't exist"""
     with pytest.raises(FileNotFoundError):
-        get_overall_scores(str(Path(temp_dir) / "nonexistent.json"))
+        load_evaluation_data(str(Path(temp_dir) / "nonexistent.json"))
 
 
 # TestCreateAndPushMetrics tests
 def test_creates_gauges_for_each_metric(monkeypatch):
-    """Test that a Gauge is created for each metric"""
-    overall_scores = {"faithfulness": 0.85, "answer_relevancy": 0.90}
+    """Test that a Gauge is created for each metric plus token/cost gauges"""
+    evaluation_data = EvaluationData(
+        individual_results=[
+            {"user_input": "Question 1", "faithfulness": 0.85, "answer_relevancy": 0.90, "trace_id": "trace1"},
+            {"user_input": "Question 2", "faithfulness": 0.80, "answer_relevancy": 0.95, "trace_id": "trace2"},
+        ],
+        total_tokens={"input_tokens": 1000, "output_tokens": 200},
+        total_cost=0.05,
+    )
 
     # Mock the meter and gauge
     create_gauge_calls = []
@@ -167,34 +280,51 @@ def mock_exporter_init(endpoint):
     monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init)
 
     create_and_push_metrics(
-        overall_scores=overall_scores,
+        evaluation_data=evaluation_data,
         workflow_name="test-workflow",
+        execution_id="exec-test-123",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
-    # Verify create_gauge was called for each metric
-    assert len(create_gauge_calls) == 2
+    # Verify gauges created: 1 metric gauge + 1 token gauge + 1 cost gauge = 3
+    assert len(create_gauge_calls) == 3
 
     # Verify gauge names
     gauge_names = [call["name"] for call in create_gauge_calls]
-    assert "ragas_evaluation_faithfulness" in gauge_names
-    assert "ragas_evaluation_answer_relevancy" in gauge_names
+    assert "testbench_evaluation_metric" in gauge_names
+    assert "testbench_evaluation_token_usage" in gauge_names
+    assert "testbench_evaluation_cost" in gauge_names
 
 
-def test_sets_gauge_values(monkeypatch):
-    """Test that gauge values are set correctly"""
-    overall_scores = {"faithfulness": 0.85}
+def test_sets_per_sample_gauge_values(monkeypatch):
+    """Test that gauge values are set for each sample with all required attributes"""
+    evaluation_data = EvaluationData(
+        individual_results=[
+            {"user_input": "Question 1", "faithfulness": 0.85, "trace_id": "d4e5f6a7b8c9012345678901234567dd"},
+            {
+                "user_input": "This is a very long question that exceeds fifty characters in length",
+                "faithfulness": 0.80,
+                "trace_id": "e5f6a7b8c9d0123456789012345678ee",
+            },
+        ],
+        total_tokens={"input_tokens": 0, "output_tokens": 0},
+        total_cost=0.0,
+    )
 
     # Mock the meter and gauge
     set_calls = []
 
     class MockGauge:
+        def __init__(self, name):
+            self.name = name
+
         def set(self, value, attributes):
-            set_calls.append({"value": value, "attributes": attributes})
+            set_calls.append({"name": self.name, "value": value, "attributes": attributes})
 
     class MockMeter:
         def create_gauge(self, name, unit=None, description=None):
-            return MockGauge()
+            return MockGauge(name)
 
     mock_meter = MockMeter()
 
@@ -229,20 +359,48 @@ def mock_exporter_init(endpoint):
     monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init)
 
     create_and_push_metrics(
-        overall_scores=overall_scores,
+        evaluation_data=evaluation_data,
         workflow_name="test-workflow",
+        execution_id="exec-test-123",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
-    # Verify gauge.set was called with correct value and attributes
-    assert len(set_calls) == 1
-    assert set_calls[0]["value"] == 0.85
-    assert set_calls[0]["attributes"] == {"workflow_name": "test-workflow"}
+    # Filter to faithfulness metric calls only (name attribute = "faithfulness")
+    faithfulness_calls = [
+        c
+        for c in set_calls
+        if c["name"] == "testbench_evaluation_metric" and c["attributes"].get("name") == "faithfulness"
+    ]
+    assert len(faithfulness_calls) == 2
+
+    # Verify gauge.set was called with correct values and all required attributes
+    # First sample: short question
+    assert faithfulness_calls[0]["value"] == 0.85
+    assert faithfulness_calls[0]["attributes"]["workflow_name"] == "test-workflow"
+    assert faithfulness_calls[0]["attributes"]["execution_id"] == "exec-test-123"
+    assert faithfulness_calls[0]["attributes"]["execution_number"] == 42
+    assert faithfulness_calls[0]["attributes"]["trace_id"] == "d4e5f6a7b8c9012345678901234567dd"
+    assert faithfulness_calls[0]["attributes"]["user_input_hash"] == _get_user_input_hash("Question 1")
+    assert faithfulness_calls[0]["attributes"]["user_input_truncated"] == "Question 1"
+
+    # Second sample: long question (should be truncated)
+    long_question = "This is a very long question that exceeds fifty characters in length"
+    assert faithfulness_calls[1]["value"] == 0.80
+    assert faithfulness_calls[1]["attributes"]["execution_id"] == "exec-test-123"
+    assert faithfulness_calls[1]["attributes"]["execution_number"] == 42
+    assert faithfulness_calls[1]["attributes"]["trace_id"] == "e5f6a7b8c9d0123456789012345678ee"
+    assert faithfulness_calls[1]["attributes"]["user_input_hash"] == _get_user_input_hash(long_question)
+    assert faithfulness_calls[1]["attributes"]["user_input_truncated"] == _get_user_input_truncated(long_question)
 
 
 def test_pushes_via_otlp(monkeypatch):
     """Test that metrics are pushed via OTLP"""
-    overall_scores = {"faithfulness": 0.85}
+    evaluation_data = EvaluationData(
+        individual_results=[{"user_input": "Q1", "faithfulness": 0.85, "trace_id": "f6a7b8c9d0e1234567890123456789ff"}],
+        total_tokens={"input_tokens": 100, "output_tokens": 50},
+        total_cost=0.01,
+    )
 
     # Mock the meter and gauge
     class MockGauge:
@@ -288,8 +446,10 @@ def mock_exporter_init(endpoint):
     monkeypatch.setattr("publish.OTLPMetricExporter", mock_exporter_init)
 
     create_and_push_metrics(
-        overall_scores=overall_scores,
+        evaluation_data=evaluation_data,
         workflow_name="test-workflow",
+        execution_id="exec-test-123",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
@@ -304,21 +464,14 @@ def mock_exporter_init(endpoint):
 
 def test_handles_push_error(monkeypatch):
     """Test error handling when OTLP export fails"""
-    overall_scores = {"faithfulness": 0.85}
-
-    # Mock the meter and gauge
-    class MockGauge:
-        def set(self, value, attributes):
-            pass
-
-    class MockMeter:
-        def create_gauge(self, name, unit=None, description=None):
-            return MockGauge()
-
-    mock_meter = MockMeter()
+    evaluation_data = EvaluationData(
+        individual_results=[{"user_input": "Q1", "faithfulness": 0.85, "trace_id": "a7b8c9d0e1f2345678901234567890aa"}],
+        total_tokens={"input_tokens": 0, "output_tokens": 0},
+        total_cost=0.0,
+    )
 
     def mock_get_meter(*args, **kwargs):
-        return mock_meter
+        return _OtelMockMeter()
 
     # Mock the provider to raise an exception on force_flush
     shutdown_calls = []
@@ -347,8 +500,10 @@ def mock_exporter_init(endpoint):
 
     with pytest.raises(Exception, match="Connection refused"):
         create_and_push_metrics(
-            overall_scores=overall_scores,
+            evaluation_data=evaluation_data,
             workflow_name="test-workflow",
+            execution_id="exec-test-123",
+            execution_number=42,
             otlp_endpoint="localhost:4318",
         )
 
@@ -361,11 +516,13 @@ def test_publish_metrics_calls_create_and_push(evaluation_scores_file, monkeypat
     """Test that publish_metrics calls create_and_push_metrics"""
     create_push_calls = []
 
-    def mock_create_push(overall_scores, workflow_name, otlp_endpoint):
+    def mock_create_push(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint):
         create_push_calls.append(
             {
-                "overall_scores": overall_scores,
+                "evaluation_data": evaluation_data,
                 "workflow_name": workflow_name,
+                "execution_id": execution_id,
+                "execution_number": execution_number,
                 "otlp_endpoint": otlp_endpoint,
             }
         )
@@ -375,6 +532,8 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint):
     publish_metrics(
         input_file=str(evaluation_scores_file),
         workflow_name="test-workflow",
+        execution_id="exec-test-123",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
@@ -382,16 +541,22 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint):
     assert len(create_push_calls) == 1
 
     # Verify parameters
-    assert create_push_calls[0]["overall_scores"]["faithfulness"] == 0.85
-    assert create_push_calls[0]["overall_scores"]["answer_relevancy"] == 0.90
+    assert len(create_push_calls[0]["evaluation_data"].individual_results) == 2
     assert create_push_calls[0]["workflow_name"] == "test-workflow"
+    assert create_push_calls[0]["execution_id"] == "exec-test-123"
+    assert create_push_calls[0]["execution_number"] == 42
     assert create_push_calls[0]["otlp_endpoint"] == "localhost:4318"
 
 
-def test_publish_metrics_with_empty_scores(temp_dir, monkeypatch):
-    """Test behavior when overall_scores is empty"""
-    # Create file with empty overall_scores
-    test_data = {"overall_scores": {}, "individual_results": []}
+def test_publish_metrics_with_empty_results(temp_dir, monkeypatch):
+    """Test behavior when individual_results is empty"""
+    # Create file with empty individual_results
+    test_data = {
+        "overall_scores": {},
+        "individual_results": [],
+        "total_tokens": {"input_tokens": 0, "output_tokens": 0},
+        "total_cost": 0.0,
+    }
 
     empty_file = Path(temp_dir) / "empty_scores.json"
     with open(empty_file, "w") as f:
@@ -399,7 +564,7 @@ def test_publish_metrics_with_empty_scores(temp_dir, monkeypatch):
 
     create_push_calls = []
 
-    def mock_create_push(overall_scores, workflow_name, otlp_endpoint):
+    def mock_create_push(evaluation_data, workflow_name, execution_id, execution_number, otlp_endpoint):
         create_push_calls.append(True)
 
     monkeypatch.setattr("publish.create_and_push_metrics", mock_create_push)
@@ -407,6 +572,8 @@ def mock_create_push(overall_scores, workflow_name, otlp_endpoint):
     publish_metrics(
         input_file=str(empty_file),
         workflow_name="test-workflow",
+        execution_id="exec-test-123",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
@@ -467,11 +634,18 @@ def mock_exporter_init(endpoint):
     publish_metrics(
         input_file=str(realistic_scores_file),
         workflow_name="weather-assistant-test",
+        execution_id="exec-weather-456",
+        execution_number=42,
         otlp_endpoint="localhost:4318",
     )
 
     # Verify OTLPMetricExporter was called
     assert len(exporter_calls) == 1
 
-    # Verify 4 metrics were created (faithfulness, answer_relevancy, context_precision, context_recall)
-    assert len(create_gauge_calls) == 4
+    # Verify 3 gauges: 1 metric gauge + 1 token gauge + 1 cost gauge
+    assert len(create_gauge_calls) == 3
+
+    gauge_names = [call["name"] for call in create_gauge_calls]
+    assert "testbench_evaluation_metric" in gauge_names
+    assert "testbench_evaluation_token_usage" in gauge_names
+    assert "testbench_evaluation_cost" in gauge_names
diff --git a/tests/test_run.py b/tests/test_run.py
index bfbbe60..8cacc30 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -111,9 +111,7 @@ def mock_httpx_client():
 
     # Call the function
     result = await run_agent_experiment.func(
-        test_row,
-        agent_url="http://test-agent:8000",
-        workflow_name="test-workflow"
+        test_row, agent_url="http://test-agent:8000", workflow_name="test-workflow"
     )
 
     # Verify result structure
@@ -156,9 +154,7 @@ def mock_httpx_client():
 
     # Call the function
     result = await run_agent_experiment.func(
-        test_row,
-        agent_url="http://test-agent:8000",
-        workflow_name="test-workflow"
+        test_row, agent_url="http://test-agent:8000", workflow_name="test-workflow"
     )
 
     # Verify error is captured in response