diff --git a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java
index 74007eb..74b2d1c 100644
--- a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java
+++ b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java
@@ -7,6 +7,8 @@
import dev.braintrust.eval.DatasetCase;
import dev.braintrust.eval.Scorer;
import dev.braintrust.instrumentation.openai.BraintrustOpenAI;
+import java.util.List;
+import java.util.Map;
import java.util.function.Function;
public class ExperimentExample {
@@ -37,7 +39,13 @@ public static void main(String[] args) throws Exception {
// will append new cases to
// the same experiment
.cases(
- DatasetCase.of("strawberry", "fruit"),
+ new DatasetCase<>(
+ "strawberry",
+ "fruit",
+ // custom tags which appear in Braintrust UI
+ List.of("example"),
+ // custom metadata passed to scorers
+ Map.of("calories", 30)),
DatasetCase.of("asparagus", "vegetable"),
DatasetCase.of("apple", "fruit"),
DatasetCase.of("banana", "fruit"))
diff --git a/src/main/java/dev/braintrust/eval/DatasetCase.java b/src/main/java/dev/braintrust/eval/DatasetCase.java
index 5591ba5..a27d920 100644
--- a/src/main/java/dev/braintrust/eval/DatasetCase.java
+++ b/src/main/java/dev/braintrust/eval/DatasetCase.java
@@ -10,14 +10,6 @@ public record DatasetCase(
OUTPUT expected,
@Nonnull List tags,
@Nonnull Map metadata) {
- public DatasetCase {
- if (!metadata.isEmpty()) {
- throw new RuntimeException("TODO: metadata support not yet implemented");
- }
- if (!tags.isEmpty()) {
- throw new RuntimeException("TODO: tags support not yet implemented");
- }
- }
public static DatasetCase of(INPUT input, OUTPUT expected) {
return new DatasetCase<>(input, expected, List.of(), Map.of());
diff --git a/src/main/java/dev/braintrust/eval/Eval.java b/src/main/java/dev/braintrust/eval/Eval.java
index 14f9fc1..b5fb9d3 100644
--- a/src/main/java/dev/braintrust/eval/Eval.java
+++ b/src/main/java/dev/braintrust/eval/Eval.java
@@ -95,6 +95,10 @@ private void evalOne(String experimentId, DatasetCase datasetCase
"braintrust.input_json", json(Map.of("input", datasetCase.input())))
.setAttribute("braintrust.expected", json(datasetCase.expected()))
.startSpan();
+ if (!datasetCase.tags().isEmpty()) {
+ rootSpan.setAttribute(
+ AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
+ }
try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
final TaskResult taskResult;
{ // run task
diff --git a/src/test/java/dev/braintrust/eval/EvalTest.java b/src/test/java/dev/braintrust/eval/EvalTest.java
index 6c7af7f..0e242e7 100644
--- a/src/test/java/dev/braintrust/eval/EvalTest.java
+++ b/src/test/java/dev/braintrust/eval/EvalTest.java
@@ -8,6 +8,7 @@
import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.trace.SpanId;
import io.opentelemetry.sdk.trace.data.SpanData;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import lombok.SneakyThrows;
@@ -99,4 +100,106 @@ public void evalOtelTraceWithProperAttributes() {
boolean isFruitOrVegetable(String str) {
return "vegetable".equals(str) || "fruit".equals(str);
}
+
+ @Test
+ @SneakyThrows
+ public void evalWithTagsAndMetadata() {
+ var experimentName = "unit-test-eval-tags-metadata";
+
+ var eval =
+ testHarness
+ .braintrust()
+ .evalBuilder()
+ .name(experimentName)
+ .cases(
+ new DatasetCase<>(
+ "strawberry",
+ "fruit",
+ List.of("red", "sweet"),
+ Map.of("calories", 32, "season", "summer")),
+ new DatasetCase<>(
+ "asparagus",
+ "vegetable",
+ List.of("green", "savory"),
+ Map.of("calories", 20, "season", "spring")))
+ .taskFunction(food -> "fruit")
+ .scorers(
+ new Scorer() {
+ @Override
+ public String getName() {
+ return "fruit_scorer";
+ }
+
+ @Override
+ public List score(
+ TaskResult taskResult) {
+ // Assert metadata is accessible and valid
+ var metadata = taskResult.datasetCase().metadata();
+ assertNotNull(
+ metadata,
+ "metadata should be accessible in scorer");
+ assertFalse(
+ metadata.isEmpty(), "metadata should not be empty");
+ assertTrue(
+ metadata.containsKey("calories"),
+ "metadata should contain calories");
+ assertTrue(
+ metadata.containsKey("season"),
+ "metadata should contain season");
+
+ // Verify specific values based on input
+ var input = taskResult.datasetCase().input();
+ if ("strawberry".equals(input)) {
+ assertEquals(
+ 32,
+ metadata.get("calories"),
+ "strawberry should have 32 calories");
+ assertEquals(
+ "summer",
+ metadata.get("season"),
+ "strawberry should be summer season");
+ } else if ("asparagus".equals(input)) {
+ assertEquals(
+ 20,
+ metadata.get("calories"),
+ "asparagus should have 20 calories");
+ assertEquals(
+ "spring",
+ metadata.get("season"),
+ "asparagus should be spring season");
+ }
+
+ var score = "fruit".equals(taskResult.result()) ? 1.0 : 0.0;
+ return List.of(new Score("fruit_scorer", score));
+ }
+ })
+ .build();
+ var result = eval.run();
+ assertEquals(
+ "%s/experiments/%s"
+ .formatted(testHarness.braintrust().projectUri(), experimentName),
+ result.getExperimentUrl());
+ var spans = testHarness.awaitExportedSpans();
+
+ final AtomicInteger numRootSpans = new AtomicInteger(0);
+ for (SpanData span : spans) {
+ if (span.getParentSpanId().equals(SpanId.getInvalid())) {
+ // This is a root span - check for tags and metadata
+ var tags = span.getAttributes().get(AttributeKey.stringArrayKey("braintrust.tags"));
+
+ assertNotNull(tags, "root span should have tags");
+
+ assertEquals(2, tags.size(), "each case should have 2 tags");
+ assertTrue(
+ tags.contains("red") || tags.contains("green"),
+ "tags should contain color");
+ assertTrue(
+ tags.contains("sweet") || tags.contains("savory"),
+ "tags should contain taste");
+
+ numRootSpans.incrementAndGet();
+ }
+ }
+ assertEquals(2, numRootSpans.get(), "both cases should have tags and metadata");
+ }
}