From 931b0920a114314d70a2fff428df2f81af98e601 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Wed, 26 Nov 2025 12:59:40 -0700 Subject: [PATCH] support dataset tags and metadata --- .../examples/ExperimentExample.java | 10 +- .../java/dev/braintrust/eval/DatasetCase.java | 8 -- src/main/java/dev/braintrust/eval/Eval.java | 4 + .../java/dev/braintrust/eval/EvalTest.java | 103 ++++++++++++++++++ 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java index 74007eb..74b2d1c 100644 --- a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java +++ b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java @@ -7,6 +7,8 @@ import dev.braintrust.eval.DatasetCase; import dev.braintrust.eval.Scorer; import dev.braintrust.instrumentation.openai.BraintrustOpenAI; +import java.util.List; +import java.util.Map; import java.util.function.Function; public class ExperimentExample { @@ -37,7 +39,13 @@ public static void main(String[] args) throws Exception { // will append new cases to // the same experiment .cases( - DatasetCase.of("strawberry", "fruit"), + new DatasetCase<>( + "strawberry", + "fruit", + // custom tags which appear in Braintrust UI + List.of("example"), + // custom metadata passed to scorers + Map.of("calories", 30)), DatasetCase.of("asparagus", "vegetable"), DatasetCase.of("apple", "fruit"), DatasetCase.of("banana", "fruit")) diff --git a/src/main/java/dev/braintrust/eval/DatasetCase.java b/src/main/java/dev/braintrust/eval/DatasetCase.java index 5591ba5..a27d920 100644 --- a/src/main/java/dev/braintrust/eval/DatasetCase.java +++ b/src/main/java/dev/braintrust/eval/DatasetCase.java @@ -10,14 +10,6 @@ public record DatasetCase( OUTPUT expected, @Nonnull List tags, @Nonnull Map metadata) { - public DatasetCase { - if (!metadata.isEmpty()) { - throw new RuntimeException("TODO: metadata support not yet implemented"); - } - if (!tags.isEmpty()) { - throw new RuntimeException("TODO: tags support not yet implemented"); - } - } public static DatasetCase of(INPUT input, OUTPUT expected) { return new DatasetCase<>(input, expected, List.of(), Map.of()); diff --git a/src/main/java/dev/braintrust/eval/Eval.java b/src/main/java/dev/braintrust/eval/Eval.java index 14f9fc1..b5fb9d3 100644 --- a/src/main/java/dev/braintrust/eval/Eval.java +++ b/src/main/java/dev/braintrust/eval/Eval.java @@ -95,6 +95,10 @@ private void evalOne(String experimentId, DatasetCase datasetCase "braintrust.input_json", json(Map.of("input", datasetCase.input()))) .setAttribute("braintrust.expected", json(datasetCase.expected())) .startSpan(); + if (!datasetCase.tags().isEmpty()) { + rootSpan.setAttribute( + AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); + } try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) { final TaskResult taskResult; { // run task diff --git a/src/test/java/dev/braintrust/eval/EvalTest.java b/src/test/java/dev/braintrust/eval/EvalTest.java index 6c7af7f..0e242e7 100644 --- a/src/test/java/dev/braintrust/eval/EvalTest.java +++ b/src/test/java/dev/braintrust/eval/EvalTest.java @@ -8,6 +8,7 @@ import io.opentelemetry.api.common.AttributeKey; import io.opentelemetry.api.trace.SpanId; import io.opentelemetry.sdk.trace.data.SpanData; +import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import lombok.SneakyThrows; @@ -99,4 +100,106 @@ public void evalOtelTraceWithProperAttributes() { boolean isFruitOrVegetable(String str) { return "vegetable".equals(str) || "fruit".equals(str); } + + @Test + @SneakyThrows + public void evalWithTagsAndMetadata() { + var experimentName = "unit-test-eval-tags-metadata"; + + var eval = + testHarness + .braintrust() + .evalBuilder() + .name(experimentName) + .cases( + new DatasetCase<>( + "strawberry", + "fruit", + List.of("red", "sweet"), + Map.of("calories", 32, "season", "summer")), + new DatasetCase<>( + "asparagus", + "vegetable", + List.of("green", "savory"), + Map.of("calories", 20, "season", "spring"))) + .taskFunction(food -> "fruit") + .scorers( + new Scorer() { + @Override + public String getName() { + return "fruit_scorer"; + } + + @Override + public List score( + TaskResult taskResult) { + // Assert metadata is accessible and valid + var metadata = taskResult.datasetCase().metadata(); + assertNotNull( + metadata, + "metadata should be accessible in scorer"); + assertFalse( + metadata.isEmpty(), "metadata should not be empty"); + assertTrue( + metadata.containsKey("calories"), + "metadata should contain calories"); + assertTrue( + metadata.containsKey("season"), + "metadata should contain season"); + + // Verify specific values based on input + var input = taskResult.datasetCase().input(); + if ("strawberry".equals(input)) { + assertEquals( + 32, + metadata.get("calories"), + "strawberry should have 32 calories"); + assertEquals( + "summer", + metadata.get("season"), + "strawberry should be summer season"); + } else if ("asparagus".equals(input)) { + assertEquals( + 20, + metadata.get("calories"), + "asparagus should have 20 calories"); + assertEquals( + "spring", + metadata.get("season"), + "asparagus should be spring season"); + } + + var score = "fruit".equals(taskResult.result()) ? 1.0 : 0.0; + return List.of(new Score("fruit_scorer", score)); + } + }) + .build(); + var result = eval.run(); + assertEquals( + "%s/experiments/%s" + .formatted(testHarness.braintrust().projectUri(), experimentName), + result.getExperimentUrl()); + var spans = testHarness.awaitExportedSpans(); + + final AtomicInteger numRootSpans = new AtomicInteger(0); + for (SpanData span : spans) { + if (span.getParentSpanId().equals(SpanId.getInvalid())) { + // This is a root span - check for tags and metadata + var tags = span.getAttributes().get(AttributeKey.stringArrayKey("braintrust.tags")); + + assertNotNull(tags, "root span should have tags"); + + assertEquals(2, tags.size(), "each case should have 2 tags"); + assertTrue( + tags.contains("red") || tags.contains("green"), + "tags should contain color"); + assertTrue( + tags.contains("sweet") || tags.contains("savory"), + "tags should contain taste"); + + numRootSpans.incrementAndGet(); + } + } + assertEquals(2, numRootSpans.get(), "both cases should have tags and metadata"); + } }