Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import dev.braintrust.eval.DatasetCase;
import dev.braintrust.eval.Scorer;
import dev.braintrust.instrumentation.openai.BraintrustOpenAI;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

public class ExperimentExample {
Expand Down Expand Up @@ -37,7 +39,13 @@ public static void main(String[] args) throws Exception {
// will append new cases to
// the same experiment
.cases(
DatasetCase.of("strawberry", "fruit"),
new DatasetCase<>(
"strawberry",
"fruit",
// custom tags which appear in Braintrust UI
List.of("example"),
// custom metadata passed to scorers
Map.of("calories", 30)),
DatasetCase.of("asparagus", "vegetable"),
DatasetCase.of("apple", "fruit"),
DatasetCase.of("banana", "fruit"))
Expand Down
8 changes: 0 additions & 8 deletions src/main/java/dev/braintrust/eval/DatasetCase.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,6 @@ public record DatasetCase<INPUT, OUTPUT>(
OUTPUT expected,
@Nonnull List<String> tags,
@Nonnull Map<String, Object> metadata) {
public DatasetCase {
if (!metadata.isEmpty()) {
throw new RuntimeException("TODO: metadata support not yet implemented");
}
if (!tags.isEmpty()) {
throw new RuntimeException("TODO: tags support not yet implemented");
}
}

public static <INPUT, OUTPUT> DatasetCase<INPUT, OUTPUT> of(INPUT input, OUTPUT expected) {
return new DatasetCase<>(input, expected, List.of(), Map.of());
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/dev/braintrust/eval/Eval.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
"braintrust.input_json", json(Map.of("input", datasetCase.input())))
.setAttribute("braintrust.expected", json(datasetCase.expected()))
.startSpan();
if (!datasetCase.tags().isEmpty()) {
rootSpan.setAttribute(
AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
}
try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
final TaskResult<INPUT, OUTPUT> taskResult;
{ // run task
Expand Down
103 changes: 103 additions & 0 deletions src/test/java/dev/braintrust/eval/EvalTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.trace.SpanId;
import io.opentelemetry.sdk.trace.data.SpanData;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import lombok.SneakyThrows;
Expand Down Expand Up @@ -99,4 +100,106 @@ public void evalOtelTraceWithProperAttributes() {
boolean isFruitOrVegetable(String str) {
return "vegetable".equals(str) || "fruit".equals(str);
}

@Test
@SneakyThrows
public void evalWithTagsAndMetadata() {
var experimentName = "unit-test-eval-tags-metadata";

var eval =
testHarness
.braintrust()
.<String, String>evalBuilder()
.name(experimentName)
.cases(
new DatasetCase<>(
"strawberry",
"fruit",
List.of("red", "sweet"),
Map.of("calories", 32, "season", "summer")),
new DatasetCase<>(
"asparagus",
"vegetable",
List.of("green", "savory"),
Map.of("calories", 20, "season", "spring")))
.taskFunction(food -> "fruit")
.scorers(
new Scorer<String, String>() {
@Override
public String getName() {
return "fruit_scorer";
}

@Override
public List<Score> score(
TaskResult<String, String> taskResult) {
// Assert metadata is accessible and valid
var metadata = taskResult.datasetCase().metadata();
assertNotNull(
metadata,
"metadata should be accessible in scorer");
assertFalse(
metadata.isEmpty(), "metadata should not be empty");
assertTrue(
metadata.containsKey("calories"),
"metadata should contain calories");
assertTrue(
metadata.containsKey("season"),
"metadata should contain season");

// Verify specific values based on input
var input = taskResult.datasetCase().input();
if ("strawberry".equals(input)) {
assertEquals(
32,
metadata.get("calories"),
"strawberry should have 32 calories");
assertEquals(
"summer",
metadata.get("season"),
"strawberry should be summer season");
} else if ("asparagus".equals(input)) {
assertEquals(
20,
metadata.get("calories"),
"asparagus should have 20 calories");
assertEquals(
"spring",
metadata.get("season"),
"asparagus should be spring season");
}

var score = "fruit".equals(taskResult.result()) ? 1.0 : 0.0;
return List.of(new Score("fruit_scorer", score));
}
})
.build();
var result = eval.run();
assertEquals(
"%s/experiments/%s"
.formatted(testHarness.braintrust().projectUri(), experimentName),
result.getExperimentUrl());
var spans = testHarness.awaitExportedSpans();

final AtomicInteger numRootSpans = new AtomicInteger(0);
for (SpanData span : spans) {
if (span.getParentSpanId().equals(SpanId.getInvalid())) {
// This is a root span - check for tags and metadata
var tags = span.getAttributes().get(AttributeKey.stringArrayKey("braintrust.tags"));

assertNotNull(tags, "root span should have tags");

assertEquals(2, tags.size(), "each case should have 2 tags");
assertTrue(
tags.contains("red") || tags.contains("green"),
"tags should contain color");
assertTrue(
tags.contains("sweet") || tags.contains("savory"),
"tags should contain taste");

numRootSpans.incrementAndGet();
}
}
assertEquals(2, numRootSpans.get(), "both cases should have tags and metadata");
}
}