From 67081cefbdb510c482bce5098c8b9b204f90ea27 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Thu, 11 Dec 2025 17:19:34 -0500 Subject: [PATCH 1/2] update scorer convenience function to take expected and result The prior helper function of just taking the result is useless outside of trivial examples --- README.md | 7 ++----- .../braintrust/examples/ExperimentExample.java | 15 ++++++--------- src/main/java/dev/braintrust/eval/Scorer.java | 11 ++++------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 85d3430..0d33aaa 100644 --- a/README.md +++ b/README.md @@ -57,11 +57,8 @@ var eval = braintrust.evalBuilder() .taskFunction(getFoodType) .scorers( Scorer.of( - "fruit_scorer", - result -> "fruit".equals(result) ? 1.0 : 0.0), - Scorer.of( - "vegetable_scorer", - result -> "vegetable".equals(result) ? 1.0 : 0.0)) + "exact_match", + (expected, result) -> expected.equals(result) ? 1.0 : 0.0)) .build(); var result = eval.run(); System.out.println("\n\n" + result.createReportString()); diff --git a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java index 74b2d1c..7c3de22 100644 --- a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java +++ b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java @@ -34,10 +34,8 @@ public static void main(String[] args) throws Exception { var eval = braintrust .evalBuilder() - .name("java-eval-x-" + System.currentTimeMillis()) // NOTE: if you use a - // constant, additional runs - // will append new cases to - // the same experiment + // NOTE: pre-existing experiment names will append results + .name("java-eval-x-" + System.currentTimeMillis()) .cases( new DatasetCase<>( "strawberry", @@ -49,14 +47,13 @@ public static void main(String[] args) throws Exception { DatasetCase.of("asparagus", "vegetable"), DatasetCase.of("apple", "fruit"), DatasetCase.of("banana", "fruit")) + // Or, to fetch a remote dataset: + // .dataset(braintrust.fetchDataset("my-dataset-name")) .taskFunction(getFoodType) .scorers( Scorer.of( - "fruit_scorer", - result -> "fruit".equals(result) ? 1.0 : 0.0), - Scorer.of( - "vegetable_scorer", - result -> "vegetable".equals(result) ? 1.0 : 0.0)) + "exact_match", + (expected, result) -> expected.equals(result) ? 1.0 : 0.0)) .build(); var result = eval.run(); System.out.println("\n\n" + result.createReportString()); diff --git a/src/main/java/dev/braintrust/eval/Scorer.java b/src/main/java/dev/braintrust/eval/Scorer.java index f28d610..2ea8c9a 100644 --- a/src/main/java/dev/braintrust/eval/Scorer.java +++ b/src/main/java/dev/braintrust/eval/Scorer.java @@ -17,7 +17,7 @@ public interface Scorer { List score(TaskResult taskResult); static Scorer of( - String scorerName, Function scorerFn) { + String scorerName, Function, Double> scorerFn) { return new Scorer<>() { @Override public String getName() { @@ -26,15 +26,13 @@ public String getName() { @Override public List score(TaskResult taskResult) { - return List.of(new Score(scorerName, scorerFn.apply(taskResult.result()))); + return List.of(new Score(scorerName, scorerFn.apply(taskResult))); } }; } - /** Deprecated. Use {@link #of(String, Function)} or implement the Scorer interface instead. */ - @Deprecated static Scorer of( - String scorerName, BiFunction, OUTPUT, Double> scorerFn) { + String scorerName, BiFunction scorerFn) { return new Scorer<>() { @Override public String getName() { @@ -47,8 +45,7 @@ public List score(TaskResult taskResult) { new Score( scorerName, scorerFn.apply( - EvalCase.from(taskResult.datasetCase()), - taskResult.result()))); + taskResult.datasetCase().expected(), taskResult.result()))); } }; } From d02b8c0a7860322a6433499ebd917122e291b3f6 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Thu, 11 Dec 2025 18:49:51 -0500 Subject: [PATCH 2/2] add origin info to dataset case --- .../examples/ExperimentExample.java | 2 +- src/main/java/dev/braintrust/Origin.java | 16 +++++ .../eval/DatasetBrainstoreImpl.java | 15 +++- .../java/dev/braintrust/eval/DatasetCase.java | 16 ++++- src/main/java/dev/braintrust/eval/Eval.java | 4 +- .../eval/DatasetBrainstoreImplTest.java | 71 ++++++++++++++++--- .../java/dev/braintrust/eval/EvalTest.java | 65 ++++++++++++++++- 7 files changed, 170 insertions(+), 19 deletions(-) create mode 100644 src/main/java/dev/braintrust/Origin.java diff --git a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java index 7c3de22..f2cd758 100644 --- a/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java +++ b/examples/src/main/java/dev/braintrust/examples/ExperimentExample.java @@ -37,7 +37,7 @@ public static void main(String[] args) throws Exception { // NOTE: pre-existing experiment names will append results .name("java-eval-x-" + System.currentTimeMillis()) .cases( - new DatasetCase<>( + DatasetCase.of( "strawberry", "fruit", // custom tags which appear in Braintrust UI diff --git a/src/main/java/dev/braintrust/Origin.java b/src/main/java/dev/braintrust/Origin.java new file mode 100644 index 0000000..e13f73f --- /dev/null +++ b/src/main/java/dev/braintrust/Origin.java @@ -0,0 +1,16 @@ +package dev.braintrust; + +import com.fasterxml.jackson.annotation.JsonProperty; + +/** Generic pointer to an object in braintrust */ +public record Origin( + /** origin type. e.g. dataset, playground_logs */ + @JsonProperty("object_type") String objectType, + /** id of the object. e.g. dataset id */ + @JsonProperty("object_id") String objectId, + /** id of the specific item within the origin. e.g. dataset row id */ + @JsonProperty("id") String id, + /** origin xact id */ + @JsonProperty("_xact_id") String xactId, + /** creation timestamp of the origin */ + @JsonProperty("created") String createdTimestamp) {} diff --git a/src/main/java/dev/braintrust/eval/DatasetBrainstoreImpl.java b/src/main/java/dev/braintrust/eval/DatasetBrainstoreImpl.java index 606c8c0..ed15eaf 100644 --- a/src/main/java/dev/braintrust/eval/DatasetBrainstoreImpl.java +++ b/src/main/java/dev/braintrust/eval/DatasetBrainstoreImpl.java @@ -91,7 +91,20 @@ public Optional> next() { } DatasetCase datasetCase = - new DatasetCase<>(input, expected, tags, metadata); + new DatasetCase<>( + input, + expected, + tags, + metadata, + Optional.of( + new dev.braintrust.Origin( + "dataset", + Objects.requireNonNull( + (String) event.get("dataset_id")), + Objects.requireNonNull((String) event.get("id")), + Objects.requireNonNull((String) event.get("_xact_id")), + Objects.requireNonNull( + (String) event.get("created"))))); return Optional.of(datasetCase); } diff --git a/src/main/java/dev/braintrust/eval/DatasetCase.java b/src/main/java/dev/braintrust/eval/DatasetCase.java index a27d920..4524b6b 100644 --- a/src/main/java/dev/braintrust/eval/DatasetCase.java +++ b/src/main/java/dev/braintrust/eval/DatasetCase.java @@ -1,7 +1,9 @@ package dev.braintrust.eval; +import dev.braintrust.Origin; import java.util.List; import java.util.Map; +import java.util.Optional; import javax.annotation.Nonnull; /** A single row in a dataset. */ @@ -9,9 +11,19 @@ public record DatasetCase( INPUT input, OUTPUT expected, @Nonnull List tags, - @Nonnull Map metadata) { + @Nonnull Map metadata, + /** origin information. empty for in-memory cases */ + Optional origin) { public static DatasetCase of(INPUT input, OUTPUT expected) { - return new DatasetCase<>(input, expected, List.of(), Map.of()); + return of(input, expected, List.of(), Map.of()); + } + + public static DatasetCase of( + INPUT input, + OUTPUT expected, + @Nonnull List tags, + @Nonnull Map metadata) { + return new DatasetCase<>(input, expected, tags, metadata, Optional.empty()); } } diff --git a/src/main/java/dev/braintrust/eval/Eval.java b/src/main/java/dev/braintrust/eval/Eval.java index d08b4cd..b09c143 100644 --- a/src/main/java/dev/braintrust/eval/Eval.java +++ b/src/main/java/dev/braintrust/eval/Eval.java @@ -76,7 +76,6 @@ public EvalResult run() { @SneakyThrows private void evalOne(String experimentId, DatasetCase datasetCase) { - JSON_MAPPER.writeValueAsString(Map.of("type", "eval")); var rootSpan = tracer.spanBuilder("eval") // TODO: allow names for eval cases .setNoParent() // each eval case is its own trace @@ -87,6 +86,9 @@ private void evalOne(String experimentId, DatasetCase datasetCase "braintrust.input_json", json(Map.of("input", datasetCase.input()))) .setAttribute("braintrust.expected", json(datasetCase.expected())) .startSpan(); + if (datasetCase.origin().isPresent()) { + rootSpan.setAttribute("braintrust.origin", json(datasetCase.origin().get())); + } if (!datasetCase.tags().isEmpty()) { rootSpan.setAttribute( AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); diff --git a/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java b/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java index 6c536c5..54bd08f 100644 --- a/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java +++ b/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java @@ -7,9 +7,7 @@ import com.github.tomakehurst.wiremock.junit5.WireMockExtension; import dev.braintrust.api.BraintrustApiClient; import dev.braintrust.config.BraintrustConfig; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; +import java.util.*; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -52,12 +50,20 @@ void testFetchAll() { { "events": [ { - "id": "event-1", + "object_type": "dataset", + "dataset_id": "test-dataset-123", + "id": "123-1", + "created": "sometimestamp", + "_xact_id": "1", "input": "Question 1", "expected": "Answer 1" }, { - "id": "event-2", + "object_type": "dataset", + "dataset_id": "test-dataset-123", + "id": "123-2", + "_xact_id": "1", + "created": "sometimestamp", "input": "Question 2", "expected": "Answer 2" } @@ -79,7 +85,11 @@ void testFetchAll() { { "events": [ { - "id": "event-3", + "object_type": "dataset", + "dataset_id": "test-dataset-123", + "id": "123-3", + "_xact_id": "1", + "created": "sometimestamp", "input": "Question 3", "expected": "Answer 3" } @@ -97,9 +107,40 @@ void testFetchAll() { // Verify we got all 3 cases assertEquals(3, cases.size()); - assertEquals("Question 1", cases.get(0).input()); + List tags = List.of(); + Map metadata = Map.of(); + assertEquals( + new DatasetCase<>( + "Question 1", + "Answer 1", + tags, + metadata, + Optional.of( + new dev.braintrust.Origin( + "dataset", datasetId, "123-1", "1", "sometimestamp"))), + cases.get(0)); assertEquals("Question 2", cases.get(1).input()); + assertEquals( + new DatasetCase<>( + "Question 2", + "Answer 2", + tags, + metadata, + Optional.of( + new dev.braintrust.Origin( + "dataset", datasetId, "123-2", "1", "sometimestamp"))), + cases.get(1)); assertEquals("Question 3", cases.get(2).input()); + assertEquals( + new DatasetCase<>( + "Question 3", + "Answer 3", + tags, + metadata, + Optional.of( + new dev.braintrust.Origin( + "dataset", datasetId, "123-3", "1", "sometimestamp"))), + cases.get(2)); // Verify the API was called twice (once for each batch) wireMock.verify(2, postRequestedFor(urlEqualTo("/v1/dataset/" + datasetId + "/fetch"))); @@ -155,12 +196,16 @@ void testFetchWithPinnedVersion() { { "objects": [ { + "object_type": "dataset", + "dataset_id": "test-dataset-123", "id": "dataset-789", "project_id": "proj-456", "name": "test-dataset", "description": "Test dataset", - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-15T12:30:00Z" + "_xact_id": "12345", + "input": "test input", + "expected": "test output", + "created": "sometimestamp" } ] } @@ -179,11 +224,15 @@ void testFetchWithPinnedVersion() { { "events": [ { - "id": "event-1", + "object_type": "dataset", + "dataset_id": "test-dataset-123", + "id": "some-row-id", "input": "test input", "expected": "test output", "metadata": {}, - "tags": [] + "tags": [], + "_xact_id": "12346", + "created": "sometimestamp" } ], "cursor": null diff --git a/src/test/java/dev/braintrust/eval/EvalTest.java b/src/test/java/dev/braintrust/eval/EvalTest.java index 0e242e7..fdeef97 100644 --- a/src/test/java/dev/braintrust/eval/EvalTest.java +++ b/src/test/java/dev/braintrust/eval/EvalTest.java @@ -2,6 +2,8 @@ import static org.junit.jupiter.api.Assertions.*; +import com.fasterxml.jackson.databind.ObjectMapper; +import dev.braintrust.Origin; import dev.braintrust.TestHarness; import dev.braintrust.api.BraintrustApiClient; import dev.braintrust.trace.BraintrustTracing; @@ -10,12 +12,15 @@ import io.opentelemetry.sdk.trace.data.SpanData; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import lombok.SneakyThrows; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; public class EvalTest { + private static final ObjectMapper JSON_MAPPER = + new com.fasterxml.jackson.databind.ObjectMapper(); private TestHarness testHarness; @BeforeEach @@ -112,12 +117,12 @@ public void evalWithTagsAndMetadata() { .evalBuilder() .name(experimentName) .cases( - new DatasetCase<>( + DatasetCase.of( "strawberry", "fruit", List.of("red", "sweet"), Map.of("calories", 32, "season", "summer")), - new DatasetCase<>( + DatasetCase.of( "asparagus", "vegetable", List.of("green", "savory"), @@ -196,10 +201,64 @@ public List score( assertTrue( tags.contains("sweet") || tags.contains("savory"), "tags should contain taste"); - numRootSpans.incrementAndGet(); } } assertEquals(2, numRootSpans.get(), "both cases should have tags and metadata"); } + + @Test + @SneakyThrows + public void evalRootSpanPassesOriginIfPresent() { + var experimentName = "unit-test-eval-origin"; + var testOrigin = new Origin("unit-test", "1234", "5678", "9", "whatever"); + + var eval = + testHarness + .braintrust() + .evalBuilder() + .name(experimentName) + .cases( + DatasetCase.of("no-origin", "whatever"), + new DatasetCase<>( + "has-origin", + "whatever", + List.of(), + Map.of(), + Optional.of(testOrigin))) + .taskFunction(food -> "fruit") + .scorers( + Scorer.of( + "exact_match", + (expected, result) -> expected.equals(result) ? 1.0 : 0.0)) + .build(); + var result = eval.run(); + assertEquals( + "%s/experiments/%s" + .formatted(testHarness.braintrust().projectUri(), experimentName), + result.getExperimentUrl()); + var spans = testHarness.awaitExportedSpans(); + + final AtomicInteger numRootSpans = new AtomicInteger(0); + for (SpanData span : spans) { + if (span.getParentSpanId().equals(SpanId.getInvalid())) { + // This is a root span - check for origin + var inputJson = + span.getAttributes().get(AttributeKey.stringKey("braintrust.input_json")); + assertNotNull(inputJson); + JSON_MAPPER.readValue(inputJson, Map.class); + var input = (String) (JSON_MAPPER.readValue(inputJson, Map.class)).get("input"); + assertNotNull(input); + var origin = span.getAttributes().get(AttributeKey.stringKey("braintrust.origin")); + switch (input) { + case "no-origin" -> assertNull(origin); + case "has-origin" -> + assertEquals(JSON_MAPPER.writeValueAsString(testOrigin), origin); + default -> fail("unexpected input: " + input); + } + numRootSpans.incrementAndGet(); + } + } + assertEquals(2, numRootSpans.get(), "should test for origin presence and absence"); + } }