diff --git a/autoblocks/_impl/testing/v2/api.py b/autoblocks/_impl/testing/v2/api.py index 27ab8350..5c6e7ab0 100644 --- a/autoblocks/_impl/testing/v2/api.py +++ b/autoblocks/_impl/testing/v2/api.py @@ -156,6 +156,7 @@ async def send_create_result( evaluator_id_to_result: dict[str, bool], evaluator_id_to_reason: dict[str, str], evaluator_id_to_score: dict[str, float], + evaluator_id_to_metadata: dict[str, dict[str, Any]], run_message: Optional[str] = None, ) -> Response: return await post_to_api( @@ -175,5 +176,6 @@ async def send_create_result( evaluatorIdToResult=evaluator_id_to_result, evaluatorIdToReason=evaluator_id_to_reason, evaluatorIdToScore=evaluator_id_to_score, + evaluatorIdToMetadata=evaluator_id_to_metadata, ), ) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 04011a0b..eeb6bc27 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -99,10 +99,11 @@ async def _compute_evaluations( @staticmethod def _evaluations_to_maps( evals: List[EvaluationWithId], - ) -> tuple[dict[str, bool], dict[str, str], dict[str, float]]: + ) -> tuple[dict[str, bool], dict[str, str], dict[str, float], dict[str, dict[str, Any]]]: id_to_result: dict[str, bool] = {} id_to_reason: dict[str, str] = {} id_to_score: dict[str, float] = {} + id_to_metadata: dict[str, dict[str, Any]] = {} for e in evals: # Reconstruct Evaluation to compute passed() @@ -137,7 +138,8 @@ def _evaluations_to_maps( id_to_reason[e.id] = reason id_to_score[e.id] = e.score - return id_to_result, id_to_reason, id_to_score + id_to_metadata[e.id] = e.metadata or {} + return id_to_result, id_to_reason, id_to_score, id_to_metadata async def async_add_result( self, @@ -162,7 +164,7 @@ async def async_add_result( output=output, evaluators=evaluators or [], ) - eval_result_map, eval_reason_map, eval_score_map = self._evaluations_to_maps(evals) + eval_result_map, eval_reason_map, eval_score_map, eval_metadata_map = self._evaluations_to_maps(evals) # Serialize input/output with standard json to match expected formatting in tests input_raw = json.dumps(serialize_test_case(test_case)) @@ -186,6 +188,7 @@ async def async_add_result( evaluator_id_to_result=eval_result_map, evaluator_id_to_reason=eval_reason_map, evaluator_id_to_score=eval_score_map, + evaluator_id_to_metadata=eval_metadata_map, run_message=self.run_message, ) data = resp.json() diff --git a/tests/autoblocks/test_run_manager_v2.py b/tests/autoblocks/test_run_manager_v2.py index 1ba9e311..4e61717c 100644 --- a/tests/autoblocks/test_run_manager_v2.py +++ b/tests/autoblocks/test_run_manager_v2.py @@ -94,6 +94,7 @@ def test_full_lifecycle_v2(httpx_mock): evaluatorIdToResult={"evaluator-external-id": True}, evaluatorIdToReason={"evaluator-external-id": "ok"}, evaluatorIdToScore={"evaluator-external-id": 1}, + evaluatorIdToMetadata={"evaluator-external-id": {"reason": "ok"}}, ), json=dict(executionId="mock-exec-id"), ) @@ -180,6 +181,7 @@ def test_add_result_without_evaluators_sends_empty_maps(httpx_mock): evaluatorIdToResult={}, evaluatorIdToReason={}, evaluatorIdToScore={}, + evaluatorIdToMetadata={}, ), json=dict(executionId="exec-1"), )