From c9646bbeb55e0e38f50c5efc0e345d15dd94fca0 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Mon, 6 Oct 2025 10:43:10 +0200 Subject: [PATCH 01/10] feature: add evaluator metadata support to V2 API for custom data displa --- autoblocks/_impl/testing/v2/api.py | 2 ++ autoblocks/_impl/testing/v2/run_manager.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/autoblocks/_impl/testing/v2/api.py b/autoblocks/_impl/testing/v2/api.py index 27ab8350..2797760a 100644 --- a/autoblocks/_impl/testing/v2/api.py +++ b/autoblocks/_impl/testing/v2/api.py @@ -156,6 +156,7 @@ async def send_create_result( evaluator_id_to_result: dict[str, bool], evaluator_id_to_reason: dict[str, str], evaluator_id_to_score: dict[str, float], + evaluator_id_to_metadata: dict[str, dict], run_message: Optional[str] = None, ) -> Response: return await post_to_api( @@ -175,5 +176,6 @@ async def send_create_result( evaluatorIdToResult=evaluator_id_to_result, evaluatorIdToReason=evaluator_id_to_reason, evaluatorIdToScore=evaluator_id_to_score, + evaluatorIdToMetadata=evaluator_id_to_metadata, ), ) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 04011a0b..1f69b4fa 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -99,10 +99,11 @@ async def _compute_evaluations( @staticmethod def _evaluations_to_maps( evals: List[EvaluationWithId], - ) -> tuple[dict[str, bool], dict[str, str], dict[str, float]]: + ) -> tuple[dict[str, bool], dict[str, str], dict[str, float], dict[str, dict]]: id_to_result: dict[str, bool] = {} id_to_reason: dict[str, str] = {} id_to_score: dict[str, float] = {} + id_to_metadata: dict[str, dict] = {} for e in evals: # Reconstruct Evaluation to compute passed() @@ -137,7 +138,8 @@ def _evaluations_to_maps( id_to_reason[e.id] = reason id_to_score[e.id] = e.score - return id_to_result, id_to_reason, id_to_score + id_to_metadata[e.id] = e.metadata or {} + return id_to_result, id_to_reason, id_to_score, id_to_metadata async def async_add_result( self, @@ -162,7 +164,7 @@ async def async_add_result( output=output, evaluators=evaluators or [], ) - eval_result_map, eval_reason_map, eval_score_map = self._evaluations_to_maps(evals) + eval_result_map, eval_reason_map, eval_score_map, eval_metadata_map = self._evaluations_to_maps(evals) # Serialize input/output with standard json to match expected formatting in tests input_raw = json.dumps(serialize_test_case(test_case)) @@ -186,6 +188,7 @@ async def async_add_result( evaluator_id_to_result=eval_result_map, evaluator_id_to_reason=eval_reason_map, evaluator_id_to_score=eval_score_map, + evaluator_id_to_metadata=eval_metadata_map, run_message=self.run_message, ) data = resp.json() From b641144fadbd57f322763ff9aa2538074c9c9f99 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Mon, 6 Oct 2025 10:56:54 +0200 Subject: [PATCH 02/10] fix: Missing type parameters --- autoblocks/_impl/testing/v2/api.py | 2 +- autoblocks/_impl/testing/v2/run_manager.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autoblocks/_impl/testing/v2/api.py b/autoblocks/_impl/testing/v2/api.py index 2797760a..5c6e7ab0 100644 --- a/autoblocks/_impl/testing/v2/api.py +++ b/autoblocks/_impl/testing/v2/api.py @@ -156,7 +156,7 @@ async def send_create_result( evaluator_id_to_result: dict[str, bool], evaluator_id_to_reason: dict[str, str], evaluator_id_to_score: dict[str, float], - evaluator_id_to_metadata: dict[str, dict], + evaluator_id_to_metadata: dict[str, dict[str, Any]], run_message: Optional[str] = None, ) -> Response: return await post_to_api( diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 1f69b4fa..eeb6bc27 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -99,11 +99,11 @@ async def _compute_evaluations( @staticmethod def _evaluations_to_maps( evals: List[EvaluationWithId], - ) -> tuple[dict[str, bool], dict[str, str], dict[str, float], dict[str, dict]]: + ) -> tuple[dict[str, bool], dict[str, str], dict[str, float], dict[str, dict[str, Any]]]: id_to_result: dict[str, bool] = {} id_to_reason: dict[str, str] = {} id_to_score: dict[str, float] = {} - id_to_metadata: dict[str, dict] = {} + id_to_metadata: dict[str, dict[str, Any]] = {} for e in evals: # Reconstruct Evaluation to compute passed() From e7222fdbc1b899b8838ab844dabdbd9c77003090 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Mon, 6 Oct 2025 11:03:07 +0200 Subject: [PATCH 03/10] update: tests to include v2 --- tests/autoblocks/test_run_manager_v2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/autoblocks/test_run_manager_v2.py b/tests/autoblocks/test_run_manager_v2.py index 1ba9e311..4e61717c 100644 --- a/tests/autoblocks/test_run_manager_v2.py +++ b/tests/autoblocks/test_run_manager_v2.py @@ -94,6 +94,7 @@ def test_full_lifecycle_v2(httpx_mock): evaluatorIdToResult={"evaluator-external-id": True}, evaluatorIdToReason={"evaluator-external-id": "ok"}, evaluatorIdToScore={"evaluator-external-id": 1}, + evaluatorIdToMetadata={"evaluator-external-id": {"reason": "ok"}}, ), json=dict(executionId="mock-exec-id"), ) @@ -180,6 +181,7 @@ def test_add_result_without_evaluators_sends_empty_maps(httpx_mock): evaluatorIdToResult={}, evaluatorIdToReason={}, evaluatorIdToScore={}, + evaluatorIdToMetadata={}, ), json=dict(executionId="exec-1"), ) From 8f60f1092d8bd6e3c3b6996acfb01fe9f4d0a898 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Tue, 7 Oct 2025 14:00:24 +0200 Subject: [PATCH 04/10] test --- autoblocks/_impl/testing/v2/run_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index eeb6bc27..f76ccf75 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -263,4 +263,4 @@ def create_human_review( rubric_id=rubric_id, ), global_state.event_loop(), - ).result() + ).result() \ No newline at end of file From db787059fd9b3f5f6683b19a6b42329fd3270e24 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Tue, 7 Oct 2025 14:03:52 +0200 Subject: [PATCH 05/10] pre commit fix newline --- autoblocks/_impl/testing/v2/run_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index f76ccf75..544de892 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -263,4 +263,5 @@ def create_human_review( rubric_id=rubric_id, ), global_state.event_loop(), - ).result() \ No newline at end of file + ).result() + \ No newline at end of file From 5c801573b55d025e9bc76c5140a02b00e8c3240b Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Thu, 9 Oct 2025 11:45:33 +0200 Subject: [PATCH 06/10] fix pre-commit checks --- autoblocks/_impl/testing/v2/run_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 544de892..eeb6bc27 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -264,4 +264,3 @@ def create_human_review( ), global_state.event_loop(), ).result() - \ No newline at end of file From c42985f293ee9ae166d3595e2b42116bef0571c7 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Thu, 9 Oct 2025 11:51:27 +0200 Subject: [PATCH 07/10] test --- autoblocks/_impl/testing/v2/run_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index eeb6bc27..5f0e7b0f 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -264,3 +264,4 @@ def create_human_review( ), global_state.event_loop(), ).result() + From e3be8e2d15c2c1ebb8e847d3bf503e255b5c021f Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Thu, 9 Oct 2025 11:57:10 +0200 Subject: [PATCH 08/10] test --- autoblocks/_impl/testing/v2/run_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 5f0e7b0f..eeb6bc27 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -264,4 +264,3 @@ def create_human_review( ), global_state.event_loop(), ).result() - From 4640ffaa8144b3ed7291ed3ef9185c6ff9312b63 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Thu, 9 Oct 2025 12:00:25 +0200 Subject: [PATCH 09/10] test --- autoblocks/_impl/testing/v2/run_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index eeb6bc27..5f0e7b0f 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -264,3 +264,4 @@ def create_human_review( ), global_state.event_loop(), ).result() + From 5a16e22a8964e6f6b4ae428a448b922560c01f18 Mon Sep 17 00:00:00 2001 From: uros-ivetic Date: Thu, 9 Oct 2025 12:10:51 +0200 Subject: [PATCH 10/10] test --- autoblocks/_impl/testing/v2/run_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autoblocks/_impl/testing/v2/run_manager.py b/autoblocks/_impl/testing/v2/run_manager.py index 5f0e7b0f..eeb6bc27 100644 --- a/autoblocks/_impl/testing/v2/run_manager.py +++ b/autoblocks/_impl/testing/v2/run_manager.py @@ -264,4 +264,3 @@ def create_human_review( ), global_state.event_loop(), ).result() -