From 0f5250b20c141dc0f093a4da25f8ee160e484a32 Mon Sep 17 00:00:00 2001 From: cemde Date: Wed, 18 Mar 2026 18:58:35 +0100 Subject: [PATCH 1/2] added usage logging to result logger --- maseval/core/callbacks/result_logger.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py index 7524de85..354831e9 100644 --- a/maseval/core/callbacks/result_logger.py +++ b/maseval/core/callbacks/result_logger.py @@ -34,6 +34,7 @@ class ResultLogger(BenchmarkCallback, ABC): include_traces: Whether to include execution traces in logged results include_config: Whether to include configuration in logged results include_eval: Whether to include evaluation results in logged results + include_usage: Whether to include API usage data in logged results validate_on_completion: Whether to validate all iterations were logged Example: @@ -62,7 +63,8 @@ def __init__( include_traces: bool = True, include_config: bool = True, include_eval: bool = True, - include_task: bool = False, + include_task: bool = True, + include_usage: bool = True, validate_on_completion: bool = True, ): """Initialize the result logger. @@ -73,6 +75,7 @@ def __init__( include_eval: If True, include evaluation results in logged results include_task: If True, include task data (query, metadata, protocol) in logged results + include_usage: If True, include API usage data in logged results validate_on_completion: If True, validate all iterations were logged at end """ super().__init__() @@ -80,6 +83,7 @@ def __init__( self.include_config = include_config self.include_eval = include_eval self.include_task = include_task + self.include_usage = include_usage self.validate_on_completion = validate_on_completion # Tracking for validation @@ -177,6 +181,9 @@ def _filter_report(self, report: Dict) -> Dict: if self.include_eval and "eval" in report: filtered["eval"] = report["eval"] + if self.include_usage and "usage" in report: + filtered["usage"] = report["usage"] + if self.include_task and "task" in report: filtered["task"] = report["task"] @@ -314,6 +321,7 @@ def __init__( include_config: bool = True, include_eval: bool = True, include_task: bool = False, + include_usage: bool = True, validate_on_completion: bool = True, ): """Initialize the file logger. @@ -332,6 +340,7 @@ def __init__( include_eval: If True, include evaluation results in logged results include_task: If True, include task data (query, metadata, protocol) in logged results + include_usage: If True, include API usage data in logged results validate_on_completion: If True, validate all iterations were logged """ super().__init__( @@ -339,6 +348,7 @@ def __init__( include_config=include_config, include_eval=include_eval, include_task=include_task, + include_usage=include_usage, validate_on_completion=validate_on_completion, ) @@ -530,6 +540,7 @@ def _write_metadata(self) -> None: "include_config": self.include_config, "include_eval": self.include_eval, "include_task": self.include_task, + "include_usage": self.include_usage, "validation_enabled": self.validate_on_completion, } From 900ba6d1ecb61cbd7e585d928bcc0dd190d433a3 Mon Sep 17 00:00:00 2001 From: cemde Date: Wed, 18 Mar 2026 19:03:17 +0100 Subject: [PATCH 2/2] fixed test --- maseval/core/callbacks/result_logger.py | 2 +- .../test_callbacks/test_result_logger.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py index 354831e9..1b681e47 100644 --- a/maseval/core/callbacks/result_logger.py +++ b/maseval/core/callbacks/result_logger.py @@ -320,7 +320,7 @@ def __init__( include_traces: bool = True, include_config: bool = True, include_eval: bool = True, - include_task: bool = False, + include_task: bool = True, include_usage: bool = True, validate_on_completion: bool = True, ): diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py index a6aba5de..b2af93c7 100644 --- a/tests/test_core/test_callbacks/test_result_logger.py +++ b/tests/test_core/test_callbacks/test_result_logger.py @@ -197,8 +197,8 @@ def test_filter_report_includes_task_when_enabled(self): assert filtered["task"]["query"] == "What is 2+2?" assert filtered["task"]["metadata"] == {"difficulty": "easy"} - def test_filter_report_excludes_task_by_default(self): - """Test that task data is excluded from filtered report by default.""" + def test_filter_report_includes_task_by_default(self): + """Test that task data is included in filtered report by default.""" logger = MockResultLogger() report = { @@ -209,6 +209,21 @@ def test_filter_report_excludes_task_by_default(self): filtered = logger._filter_report(report) + assert "task" in filtered + assert filtered["task"]["query"] == "What is 2+2?" + + def test_filter_report_excludes_task_when_disabled(self): + """Test that task data is excluded from filtered report when include_task is False.""" + logger = MockResultLogger(include_task=False) + + report = { + "task_id": "task_0", + "repeat_idx": 0, + "task": {"query": "What is 2+2?", "metadata": {}, "protocol": {}}, + } + + filtered = logger._filter_report(report) + assert "task" not in filtered def test_filter_report_partial_included(self):