diff --git a/pyproject.toml b/pyproject.toml index a65a5cbe7..6016ee691 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.9.8" +version = "2.9.9" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/src/uipath/eval/evaluators/exact_match_evaluator.py b/src/uipath/eval/evaluators/exact_match_evaluator.py index cd422eb23..0f1b3e8e8 100644 --- a/src/uipath/eval/evaluators/exact_match_evaluator.py +++ b/src/uipath/eval/evaluators/exact_match_evaluator.py @@ -56,23 +56,26 @@ async def evaluate( Returns: EvaluationResult: Boolean result indicating exact match (True/False) """ - actual_output = str(self._get_actual_output(agent_execution)) - expected_output = str(self._get_expected_output(evaluation_criteria)) + actual_output = self._get_actual_output(agent_execution) + expected_output = self._get_expected_output(evaluation_criteria) - try: - is_exact_match = float(actual_output) == float(expected_output) - except ValueError: + if isinstance(actual_output, str) or isinstance(expected_output, str): + actual_str = str(actual_output) + expected_str = str(expected_output) if not self.evaluator_config.case_sensitive: - actual_output = actual_output.lower() - expected_output = expected_output.lower() + actual_str = actual_str.lower() + expected_str = expected_str.lower() + is_exact_match = actual_str == expected_str + else: is_exact_match = actual_output == expected_output + if self.evaluator_config.negated: is_exact_match = not is_exact_match validated_justification = self.validate_justification( { - "expected": expected_output, - "actual": actual_output, + "expected": str(expected_output), + "actual": str(actual_output), } ) return NumericEvaluationResult( diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py index 3a07a239a..c67212548 100644 --- a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py @@ -1,5 +1,6 @@ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" +import copy import json import logging from abc import abstractmethod @@ -289,8 +290,6 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: ) # Log full request body for debugging - import copy - request_body_for_log = copy.deepcopy(request_data) # Convert tool_choice to dict for logging if "tool_choice" in request_body_for_log: diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py index 5760e1008..1dde550ac 100644 --- a/src/uipath/eval/evaluators/output_evaluator.py +++ b/src/uipath/eval/evaluators/output_evaluator.py @@ -52,11 +52,25 @@ class BaseOutputEvaluator(BaseEvaluator[T, C, J]): J: The justification type """ + def _normalize_numbers(self, obj: Any) -> Any: + """Recursively normalize int/float to float for consistent numeric comparison. + + Converts all numeric values (int, float) to float in nested structures + (dicts, lists), while preserving booleans and other data types. + """ + if isinstance(obj, dict): + return {k: self._normalize_numbers(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [self._normalize_numbers(v) for v in obj] + if isinstance(obj, (int, float)) and not isinstance(obj, bool): + return float(obj) + return obj + def _get_actual_output(self, agent_execution: AgentExecution) -> Any: """Get the actual output from the agent execution.""" if self.evaluator_config.target_output_key != "*": try: - return resolve_output_path( + result = resolve_output_path( agent_execution.agent_output, self.evaluator_config.target_output_key, ) @@ -67,7 +81,9 @@ def _get_actual_output(self, agent_execution: AgentExecution) -> Any: detail=f"Error: {e}", category=UiPathEvaluationErrorCategory.USER, ) from e - return agent_execution.agent_output + else: + result = agent_execution.agent_output + return self._normalize_numbers(result) def _get_full_expected_output(self, evaluation_criteria: T) -> Any: """Get the full expected output from the evaluation criteria.""" @@ -104,7 +120,7 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any: detail=f"Error: {e}", category=UiPathEvaluationErrorCategory.USER, ) from e - return expected_output + return self._normalize_numbers(expected_output) # NOTE: This evaluator is only used in coded evaluators. diff --git a/tests/evaluators/test_evaluator_methods.py b/tests/evaluators/test_evaluator_methods.py index 3aaa99368..90ea3a675 100644 --- a/tests/evaluators/test_evaluator_methods.py +++ b/tests/evaluators/test_evaluator_methods.py @@ -201,64 +201,81 @@ async def test_exact_match_negated( @pytest.mark.asyncio @pytest.mark.parametrize( - "actual, expected", + "actual_output, expected_output, expected_score", [ - ("1.0", "1"), - ("1", "1.0"), - ("1e0", "1"), - ("1.00", "1.0"), - ("0.5", "0.50"), - ("-3.0", "-3"), + # Scalar int/float normalization — the core fix + (1, 1.0, 1.0), + (1.0, 1, 1.0), + (0, 0.0, 1.0), + (-3, -3.0, 1.0), + (1.5, 1, 0.0), + (2, 3, 0.0), ], ) - async def test_exact_match_numeric_leniency( - self, actual: str, expected: str + async def test_exact_match_numeric_normalization( + self, actual_output: Any, expected_output: Any, expected_score: float ) -> None: - """Test that numerically equal values match regardless of string representation.""" + """Test that int and float scalar values are normalized before comparison.""" execution = AgentExecution( - agent_input={"input": "Test"}, - agent_output={"result": actual}, + agent_input={}, + agent_output={"value": actual_output}, agent_trace=[], ) - config = { - "name": "ExactMatchNumericTest", - "case_sensitive": True, - "target_output_key": "result", - } + config = {"name": "ExactMatchNumericTest", "target_output_key": "value"} evaluator = ExactMatchEvaluator.model_validate( {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) - criteria = OutputEvaluationCriteria(expected_output={"result": expected}) # pyright: ignore[reportCallIssue] + criteria = OutputEvaluationCriteria(expected_output={"value": expected_output}) # pyright: ignore[reportCallIssue] result = await evaluator.evaluate(execution, criteria) assert isinstance(result, NumericEvaluationResult) - assert result.score == 1.0, ( - f"Expected '{actual}' and '{expected}' to be considered equal as numbers" - ) + assert result.score == expected_score @pytest.mark.asyncio - async def test_exact_match_numeric_non_equal(self) -> None: - """Test that numerically different values do not match.""" + @pytest.mark.parametrize( + "actual_output, expected_output, target_key, expected_score", + [ + # Flat dict: int vs float value + ({"v": 1}, {"v": 1.0}, "*", 1.0), + ({"v": 1.0}, {"v": 1}, "*", 1.0), + ({"v": 1.5}, {"v": 1}, "*", 0.0), + # Nested dict + ({"a": {"b": 1}}, {"a": {"b": 1.0}}, "*", 1.0), + ({"a": {"b": 1.5}}, {"a": {"b": 1}}, "*", 0.0), + # List of numbers + ({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, "*", 1.0), + ({"vals": [1, 2, 4]}, {"vals": [1.0, 2.0, 3.0]}, "*", 0.0), + # target_output_key resolves to a dict containing int/float + ({"result": {"count": 1}}, {"result": {"count": 1.0}}, "result", 1.0), + # target_output_key resolves to a scalar int/float + ({"result": 1}, {"result": 1.0}, "result", 1.0), + ({"result": 1.5}, {"result": 1}, "result", 0.0), + ], + ) + async def test_exact_match_recursive_normalization( + self, + actual_output: Any, + expected_output: Any, + target_key: str, + expected_score: float, + ) -> None: + """Test that int/float normalization works recursively for dicts, lists, and nested structures.""" execution = AgentExecution( - agent_input={"input": "Test"}, - agent_output={"result": "1.5"}, + agent_input={}, + agent_output=actual_output, agent_trace=[], ) - config = { - "name": "ExactMatchNumericTest", - "case_sensitive": True, - "target_output_key": "result", - } + config = {"name": "ExactMatchRecursiveTest", "target_output_key": target_key} evaluator = ExactMatchEvaluator.model_validate( {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) - criteria = OutputEvaluationCriteria(expected_output={"result": "1"}) # pyright: ignore[reportCallIssue] + criteria = OutputEvaluationCriteria(expected_output=expected_output) # pyright: ignore[reportCallIssue] result = await evaluator.evaluate(execution, criteria) assert isinstance(result, NumericEvaluationResult) - assert result.score == 0.0 + assert result.score == expected_score @pytest.mark.asyncio async def test_exact_match_validate_and_evaluate_criteria( @@ -286,43 +303,64 @@ class TestContainsEvaluator: """Test ContainsEvaluator.evaluate() method.""" @pytest.mark.asyncio + @pytest.mark.parametrize( + "agent_output, search_text, target_key, case_sensitive, negated, expected_score", + [ + # Basic match + ("Test output", "Test output", "*", False, False, 1.0), + # Substring match + ("Hello World", "World", "*", False, False, 1.0), + # No match + ("Hello World", "Goodbye", "*", False, False, 0.0), + # Case-insensitive match (default) + ("Hello World", "hello world", "*", False, False, 1.0), + # Case-sensitive hit + ("Hello World", "Hello", "*", True, False, 1.0), + # Case-sensitive miss + ("Hello World", "hello", "*", True, False, 0.0), + # Negated hit becomes miss + ("Test output", "Test output", "*", False, True, 0.0), + # Negated miss becomes hit + ("Hello World", "Goodbye", "*", False, True, 1.0), + # target_output_key extraction + ("Test output", "Test output", "output", False, False, 1.0), + ], + ) async def test_contains_evaluator( - self, sample_agent_execution: AgentExecution - ) -> None: - """Test contains evaluator.""" - config = { - "name": "ContainsTest", - "target_output_key": "output", - "default_evaluation_criteria": {"search_text": "Test output"}, - } - evaluator = ContainsEvaluator.model_validate( - {"evaluatorConfig": config, "id": str(uuid.uuid4())} - ) - criteria = ContainsEvaluationCriteria(search_text="Test output") - result = await evaluator.evaluate(sample_agent_execution, criteria) - - assert isinstance(result, NumericEvaluationResult) - assert result.score == 1.0 - - @pytest.mark.asyncio - async def test_contains_evaluator_negated( - self, sample_agent_execution: AgentExecution + self, + agent_output: Any, + search_text: str, + target_key: str, + case_sensitive: bool, + negated: bool, + expected_score: float, + sample_agent_execution: AgentExecution, ) -> None: - """Test contains evaluator with negated criteria.""" + """Test ContainsEvaluator across match, no-match, case sensitivity, and negation cases.""" + if target_key == "output": + execution = ( + sample_agent_execution # has agent_output={"output": "Test output"} + ) + else: + execution = AgentExecution( + agent_input={}, + agent_output=agent_output, + agent_trace=[], + ) config = { "name": "ContainsTest", - "negated": True, - "target_output_key": "output", - "default_evaluation_criteria": {"search_text": "Test output"}, + "target_output_key": target_key, + "case_sensitive": case_sensitive, + "negated": negated, } evaluator = ContainsEvaluator.model_validate( {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) - criteria = ContainsEvaluationCriteria(search_text="Test output") - result = await evaluator.evaluate(sample_agent_execution, criteria) + criteria = ContainsEvaluationCriteria(search_text=search_text) + result = await evaluator.evaluate(execution, criteria) assert isinstance(result, NumericEvaluationResult) - assert result.score == 0.0 + assert result.score == expected_score @pytest.mark.asyncio async def test_contains_evaluator_validate_and_evaluate_criteria( @@ -332,7 +370,6 @@ async def test_contains_evaluator_validate_and_evaluate_criteria( config = { "name": "ContainsTest", "target_output_key": "*", - "default_evaluation_criteria": {"search_text": "Test output"}, } evaluator = ContainsEvaluator.model_validate( {"evaluatorConfig": config, "id": str(uuid.uuid4())} @@ -395,6 +432,44 @@ async def test_json_similarity_partial_match(self) -> None: assert isinstance(result, NumericEvaluationResult) assert math.isclose(result.score, 0.666, abs_tol=1e-3) + @pytest.mark.asyncio + @pytest.mark.parametrize( + "actual_output, expected_output, expected_score", + [ + # int/float normalization — identical after normalization + ({"count": 1}, {"count": 1.0}, 1.0), + ({"count": 1.0}, {"count": 1}, 1.0), + # Nested int/float + ({"a": {"b": 1}}, {"a": {"b": 1.0}}, 1.0), + # List of ints vs floats + ({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, 1.0), + # Different numeric values — partial score: 1.0 - |expected-actual|/|expected| + ({"count": 1.5}, {"count": 1}, 0.5), + ], + ) + async def test_json_similarity_numeric_normalization( + self, + actual_output: Any, + expected_output: Any, + expected_score: float, + ) -> None: + """Test that int/float normalization is applied before JSON similarity comparison.""" + execution = AgentExecution( + agent_input={}, + agent_output=actual_output, + agent_trace=[], + ) + config = {"name": "JsonSimilarityTest"} + evaluator = JsonSimilarityEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria(expected_output=expected_output) # pyright: ignore[reportCallIssue] + + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == expected_score + @pytest.mark.asyncio async def test_json_similarity_validate_and_evaluate_criteria(self) -> None: """Test JSON similarity using validate_and_evaluate_criteria.""" diff --git a/uv.lock b/uv.lock index c95bb9528..520ffa80c 100644 --- a/uv.lock +++ b/uv.lock @@ -2531,7 +2531,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.9.8" +version = "2.9.9" source = { editable = "." } dependencies = [ { name = "applicationinsights" },