UiPath · andrei-rusu · Feb 25, 2026 · Feb 25, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.9.8"
+version = "2.9.9"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"

diff --git a/src/uipath/eval/evaluators/exact_match_evaluator.py b/src/uipath/eval/evaluators/exact_match_evaluator.py
@@ -56,23 +56,26 @@ async def evaluate(
         Returns:
             EvaluationResult: Boolean result indicating exact match (True/False)
         """
-        actual_output = str(self._get_actual_output(agent_execution))
-        expected_output = str(self._get_expected_output(evaluation_criteria))
+        actual_output = self._get_actual_output(agent_execution)
+        expected_output = self._get_expected_output(evaluation_criteria)
 
-        try:
-            is_exact_match = float(actual_output) == float(expected_output)
-        except ValueError:
+        if isinstance(actual_output, str) or isinstance(expected_output, str):
+            actual_str = str(actual_output)
+            expected_str = str(expected_output)
             if not self.evaluator_config.case_sensitive:
-                actual_output = actual_output.lower()
-                expected_output = expected_output.lower()
+                actual_str = actual_str.lower()
+                expected_str = expected_str.lower()
+            is_exact_match = actual_str == expected_str
+        else:
             is_exact_match = actual_output == expected_output
+
         if self.evaluator_config.negated:
             is_exact_match = not is_exact_match
 
         validated_justification = self.validate_justification(
             {
-                "expected": expected_output,
-                "actual": actual_output,
+                "expected": str(expected_output),
+                "actual": str(actual_output),
             }
         )
         return NumericEvaluationResult(

diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py
@@ -1,5 +1,6 @@
 """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
 
+import copy
 import json
 import logging
 from abc import abstractmethod
@@ -289,8 +290,6 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
         )
 
         # Log full request body for debugging
-        import copy
-
         request_body_for_log = copy.deepcopy(request_data)
         # Convert tool_choice to dict for logging
         if "tool_choice" in request_body_for_log:

diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py
@@ -52,11 +52,25 @@ class BaseOutputEvaluator(BaseEvaluator[T, C, J]):
         J: The justification type
     """
 
+    def _normalize_numbers(self, obj: Any) -> Any:
+        """Recursively normalize int/float to float for consistent numeric comparison.
+
+        Converts all numeric values (int, float) to float in nested structures
+        (dicts, lists), while preserving booleans and other data types.
+        """
+        if isinstance(obj, dict):
+            return {k: self._normalize_numbers(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [self._normalize_numbers(v) for v in obj]
+        if isinstance(obj, (int, float)) and not isinstance(obj, bool):
+            return float(obj)
+        return obj
+
     def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
         """Get the actual output from the agent execution."""
         if self.evaluator_config.target_output_key != "*":
             try:
-                return resolve_output_path(
+                result = resolve_output_path(
                     agent_execution.agent_output,
                     self.evaluator_config.target_output_key,
                 )
@@ -67,7 +81,9 @@ def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
                     detail=f"Error: {e}",
                     category=UiPathEvaluationErrorCategory.USER,
                 ) from e
-        return agent_execution.agent_output
+        else:
+            result = agent_execution.agent_output
+        return self._normalize_numbers(result)
 
     def _get_full_expected_output(self, evaluation_criteria: T) -> Any:
         """Get the full expected output from the evaluation criteria."""
@@ -104,7 +120,7 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any:
                     detail=f"Error: {e}",
                     category=UiPathEvaluationErrorCategory.USER,
                 ) from e
-        return expected_output
+        return self._normalize_numbers(expected_output)
 
 
 # NOTE: This evaluator is only used in coded evaluators.

diff --git a/tests/evaluators/test_evaluator_methods.py b/tests/evaluators/test_evaluator_methods.py
@@ -201,64 +201,81 @@ async def test_exact_match_negated(
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize(
-        "actual, expected",
+        "actual_output, expected_output, expected_score",
         [
-            ("1.0", "1"),
-            ("1", "1.0"),
-            ("1e0", "1"),
-            ("1.00", "1.0"),
-            ("0.5", "0.50"),
-            ("-3.0", "-3"),
+            # Scalar int/float normalization — the core fix
+            (1, 1.0, 1.0),
+            (1.0, 1, 1.0),
+            (0, 0.0, 1.0),
+            (-3, -3.0, 1.0),
+            (1.5, 1, 0.0),
+            (2, 3, 0.0),
         ],
     )
-    async def test_exact_match_numeric_leniency(
-        self, actual: str, expected: str
+    async def test_exact_match_numeric_normalization(
+        self, actual_output: Any, expected_output: Any, expected_score: float
     ) -> None:
-        """Test that numerically equal values match regardless of string representation."""
+        """Test that int and float scalar values are normalized before comparison."""
         execution = AgentExecution(
-            agent_input={"input": "Test"},
-            agent_output={"result": actual},
+            agent_input={},
+            agent_output={"value": actual_output},
             agent_trace=[],
         )
-        config = {
-            "name": "ExactMatchNumericTest",
-            "case_sensitive": True,
-            "target_output_key": "result",
-        }
+        config = {"name": "ExactMatchNumericTest", "target_output_key": "value"}
         evaluator = ExactMatchEvaluator.model_validate(
             {"evaluatorConfig": config, "id": str(uuid.uuid4())}
         )
-        criteria = OutputEvaluationCriteria(expected_output={"result": expected})  # pyright: ignore[reportCallIssue]
+        criteria = OutputEvaluationCriteria(expected_output={"value": expected_output})  # pyright: ignore[reportCallIssue]
 
         result = await evaluator.evaluate(execution, criteria)
 
         assert isinstance(result, NumericEvaluationResult)
-        assert result.score == 1.0, (
-            f"Expected '{actual}' and '{expected}' to be considered equal as numbers"
-        )
+        assert result.score == expected_score
 
     @pytest.mark.asyncio
-    async def test_exact_match_numeric_non_equal(self) -> None:
-        """Test that numerically different values do not match."""
+    @pytest.mark.parametrize(
+        "actual_output, expected_output, target_key, expected_score",
+        [
+            # Flat dict: int vs float value
+            ({"v": 1}, {"v": 1.0}, "*", 1.0),
+            ({"v": 1.0}, {"v": 1}, "*", 1.0),
+            ({"v": 1.5}, {"v": 1}, "*", 0.0),
+            # Nested dict
+            ({"a": {"b": 1}}, {"a": {"b": 1.0}}, "*", 1.0),
+            ({"a": {"b": 1.5}}, {"a": {"b": 1}}, "*", 0.0),
+            # List of numbers
+            ({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, "*", 1.0),
+            ({"vals": [1, 2, 4]}, {"vals": [1.0, 2.0, 3.0]}, "*", 0.0),
+            # target_output_key resolves to a dict containing int/float
+            ({"result": {"count": 1}}, {"result": {"count": 1.0}}, "result", 1.0),
+            # target_output_key resolves to a scalar int/float
+            ({"result": 1}, {"result": 1.0}, "result", 1.0),
+            ({"result": 1.5}, {"result": 1}, "result", 0.0),
+        ],
+    )
+    async def test_exact_match_recursive_normalization(
+        self,
+        actual_output: Any,
+        expected_output: Any,
+        target_key: str,
+        expected_score: float,
+    ) -> None:
+        """Test that int/float normalization works recursively for dicts, lists, and nested structures."""
         execution = AgentExecution(
-            agent_input={"input": "Test"},
-            agent_output={"result": "1.5"},
+            agent_input={},
+            agent_output=actual_output,
             agent_trace=[],
         )
-        config = {
-            "name": "ExactMatchNumericTest",
-            "case_sensitive": True,
-            "target_output_key": "result",
-        }
+        config = {"name": "ExactMatchRecursiveTest", "target_output_key": target_key}
         evaluator = ExactMatchEvaluator.model_validate(
             {"evaluatorConfig": config, "id": str(uuid.uuid4())}
         )
-        criteria = OutputEvaluationCriteria(expected_output={"result": "1"})  # pyright: ignore[reportCallIssue]
+        criteria = OutputEvaluationCriteria(expected_output=expected_output)  # pyright: ignore[reportCallIssue]
 
         result = await evaluator.evaluate(execution, criteria)
 
         assert isinstance(result, NumericEvaluationResult)
-        assert result.score == 0.0
+        assert result.score == expected_score
 
     @pytest.mark.asyncio
     async def test_exact_match_validate_and_evaluate_criteria(
@@ -286,43 +303,64 @@ class TestContainsEvaluator:
     """Test ContainsEvaluator.evaluate() method."""
 
     @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "agent_output, search_text, target_key, case_sensitive, negated, expected_score",
+        [
+            # Basic match
+            ("Test output", "Test output", "*", False, False, 1.0),
+            # Substring match
+            ("Hello World", "World", "*", False, False, 1.0),
+            # No match
+            ("Hello World", "Goodbye", "*", False, False, 0.0),
+            # Case-insensitive match (default)
+            ("Hello World", "hello world", "*", False, False, 1.0),
+            # Case-sensitive hit
+            ("Hello World", "Hello", "*", True, False, 1.0),
+            # Case-sensitive miss
+            ("Hello World", "hello", "*", True, False, 0.0),
+            # Negated hit becomes miss
+            ("Test output", "Test output", "*", False, True, 0.0),
+            # Negated miss becomes hit
+            ("Hello World", "Goodbye", "*", False, True, 1.0),
+            # target_output_key extraction
+            ("Test output", "Test output", "output", False, False, 1.0),
+        ],
+    )
     async def test_contains_evaluator(
-        self, sample_agent_execution: AgentExecution
-    ) -> None:
-        """Test contains evaluator."""
-        config = {
-            "name": "ContainsTest",
-            "target_output_key": "output",
-            "default_evaluation_criteria": {"search_text": "Test output"},
-        }
-        evaluator = ContainsEvaluator.model_validate(
-            {"evaluatorConfig": config, "id": str(uuid.uuid4())}
-        )
-        criteria = ContainsEvaluationCriteria(search_text="Test output")
-        result = await evaluator.evaluate(sample_agent_execution, criteria)
-
-        assert isinstance(result, NumericEvaluationResult)
-        assert result.score == 1.0
-
-    @pytest.mark.asyncio
-    async def test_contains_evaluator_negated(
-        self, sample_agent_execution: AgentExecution
+        self,
+        agent_output: Any,
+        search_text: str,
+        target_key: str,
+        case_sensitive: bool,
+        negated: bool,
+        expected_score: float,
+        sample_agent_execution: AgentExecution,
     ) -> None:
-        """Test contains evaluator with negated criteria."""
+        """Test ContainsEvaluator across match, no-match, case sensitivity, and negation cases."""
+        if target_key == "output":
+            execution = (
+                sample_agent_execution  # has agent_output={"output": "Test output"}
+            )
+        else:
+            execution = AgentExecution(
+                agent_input={},
+                agent_output=agent_output,
+                agent_trace=[],
+            )
         config = {
             "name": "ContainsTest",
-            "negated": True,
-            "target_output_key": "output",
-            "default_evaluation_criteria": {"search_text": "Test output"},
+            "target_output_key": target_key,
+            "case_sensitive": case_sensitive,
+            "negated": negated,
         }
         evaluator = ContainsEvaluator.model_validate(
             {"evaluatorConfig": config, "id": str(uuid.uuid4())}
         )
-        criteria = ContainsEvaluationCriteria(search_text="Test output")
-        result = await evaluator.evaluate(sample_agent_execution, criteria)
+        criteria = ContainsEvaluationCriteria(search_text=search_text)
+        result = await evaluator.evaluate(execution, criteria)
 
         assert isinstance(result, NumericEvaluationResult)
-        assert result.score == 0.0
+        assert result.score == expected_score
 
     @pytest.mark.asyncio
     async def test_contains_evaluator_validate_and_evaluate_criteria(
@@ -332,7 +370,6 @@ async def test_contains_evaluator_validate_and_evaluate_criteria(
         config = {
             "name": "ContainsTest",
             "target_output_key": "*",
-            "default_evaluation_criteria": {"search_text": "Test output"},
         }
         evaluator = ContainsEvaluator.model_validate(
             {"evaluatorConfig": config, "id": str(uuid.uuid4())}
@@ -395,6 +432,44 @@ async def test_json_similarity_partial_match(self) -> None:
         assert isinstance(result, NumericEvaluationResult)
         assert math.isclose(result.score, 0.666, abs_tol=1e-3)
 
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "actual_output, expected_output, expected_score",
+        [
+            # int/float normalization — identical after normalization
+            ({"count": 1}, {"count": 1.0}, 1.0),
+            ({"count": 1.0}, {"count": 1}, 1.0),
+            # Nested int/float
+            ({"a": {"b": 1}}, {"a": {"b": 1.0}}, 1.0),
+            # List of ints vs floats
+            ({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, 1.0),
+            # Different numeric values — partial score: 1.0 - |expected-actual|/|expected|
+            ({"count": 1.5}, {"count": 1}, 0.5),
+        ],
+    )
+    async def test_json_similarity_numeric_normalization(
+        self,
+        actual_output: Any,
+        expected_output: Any,
+        expected_score: float,
+    ) -> None:
+        """Test that int/float normalization is applied before JSON similarity comparison."""
+        execution = AgentExecution(
+            agent_input={},
+            agent_output=actual_output,
+            agent_trace=[],
+        )
+        config = {"name": "JsonSimilarityTest"}
+        evaluator = JsonSimilarityEvaluator.model_validate(
+            {"evaluatorConfig": config, "id": str(uuid.uuid4())}
+        )
+        criteria = OutputEvaluationCriteria(expected_output=expected_output)  # pyright: ignore[reportCallIssue]
+
+        result = await evaluator.evaluate(execution, criteria)
+
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == expected_score
+
     @pytest.mark.asyncio
     async def test_json_similarity_validate_and_evaluate_criteria(self) -> None:
         """Test JSON similarity using validate_and_evaluate_criteria."""

diff --git a/uv.lock b/uv.lock