Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath"
version = "2.9.8"
version = "2.9.9"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand Down
21 changes: 12 additions & 9 deletions src/uipath/eval/evaluators/exact_match_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,26 @@ async def evaluate(
Returns:
EvaluationResult: Boolean result indicating exact match (True/False)
"""
actual_output = str(self._get_actual_output(agent_execution))
expected_output = str(self._get_expected_output(evaluation_criteria))
actual_output = self._get_actual_output(agent_execution)
expected_output = self._get_expected_output(evaluation_criteria)

try:
is_exact_match = float(actual_output) == float(expected_output)
except ValueError:
if isinstance(actual_output, str) or isinstance(expected_output, str):
actual_str = str(actual_output)
expected_str = str(expected_output)
if not self.evaluator_config.case_sensitive:
actual_output = actual_output.lower()
expected_output = expected_output.lower()
actual_str = actual_str.lower()
expected_str = expected_str.lower()
is_exact_match = actual_str == expected_str
else:
is_exact_match = actual_output == expected_output

if self.evaluator_config.negated:
is_exact_match = not is_exact_match

validated_justification = self.validate_justification(
{
"expected": expected_output,
"actual": actual_output,
"expected": str(expected_output),
"actual": str(actual_output),
}
)
return NumericEvaluationResult(
Expand Down
3 changes: 1 addition & 2 deletions src/uipath/eval/evaluators/llm_as_judge_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""

import copy
import json
import logging
from abc import abstractmethod
Expand Down Expand Up @@ -289,8 +290,6 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
)

# Log full request body for debugging
import copy

request_body_for_log = copy.deepcopy(request_data)
# Convert tool_choice to dict for logging
if "tool_choice" in request_body_for_log:
Expand Down
22 changes: 19 additions & 3 deletions src/uipath/eval/evaluators/output_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,25 @@ class BaseOutputEvaluator(BaseEvaluator[T, C, J]):
J: The justification type
"""

def _normalize_numbers(self, obj: Any) -> Any:
"""Recursively normalize int/float to float for consistent numeric comparison.

Converts all numeric values (int, float) to float in nested structures
(dicts, lists), while preserving booleans and other data types.
"""
if isinstance(obj, dict):
return {k: self._normalize_numbers(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [self._normalize_numbers(v) for v in obj]
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
return float(obj)
return obj

def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
"""Get the actual output from the agent execution."""
if self.evaluator_config.target_output_key != "*":
try:
return resolve_output_path(
result = resolve_output_path(
agent_execution.agent_output,
self.evaluator_config.target_output_key,
)
Expand All @@ -67,7 +81,9 @@ def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
detail=f"Error: {e}",
category=UiPathEvaluationErrorCategory.USER,
) from e
return agent_execution.agent_output
else:
result = agent_execution.agent_output
return self._normalize_numbers(result)

def _get_full_expected_output(self, evaluation_criteria: T) -> Any:
"""Get the full expected output from the evaluation criteria."""
Expand Down Expand Up @@ -104,7 +120,7 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any:
detail=f"Error: {e}",
category=UiPathEvaluationErrorCategory.USER,
) from e
return expected_output
return self._normalize_numbers(expected_output)


# NOTE: This evaluator is only used in coded evaluators.
Expand Down
195 changes: 135 additions & 60 deletions tests/evaluators/test_evaluator_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,64 +201,81 @@ async def test_exact_match_negated(

@pytest.mark.asyncio
@pytest.mark.parametrize(
"actual, expected",
"actual_output, expected_output, expected_score",
[
("1.0", "1"),
("1", "1.0"),
("1e0", "1"),
("1.00", "1.0"),
("0.5", "0.50"),
("-3.0", "-3"),
# Scalar int/float normalization — the core fix
(1, 1.0, 1.0),
(1.0, 1, 1.0),
(0, 0.0, 1.0),
(-3, -3.0, 1.0),
(1.5, 1, 0.0),
(2, 3, 0.0),
],
)
async def test_exact_match_numeric_leniency(
self, actual: str, expected: str
async def test_exact_match_numeric_normalization(
self, actual_output: Any, expected_output: Any, expected_score: float
) -> None:
"""Test that numerically equal values match regardless of string representation."""
"""Test that int and float scalar values are normalized before comparison."""
execution = AgentExecution(
agent_input={"input": "Test"},
agent_output={"result": actual},
agent_input={},
agent_output={"value": actual_output},
agent_trace=[],
)
config = {
"name": "ExactMatchNumericTest",
"case_sensitive": True,
"target_output_key": "result",
}
config = {"name": "ExactMatchNumericTest", "target_output_key": "value"}
evaluator = ExactMatchEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
criteria = OutputEvaluationCriteria(expected_output={"result": expected}) # pyright: ignore[reportCallIssue]
criteria = OutputEvaluationCriteria(expected_output={"value": expected_output}) # pyright: ignore[reportCallIssue]

result = await evaluator.evaluate(execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert result.score == 1.0, (
f"Expected '{actual}' and '{expected}' to be considered equal as numbers"
)
assert result.score == expected_score

@pytest.mark.asyncio
async def test_exact_match_numeric_non_equal(self) -> None:
"""Test that numerically different values do not match."""
@pytest.mark.parametrize(
"actual_output, expected_output, target_key, expected_score",
[
# Flat dict: int vs float value
({"v": 1}, {"v": 1.0}, "*", 1.0),
({"v": 1.0}, {"v": 1}, "*", 1.0),
({"v": 1.5}, {"v": 1}, "*", 0.0),
# Nested dict
({"a": {"b": 1}}, {"a": {"b": 1.0}}, "*", 1.0),
({"a": {"b": 1.5}}, {"a": {"b": 1}}, "*", 0.0),
# List of numbers
({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, "*", 1.0),
({"vals": [1, 2, 4]}, {"vals": [1.0, 2.0, 3.0]}, "*", 0.0),
# target_output_key resolves to a dict containing int/float
({"result": {"count": 1}}, {"result": {"count": 1.0}}, "result", 1.0),
# target_output_key resolves to a scalar int/float
({"result": 1}, {"result": 1.0}, "result", 1.0),
({"result": 1.5}, {"result": 1}, "result", 0.0),
],
)
async def test_exact_match_recursive_normalization(
self,
actual_output: Any,
expected_output: Any,
target_key: str,
expected_score: float,
) -> None:
"""Test that int/float normalization works recursively for dicts, lists, and nested structures."""
execution = AgentExecution(
agent_input={"input": "Test"},
agent_output={"result": "1.5"},
agent_input={},
agent_output=actual_output,
agent_trace=[],
)
config = {
"name": "ExactMatchNumericTest",
"case_sensitive": True,
"target_output_key": "result",
}
config = {"name": "ExactMatchRecursiveTest", "target_output_key": target_key}
evaluator = ExactMatchEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
criteria = OutputEvaluationCriteria(expected_output={"result": "1"}) # pyright: ignore[reportCallIssue]
criteria = OutputEvaluationCriteria(expected_output=expected_output) # pyright: ignore[reportCallIssue]

result = await evaluator.evaluate(execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert result.score == 0.0
assert result.score == expected_score

@pytest.mark.asyncio
async def test_exact_match_validate_and_evaluate_criteria(
Expand Down Expand Up @@ -286,43 +303,64 @@ class TestContainsEvaluator:
"""Test ContainsEvaluator.evaluate() method."""

@pytest.mark.asyncio
@pytest.mark.parametrize(
"agent_output, search_text, target_key, case_sensitive, negated, expected_score",
[
# Basic match
("Test output", "Test output", "*", False, False, 1.0),
# Substring match
("Hello World", "World", "*", False, False, 1.0),
# No match
("Hello World", "Goodbye", "*", False, False, 0.0),
# Case-insensitive match (default)
("Hello World", "hello world", "*", False, False, 1.0),
# Case-sensitive hit
("Hello World", "Hello", "*", True, False, 1.0),
# Case-sensitive miss
("Hello World", "hello", "*", True, False, 0.0),
# Negated hit becomes miss
("Test output", "Test output", "*", False, True, 0.0),
# Negated miss becomes hit
("Hello World", "Goodbye", "*", False, True, 1.0),
# target_output_key extraction
("Test output", "Test output", "output", False, False, 1.0),
],
)
async def test_contains_evaluator(
self, sample_agent_execution: AgentExecution
) -> None:
"""Test contains evaluator."""
config = {
"name": "ContainsTest",
"target_output_key": "output",
"default_evaluation_criteria": {"search_text": "Test output"},
}
evaluator = ContainsEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
criteria = ContainsEvaluationCriteria(search_text="Test output")
result = await evaluator.evaluate(sample_agent_execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert result.score == 1.0

@pytest.mark.asyncio
async def test_contains_evaluator_negated(
self, sample_agent_execution: AgentExecution
self,
agent_output: Any,
search_text: str,
target_key: str,
case_sensitive: bool,
negated: bool,
expected_score: float,
sample_agent_execution: AgentExecution,
) -> None:
"""Test contains evaluator with negated criteria."""
"""Test ContainsEvaluator across match, no-match, case sensitivity, and negation cases."""
if target_key == "output":
execution = (
sample_agent_execution # has agent_output={"output": "Test output"}
)
else:
execution = AgentExecution(
agent_input={},
agent_output=agent_output,
agent_trace=[],
)
config = {
"name": "ContainsTest",
"negated": True,
"target_output_key": "output",
"default_evaluation_criteria": {"search_text": "Test output"},
"target_output_key": target_key,
"case_sensitive": case_sensitive,
"negated": negated,
}
evaluator = ContainsEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
criteria = ContainsEvaluationCriteria(search_text="Test output")
result = await evaluator.evaluate(sample_agent_execution, criteria)
criteria = ContainsEvaluationCriteria(search_text=search_text)
result = await evaluator.evaluate(execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert result.score == 0.0
assert result.score == expected_score

@pytest.mark.asyncio
async def test_contains_evaluator_validate_and_evaluate_criteria(
Expand All @@ -332,7 +370,6 @@ async def test_contains_evaluator_validate_and_evaluate_criteria(
config = {
"name": "ContainsTest",
"target_output_key": "*",
"default_evaluation_criteria": {"search_text": "Test output"},
}
evaluator = ContainsEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
Expand Down Expand Up @@ -395,6 +432,44 @@ async def test_json_similarity_partial_match(self) -> None:
assert isinstance(result, NumericEvaluationResult)
assert math.isclose(result.score, 0.666, abs_tol=1e-3)

@pytest.mark.asyncio
@pytest.mark.parametrize(
"actual_output, expected_output, expected_score",
[
# int/float normalization — identical after normalization
({"count": 1}, {"count": 1.0}, 1.0),
({"count": 1.0}, {"count": 1}, 1.0),
# Nested int/float
({"a": {"b": 1}}, {"a": {"b": 1.0}}, 1.0),
# List of ints vs floats
({"vals": [1, 2, 3]}, {"vals": [1.0, 2.0, 3.0]}, 1.0),
# Different numeric values — partial score: 1.0 - |expected-actual|/|expected|
({"count": 1.5}, {"count": 1}, 0.5),
],
)
async def test_json_similarity_numeric_normalization(
self,
actual_output: Any,
expected_output: Any,
expected_score: float,
) -> None:
"""Test that int/float normalization is applied before JSON similarity comparison."""
execution = AgentExecution(
agent_input={},
agent_output=actual_output,
agent_trace=[],
)
config = {"name": "JsonSimilarityTest"}
evaluator = JsonSimilarityEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
criteria = OutputEvaluationCriteria(expected_output=expected_output) # pyright: ignore[reportCallIssue]

result = await evaluator.evaluate(execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert result.score == expected_score

@pytest.mark.asyncio
async def test_json_similarity_validate_and_evaluate_criteria(self) -> None:
"""Test JSON similarity using validate_and_evaluate_criteria."""
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.