Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 92 additions & 18 deletions src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,20 +112,14 @@ def find_process_result(step_name: str):
)
)

total_evaluated_fields_count = evaluated_result.confidence.get(
"total_evaluated_fields_count", 0
)
schema_score = (
0
if total_evaluated_fields_count == 0
else round(
(
len(evaluated_result.comparison_result.items)
- evaluated_result.confidence["zero_confidence_fields_count"]
)
/ len(evaluated_result.comparison_result.items),
3,
)
# Compute the aggregate scores. Successful (Completed) processing
# always yields numeric scores: when probabilistic confidence is
# available (logprobs from non-reasoning models / Content Understanding
# signal) we use it; otherwise we fall back to a structural
# completeness score (fraction of expected fields actually filled).
# Failed runs and genuinely empty extractions remain at ``0.0``.
entity_score, schema_score, min_extracted_entity_score = (
self._derive_aggregate_scores(evaluated_result)
)

processed_result = ContentProcess(
Expand All @@ -143,11 +137,9 @@ def find_process_result(step_name: str):
self._current_message_context.data_pipeline.pipeline_status.creation_time,
"%Y-%m-%dT%H:%M:%S.%fZ",
),
entity_score=evaluated_result.confidence["overall_confidence"],
entity_score=entity_score,
schema_score=schema_score,
min_extracted_entity_score=evaluated_result.confidence[
"min_extracted_field_confidence"
],
min_extracted_entity_score=min_extracted_entity_score,
prompt_tokens=evaluated_result.prompt_tokens,
completion_tokens=evaluated_result.completion_tokens,
target_schema=Schema.get_schema(
Expand Down Expand Up @@ -241,3 +233,85 @@ def _summarize_processed_time(self, step_results: list[StepResult]) -> str:
# Format the total elapsed time as a string
formatted_elapsed_time = f"{total_hours:02}:{total_minutes:02}:{total_seconds:02}.{total_milliseconds:03}"
return formatted_elapsed_time

@staticmethod
def _is_filled_value(value: object) -> bool:
"""Heuristic: does an extracted value count as "actually filled"?

Treats ``None``, empty strings, whitespace-only strings, and empty
containers as *not* filled. Recursively descends into dicts/lists so a
nested object that contains only nulls is still counted as empty.
"""
if value is None:
return False
if isinstance(value, bool):
return True
if isinstance(value, str):
return value.strip() != ""
if isinstance(value, dict):
return any(SaveHandler._is_filled_value(v) for v in value.values())
if isinstance(value, (list, tuple, set)):
return any(SaveHandler._is_filled_value(v) for v in value)
return True

@staticmethod
def _derive_aggregate_scores(
evaluated_result: DataExtractionResult,
) -> tuple[float, float, float]:
"""Compute ``(entity_score, schema_score, min_extracted_entity_score)``.

Score selection order:

1. **Probabilistic confidence** — when the evaluate step produced
per-field confidence (``total_evaluated_fields_count > 0``), use the
probabilistic ``overall_confidence`` plus the ratio of
above-threshold fields. This is the highest-fidelity signal.

2. **Structural completeness fallback** — when no probabilistic
signal was produced (e.g. reasoning models like ``gpt-5``/``o1``/``o3``
don't return logprobs, and image-only flow has no Content
Understanding signal), but extraction still produced a comparison
table, score by *how much of the schema was actually filled*. This
replaces the old behaviour of falsely emitting ``0%`` for completed
runs that simply lacked logprobs.

3. **Zero** — only when there is literally no extraction data
(failed pipeline / genuinely empty result). Failed processing
continues to surface as ``0`` so the UI consistently renders
``0%`` for failures and genuine zeros.
"""
confidence = evaluated_result.confidence or {}
total_evaluated_fields_count = confidence.get(
"total_evaluated_fields_count", 0
)
comparison_items = (
evaluated_result.comparison_result.items
if evaluated_result.comparison_result is not None
else []
)

# Path 1: probabilistic confidence
if total_evaluated_fields_count > 0 and comparison_items:
zero_count = confidence.get("zero_confidence_fields_count", 0)
schema_score = round(
(len(comparison_items) - zero_count) / len(comparison_items),
3,
)
Comment thread
Prachig-Microsoft marked this conversation as resolved.
entity_score = float(confidence.get("overall_confidence") or 0.0)
min_extracted_entity_score = float(
confidence.get("min_extracted_field_confidence") or 0.0
)
return (entity_score, schema_score, min_extracted_entity_score)

# Path 2: structural completeness fallback
if comparison_items:
filled = sum(
1
for item in comparison_items
if SaveHandler._is_filled_value(item.Extracted)
)
ratio = round(filled / len(comparison_items), 3)
return (ratio, ratio, ratio)
Comment thread
Prachig-Microsoft marked this conversation as resolved.

# Path 3: nothing to score on
return (0.0, 0.0, 0.0)
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from azure.identity import (
AzureCliCredential,
AzureDeveloperCliCredential,
DefaultAzureCredential,
ManagedIdentityCredential,
)
Comment thread
Prachig-Microsoft marked this conversation as resolved.
from azure.identity import (
Expand Down
1 change: 0 additions & 1 deletion src/ContentProcessor/src/libs/utils/credential_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from azure.identity import (
AzureCliCredential,
AzureDeveloperCliCredential,
DefaultAzureCredential,
ManagedIdentityCredential,
)
Comment thread
Prachig-Microsoft marked this conversation as resolved.
from azure.identity import (
Expand Down
236 changes: 236 additions & 0 deletions src/ContentProcessor/tests/unit/pipeline/test_save_handler_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for ``SaveHandler._derive_aggregate_scores``.

Covers the score-derivation contract:
- probabilistic confidence flows through verbatim when available
- structural completeness fallback fires for Completed runs without logprobs
(e.g. reasoning models / image-only flow) instead of emitting a misleading 0%
- a genuine zero is preserved as ``0.0``
- failed/empty runs return ``0.0``
"""

from __future__ import annotations

from libs.pipeline.handlers.logics.evaluate_handler.comparison import (
ExtractionComparisonData,
ExtractionComparisonItem,
)
from libs.pipeline.handlers.logics.evaluate_handler.model import DataExtractionResult
from libs.pipeline.handlers.save_handler import SaveHandler


def _make_result(
*,
items: list[ExtractionComparisonItem],
confidence: dict,
) -> DataExtractionResult:
return DataExtractionResult(
extracted_result={},
confidence=confidence,
comparison_result=ExtractionComparisonData(items=items),
prompt_tokens=0,
completion_tokens=0,
execution_time=0,
)


class TestProbabilisticPath:
def test_valid_scores_flow_through(self):
"""A normal evaluate-step result must produce numeric scores."""
items = [
ExtractionComparisonItem(
Field="a", Extracted="x", Confidence="90.00%", IsAboveThreshold="True"
),
ExtractionComparisonItem(
Field="b", Extracted="y", Confidence="80.00%", IsAboveThreshold="True"
),
ExtractionComparisonItem(
Field="c", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
),
]
confidence = {
"total_evaluated_fields_count": 3,
"overall_confidence": 0.567,
"min_extracted_field_confidence": 0.0,
"zero_confidence_fields_count": 1,
}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
assert entity == 0.567
# 2 of 3 fields above threshold → 0.667
assert schema == round(2 / 3, 3)
assert min_score == 0.0

def test_all_fields_above_threshold(self):
items = [
ExtractionComparisonItem(
Field="a", Extracted="x", Confidence="95.00%", IsAboveThreshold="True"
),
ExtractionComparisonItem(
Field="b", Extracted="y", Confidence="90.00%", IsAboveThreshold="True"
),
]
confidence = {
"total_evaluated_fields_count": 2,
"overall_confidence": 0.925,
"min_extracted_field_confidence": 0.9,
"zero_confidence_fields_count": 0,
}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
assert entity == 0.925
assert schema == 1.0
assert min_score == 0.9


class TestStructuralFallback:
"""When logprobs are unavailable (reasoning model / image-only) but
extraction succeeded, the Completed file must still get a meaningful
numeric score based on schema completeness."""

def test_all_fields_filled_yields_one(self):
items = [
ExtractionComparisonItem(
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="b", Extracted="y", Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="c", Extracted=42, Confidence="0.00%", IsAboveThreshold="False"
),
]
# No probabilistic signal: total_evaluated_fields_count == 0
confidence = {
"total_evaluated_fields_count": 0,
"overall_confidence": 0.0,
"min_extracted_field_confidence": 0.0,
"zero_confidence_fields_count": 0,
}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
assert entity == 1.0
assert schema == 1.0
assert min_score == 1.0

def test_partial_fill_yields_ratio(self):
items = [
ExtractionComparisonItem(
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="b", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="c", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="d", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
),
]
confidence = {"total_evaluated_fields_count": 0}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
# 2 of 4 fields actually filled → 0.5
assert entity == 0.5
assert schema == 0.5
assert min_score == 0.5

def test_all_fields_empty_yields_zero(self):
"""Genuine-empty extraction: structural fallback collapses to ``0.0``."""
items = [
ExtractionComparisonItem(
Field="a", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="b", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
),
ExtractionComparisonItem(
Field="c", Extracted=" ", Confidence="0.00%", IsAboveThreshold="False"
),
]
confidence = {"total_evaluated_fields_count": 0}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
assert entity == 0.0
assert schema == 0.0
assert min_score == 0.0


class TestZeroPath:
def test_no_comparison_items_returns_zero(self):
"""No extraction data at all (failed pipeline) → ``0.0``."""
confidence = {
"total_evaluated_fields_count": 0,
"overall_confidence": 0.0,
"min_extracted_field_confidence": 0.0,
"zero_confidence_fields_count": 0,
}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=[], confidence=confidence)
)
assert entity == 0.0
assert schema == 0.0
assert min_score == 0.0

def test_genuine_zero_probabilistic_score_preserved(self):
"""A real ``0`` confidence (every field below threshold) must NOT be
replaced by the structural fallback — it's genuinely 0%."""
items = [
ExtractionComparisonItem(
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
),
]
confidence = {
"total_evaluated_fields_count": 1,
"overall_confidence": 0.0,
"min_extracted_field_confidence": 0.0,
"zero_confidence_fields_count": 1,
}
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
_make_result(items=items, confidence=confidence)
)
assert entity == 0.0
assert schema == 0.0
assert min_score == 0.0


class TestIsFilledValue:
"""Coverage for the ``_is_filled_value`` helper used by the structural fallback."""

def test_none_is_empty(self):
assert SaveHandler._is_filled_value(None) is False

def test_empty_string_is_empty(self):
assert SaveHandler._is_filled_value("") is False
assert SaveHandler._is_filled_value(" ") is False

def test_non_empty_string_is_filled(self):
assert SaveHandler._is_filled_value("x") is True

def test_zero_int_is_filled(self):
# A literal ``0`` is a valid extracted value (e.g. count fields).
assert SaveHandler._is_filled_value(0) is True

def test_bool_is_filled(self):
assert SaveHandler._is_filled_value(False) is True
assert SaveHandler._is_filled_value(True) is True

def test_empty_container_is_empty(self):
assert SaveHandler._is_filled_value([]) is False
assert SaveHandler._is_filled_value({}) is False

def test_nested_all_null_is_empty(self):
assert SaveHandler._is_filled_value({"a": None, "b": ""}) is False
assert SaveHandler._is_filled_value([None, "", {"c": None}]) is False

def test_nested_with_value_is_filled(self):
assert SaveHandler._is_filled_value({"a": None, "b": "x"}) is True
assert SaveHandler._is_filled_value([None, "x"]) is True
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ class Content_Process(EntityBase):
description="MIME type of the processed content file", default=None
)
entity_score: float = Field(
description="Score indicating the quality of entity extraction from the content",
description="Score indicating the quality of entity extraction from the content. For Completed runs this is either the probabilistic confidence (when logprobs are available) or a structural completeness fallback (fraction of expected fields actually filled). Failed runs and genuinely empty extractions remain at ``0.0``.",
default=0.0,
)
schema_score: float = Field(
description="Score indicating the quality of schema matching for the content",
description="Score indicating the quality of schema matching for the content. For Completed runs this is either the probabilistic above-threshold ratio or a structural completeness fallback. Failed runs remain at ``0.0``.",
default=0.0,
)
status: Optional[str] = Field(
Expand Down
Loading
Loading