microsoft · Avijit-Microsoft · Jun 16, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -112,20 +112,14 @@ def find_process_result(step_name: str):
             )
         )
 
-        total_evaluated_fields_count = evaluated_result.confidence.get(
-            "total_evaluated_fields_count", 0
-        )
-        schema_score = (
-            0
-            if total_evaluated_fields_count == 0
-            else round(
-                (
-                    len(evaluated_result.comparison_result.items)
-                    - evaluated_result.confidence["zero_confidence_fields_count"]
-                )
-                / len(evaluated_result.comparison_result.items),
-                3,
-            )
+        # Compute the aggregate scores. Successful (Completed) processing
+        # always yields numeric scores: when probabilistic confidence is
+        # available (logprobs from non-reasoning models / Content Understanding
+        # signal) we use it; otherwise we fall back to a structural
+        # completeness score (fraction of expected fields actually filled).
+        # Failed runs and genuinely empty extractions remain at ``0.0``.
+        entity_score, schema_score, min_extracted_entity_score = (
+            self._derive_aggregate_scores(evaluated_result)
         )
 
         processed_result = ContentProcess(
@@ -143,11 +137,9 @@ def find_process_result(step_name: str):
                 self._current_message_context.data_pipeline.pipeline_status.creation_time,
                 "%Y-%m-%dT%H:%M:%S.%fZ",
             ),
-            entity_score=evaluated_result.confidence["overall_confidence"],
+            entity_score=entity_score,
             schema_score=schema_score,
-            min_extracted_entity_score=evaluated_result.confidence[
-                "min_extracted_field_confidence"
-            ],
+            min_extracted_entity_score=min_extracted_entity_score,
             prompt_tokens=evaluated_result.prompt_tokens,
             completion_tokens=evaluated_result.completion_tokens,
             target_schema=Schema.get_schema(
@@ -241,3 +233,85 @@ def _summarize_processed_time(self, step_results: list[StepResult]) -> str:
         # Format the total elapsed time as a string
         formatted_elapsed_time = f"{total_hours:02}:{total_minutes:02}:{total_seconds:02}.{total_milliseconds:03}"
         return formatted_elapsed_time
+
+    @staticmethod
+    def _is_filled_value(value: object) -> bool:
+        """Heuristic: does an extracted value count as "actually filled"?
+
+        Treats ``None``, empty strings, whitespace-only strings, and empty
+        containers as *not* filled. Recursively descends into dicts/lists so a
+        nested object that contains only nulls is still counted as empty.
+        """
+        if value is None:
+            return False
+        if isinstance(value, bool):
+            return True
+        if isinstance(value, str):
+            return value.strip() != ""
+        if isinstance(value, dict):
+            return any(SaveHandler._is_filled_value(v) for v in value.values())
+        if isinstance(value, (list, tuple, set)):
+            return any(SaveHandler._is_filled_value(v) for v in value)
+        return True
+
+    @staticmethod
+    def _derive_aggregate_scores(
+        evaluated_result: DataExtractionResult,
+    ) -> tuple[float, float, float]:
+        """Compute ``(entity_score, schema_score, min_extracted_entity_score)``.
+
+        Score selection order:
+
+        1. **Probabilistic confidence** — when the evaluate step produced
+           per-field confidence (``total_evaluated_fields_count > 0``), use the
+           probabilistic ``overall_confidence`` plus the ratio of
+           above-threshold fields. This is the highest-fidelity signal.
+
+        2. **Structural completeness fallback** — when no probabilistic
+           signal was produced (e.g. reasoning models like ``gpt-5``/``o1``/``o3``
+           don't return logprobs, and image-only flow has no Content
+           Understanding signal), but extraction still produced a comparison
+           table, score by *how much of the schema was actually filled*. This
+           replaces the old behaviour of falsely emitting ``0%`` for completed
+           runs that simply lacked logprobs.
+
+        3. **Zero** — only when there is literally no extraction data
+           (failed pipeline / genuinely empty result). Failed processing
+           continues to surface as ``0`` so the UI consistently renders
+           ``0%`` for failures and genuine zeros.
+        """
+        confidence = evaluated_result.confidence or {}
+        total_evaluated_fields_count = confidence.get(
+            "total_evaluated_fields_count", 0
+        )
+        comparison_items = (
+            evaluated_result.comparison_result.items
+            if evaluated_result.comparison_result is not None
+            else []
+        )
+
+        # Path 1: probabilistic confidence
+        if total_evaluated_fields_count > 0 and comparison_items:
+            zero_count = confidence.get("zero_confidence_fields_count", 0)
+            schema_score = round(
+                (len(comparison_items) - zero_count) / len(comparison_items),
+                3,
+            )
+            entity_score = float(confidence.get("overall_confidence") or 0.0)
+            min_extracted_entity_score = float(
+                confidence.get("min_extracted_field_confidence") or 0.0
+            )
+            return (entity_score, schema_score, min_extracted_entity_score)
+
+        # Path 2: structural completeness fallback
+        if comparison_items:
+            filled = sum(
+                1
+                for item in comparison_items
+                if SaveHandler._is_filled_value(item.Extracted)
+            )
+            ratio = round(filled / len(comparison_items), 3)
+            return (ratio, ratio, ratio)
+
+        # Path 3: nothing to score on
+        return (0.0, 0.0, 0.0)
@@ -19,7 +19,6 @@
 from azure.identity import (
     AzureCliCredential,
     AzureDeveloperCliCredential,
-    DefaultAzureCredential,
     ManagedIdentityCredential,
 )
 from azure.identity import (

@@ -19,7 +19,6 @@
 from azure.identity import (
     AzureCliCredential,
     AzureDeveloperCliCredential,
-    DefaultAzureCredential,
     ManagedIdentityCredential,
 )
 from azure.identity import (

@@ -0,0 +1,236 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for ``SaveHandler._derive_aggregate_scores``.
+
+Covers the score-derivation contract:
+- probabilistic confidence flows through verbatim when available
+- structural completeness fallback fires for Completed runs without logprobs
+  (e.g. reasoning models / image-only flow) instead of emitting a misleading 0%
+- a genuine zero is preserved as ``0.0``
+- failed/empty runs return ``0.0``
+"""
+
+from __future__ import annotations
+
+from libs.pipeline.handlers.logics.evaluate_handler.comparison import (
+    ExtractionComparisonData,
+    ExtractionComparisonItem,
+)
+from libs.pipeline.handlers.logics.evaluate_handler.model import DataExtractionResult
+from libs.pipeline.handlers.save_handler import SaveHandler
+
+
+def _make_result(
+    *,
+    items: list[ExtractionComparisonItem],
+    confidence: dict,
+) -> DataExtractionResult:
+    return DataExtractionResult(
+        extracted_result={},
+        confidence=confidence,
+        comparison_result=ExtractionComparisonData(items=items),
+        prompt_tokens=0,
+        completion_tokens=0,
+        execution_time=0,
+    )
+
+
+class TestProbabilisticPath:
+    def test_valid_scores_flow_through(self):
+        """A normal evaluate-step result must produce numeric scores."""
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted="x", Confidence="90.00%", IsAboveThreshold="True"
+            ),
+            ExtractionComparisonItem(
+                Field="b", Extracted="y", Confidence="80.00%", IsAboveThreshold="True"
+            ),
+            ExtractionComparisonItem(
+                Field="c", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+        ]
+        confidence = {
+            "total_evaluated_fields_count": 3,
+            "overall_confidence": 0.567,
+            "min_extracted_field_confidence": 0.0,
+            "zero_confidence_fields_count": 1,
+        }
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        assert entity == 0.567
+        # 2 of 3 fields above threshold → 0.667
+        assert schema == round(2 / 3, 3)
+        assert min_score == 0.0
+
+    def test_all_fields_above_threshold(self):
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted="x", Confidence="95.00%", IsAboveThreshold="True"
+            ),
+            ExtractionComparisonItem(
+                Field="b", Extracted="y", Confidence="90.00%", IsAboveThreshold="True"
+            ),
+        ]
+        confidence = {
+            "total_evaluated_fields_count": 2,
+            "overall_confidence": 0.925,
+            "min_extracted_field_confidence": 0.9,
+            "zero_confidence_fields_count": 0,
+        }
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        assert entity == 0.925
+        assert schema == 1.0
+        assert min_score == 0.9
+
+
+class TestStructuralFallback:
+    """When logprobs are unavailable (reasoning model / image-only) but
+    extraction succeeded, the Completed file must still get a meaningful
+    numeric score based on schema completeness."""
+
+    def test_all_fields_filled_yields_one(self):
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="b", Extracted="y", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="c", Extracted=42, Confidence="0.00%", IsAboveThreshold="False"
+            ),
+        ]
+        # No probabilistic signal: total_evaluated_fields_count == 0
+        confidence = {
+            "total_evaluated_fields_count": 0,
+            "overall_confidence": 0.0,
+            "min_extracted_field_confidence": 0.0,
+            "zero_confidence_fields_count": 0,
+        }
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        assert entity == 1.0
+        assert schema == 1.0
+        assert min_score == 1.0
+
+    def test_partial_fill_yields_ratio(self):
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="b", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="c", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="d", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+        ]
+        confidence = {"total_evaluated_fields_count": 0}
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        # 2 of 4 fields actually filled → 0.5
+        assert entity == 0.5
+        assert schema == 0.5
+        assert min_score == 0.5
+
+    def test_all_fields_empty_yields_zero(self):
+        """Genuine-empty extraction: structural fallback collapses to ``0.0``."""
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="b", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+            ExtractionComparisonItem(
+                Field="c", Extracted="   ", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+        ]
+        confidence = {"total_evaluated_fields_count": 0}
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        assert entity == 0.0
+        assert schema == 0.0
+        assert min_score == 0.0
+
+
+class TestZeroPath:
+    def test_no_comparison_items_returns_zero(self):
+        """No extraction data at all (failed pipeline) → ``0.0``."""
+        confidence = {
+            "total_evaluated_fields_count": 0,
+            "overall_confidence": 0.0,
+            "min_extracted_field_confidence": 0.0,
+            "zero_confidence_fields_count": 0,
+        }
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=[], confidence=confidence)
+        )
+        assert entity == 0.0
+        assert schema == 0.0
+        assert min_score == 0.0
+
+    def test_genuine_zero_probabilistic_score_preserved(self):
+        """A real ``0`` confidence (every field below threshold) must NOT be
+        replaced by the structural fallback — it's genuinely 0%."""
+        items = [
+            ExtractionComparisonItem(
+                Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
+            ),
+        ]
+        confidence = {
+            "total_evaluated_fields_count": 1,
+            "overall_confidence": 0.0,
+            "min_extracted_field_confidence": 0.0,
+            "zero_confidence_fields_count": 1,
+        }
+        entity, schema, min_score = SaveHandler._derive_aggregate_scores(
+            _make_result(items=items, confidence=confidence)
+        )
+        assert entity == 0.0
+        assert schema == 0.0
+        assert min_score == 0.0
+
+
+class TestIsFilledValue:
+    """Coverage for the ``_is_filled_value`` helper used by the structural fallback."""
+
+    def test_none_is_empty(self):
+        assert SaveHandler._is_filled_value(None) is False
+
+    def test_empty_string_is_empty(self):
+        assert SaveHandler._is_filled_value("") is False
+        assert SaveHandler._is_filled_value("   ") is False
+
+    def test_non_empty_string_is_filled(self):
+        assert SaveHandler._is_filled_value("x") is True
+
+    def test_zero_int_is_filled(self):
+        # A literal ``0`` is a valid extracted value (e.g. count fields).
+        assert SaveHandler._is_filled_value(0) is True
+
+    def test_bool_is_filled(self):
+        assert SaveHandler._is_filled_value(False) is True
+        assert SaveHandler._is_filled_value(True) is True
+
+    def test_empty_container_is_empty(self):
+        assert SaveHandler._is_filled_value([]) is False
+        assert SaveHandler._is_filled_value({}) is False
+
+    def test_nested_all_null_is_empty(self):
+        assert SaveHandler._is_filled_value({"a": None, "b": ""}) is False
+        assert SaveHandler._is_filled_value([None, "", {"c": None}]) is False
+
+    def test_nested_with_value_is_filled(self):
+        assert SaveHandler._is_filled_value({"a": None, "b": "x"}) is True
+        assert SaveHandler._is_filled_value([None, "x"]) is True
@@ -54,11 +54,11 @@ class Content_Process(EntityBase):
         description="MIME type of the processed content file", default=None
     )
     entity_score: float = Field(
-        description="Score indicating the quality of entity extraction from the content",
+        description="Score indicating the quality of entity extraction from the content. For Completed runs this is either the probabilistic confidence (when logprobs are available) or a structural completeness fallback (fraction of expected fields actually filled). Failed runs and genuinely empty extractions remain at ``0.0``.",
         default=0.0,
     )
     schema_score: float = Field(
-        description="Score indicating the quality of schema matching for the content",
+        description="Score indicating the quality of schema matching for the content. For Completed runs this is either the probabilistic above-threshold ratio or a structural completeness fallback. Failed runs remain at ``0.0``.",
         default=0.0,
     )
     status: Optional[str] = Field(