Response Groundedness (#2403)

rhlbhatnagar · anistark · web-flow · commit aa7b7bbe0c9f · 2025-11-10T23:01:11.000+05:30
Co-authored-by: Ani &lt;5357586+anistark@users.noreply.github.com&gt;
diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
@@ -15,6 +15,7 @@
 from ragas.metrics.collections._factual_correctness import FactualCorrectness
 from ragas.metrics.collections._faithfulness import Faithfulness
 from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
+from ragas.metrics.collections._response_groundedness import ResponseGroundedness
 from ragas.metrics.collections._rouge_score import RougeScore
 from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
 from ragas.metrics.collections._string import (
@@ -44,6 +45,7 @@
     "Faithfulness",
     "NoiseSensitivity",
     "NonLLMStringSimilarity",
+    "ResponseGroundedness",
     "RougeScore",
     "SemanticSimilarity",
     "StringPresence",
diff --git a/src/ragas/metrics/collections/_response_groundedness.py b/src/ragas/metrics/collections/_response_groundedness.py
@@ -0,0 +1,165 @@
+"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation."""
+
+import typing as t
+from typing import List
+
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.response_groundedness import (
+    response_groundedness_judge1_prompt,
+    response_groundedness_judge2_prompt,
+)
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class GroundednessRating(BaseModel):
+    """Structured output for groundedness rating."""
+
+    rating: int
+
+
+class ResponseGroundedness(BaseMetric):
+    """
+    Modern v2 implementation of response groundedness evaluation.
+
+    Evaluates how well grounded a response is in the retrieved contexts
+    using a dual-judge system. This metric averages two distinct judge prompts
+    to ensure robust evaluation.
+
+    The metric uses NVIDIA's proven dual-judge approach:
+    1. Judge 1: Direct groundedness evaluation with structured instructions
+    2. Judge 2: Alternative perspective for fairness
+    3. Average both judges for final score
+
+    Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded)
+    Final score: Average of both judges converted to 0.0-1.0 scale
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import llm_factory
+        >>> from ragas.metrics.collections import ResponseGroundedness
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = llm_factory("gpt-4o", client=client)
+        >>>
+        >>> # Create metric instance
+        >>> metric = ResponseGroundedness(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     response="Einstein was born in Germany in 1879.",
+        ...     retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."]
+        ... )
+        >>> print(f"Response Groundedness: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for dual-judge evaluation
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+        max_retries: Maximum retry attempts for invalid ratings
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "response_groundedness",
+        max_retries: int = 5,
+        **kwargs,
+    ):
+        """
+        Initialize ResponseGroundedness metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for dual-judge evaluation
+            name: The metric name
+            max_retries: Maximum retry attempts for invalid ratings
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.max_retries = max_retries
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self, response: str, retrieved_contexts: List[str]
+    ) -> MetricResult:
+        """
+        Calculate response groundedness score using dual-judge evaluation.
+
+        Args:
+            response: The response to evaluate for groundedness
+            retrieved_contexts: The retrieved contexts to check groundedness against
+
+        Returns:
+            MetricResult with response groundedness score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not retrieved_contexts:
+            raise ValueError(
+                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
+            )
+
+        # Handle edge cases like legacy
+        context_str = "\n".join(retrieved_contexts)
+
+        if not response.strip() or not context_str.strip():
+            return MetricResult(value=0.0)
+
+        # Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing)
+        judge1_rating = await self._get_judge_rating(
+            response_groundedness_judge1_prompt(response, context_str)
+        )
+        judge2_rating = await self._get_judge_rating(
+            response_groundedness_judge2_prompt(response, context_str)
+        )
+
+        # Average the scores (already on 0.0-1.0 scale like legacy)
+        score = self._average_scores(judge1_rating, judge2_rating)
+
+        return MetricResult(value=float(score))
+
+    async def _get_judge_rating(self, prompt: str) -> float:
+        """Get rating from judge using structured output with legacy-compatible processing."""
+        for retry in range(self.max_retries):
+            try:
+                result = await self.llm.agenerate(prompt, GroundednessRating)
+                rating = result.rating
+
+                # Validate rating is in expected range and convert to 0.0-1.0 scale
+                if rating in [0, 1, 2]:
+                    return rating / 2.0  # Convert to legacy 0.0-1.0 scale
+                else:
+                    if retry < self.max_retries - 1:
+                        continue  # Retry if invalid rating
+                    else:
+                        return float("nan")
+
+            except Exception:
+                if retry < self.max_retries - 1:
+                    continue  # Retry on exception
+                else:
+                    return float("nan")
+
+        return float("nan")
+
+    def _average_scores(self, score1: float, score2: float) -> float:
+        """Average two judge scores, handling NaN values. Matches legacy logic exactly."""
+        if score1 >= 0 and score2 >= 0:
+            return (score1 + score2) / 2.0
+        else:
+            # Match legacy behavior: use max() for NaN handling
+            return max(score1, score2)
diff --git a/src/ragas/prompt/metrics/response_groundedness.py b/src/ragas/prompt/metrics/response_groundedness.py
@@ -0,0 +1,62 @@
+"""Response groundedness prompts - V1-identical converted to functions."""
+
+
+def response_groundedness_judge1_prompt(response: str, context: str) -> str:
+    """
+    V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly.
+
+    Args:
+        response: The response/assertion to evaluate for groundedness
+        context: The context to evaluate the response against
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    return f"""### Instruction
+
+You are a world class expert designed to evaluate the groundedness of an assertion.
+You will be provided with an assertion and a context.
+Your task is to determine if the assertion is supported by the context.
+Follow the instructions below:
+A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
+B. If the assertion is not supported by the context, say 0.
+C. If the assertion is partially supported by the context, say 1.
+D. If the assertion is fully supported by the context, say 2.
+You must provide a rating of 0, 1, or 2, nothing else.
+
+### Context:
+<{context}>
+
+### Assertion:
+<{response}>
+
+Analyzing Context and Response, the Groundedness score is """
+
+
+def response_groundedness_judge2_prompt(response: str, context: str) -> str:
+    """
+    V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly.
+
+    Args:
+        response: The response/assertion to evaluate for groundedness
+        context: The context to evaluate the response against
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines:
+
+* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0.
+* If the assertion is partially supported, assign a score of 1.
+* If the assertion is fully supported, assign a score of 2.
+
+I will provide a rating of 0, 1, or 2, without any additional information.
+
+---
+**Context:**
+[{context}]
+
+**Assertion:**
+[{response}]
+
+Do not explain. Based on the provided context and response, the Groundedness score is:"""
diff --git a/tests/e2e/metrics_migration/test_response_groundedness_migration.py b/tests/e2e/metrics_migration/test_response_groundedness_migration.py