Skip to content

Commit aa7b7bb

Browse files
Response Groundedness (#2403)
Co-authored-by: Ani <5357586+anistark@users.noreply.github.com>
1 parent 023e332 commit aa7b7bb

File tree

4 files changed

+453
-0
lines changed

4 files changed

+453
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from ragas.metrics.collections._factual_correctness import FactualCorrectness
1616
from ragas.metrics.collections._faithfulness import Faithfulness
1717
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
18+
from ragas.metrics.collections._response_groundedness import ResponseGroundedness
1819
from ragas.metrics.collections._rouge_score import RougeScore
1920
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
2021
from ragas.metrics.collections._string import (
@@ -44,6 +45,7 @@
4445
"Faithfulness",
4546
"NoiseSensitivity",
4647
"NonLLMStringSimilarity",
48+
"ResponseGroundedness",
4749
"RougeScore",
4850
"SemanticSimilarity",
4951
"StringPresence",
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
from pydantic import BaseModel
7+
8+
from ragas.metrics.collections.base import BaseMetric
9+
from ragas.metrics.result import MetricResult
10+
from ragas.prompt.metrics.response_groundedness import (
11+
response_groundedness_judge1_prompt,
12+
response_groundedness_judge2_prompt,
13+
)
14+
15+
if t.TYPE_CHECKING:
16+
from ragas.llms.base import InstructorBaseRagasLLM
17+
18+
19+
class GroundednessRating(BaseModel):
20+
"""Structured output for groundedness rating."""
21+
22+
rating: int
23+
24+
25+
class ResponseGroundedness(BaseMetric):
26+
"""
27+
Modern v2 implementation of response groundedness evaluation.
28+
29+
Evaluates how well grounded a response is in the retrieved contexts
30+
using a dual-judge system. This metric averages two distinct judge prompts
31+
to ensure robust evaluation.
32+
33+
The metric uses NVIDIA's proven dual-judge approach:
34+
1. Judge 1: Direct groundedness evaluation with structured instructions
35+
2. Judge 2: Alternative perspective for fairness
36+
3. Average both judges for final score
37+
38+
Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded)
39+
Final score: Average of both judges converted to 0.0-1.0 scale
40+
41+
Usage:
42+
>>> import instructor
43+
>>> from openai import AsyncOpenAI
44+
>>> from ragas.llms.base import llm_factory
45+
>>> from ragas.metrics.collections import ResponseGroundedness
46+
>>>
47+
>>> # Setup dependencies
48+
>>> client = AsyncOpenAI()
49+
>>> llm = llm_factory("gpt-4o", client=client)
50+
>>>
51+
>>> # Create metric instance
52+
>>> metric = ResponseGroundedness(llm=llm)
53+
>>>
54+
>>> # Single evaluation
55+
>>> result = await metric.ascore(
56+
... response="Einstein was born in Germany in 1879.",
57+
... retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."]
58+
... )
59+
>>> print(f"Response Groundedness: {result.value}")
60+
61+
Attributes:
62+
llm: Modern instructor-based LLM for dual-judge evaluation
63+
name: The metric name
64+
allowed_values: Score range (0.0 to 1.0, higher is better)
65+
max_retries: Maximum retry attempts for invalid ratings
66+
"""
67+
68+
# Type hints for linter (attributes are set in __init__)
69+
llm: "InstructorBaseRagasLLM"
70+
71+
def __init__(
72+
self,
73+
llm: "InstructorBaseRagasLLM",
74+
name: str = "response_groundedness",
75+
max_retries: int = 5,
76+
**kwargs,
77+
):
78+
"""
79+
Initialize ResponseGroundedness metric with required components.
80+
81+
Args:
82+
llm: Modern instructor-based LLM for dual-judge evaluation
83+
name: The metric name
84+
max_retries: Maximum retry attempts for invalid ratings
85+
"""
86+
# Set attributes explicitly before calling super()
87+
self.llm = llm
88+
self.max_retries = max_retries
89+
90+
# Call super() for validation (without passing llm in kwargs)
91+
super().__init__(name=name, **kwargs)
92+
93+
async def ascore(
94+
self, response: str, retrieved_contexts: List[str]
95+
) -> MetricResult:
96+
"""
97+
Calculate response groundedness score using dual-judge evaluation.
98+
99+
Args:
100+
response: The response to evaluate for groundedness
101+
retrieved_contexts: The retrieved contexts to check groundedness against
102+
103+
Returns:
104+
MetricResult with response groundedness score (0.0-1.0, higher is better)
105+
"""
106+
# Input validation
107+
if not response:
108+
raise ValueError(
109+
"response is missing. Please add response to the test sample."
110+
)
111+
if not retrieved_contexts:
112+
raise ValueError(
113+
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
114+
)
115+
116+
# Handle edge cases like legacy
117+
context_str = "\n".join(retrieved_contexts)
118+
119+
if not response.strip() or not context_str.strip():
120+
return MetricResult(value=0.0)
121+
122+
# Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing)
123+
judge1_rating = await self._get_judge_rating(
124+
response_groundedness_judge1_prompt(response, context_str)
125+
)
126+
judge2_rating = await self._get_judge_rating(
127+
response_groundedness_judge2_prompt(response, context_str)
128+
)
129+
130+
# Average the scores (already on 0.0-1.0 scale like legacy)
131+
score = self._average_scores(judge1_rating, judge2_rating)
132+
133+
return MetricResult(value=float(score))
134+
135+
async def _get_judge_rating(self, prompt: str) -> float:
136+
"""Get rating from judge using structured output with legacy-compatible processing."""
137+
for retry in range(self.max_retries):
138+
try:
139+
result = await self.llm.agenerate(prompt, GroundednessRating)
140+
rating = result.rating
141+
142+
# Validate rating is in expected range and convert to 0.0-1.0 scale
143+
if rating in [0, 1, 2]:
144+
return rating / 2.0 # Convert to legacy 0.0-1.0 scale
145+
else:
146+
if retry < self.max_retries - 1:
147+
continue # Retry if invalid rating
148+
else:
149+
return float("nan")
150+
151+
except Exception:
152+
if retry < self.max_retries - 1:
153+
continue # Retry on exception
154+
else:
155+
return float("nan")
156+
157+
return float("nan")
158+
159+
def _average_scores(self, score1: float, score2: float) -> float:
160+
"""Average two judge scores, handling NaN values. Matches legacy logic exactly."""
161+
if score1 >= 0 and score2 >= 0:
162+
return (score1 + score2) / 2.0
163+
else:
164+
# Match legacy behavior: use max() for NaN handling
165+
return max(score1, score2)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Response groundedness prompts - V1-identical converted to functions."""
2+
3+
4+
def response_groundedness_judge1_prompt(response: str, context: str) -> str:
5+
"""
6+
V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly.
7+
8+
Args:
9+
response: The response/assertion to evaluate for groundedness
10+
context: The context to evaluate the response against
11+
12+
Returns:
13+
V1-identical prompt string for the LLM
14+
"""
15+
return f"""### Instruction
16+
17+
You are a world class expert designed to evaluate the groundedness of an assertion.
18+
You will be provided with an assertion and a context.
19+
Your task is to determine if the assertion is supported by the context.
20+
Follow the instructions below:
21+
A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
22+
B. If the assertion is not supported by the context, say 0.
23+
C. If the assertion is partially supported by the context, say 1.
24+
D. If the assertion is fully supported by the context, say 2.
25+
You must provide a rating of 0, 1, or 2, nothing else.
26+
27+
### Context:
28+
<{context}>
29+
30+
### Assertion:
31+
<{response}>
32+
33+
Analyzing Context and Response, the Groundedness score is """
34+
35+
36+
def response_groundedness_judge2_prompt(response: str, context: str) -> str:
37+
"""
38+
V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly.
39+
40+
Args:
41+
response: The response/assertion to evaluate for groundedness
42+
context: The context to evaluate the response against
43+
44+
Returns:
45+
V1-identical prompt string for the LLM
46+
"""
47+
return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines:
48+
49+
* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0.
50+
* If the assertion is partially supported, assign a score of 1.
51+
* If the assertion is fully supported, assign a score of 2.
52+
53+
I will provide a rating of 0, 1, or 2, without any additional information.
54+
55+
---
56+
**Context:**
57+
[{context}]
58+
59+
**Assertion:**
60+
[{response}]
61+
62+
Do not explain. Based on the provided context and response, the Groundedness score is:"""

0 commit comments

Comments
 (0)