Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/ragas/metrics/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ragas.metrics.collections._factual_correctness import FactualCorrectness
from ragas.metrics.collections._faithfulness import Faithfulness
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
from ragas.metrics.collections._response_groundedness import ResponseGroundedness
from ragas.metrics.collections._rouge_score import RougeScore
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
from ragas.metrics.collections._string import (
Expand Down Expand Up @@ -44,6 +45,7 @@
"Faithfulness",
"NoiseSensitivity",
"NonLLMStringSimilarity",
"ResponseGroundedness",
"RougeScore",
"SemanticSimilarity",
"StringPresence",
Expand Down
165 changes: 165 additions & 0 deletions src/ragas/metrics/collections/_response_groundedness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation."""

import typing as t
from typing import List

from pydantic import BaseModel

from ragas.metrics.collections.base import BaseMetric
from ragas.metrics.result import MetricResult
from ragas.prompt.metrics.response_groundedness import (
response_groundedness_judge1_prompt,
response_groundedness_judge2_prompt,
)

if t.TYPE_CHECKING:
from ragas.llms.base import InstructorBaseRagasLLM


class GroundednessRating(BaseModel):
"""Structured output for groundedness rating."""

rating: int


class ResponseGroundedness(BaseMetric):
"""
Modern v2 implementation of response groundedness evaluation.

Evaluates how well grounded a response is in the retrieved contexts
using a dual-judge system. This metric averages two distinct judge prompts
to ensure robust evaluation.

The metric uses NVIDIA's proven dual-judge approach:
1. Judge 1: Direct groundedness evaluation with structured instructions
2. Judge 2: Alternative perspective for fairness
3. Average both judges for final score

Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded)
Final score: Average of both judges converted to 0.0-1.0 scale

Usage:
>>> import instructor
>>> from openai import AsyncOpenAI
>>> from ragas.llms.base import llm_factory
>>> from ragas.metrics.collections import ResponseGroundedness
>>>
>>> # Setup dependencies
>>> client = AsyncOpenAI()
>>> llm = llm_factory("gpt-4o", client=client)
>>>
>>> # Create metric instance
>>> metric = ResponseGroundedness(llm=llm)
>>>
>>> # Single evaluation
>>> result = await metric.ascore(
... response="Einstein was born in Germany in 1879.",
... retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."]
... )
>>> print(f"Response Groundedness: {result.value}")

Attributes:
llm: Modern instructor-based LLM for dual-judge evaluation
name: The metric name
allowed_values: Score range (0.0 to 1.0, higher is better)
max_retries: Maximum retry attempts for invalid ratings
"""

# Type hints for linter (attributes are set in __init__)
llm: "InstructorBaseRagasLLM"

def __init__(
self,
llm: "InstructorBaseRagasLLM",
name: str = "response_groundedness",
max_retries: int = 5,
**kwargs,
):
"""
Initialize ResponseGroundedness metric with required components.

Args:
llm: Modern instructor-based LLM for dual-judge evaluation
name: The metric name
max_retries: Maximum retry attempts for invalid ratings
"""
# Set attributes explicitly before calling super()
self.llm = llm
self.max_retries = max_retries

# Call super() for validation (without passing llm in kwargs)
super().__init__(name=name, **kwargs)

async def ascore(
self, response: str, retrieved_contexts: List[str]
) -> MetricResult:
"""
Calculate response groundedness score using dual-judge evaluation.

Args:
response: The response to evaluate for groundedness
retrieved_contexts: The retrieved contexts to check groundedness against

Returns:
MetricResult with response groundedness score (0.0-1.0, higher is better)
"""
# Input validation
if not response:
raise ValueError(
"response is missing. Please add response to the test sample."
)
if not retrieved_contexts:
raise ValueError(
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
)

# Handle edge cases like legacy
context_str = "\n".join(retrieved_contexts)

if not response.strip() or not context_str.strip():
return MetricResult(value=0.0)

# Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing)
judge1_rating = await self._get_judge_rating(
response_groundedness_judge1_prompt(response, context_str)
)
judge2_rating = await self._get_judge_rating(
response_groundedness_judge2_prompt(response, context_str)
)

# Average the scores (already on 0.0-1.0 scale like legacy)
score = self._average_scores(judge1_rating, judge2_rating)

return MetricResult(value=float(score))

async def _get_judge_rating(self, prompt: str) -> float:
"""Get rating from judge using structured output with legacy-compatible processing."""
for retry in range(self.max_retries):
try:
result = await self.llm.agenerate(prompt, GroundednessRating)
rating = result.rating

# Validate rating is in expected range and convert to 0.0-1.0 scale
if rating in [0, 1, 2]:
return rating / 2.0 # Convert to legacy 0.0-1.0 scale
else:
if retry < self.max_retries - 1:
continue # Retry if invalid rating
else:
return float("nan")

except Exception:
if retry < self.max_retries - 1:
continue # Retry on exception
else:
return float("nan")

return float("nan")

def _average_scores(self, score1: float, score2: float) -> float:
"""Average two judge scores, handling NaN values. Matches legacy logic exactly."""
if score1 >= 0 and score2 >= 0:
return (score1 + score2) / 2.0
else:
# Match legacy behavior: use max() for NaN handling
return max(score1, score2)
62 changes: 62 additions & 0 deletions src/ragas/prompt/metrics/response_groundedness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Response groundedness prompts - V1-identical converted to functions."""


def response_groundedness_judge1_prompt(response: str, context: str) -> str:
"""
V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly.

Args:
response: The response/assertion to evaluate for groundedness
context: The context to evaluate the response against

Returns:
V1-identical prompt string for the LLM
"""
return f"""### Instruction

You are a world class expert designed to evaluate the groundedness of an assertion.
You will be provided with an assertion and a context.
Your task is to determine if the assertion is supported by the context.
Follow the instructions below:
A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
B. If the assertion is not supported by the context, say 0.
C. If the assertion is partially supported by the context, say 1.
D. If the assertion is fully supported by the context, say 2.
You must provide a rating of 0, 1, or 2, nothing else.

### Context:
<{context}>

### Assertion:
<{response}>

Analyzing Context and Response, the Groundedness score is """


def response_groundedness_judge2_prompt(response: str, context: str) -> str:
"""
V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly.

Args:
response: The response/assertion to evaluate for groundedness
context: The context to evaluate the response against

Returns:
V1-identical prompt string for the LLM
"""
return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines:

* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0.
* If the assertion is partially supported, assign a score of 1.
* If the assertion is fully supported, assign a score of 2.

I will provide a rating of 0, 1, or 2, without any additional information.

---
**Context:**
[{context}]

**Assertion:**
[{response}]

Do not explain. Based on the provided context and response, the Groundedness score is:"""
Loading
Loading