diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py index 1885d8da8..1c0274c7b 100644 --- a/src/ragas/metrics/collections/__init__.py +++ b/src/ragas/metrics/collections/__init__.py @@ -15,6 +15,7 @@ from ragas.metrics.collections._factual_correctness import FactualCorrectness from ragas.metrics.collections._faithfulness import Faithfulness from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity +from ragas.metrics.collections._response_groundedness import ResponseGroundedness from ragas.metrics.collections._rouge_score import RougeScore from ragas.metrics.collections._semantic_similarity import SemanticSimilarity from ragas.metrics.collections._string import ( @@ -44,6 +45,7 @@ "Faithfulness", "NoiseSensitivity", "NonLLMStringSimilarity", + "ResponseGroundedness", "RougeScore", "SemanticSimilarity", "StringPresence", diff --git a/src/ragas/metrics/collections/_response_groundedness.py b/src/ragas/metrics/collections/_response_groundedness.py new file mode 100644 index 000000000..3cfb8e41b --- /dev/null +++ b/src/ragas/metrics/collections/_response_groundedness.py @@ -0,0 +1,165 @@ +"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation.""" + +import typing as t +from typing import List + +from pydantic import BaseModel + +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.prompt.metrics.response_groundedness import ( + response_groundedness_judge1_prompt, + response_groundedness_judge2_prompt, +) + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + + +class GroundednessRating(BaseModel): + """Structured output for groundedness rating.""" + + rating: int + + +class ResponseGroundedness(BaseMetric): + """ + Modern v2 implementation of response groundedness evaluation. + + Evaluates how well grounded a response is in the retrieved contexts + using a dual-judge system. This metric averages two distinct judge prompts + to ensure robust evaluation. + + The metric uses NVIDIA's proven dual-judge approach: + 1. Judge 1: Direct groundedness evaluation with structured instructions + 2. Judge 2: Alternative perspective for fairness + 3. Average both judges for final score + + Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded) + Final score: Average of both judges converted to 0.0-1.0 scale + + Usage: + >>> import instructor + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import llm_factory + >>> from ragas.metrics.collections import ResponseGroundedness + >>> + >>> # Setup dependencies + >>> client = AsyncOpenAI() + >>> llm = llm_factory("gpt-4o", client=client) + >>> + >>> # Create metric instance + >>> metric = ResponseGroundedness(llm=llm) + >>> + >>> # Single evaluation + >>> result = await metric.ascore( + ... response="Einstein was born in Germany in 1879.", + ... retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."] + ... ) + >>> print(f"Response Groundedness: {result.value}") + + Attributes: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + allowed_values: Score range (0.0 to 1.0, higher is better) + max_retries: Maximum retry attempts for invalid ratings + """ + + # Type hints for linter (attributes are set in __init__) + llm: "InstructorBaseRagasLLM" + + def __init__( + self, + llm: "InstructorBaseRagasLLM", + name: str = "response_groundedness", + max_retries: int = 5, + **kwargs, + ): + """ + Initialize ResponseGroundedness metric with required components. + + Args: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + max_retries: Maximum retry attempts for invalid ratings + """ + # Set attributes explicitly before calling super() + self.llm = llm + self.max_retries = max_retries + + # Call super() for validation (without passing llm in kwargs) + super().__init__(name=name, **kwargs) + + async def ascore( + self, response: str, retrieved_contexts: List[str] + ) -> MetricResult: + """ + Calculate response groundedness score using dual-judge evaluation. + + Args: + response: The response to evaluate for groundedness + retrieved_contexts: The retrieved contexts to check groundedness against + + Returns: + MetricResult with response groundedness score (0.0-1.0, higher is better) + """ + # Input validation + if not response: + raise ValueError( + "response is missing. Please add response to the test sample." + ) + if not retrieved_contexts: + raise ValueError( + "retrieved_contexts is missing. Please add retrieved_contexts to the test sample." + ) + + # Handle edge cases like legacy + context_str = "\n".join(retrieved_contexts) + + if not response.strip() or not context_str.strip(): + return MetricResult(value=0.0) + + # Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing) + judge1_rating = await self._get_judge_rating( + response_groundedness_judge1_prompt(response, context_str) + ) + judge2_rating = await self._get_judge_rating( + response_groundedness_judge2_prompt(response, context_str) + ) + + # Average the scores (already on 0.0-1.0 scale like legacy) + score = self._average_scores(judge1_rating, judge2_rating) + + return MetricResult(value=float(score)) + + async def _get_judge_rating(self, prompt: str) -> float: + """Get rating from judge using structured output with legacy-compatible processing.""" + for retry in range(self.max_retries): + try: + result = await self.llm.agenerate(prompt, GroundednessRating) + rating = result.rating + + # Validate rating is in expected range and convert to 0.0-1.0 scale + if rating in [0, 1, 2]: + return rating / 2.0 # Convert to legacy 0.0-1.0 scale + else: + if retry < self.max_retries - 1: + continue # Retry if invalid rating + else: + return float("nan") + + except Exception: + if retry < self.max_retries - 1: + continue # Retry on exception + else: + return float("nan") + + return float("nan") + + def _average_scores(self, score1: float, score2: float) -> float: + """Average two judge scores, handling NaN values. Matches legacy logic exactly.""" + if score1 >= 0 and score2 >= 0: + return (score1 + score2) / 2.0 + else: + # Match legacy behavior: use max() for NaN handling + return max(score1, score2) diff --git a/src/ragas/prompt/metrics/response_groundedness.py b/src/ragas/prompt/metrics/response_groundedness.py new file mode 100644 index 000000000..8ae7b7ff7 --- /dev/null +++ b/src/ragas/prompt/metrics/response_groundedness.py @@ -0,0 +1,62 @@ +"""Response groundedness prompts - V1-identical converted to functions.""" + + +def response_groundedness_judge1_prompt(response: str, context: str) -> str: + """ + V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly. + + Args: + response: The response/assertion to evaluate for groundedness + context: The context to evaluate the response against + + Returns: + V1-identical prompt string for the LLM + """ + return f"""### Instruction + +You are a world class expert designed to evaluate the groundedness of an assertion. +You will be provided with an assertion and a context. +Your task is to determine if the assertion is supported by the context. +Follow the instructions below: +A. If there is no context or no assertion or context is empty or assertion is empty, say 0. +B. If the assertion is not supported by the context, say 0. +C. If the assertion is partially supported by the context, say 1. +D. If the assertion is fully supported by the context, say 2. +You must provide a rating of 0, 1, or 2, nothing else. + +### Context: +<{context}> + +### Assertion: +<{response}> + +Analyzing Context and Response, the Groundedness score is """ + + +def response_groundedness_judge2_prompt(response: str, context: str) -> str: + """ + V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly. + + Args: + response: The response/assertion to evaluate for groundedness + context: The context to evaluate the response against + + Returns: + V1-identical prompt string for the LLM + """ + return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines: + +* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0. +* If the assertion is partially supported, assign a score of 1. +* If the assertion is fully supported, assign a score of 2. + +I will provide a rating of 0, 1, or 2, without any additional information. + +--- +**Context:** +[{context}] + +**Assertion:** +[{response}] + +Do not explain. Based on the provided context and response, the Groundedness score is:""" diff --git a/tests/e2e/metrics_migration/test_response_groundedness_migration.py b/tests/e2e/metrics_migration/test_response_groundedness_migration.py new file mode 100644 index 000000000..d37e5a96e --- /dev/null +++ b/tests/e2e/metrics_migration/test_response_groundedness_migration.py @@ -0,0 +1,224 @@ +"""E2E tests for ResponseGroundedness metric migration from v1 to v2.""" + +import numpy as np +import pytest + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics._nv_metrics import ResponseGroundedness as LegacyResponseGroundedness +from ragas.metrics.collections import ResponseGroundedness + + +class TestResponseGroundednessE2EMigration: + """E2E test compatibility between legacy ResponseGroundedness and new V2 ResponseGroundedness with modern components.""" + + @pytest.fixture + def sample_data(self): + """Real-world test cases for response groundedness evaluation.""" + return [ + { + "response": "Einstein was born in Germany on March 14, 1879.", + "retrieved_contexts": [ + "Albert Einstein was born in Ulm, Germany on March 14, 1879." + ], + "description": "High groundedness - response fully supported by context", + }, + { + "response": "Einstein was born in France on March 14, 1879.", + "retrieved_contexts": [ + "Albert Einstein was born in Ulm, Germany on March 14, 1879." + ], + "description": "Low groundedness - wrong country not supported by context", + }, + { + "response": "Einstein was a physicist.", + "retrieved_contexts": [ + "Albert Einstein was a German-born theoretical physicist, widely held to be one of the greatest scientists of all time." + ], + "description": "High groundedness - response supported by context", + }, + { + "response": "The capital of France is Paris, and it has a population of over 2 million.", + "retrieved_contexts": [ + "Paris is the capital and most populous city of France." + ], + "description": "Partial groundedness - capital correct, population not mentioned", + }, + { + "response": "Photosynthesis is the process by which plants convert sunlight into energy.", + "retrieved_contexts": [ + "Photosynthesis is a biological process where plants use sunlight to create glucose and oxygen." + ], + "description": "High groundedness - core concept supported", + }, + ] + + @pytest.fixture + def test_llm(self): + """Create a LangChain LLM for legacy response groundedness evaluation.""" + try: + from langchain_openai import ChatOpenAI + + from ragas.llms import LangchainLLMWrapper + + langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01) + return LangchainLLMWrapper(langchain_llm) + except ImportError as e: + pytest.skip(f"LangChain LLM not available: {e}") + except Exception as e: + pytest.skip(f"Could not create LangChain LLM (API key may be missing): {e}") + + @pytest.fixture + def test_modern_llm(self): + """Create a modern instructor LLM for v2 implementation.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + # Use legacy temperature (0.1) for perfect compatibility + return llm_factory("gpt-4o", client=client, temperature=0.1) + except ImportError as e: + pytest.skip(f"LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create modern LLM (API key may be missing): {e}") + + @pytest.mark.asyncio + async def test_legacy_response_groundedness_vs_v2_response_groundedness_e2e_compatibility( + self, sample_data, test_llm, test_modern_llm + ): + """E2E test that legacy and v2 implementations produce similar scores.""" + + if test_llm is None or test_modern_llm is None: + pytest.skip("LLM required for E2E testing") + + for i, data in enumerate(sample_data): + print( + f"\n🧪 Testing ResponseGroundedness - Case {i + 1}: {data['description']}" + ) + print(f" Response: {data['response'][:80]}...") + print(f" Contexts: {len(data['retrieved_contexts'])} context(s)") + + # Legacy implementation + legacy_groundedness = LegacyResponseGroundedness(llm=test_llm) + legacy_sample = SingleTurnSample( + response=data["response"], + retrieved_contexts=data["retrieved_contexts"], + ) + legacy_score = await legacy_groundedness._single_turn_ascore( + legacy_sample, None + ) + + # V2 implementation + v2_groundedness = ResponseGroundedness(llm=test_modern_llm) + v2_result = await v2_groundedness.ascore( + response=data["response"], + retrieved_contexts=data["retrieved_contexts"], + ) + + score_diff = abs(legacy_score - v2_result.value) + print(f" Legacy: {legacy_score:.6f}") + print(f" V2: {v2_result.value:.6f}") + print(f" Diff: {score_diff:.6f}") + + # Ensure implementations give reasonably similar scores + # Response groundedness uses dual-judge system with some variation expected + assert score_diff < 0.3, ( + f"Legacy and V2 scores should be similar: Legacy={legacy_score:.6f}, " + f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.3)" + ) + print(" ✅ Both implementations give consistent scores") + + # Validate score ranges (both should be 0-1 or NaN) + if not np.isnan(legacy_score): + assert 0.0 <= legacy_score <= 1.0 + if not np.isnan(v2_result.value): + assert 0.0 <= v2_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_response_groundedness_edge_cases(self, test_modern_llm): + """Test edge cases like empty responses and contexts.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for edge case testing") + + metric = ResponseGroundedness(llm=test_modern_llm) + + # Test empty response + with pytest.raises(ValueError, match="response is missing"): + await metric.ascore( + response="", + retrieved_contexts=["Some context about Einstein."], + ) + + # Test empty contexts + with pytest.raises(ValueError, match="retrieved_contexts is missing"): + await metric.ascore( + response="Einstein was a physicist.", + retrieved_contexts=[], + ) + + @pytest.mark.asyncio + async def test_response_groundedness_scoring_behavior(self, test_modern_llm): + """Test that response groundedness produces expected score patterns.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for scoring testing") + + metric = ResponseGroundedness(llm=test_modern_llm) + + # High groundedness case + high_result = await metric.ascore( + response="The capital of France is Paris.", + retrieved_contexts=["Paris is the capital and largest city of France."], + ) + + # Low groundedness case + low_result = await metric.ascore( + response="The capital of France is London.", + retrieved_contexts=["Paris is the capital and largest city of France."], + ) + + print(f"High groundedness score: {high_result.value:.3f}") + print(f"Low groundedness score: {low_result.value:.3f}") + + # Validate ranges + assert 0.0 <= high_result.value <= 1.0 + assert 0.0 <= low_result.value <= 1.0 + + # High groundedness should typically score higher than low groundedness + # (though exact scores depend on judge behavior) + + @pytest.mark.asyncio + async def test_response_groundedness_dual_judge_system(self, test_modern_llm): + """Test that the dual-judge system is working with different contexts.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for dual-judge testing") + + metric = ResponseGroundedness(llm=test_modern_llm) + + # Test with multiple contexts that provide different levels of support + result = await metric.ascore( + response="Einstein developed the theory of relativity and won a Nobel Prize.", + retrieved_contexts=[ + "Albert Einstein developed the theory of relativity.", + "Einstein won the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect.", + ], + ) + + print(f"Multi-context groundedness score: {result.value:.3f}") + + # Should be well-grounded since both parts are supported + assert 0.0 <= result.value <= 1.0 + + def test_response_groundedness_migration_requirements_documented(self): + """Test that migration requirements are properly documented.""" + + # V2 implementation should not accept legacy components + with pytest.raises((TypeError, ValueError, AttributeError)): + ResponseGroundedness(llm="invalid_llm_type") # type: ignore[arg-type] # Should reject string + + # V2 should only accept InstructorBaseRagasLLM + with pytest.raises((TypeError, ValueError, AttributeError)): + ResponseGroundedness(llm=None) # type: ignore[arg-type] # Should reject None