diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
index 1885d8da8..1c0274c7b 100644
--- a/src/ragas/metrics/collections/__init__.py
+++ b/src/ragas/metrics/collections/__init__.py
@@ -15,6 +15,7 @@
 from ragas.metrics.collections._factual_correctness import FactualCorrectness
 from ragas.metrics.collections._faithfulness import Faithfulness
 from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
+from ragas.metrics.collections._response_groundedness import ResponseGroundedness
 from ragas.metrics.collections._rouge_score import RougeScore
 from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
 from ragas.metrics.collections._string import (
@@ -44,6 +45,7 @@
     "Faithfulness",
     "NoiseSensitivity",
     "NonLLMStringSimilarity",
+    "ResponseGroundedness",
     "RougeScore",
     "SemanticSimilarity",
     "StringPresence",
diff --git a/src/ragas/metrics/collections/_response_groundedness.py b/src/ragas/metrics/collections/_response_groundedness.py
new file mode 100644
index 000000000..3cfb8e41b
--- /dev/null
+++ b/src/ragas/metrics/collections/_response_groundedness.py
@@ -0,0 +1,165 @@
+"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation."""
+
+import typing as t
+from typing import List
+
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.response_groundedness import (
+    response_groundedness_judge1_prompt,
+    response_groundedness_judge2_prompt,
+)
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class GroundednessRating(BaseModel):
+    """Structured output for groundedness rating."""
+
+    rating: int
+
+
+class ResponseGroundedness(BaseMetric):
+    """
+    Modern v2 implementation of response groundedness evaluation.
+
+    Evaluates how well grounded a response is in the retrieved contexts
+    using a dual-judge system. This metric averages two distinct judge prompts
+    to ensure robust evaluation.
+
+    The metric uses NVIDIA's proven dual-judge approach:
+    1. Judge 1: Direct groundedness evaluation with structured instructions
+    2. Judge 2: Alternative perspective for fairness
+    3. Average both judges for final score
+
+    Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded)
+    Final score: Average of both judges converted to 0.0-1.0 scale
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import llm_factory
+        >>> from ragas.metrics.collections import ResponseGroundedness
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = llm_factory("gpt-4o", client=client)
+        >>>
+        >>> # Create metric instance
+        >>> metric = ResponseGroundedness(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     response="Einstein was born in Germany in 1879.",
+        ...     retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."]
+        ... )
+        >>> print(f"Response Groundedness: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for dual-judge evaluation
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+        max_retries: Maximum retry attempts for invalid ratings
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "response_groundedness",
+        max_retries: int = 5,
+        **kwargs,
+    ):
+        """
+        Initialize ResponseGroundedness metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for dual-judge evaluation
+            name: The metric name
+            max_retries: Maximum retry attempts for invalid ratings
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.max_retries = max_retries
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self, response: str, retrieved_contexts: List[str]
+    ) -> MetricResult:
+        """
+        Calculate response groundedness score using dual-judge evaluation.
+
+        Args:
+            response: The response to evaluate for groundedness
+            retrieved_contexts: The retrieved contexts to check groundedness against
+
+        Returns:
+            MetricResult with response groundedness score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not retrieved_contexts:
+            raise ValueError(
+                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
+            )
+
+        # Handle edge cases like legacy
+        context_str = "\n".join(retrieved_contexts)
+
+        if not response.strip() or not context_str.strip():
+            return MetricResult(value=0.0)
+
+        # Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing)
+        judge1_rating = await self._get_judge_rating(
+            response_groundedness_judge1_prompt(response, context_str)
+        )
+        judge2_rating = await self._get_judge_rating(
+            response_groundedness_judge2_prompt(response, context_str)
+        )
+
+        # Average the scores (already on 0.0-1.0 scale like legacy)
+        score = self._average_scores(judge1_rating, judge2_rating)
+
+        return MetricResult(value=float(score))
+
+    async def _get_judge_rating(self, prompt: str) -> float:
+        """Get rating from judge using structured output with legacy-compatible processing."""
+        for retry in range(self.max_retries):
+            try:
+                result = await self.llm.agenerate(prompt, GroundednessRating)
+                rating = result.rating
+
+                # Validate rating is in expected range and convert to 0.0-1.0 scale
+                if rating in [0, 1, 2]:
+                    return rating / 2.0  # Convert to legacy 0.0-1.0 scale
+                else:
+                    if retry < self.max_retries - 1:
+                        continue  # Retry if invalid rating
+                    else:
+                        return float("nan")
+
+            except Exception:
+                if retry < self.max_retries - 1:
+                    continue  # Retry on exception
+                else:
+                    return float("nan")
+
+        return float("nan")
+
+    def _average_scores(self, score1: float, score2: float) -> float:
+        """Average two judge scores, handling NaN values. Matches legacy logic exactly."""
+        if score1 >= 0 and score2 >= 0:
+            return (score1 + score2) / 2.0
+        else:
+            # Match legacy behavior: use max() for NaN handling
+            return max(score1, score2)
diff --git a/src/ragas/prompt/metrics/response_groundedness.py b/src/ragas/prompt/metrics/response_groundedness.py
new file mode 100644
index 000000000..8ae7b7ff7
--- /dev/null
+++ b/src/ragas/prompt/metrics/response_groundedness.py
@@ -0,0 +1,62 @@
+"""Response groundedness prompts - V1-identical converted to functions."""
+
+
+def response_groundedness_judge1_prompt(response: str, context: str) -> str:
+    """
+    V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly.
+
+    Args:
+        response: The response/assertion to evaluate for groundedness
+        context: The context to evaluate the response against
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    return f"""### Instruction
+
+You are a world class expert designed to evaluate the groundedness of an assertion.
+You will be provided with an assertion and a context.
+Your task is to determine if the assertion is supported by the context.
+Follow the instructions below:
+A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
+B. If the assertion is not supported by the context, say 0.
+C. If the assertion is partially supported by the context, say 1.
+D. If the assertion is fully supported by the context, say 2.
+You must provide a rating of 0, 1, or 2, nothing else.
+
+### Context:
+<{context}>
+
+### Assertion:
+<{response}>
+
+Analyzing Context and Response, the Groundedness score is """
+
+
+def response_groundedness_judge2_prompt(response: str, context: str) -> str:
+    """
+    V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly.
+
+    Args:
+        response: The response/assertion to evaluate for groundedness
+        context: The context to evaluate the response against
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines:
+
+* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0.
+* If the assertion is partially supported, assign a score of 1.
+* If the assertion is fully supported, assign a score of 2.
+
+I will provide a rating of 0, 1, or 2, without any additional information.
+
+---
+**Context:**
+[{context}]
+
+**Assertion:**
+[{response}]
+
+Do not explain. Based on the provided context and response, the Groundedness score is:"""
diff --git a/tests/e2e/metrics_migration/test_response_groundedness_migration.py b/tests/e2e/metrics_migration/test_response_groundedness_migration.py
new file mode 100644
index 000000000..d37e5a96e
--- /dev/null
+++ b/tests/e2e/metrics_migration/test_response_groundedness_migration.py
@@ -0,0 +1,224 @@
+"""E2E tests for ResponseGroundedness metric migration from v1 to v2."""
+
+import numpy as np
+import pytest
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics._nv_metrics import ResponseGroundedness as LegacyResponseGroundedness
+from ragas.metrics.collections import ResponseGroundedness
+
+
+class TestResponseGroundednessE2EMigration:
+    """E2E test compatibility between legacy ResponseGroundedness and new V2 ResponseGroundedness with modern components."""
+
+    @pytest.fixture
+    def sample_data(self):
+        """Real-world test cases for response groundedness evaluation."""
+        return [
+            {
+                "response": "Einstein was born in Germany on March 14, 1879.",
+                "retrieved_contexts": [
+                    "Albert Einstein was born in Ulm, Germany on March 14, 1879."
+                ],
+                "description": "High groundedness - response fully supported by context",
+            },
+            {
+                "response": "Einstein was born in France on March 14, 1879.",
+                "retrieved_contexts": [
+                    "Albert Einstein was born in Ulm, Germany on March 14, 1879."
+                ],
+                "description": "Low groundedness - wrong country not supported by context",
+            },
+            {
+                "response": "Einstein was a physicist.",
+                "retrieved_contexts": [
+                    "Albert Einstein was a German-born theoretical physicist, widely held to be one of the greatest scientists of all time."
+                ],
+                "description": "High groundedness - response supported by context",
+            },
+            {
+                "response": "The capital of France is Paris, and it has a population of over 2 million.",
+                "retrieved_contexts": [
+                    "Paris is the capital and most populous city of France."
+                ],
+                "description": "Partial groundedness - capital correct, population not mentioned",
+            },
+            {
+                "response": "Photosynthesis is the process by which plants convert sunlight into energy.",
+                "retrieved_contexts": [
+                    "Photosynthesis is a biological process where plants use sunlight to create glucose and oxygen."
+                ],
+                "description": "High groundedness - core concept supported",
+            },
+        ]
+
+    @pytest.fixture
+    def test_llm(self):
+        """Create a LangChain LLM for legacy response groundedness evaluation."""
+        try:
+            from langchain_openai import ChatOpenAI
+
+            from ragas.llms import LangchainLLMWrapper
+
+            langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01)
+            return LangchainLLMWrapper(langchain_llm)
+        except ImportError as e:
+            pytest.skip(f"LangChain LLM not available: {e}")
+        except Exception as e:
+            pytest.skip(f"Could not create LangChain LLM (API key may be missing): {e}")
+
+    @pytest.fixture
+    def test_modern_llm(self):
+        """Create a modern instructor LLM for v2 implementation."""
+        try:
+            import openai
+
+            from ragas.llms.base import llm_factory
+
+            client = openai.AsyncOpenAI()
+            # Use legacy temperature (0.1) for perfect compatibility
+            return llm_factory("gpt-4o", client=client, temperature=0.1)
+        except ImportError as e:
+            pytest.skip(f"LLM factory not available: {e}")
+        except Exception as e:
+            pytest.skip(f"Could not create modern LLM (API key may be missing): {e}")
+
+    @pytest.mark.asyncio
+    async def test_legacy_response_groundedness_vs_v2_response_groundedness_e2e_compatibility(
+        self, sample_data, test_llm, test_modern_llm
+    ):
+        """E2E test that legacy and v2 implementations produce similar scores."""
+
+        if test_llm is None or test_modern_llm is None:
+            pytest.skip("LLM required for E2E testing")
+
+        for i, data in enumerate(sample_data):
+            print(
+                f"\n🧪 Testing ResponseGroundedness - Case {i + 1}: {data['description']}"
+            )
+            print(f"   Response: {data['response'][:80]}...")
+            print(f"   Contexts: {len(data['retrieved_contexts'])} context(s)")
+
+            # Legacy implementation
+            legacy_groundedness = LegacyResponseGroundedness(llm=test_llm)
+            legacy_sample = SingleTurnSample(
+                response=data["response"],
+                retrieved_contexts=data["retrieved_contexts"],
+            )
+            legacy_score = await legacy_groundedness._single_turn_ascore(
+                legacy_sample, None
+            )
+
+            # V2 implementation
+            v2_groundedness = ResponseGroundedness(llm=test_modern_llm)
+            v2_result = await v2_groundedness.ascore(
+                response=data["response"],
+                retrieved_contexts=data["retrieved_contexts"],
+            )
+
+            score_diff = abs(legacy_score - v2_result.value)
+            print(f"   Legacy: {legacy_score:.6f}")
+            print(f"   V2:     {v2_result.value:.6f}")
+            print(f"   Diff:   {score_diff:.6f}")
+
+            # Ensure implementations give reasonably similar scores
+            # Response groundedness uses dual-judge system with some variation expected
+            assert score_diff < 0.3, (
+                f"Legacy and V2 scores should be similar: Legacy={legacy_score:.6f}, "
+                f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.3)"
+            )
+            print("   ✅ Both implementations give consistent scores")
+
+            # Validate score ranges (both should be 0-1 or NaN)
+            if not np.isnan(legacy_score):
+                assert 0.0 <= legacy_score <= 1.0
+            if not np.isnan(v2_result.value):
+                assert 0.0 <= v2_result.value <= 1.0
+
+    @pytest.mark.asyncio
+    async def test_response_groundedness_edge_cases(self, test_modern_llm):
+        """Test edge cases like empty responses and contexts."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for edge case testing")
+
+        metric = ResponseGroundedness(llm=test_modern_llm)
+
+        # Test empty response
+        with pytest.raises(ValueError, match="response is missing"):
+            await metric.ascore(
+                response="",
+                retrieved_contexts=["Some context about Einstein."],
+            )
+
+        # Test empty contexts
+        with pytest.raises(ValueError, match="retrieved_contexts is missing"):
+            await metric.ascore(
+                response="Einstein was a physicist.",
+                retrieved_contexts=[],
+            )
+
+    @pytest.mark.asyncio
+    async def test_response_groundedness_scoring_behavior(self, test_modern_llm):
+        """Test that response groundedness produces expected score patterns."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for scoring testing")
+
+        metric = ResponseGroundedness(llm=test_modern_llm)
+
+        # High groundedness case
+        high_result = await metric.ascore(
+            response="The capital of France is Paris.",
+            retrieved_contexts=["Paris is the capital and largest city of France."],
+        )
+
+        # Low groundedness case
+        low_result = await metric.ascore(
+            response="The capital of France is London.",
+            retrieved_contexts=["Paris is the capital and largest city of France."],
+        )
+
+        print(f"High groundedness score: {high_result.value:.3f}")
+        print(f"Low groundedness score: {low_result.value:.3f}")
+
+        # Validate ranges
+        assert 0.0 <= high_result.value <= 1.0
+        assert 0.0 <= low_result.value <= 1.0
+
+        # High groundedness should typically score higher than low groundedness
+        # (though exact scores depend on judge behavior)
+
+    @pytest.mark.asyncio
+    async def test_response_groundedness_dual_judge_system(self, test_modern_llm):
+        """Test that the dual-judge system is working with different contexts."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for dual-judge testing")
+
+        metric = ResponseGroundedness(llm=test_modern_llm)
+
+        # Test with multiple contexts that provide different levels of support
+        result = await metric.ascore(
+            response="Einstein developed the theory of relativity and won a Nobel Prize.",
+            retrieved_contexts=[
+                "Albert Einstein developed the theory of relativity.",
+                "Einstein won the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect.",
+            ],
+        )
+
+        print(f"Multi-context groundedness score: {result.value:.3f}")
+
+        # Should be well-grounded since both parts are supported
+        assert 0.0 <= result.value <= 1.0
+
+    def test_response_groundedness_migration_requirements_documented(self):
+        """Test that migration requirements are properly documented."""
+
+        # V2 implementation should not accept legacy components
+        with pytest.raises((TypeError, ValueError, AttributeError)):
+            ResponseGroundedness(llm="invalid_llm_type")  # type: ignore[arg-type]  # Should reject string
+
+        # V2 should only accept InstructorBaseRagasLLM
+        with pytest.raises((TypeError, ValueError, AttributeError)):
+            ResponseGroundedness(llm=None)  # type: ignore[arg-type]  # Should reject None