From 4fe745cc56aba0bfb86007596d25aeea4ac49b77 Mon Sep 17 00:00:00 2001
From: A Vertex SDK engineer <vertex-sdk-bot@google.com>
Date: Tue, 31 Mar 2026 16:16:30 -0700
Subject: [PATCH] feat: Add support for custom result parsing in LLM-based
 evaluation metrics

PiperOrigin-RevId: 892593906
---
 .../vertexai/genai/replays/test_evaluate.py   | 70 +++++++++----------
 vertexai/_genai/_transformers.py              | 16 +++++
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py
index 75e7d32e5e..317e93a33c 100644
--- a/tests/unit/vertexai/genai/replays/test_evaluate.py
+++ b/tests/unit/vertexai/genai/replays/test_evaluate.py
@@ -355,41 +355,41 @@ def test_evaluation_metric_resource_name(client):
     client._api_client._http_options.api_version = "v1beta1"
     tone_check_metric = types.LLMMetric(
         name="tone_check",
-        prompt_template="""
-    # Instruction
-    You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
-
-    # Criteria
-    Analyze the tone of the response based on these two criteria:
-    1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
-    2. Empathy: The response should acknowledge the user's feelings and show understanding.
-
-    # Input
-    Prompt: {agent_data.turns[0].events[0]}
-    Response: {agent_data.turns[0].events[1]}
-
-    # Output Format
-    Respond in a JSON format with the following schema:
-    {
-        "type": "OBJECT",
-        "properties": {
-            "score": {"type": "NUMBER"},
-            "explanation": {"type": "STRING"},
-        },
-        "required": ["score", "explanation"],
-    }
-    Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
-
-    The output would include the following fields:
-    score: based on your evaluation, the score should be a number based on the rating rubrics.
-    explanation: your explanation for the score rating, in one line.
-
-    ## Example Output Format:
-    {"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
-    {"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric."}
-    {"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
-    {"score" : 5, "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric."}
-    """,
+        prompt_template="""Analyze the tone of the response based on these two criteria:\n
+          1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.\n
+          2. Empathy: The response should acknowledge the user's feelings and show understanding.\n\n
+          Prompt: {agent_data.turns[0].events[0]}
+          Response: {agent_data.turns[0].events[1]}
+          Return ONLY a JSON list of objects for these two properties:
+          [{"property": "Professionalism", "verdict": true, "reasoning": "..."},
+          {"property": "Empathy", "verdict": true, "reasoning": "..."}]
+        """,
+        result_parsing_function="""
+import json, re
+def parse_results(responses):
+    text = responses[0]
+    # Use robust regex to find the JSON list block
+    match = re.search("[\\[].*[]]", text, re.DOTALL)
+    if not match: return {"score": 0.0, "explanation": "No valid JSON found"}
+
+    try:
+        data = json.loads(match.group(0))
+        # Calculate an overall score (e.g., average of verdicts)
+        passed_count = sum(1 for r in data if r.get("verdict", False))
+        total_count = len(data)
+        score = passed_count / total_count if total_count > 0 else 0.0
+
+        # Consolidate reasoning into a single explanation string
+        explanation = "\\n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data])
+
+        # IMPORTANT: Return a dictionary, not a list
+        return {
+            "score": float(score),
+            "explanation": explanation
+        }
+    except Exception as e:
+        return {"score": 0.0, "explanation": f"Parsing failed: {str(e)}"}
+""",
     )
     metric_resource_name = client.evals.create_evaluation_metric(
         metric=tone_check_metric,
diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py
index 65ca401ae3..4b203d447f 100644
--- a/vertexai/_genai/_transformers.py
+++ b/vertexai/_genai/_transformers.py
@@ -119,6 +119,14 @@ def t_metrics(
             if autorater_config:
                 llm_based_spec["judge_autorater_config"] = autorater_config
 
+            result_parsing_function = getv(metric, ["result_parsing_function"])
+            if result_parsing_function:
+                llm_based_spec["result_parser_config"] = {
+                    "custom_code_parser_config": {
+                        "parsing_function": result_parsing_function
+                    }
+                }
+
             metric_payload_item["llm_based_metric_spec"] = llm_based_spec
         elif getattr(metric, "metric_resource_name", None) is not None:
             # Safe pass
@@ -235,6 +243,14 @@ def t_metric_for_registry(
         if autorater_config:
             llm_based_spec["judge_autorater_config"] = autorater_config
 
+        result_parsing_function = getv(metric, ["result_parsing_function"])
+        if result_parsing_function:
+            llm_based_spec["result_parser_config"] = {
+                "custom_code_parser_config": {
+                    "parsing_function": result_parsing_function
+                }
+            }
+
         metric_payload_item["llm_based_metric_spec"] = llm_based_spec
 
     else: