From 4fe745cc56aba0bfb86007596d25aeea4ac49b77 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Tue, 31 Mar 2026 16:16:30 -0700 Subject: [PATCH] feat: Add support for custom result parsing in LLM-based evaluation metrics PiperOrigin-RevId: 892593906 --- .../vertexai/genai/replays/test_evaluate.py | 70 +++++++++---------- vertexai/_genai/_transformers.py | 16 +++++ 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py index 75e7d32e5e..317e93a33c 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate.py @@ -355,41 +355,41 @@ def test_evaluation_metric_resource_name(client): client._api_client._http_options.api_version = "v1beta1" tone_check_metric = types.LLMMetric( name="tone_check", - prompt_template=""" - # Instruction - You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria. - - # Criteria - Analyze the tone of the response based on these two criteria: - 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor. - 2. Empathy: The response should acknowledge the user's feelings and show understanding. - - # Input - Prompt: {agent_data.turns[0].events[0]} - Response: {agent_data.turns[0].events[1]} - - # Output Format - Respond in a JSON format with the following schema: - { - "type": "OBJECT", - "properties": { - "score": {"type": "NUMBER"}, - "explanation": {"type": "STRING"}, - }, - "required": ["score", "explanation"], - } - Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'. - - The output would include the following fields: - score: based on your evaluation, the score should be a number based on the rating rubrics. - explanation: your explanation for the score rating, in one line. - - ## Example Output Format: - {"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."} - {"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric."} - {"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."} - {"score" : 5, "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric."} - """, + prompt_template="""Analyze the tone of the response based on these two criteria:\n + 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.\n + 2. Empathy: The response should acknowledge the user's feelings and show understanding.\n\n + Prompt: {agent_data.turns[0].events[0]} + Response: {agent_data.turns[0].events[1]} + Return ONLY a JSON list of objects for these two properties: + [{"property": "Professionalism", "verdict": true, "reasoning": "..."}, + {"property": "Empathy", "verdict": true, "reasoning": "..."}] + """, + result_parsing_function=""" +import json, re +def parse_results(responses): + text = responses[0] + # Use robust regex to find the JSON list block + match = re.search("[\\[].*[]]", text, re.DOTALL) + if not match: return {"score": 0.0, "explanation": "No valid JSON found"} + + try: + data = json.loads(match.group(0)) + # Calculate an overall score (e.g., average of verdicts) + passed_count = sum(1 for r in data if r.get("verdict", False)) + total_count = len(data) + score = passed_count / total_count if total_count > 0 else 0.0 + + # Consolidate reasoning into a single explanation string + explanation = "\\n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data]) + + # IMPORTANT: Return a dictionary, not a list + return { + "score": float(score), + "explanation": explanation + } + except Exception as e: + return {"score": 0.0, "explanation": f"Parsing failed: {str(e)}"} +""", ) metric_resource_name = client.evals.create_evaluation_metric( metric=tone_check_metric, diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index 65ca401ae3..4b203d447f 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -119,6 +119,14 @@ def t_metrics( if autorater_config: llm_based_spec["judge_autorater_config"] = autorater_config + result_parsing_function = getv(metric, ["result_parsing_function"]) + if result_parsing_function: + llm_based_spec["result_parser_config"] = { + "custom_code_parser_config": { + "parsing_function": result_parsing_function + } + } + metric_payload_item["llm_based_metric_spec"] = llm_based_spec elif getattr(metric, "metric_resource_name", None) is not None: # Safe pass @@ -235,6 +243,14 @@ def t_metric_for_registry( if autorater_config: llm_based_spec["judge_autorater_config"] = autorater_config + result_parsing_function = getv(metric, ["result_parsing_function"]) + if result_parsing_function: + llm_based_spec["result_parser_config"] = { + "custom_code_parser_config": { + "parsing_function": result_parsing_function + } + } + metric_payload_item["llm_based_metric_spec"] = llm_based_spec else: