diff --git a/agentplatform/_genai/_evals_metric_handlers.py b/agentplatform/_genai/_evals_metric_handlers.py index 4571802dbc..bd234c55ca 100644 --- a/agentplatform/_genai/_evals_metric_handlers.py +++ b/agentplatform/_genai/_evals_metric_handlers.py @@ -972,6 +972,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric): raise ValueError( f"Metric '{self.metric.name}' is not a supported predefined metric." ) + if ( + self.metric.judge_model + or self.metric.judge_model_generation_config + or self.metric.judge_model_sampling_count + ): + logger.warning( + "Autorater config settings (judge_model, " + "judge_model_generation_config, judge_model_sampling_count) " + "are ignored for predefined metric '%s'.", + self.metric.name, + ) def _build_request_payload( self, eval_case: types.EvalCase, response_index: int @@ -1031,6 +1042,7 @@ def _build_request_payload( request_payload["autorater_config"] = genai_types.AutoraterConfig( **autorater_config ) + return request_payload @override diff --git a/agentplatform/_genai/types/common.py b/agentplatform/_genai/types/common.py index 44b45f2146..a9738614c4 100644 --- a/agentplatform/_genai/types/common.py +++ b/agentplatform/_genai/types/common.py @@ -2435,7 +2435,8 @@ class EvaluationRunConfig(_common.BaseModel): default=None, description="""The output config for the evaluation run.""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""The autorater config for the evaluation run.""" + default=None, + description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) prompt_template: Optional[EvaluationRunPromptTemplate] = Field( default=None, description="""The prompt template used for inference.""" @@ -2465,7 +2466,7 @@ class EvaluationRunConfigDict(TypedDict, total=False): """The output config for the evaluation run.""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """The autorater config for the evaluation run.""" + """The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" prompt_template: Optional[EvaluationRunPromptTemplateDict] """The prompt template used for inference.""" @@ -4772,7 +4773,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel): default=None, description="""""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""""" + default=None, + description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) metrics: Optional[list[Metric]] = Field( default=None, @@ -4823,7 +4825,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False): """""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """""" + """Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" metrics: Optional[list[MetricDict]] """The metrics used for evaluation. @@ -19065,7 +19067,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel): default=None, description="""""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""""" + default=None, + description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""") @@ -19083,7 +19086,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False): """""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """""" + """Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" config: Optional[EvaluateDatasetConfigDict] """""" diff --git a/tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py b/tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py index 0efbf06337..cdfa0cb6a5 100644 --- a/tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py +++ b/tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py @@ -106,6 +106,39 @@ def test_evaluation_result(client): # assert case_result.response_candidate_results is not None +def test_predefined_metric_with_judge_model_ignores_autorater_config(client): + """Tests that autorater_config is ignored for predefined metrics in replays.""" + prompts_df = pd.DataFrame( + { + "prompt": ["Explain the concept of machine learning in simple terms."], + "response": [ + "Machine learning is a type of artificial intelligence that allows" + " computers to learn from data without being explicitly programmed." + ], + } + ) + + eval_dataset = types.EvaluationDataset( + eval_dataset_df=prompts_df, + candidate_name="gemini-2.5-flash", + ) + + # Set judge_model, which should be ignored for predefined metrics + metric = types.Metric( + name="safety_v1", + judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash" + ) + + evaluation_result = client.evals.evaluate( + dataset=eval_dataset, + metrics=[metric], + ) + + assert isinstance(evaluation_result, types.EvaluationResult) + assert evaluation_result.summary_metrics is not None + assert evaluation_result.summary_metrics[0].metric_name == "safety_v1" + + def test_multi_turn_predefined_metric(client): """Tests that evaluate works with multi-turn predefined metrics.""" prompts_data = { diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 4571802dbc..d682460243 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -36,7 +36,6 @@ from . import evals from . import types - logger = logging.getLogger(__name__) _MAX_RETRIES = 5 # HTTP status codes that are safe to retry with backoff. @@ -972,6 +971,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric): raise ValueError( f"Metric '{self.metric.name}' is not a supported predefined metric." ) + if ( + self.metric.judge_model + or self.metric.judge_model_generation_config + or self.metric.judge_model_sampling_count + ): + logger.warning( + "Autorater config settings (judge_model, " + "judge_model_generation_config, judge_model_sampling_count) " + "are ignored for predefined metric '%s'.", + self.metric.name, + ) def _build_request_payload( self, eval_case: types.EvalCase, response_index: int @@ -1026,11 +1036,9 @@ def _build_request_payload( "instance": instance_payload, } - autorater_config = _get_autorater_config(self.metric) - if autorater_config: - request_payload["autorater_config"] = genai_types.AutoraterConfig( - **autorater_config - ) + # Note: autorater_config is intentionally not passed for predefined + # metrics. The server uses its own model configuration for predefined + # metrics and ignores the autorater_config field. return request_payload @override @@ -1045,7 +1053,6 @@ def get_metric_result( lambda: self.module._evaluate_instances( metrics=[self.metric], instance=payload.get("instance"), - autorater_config=payload.get("autorater_config"), ), metric_name, ) diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 4da972fde6..0579725846 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -2420,7 +2420,8 @@ class EvaluationRunConfig(_common.BaseModel): default=None, description="""The output config for the evaluation run.""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""The autorater config for the evaluation run.""" + default=None, + description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) prompt_template: Optional[EvaluationRunPromptTemplate] = Field( default=None, description="""The prompt template used for inference.""" @@ -2450,7 +2451,7 @@ class EvaluationRunConfigDict(TypedDict, total=False): """The output config for the evaluation run.""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """The autorater config for the evaluation run.""" + """The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" prompt_template: Optional[EvaluationRunPromptTemplateDict] """The prompt template used for inference.""" @@ -4757,7 +4758,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel): default=None, description="""""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""""" + default=None, + description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) metrics: Optional[list[Metric]] = Field( default=None, @@ -4808,7 +4810,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False): """""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """""" + """Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" metrics: Optional[list[MetricDict]] """The metrics used for evaluation. @@ -19050,7 +19052,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel): default=None, description="""""" ) autorater_config: Optional[genai_types.AutoraterConfig] = Field( - default=None, description="""""" + default=None, + description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""", ) config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""") @@ -19068,7 +19071,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False): """""" autorater_config: Optional[genai_types.AutoraterConfigDict] - """""" + """Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""" config: Optional[EvaluateDatasetConfigDict] """"""