Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sdk/ai/azure-ai-projects/samples/evaluations/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ These samples require additional setup or Azure services:
|--------|-------------|
| [sample_evaluations_graders.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py) | OpenAI graders: label_model, text_similarity, string_check, score_model |
| [sample_evaluations_ai_assisted.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py) | AI-assisted and NLP-based evaluators: Similarity, ROUGE, METEOR, GLEU, F1, BLEU |
| [sample_eval_catalog_code_based_evaluators.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_code_based_evaluators.py) | Custom code-based (python) evaluators |
| [sample_eval_catalog_code_based_evaluators.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_code_based_evaluators.py) | Custom code-based (inline) evaluators |
| [sample_eval_catalog_prompt_based_evaluators.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_prompt_based_evaluators.py) | Custom prompt-based evaluators |
| [sample_custom_eval_upload_simple.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_custom_eval_upload_simple.py) | Custom code-based evaluator with upload — simple example (AnswerLength) |
| [sample_custom_eval_upload_advanced.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_custom_eval_upload_advanced.py) | Custom code-based evaluator with upload — advanced example (FriendlyEvaluator with LLM judge) |

### Agentic Evaluators

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@


class AnswerLengthEvaluator:
def __init__(self, *, config: str, threshold, **kwargs):
self.config = config
self.threshold = threshold
def __init__(self, **kwargs):
pass

def __call__(self, *args, **kwargs):
length = evaluate_answer_length(kwargs.get("response"))
return {
"result": evaluate_answer_length(kwargs.get("response")),
"result": length,
"reason": "Short answer" if length <= 50 else "Long answer",
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,62 +12,76 @@
You MUST respond in the following JSON format only:
{
"score": <integer 1-5>,
"label": "<Pass or Fail>",
"reason": "<brief reason for the score>",
"explanation": "<detailed explanation of why the response received this score>"
"explanation": "<detailed explanation of why the response received this score>",
"tone": "<the overall tone detected, e.g. warm, neutral, dismissive>",
"confidence": "<high, medium, or low confidence in the assessment>"
}

A score of 3 or above is considered "Pass", below 3 is "Fail".
"""


def build_evaluation_messages(query: str, response: str) -> list:
"""Build the messages list for the LLM evaluation call.
def build_evaluation_instructions() -> str:
"""Return the system instructions for the LLM evaluation call.

:return: The system prompt string for the Responses API.
"""
return FRIENDLINESS_SYSTEM_PROMPT


def build_evaluation_input(query: str, response: str) -> str:
"""Build the user input for the LLM evaluation call.

:param query: The original user query.
:param response: The response to evaluate for friendliness.
:return: A list of message dicts for the chat completion API.
:return: A string prompt for the Responses API.
"""
return [
{"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT},
{
"role": "user",
"content": (
f"Please evaluate the friendliness of the following response.\n\n"
f"Original query: {query}\n\n"
f"Response to evaluate: {response}"
),
},
]
return (
f"Please evaluate the friendliness of the following response.\n\n"
f"Original query: {query}\n\n"
f"Response to evaluate: {response}"
)


def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict:
"""Parse the LLM's JSON response into a structured evaluation result.

The return dict has the standard top-level keys (score, label, reason,
threshold) and a ``properties`` dict for any extra output fields the
evaluator wants to surface. Note: ``passed`` can also be calculated
and provided based on the score and threshold if needed.

:param raw_result: The raw string output from the LLM.
:param threshold: The minimum score to be considered "Pass".
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, and properties.
"""
import json

# Keys that are promoted to the top level of the result
top_level_keys = {"score", "label", "reason"}

try:
# Try to extract JSON from the response (handle markdown code blocks)
text = raw_result.strip()
if text.startswith("```"):
text = text.split("\n", 1)[1] if "\n" in text else text[3:]
text = text.rsplit("```", 1)[0]
result = json.loads(text.strip())
score = int(result.get("score", threshold))
score = max(1, min(5, int(result.get("score", threshold))))

# Collect any extra fields returned by the LLM into properties
properties = {k: v for k, v in result.items() if k not in top_level_keys}

return {
"score": max(1, min(5, score)),
"label": result.get("label", "Pass" if score >= threshold else "Fail"),
"score": score,
"label": "Pass" if score >= threshold else "Fail",
"reason": result.get("reason", "No reason provided"),
"explanation": result.get("explanation", "No explanation provided"),
"threshold": threshold,
"properties": properties, # extra metadata surfaced in the evaluation results
}
except (json.JSONDecodeError, ValueError, KeyError):
return {
"score": threshold,
"label": "Pass",
"score": 0,
"label": "Fail",
"reason": "Could not parse LLM response",
"explanation": f"Raw LLM output: {raw_result}",
"threshold": threshold,
}
Original file line number Diff line number Diff line change
@@ -1,66 +1,42 @@
"""Custom evaluator that uses an LLM to assess the friendliness of a response."""

from openai import AzureOpenAI
from common_util.util import build_evaluation_messages, parse_evaluation_result
from openai import OpenAI
from common_util.util import build_evaluation_instructions, build_evaluation_input, parse_evaluation_result


class FriendlyEvaluator:
"""Evaluates how friendly and approachable a response is using an LLM judge.

This evaluator sends the query and response to an LLM, which returns a
friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation.
This evaluator sends the query and response to an LLM via the OpenAI Responses
API, which returns a friendliness score (1-5), a pass/fail label, a reason,
and a detailed explanation.

:param model_config: A dict containing Azure OpenAI connection info. Expected keys:
- azure_endpoint: The Azure OpenAI endpoint URL.
- azure_deployment: The deployment/model name.
- api_version: The API version (default: "2024-06-01").
- api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used.
:param api_key: The OpenAI API key.
:param model_name: The model_name to use for evaluation (e.g. "gpt-4o").
:param threshold: The minimum score (1-5) to be considered "Pass" (default: 3).
"""

def __init__(self, *, model_config: dict, threshold: int = 3, **kwargs):
self.model_config = model_config
def __init__(self, *, api_key: str, model_name: str, threshold: int = 3, **kwargs):
self.client = OpenAI(api_key=api_key)
self.model_name = model_name
self.threshold = threshold
api_key = model_config.get("api_key")

if api_key:
self.client = AzureOpenAI(
azure_endpoint=model_config["azure_endpoint"],
api_key=api_key,
api_version=model_config.get("api_version", "2024-06-01"),
)
else:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

token_provider = get_bearer_token_provider(
DefaultAzureCredential(),
"https://cognitiveservices.azure.com/.default",
)
self.client = AzureOpenAI(
azure_endpoint=model_config["azure_endpoint"],
azure_ad_token_provider=token_provider,
api_version=model_config.get("api_version", "2024-06-01"),
)

self.deployment = model_config["azure_deployment"]

def __call__(self, *, query: str, response: str, **kwargs) -> dict:
"""Evaluate the friendliness of a response.

:param query: The original user query.
:param response: The response to evaluate.
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, and properties.
"""
messages = build_evaluation_messages(query, response)

completion = self.client.chat.completions.create(
model=self.deployment,
messages=messages,
result = self.client.responses.create(
model=self.model_name,
instructions=build_evaluation_instructions(),
input=build_evaluation_input(query, response),
temperature=0.0,
max_tokens=500,
max_output_tokens=500,
)

raw_result = completion.choices[0].message.content
raw_result = result.output_text
if raw_result is None:
raise ValueError("No content in completion response")
raise ValueError("No content in response")
return parse_evaluation_result(raw_result, self.threshold)
Loading