Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bd27c42
firstdraft
jsong468 Dec 4, 2025
b714a65
revert nb
jsong468 Dec 4, 2025
c9678ab
revert nb
jsong468 Dec 4, 2025
15b196b
merge
jsong468 Dec 12, 2025
be97a2c
changes from openai migration and change name to scorermetricsregistry
jsong468 Dec 12, 2025
dd55b9f
refactor
jsong468 Dec 17, 2025
c826319
merge
jsong468 Dec 17, 2025
dbbd7d1
update notebook and printing
jsong468 Dec 17, 2025
4232743
Merge branch 'main' into using_scorer_metrics_1
jsong468 Dec 17, 2025
b38f315
fix tests
jsong468 Dec 17, 2025
2c72396
separate prs
jsong468 Dec 18, 2025
3076945
merge and clean
jsong468 Dec 23, 2025
c5e4543
workflow in place
rlundeen2 Dec 25, 2025
0cd7613
dataset refactor
rlundeen2 Dec 25, 2025
55a0383
moving things
rlundeen2 Dec 25, 2025
e1fca4d
finalizing things
rlundeen2 Dec 25, 2025
7b5a3e4
saving changes
rlundeen2 Dec 26, 2025
5a3796b
generalizing return types
rlundeen2 Dec 26, 2025
be1ea4c
harm category additions
rlundeen2 Dec 26, 2025
0afacd8
moving paths
rlundeen2 Dec 26, 2025
9dd8e63
Piping harm scorers through
rlundeen2 Dec 27, 2025
249d574
updating csv structure
rlundeen2 Dec 27, 2025
24a6517
most things working
rlundeen2 Dec 27, 2025
eab659e
adding scorer printer
rlundeen2 Dec 27, 2025
0c6dac3
fixing tests
rlundeen2 Dec 27, 2025
61a31ec
merging main
rlundeen2 Dec 27, 2025
e9faea3
pre-commit
rlundeen2 Dec 28, 2025
d4be70a
merge main
jsong468 Dec 29, 2025
1bdca43
Merge branch 'scorer_evaluation_refactor' into jsong/scorer_evaluatio…
jsong468 Dec 30, 2025
c452f6e
Merge pull request #2 from rlundeen2/jsong/scorer_evaluation_refactor
jsong468 Dec 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions build_scripts/evaluate_scorers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
Manual script for evaluating multiple scorers against human-labeled datasets.
This is a long-running process that should be run occasionally to benchmark
scorer performance. Results are saved to the scorer_evals directory and checked in.
Usage:
python build_scripts/evaluate_scorers.py
"""

import asyncio
import os
import sys
import time

from azure.ai.contentsafety.models import TextCategory
from tqdm import tqdm

from pyrit.common.path import SCORER_EVALS_PATH
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.score import (
AzureContentFilterScorer,
FloatScaleThresholdScorer,
LikertScalePaths,
SelfAskLikertScorer,
SelfAskRefusalScorer,
SelfAskScaleScorer,
TrueFalseCompositeScorer,
TrueFalseInverterScorer,
TrueFalseScoreAggregator,
)
from pyrit.setup import IN_MEMORY, initialize_pyrit_async


async def evaluate_scorers() -> None:
"""
Evaluate multiple scorers against their configured datasets.
This will:
1. Initialize PyRIT with in-memory database
2. Create a shared chat target for consistency
3. Instantiate each scorer with appropriate configuration
4. Run evaluate_async() on each scorer
5. Save results to scorer_evals directory
"""
print("Initializing PyRIT...")
await initialize_pyrit_async(memory_db_type=IN_MEMORY)

# Targets
gpt_4o_target = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
)

gpt_4o_unsafe = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
)

gpt_4o_unsafe_temp9 = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=0.9,
)

# Refusal Scorer
refuasal = SelfAskRefusalScorer(chat_target=gpt_4o_target)

# Objective Scorers
refuasal_gpt_4o = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_target))

refusal_gpt_4o_unsafe = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe))

refusal_gpt_4o_unsafe_temp9 = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9))

acs_t1 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.1)
acs_t5 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5)
acs_t7 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.7)

acs_with_refusal = TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND,
scorers=[
FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5),
TrueFalseInverterScorer(
scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9),
),
],
)

_scale_scorer_gpt_4o = SelfAskScaleScorer(
chat_target=OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
temperature=0.9,
),
)

scale_gpt_4o_9_threshhold = FloatScaleThresholdScorer(scorer=_scale_scorer_gpt_4o, threshold=0.9)

scale_and_refusal = TrueFalseCompositeScorer(
scorers=[scale_gpt_4o_9_threshhold, refuasal_gpt_4o],
aggregator=TrueFalseScoreAggregator.AND,
)

# Harm Scorers
likert_scorers_gpt_4o = []
for scale in LikertScalePaths:
if scale.evaluation_files is not None:
likert_scorers_gpt_4o.append(
SelfAskLikertScorer(
chat_target=gpt_4o_target,
likert_scale=scale,
)
)

acs_hate = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])
acs_self_harm = AzureContentFilterScorer(harm_categories=[TextCategory.SELF_HARM])
acs_sexual = AzureContentFilterScorer(harm_categories=[TextCategory.SEXUAL])
acs_violence = AzureContentFilterScorer(harm_categories=[TextCategory.VIOLENCE])

# Build list of scorers to evaluate
scorers = [
refuasal,
refuasal_gpt_4o,
refusal_gpt_4o_unsafe,
refusal_gpt_4o_unsafe_temp9,
acs_t1,
acs_t5,
acs_t7,
acs_with_refusal,
scale_gpt_4o_9_threshhold,
scale_and_refusal,
acs_hate,
acs_self_harm,
acs_sexual,
acs_violence,
]

scorers.extend(likert_scorers_gpt_4o)

print(f"\nEvaluating {len(scorers)} scorer(s)...\n")

# Use tqdm for progress tracking across all scorers
scorer_iterator = tqdm(enumerate(scorers, 1), total=len(scorers), desc="Scorers") if tqdm else enumerate(scorers, 1)

# Evaluate each scorer
for i, scorer in scorer_iterator:
scorer_name = scorer.__class__.__name__
print(f"\n[{i}/{len(scorers)}] Evaluating {scorer_name}...")
print(" Status: Starting evaluation (this may take several minutes)...")

start_time = time.time()

try:
# Run evaluation with production settings:
# - num_scorer_trials=3 for variance measurement
# - add_to_evaluation_results=True to save to registry
print(" Status: Running evaluations...")
results = await scorer.evaluate_async(
num_scorer_trials=3,
max_concurrency=10,
)

elapsed_time = time.time() - start_time

# Results are saved to disk by evaluate_async() with add_to_evaluation_results=True
print(" ✓ Evaluation complete and saved!")
print(f" Elapsed time: {elapsed_time:.1f}s")
if results:
print(f" Dataset: {results.dataset_name}")

except Exception as e:
elapsed_time = time.time() - start_time
print(f" ✗ Error evaluating {scorer_name} after {elapsed_time:.1f}s: {e}")
print(" Continuing with next scorer...\n")
import traceback

traceback.print_exc()
continue

print("=" * 60)
print("Evaluation complete!")
print(f"Results saved to: {SCORER_EVALS_PATH}")
print("=" * 60)


if __name__ == "__main__":
print("=" * 60)
print("PyRIT Scorer Evaluation Script")
print("=" * 60)
print("This script will evaluate multiple scorers against human-labeled")
print("datasets. This is a long-running process that may take several")
print("minutes to hours depending on the number of scorers and datasets.")
print()
print("Results will be saved to the registry files in:")
print(f" {SCORER_EVALS_PATH}")
print("=" * 60)
print()

try:
asyncio.run(evaluate_scorers())
except KeyboardInterrupt:
print("\n\nEvaluation interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nFatal error: {e}")
import traceback

traceback.print_exc()
sys.exit(1)
3 changes: 2 additions & 1 deletion doc/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ chapters:
- file: code/scoring/persuasion_full_conversation_scorer
- file: code/scoring/prompt_shield_scorer
- file: code/scoring/generic_scorers
- file: code/scoring/scorer_evals
- file: code/scoring/8_scorer_metrics
- file: code/scoring/9_printing_scorers
- file: code/memory/0_memory
sections:
- file: code/memory/1_sqlite_memory
Expand Down
14 changes: 14 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,10 @@ API Reference
EmbeddingSupport
EmbeddingUsageInformation
ErrorDataTypeSerializer
get_all_harm_definitions
group_conversation_message_pieces_by_sequence
group_message_pieces_into_conversations
HarmDefinition
Identifier
ImagePathDataTypeSerializer
AllowedCategories
Expand All @@ -338,6 +340,7 @@ API Reference
QuestionAnsweringDataset
QuestionAnsweringEntry
QuestionChoice
ScaleDescription
ScenarioIdentifier
ScenarioResult
Score
Expand Down Expand Up @@ -515,6 +518,7 @@ API Reference

AzureContentFilterScorer
BatchScorer
ConsoleScorerPrinter
ContentClassifierPaths
ConversationScorer
create_conversation_scorer
Expand All @@ -525,13 +529,16 @@ API Reference
FloatScaleScorerByCategory
FloatScaleThresholdScorer
GandalfScorer
get_all_harm_definitions
HarmDefinition
HarmHumanLabeledEntry
HarmScorerEvaluator
HarmScorerMetrics
HumanInTheLoopScorerGradio
HumanLabeledDataset
HumanLabeledEntry
InsecureCodeScorer
LikertScaleEvalFiles
LikertScalePaths
MarkdownInjectionScorer
MetricsType
Expand All @@ -542,11 +549,18 @@ API Reference
PlagiarismScorer
PromptShieldScorer
QuestionAnswerScorer
RegistryUpdateBehavior
Scorer
ScorerEvalDatasetFiles
ScorerEvaluator
ScorerIdentifier
ScorerMetrics
ScorerMetricsWithIdentity
ScorerPrinter
ScorerPromptValidator
ScaleDescription
get_all_harm_metrics
get_all_objective_metrics
SelfAskCategoryScorer
SelfAskGeneralFloatScaleScorer
SelfAskGeneralTrueFalseScorer
Expand Down
2 changes: 1 addition & 1 deletion doc/code/executor/attack/1_prompt_sending_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@

# Set up the Azure Content Filter Scorer and SelfAsk Likert Scorer for bias detection
acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE]) # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE
bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)
bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)

# Anything that isn't a refusal is a "success" for this scenario
objective_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=target))
Expand Down
2 changes: 1 addition & 1 deletion doc/code/executor/attack/many_shot_jailbreak_attack.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"prompt_target = OpenAIChatTarget()\n",
"\n",
"# Set up the Harm Scorer with the same prompt target to ensure consistency\n",
"harm_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.HARM_SCALE.value, chat_target=prompt_target)\n",
"harm_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.HARM_SCALE, chat_target=prompt_target)\n",
"scoring_config = AttackScoringConfig(\n",
" auxiliary_scorers=[harm_scorer],\n",
")\n",
Expand Down
2 changes: 1 addition & 1 deletion doc/code/executor/attack/many_shot_jailbreak_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
prompt_target = OpenAIChatTarget()

# Set up the Harm Scorer with the same prompt target to ensure consistency
harm_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.HARM_SCALE.value, chat_target=prompt_target)
harm_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.HARM_SCALE, chat_target=prompt_target)
scoring_config = AttackScoringConfig(
auxiliary_scorers=[harm_scorer],
)
Expand Down
2 changes: 1 addition & 1 deletion doc/code/memory/7_azure_sql_memory_attacks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@
"\n",
"# Set up the Azure Content Filter Scorer\n",
"acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE]) # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE\n",
"bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)\n",
"bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)\n",
"\n",
"scoring_config = AttackScoringConfig(\n",
" auxiliary_scorers=[acf, bias_scorer],\n",
Expand Down
2 changes: 1 addition & 1 deletion doc/code/memory/7_azure_sql_memory_attacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@

# Set up the Azure Content Filter Scorer
acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE]) # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE
bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)
bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)

scoring_config = AttackScoringConfig(
auxiliary_scorers=[acf, bias_scorer],
Expand Down
2 changes: 1 addition & 1 deletion doc/code/scenarios/0_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def __init__(
name="My Custom Scenario",
version=self.version,
strategy_class=MyStrategy,
objective_scorer_identifier=objective_scorer.get_identifier() if objective_scorer else None,
objective_scorer=objective_scorer,
scenario_result_id=scenario_result_id,
)

Expand Down
Loading