Azure · jsong468 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 12, 2025
diff --git a/build_scripts/evaluate_scorers.py b/build_scripts/evaluate_scorers.py
@@ -0,0 +1,217 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+Manual script for evaluating multiple scorers against human-labeled datasets.
+
+This is a long-running process that should be run occasionally to benchmark
+scorer performance. Results are saved to the scorer_evals directory and checked in.
+
+Usage:
+    python build_scripts/evaluate_scorers.py
+"""
+
+import asyncio
+import os
+import sys
+import time
+
+from azure.ai.contentsafety.models import TextCategory
+from tqdm import tqdm
+
+from pyrit.common.path import SCORER_EVALS_PATH
+from pyrit.prompt_target import OpenAIChatTarget
+from pyrit.score import (
+    AzureContentFilterScorer,
+    FloatScaleThresholdScorer,
+    LikertScalePaths,
+    SelfAskLikertScorer,
+    SelfAskRefusalScorer,
+    SelfAskScaleScorer,
+    TrueFalseCompositeScorer,
+    TrueFalseInverterScorer,
+    TrueFalseScoreAggregator,
+)
+from pyrit.setup import IN_MEMORY, initialize_pyrit_async
+
+
+async def evaluate_scorers() -> None:
+    """
+    Evaluate multiple scorers against their configured datasets.
+
+    This will:
+    1. Initialize PyRIT with in-memory database
+    2. Create a shared chat target for consistency
+    3. Instantiate each scorer with appropriate configuration
+    4. Run evaluate_async() on each scorer
+    5. Save results to scorer_evals directory
+    """
+    print("Initializing PyRIT...")
+    await initialize_pyrit_async(memory_db_type=IN_MEMORY)
+
+    # Targets
+    gpt_4o_target = OpenAIChatTarget(
+        endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
+        api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
+        model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
+    )
+
+    gpt_4o_unsafe = OpenAIChatTarget(
+        endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
+        api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
+        model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
+    )
+
+    gpt_4o_unsafe_temp9 = OpenAIChatTarget(
+        endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
+        api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
+        model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
+        temperature=0.9,
+    )
+
+    # Refusal Scorer
+    refuasal = SelfAskRefusalScorer(chat_target=gpt_4o_target)
+
+    # Objective Scorers
+    refuasal_gpt_4o = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_target))
+
+    refusal_gpt_4o_unsafe = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe))
+
+    refusal_gpt_4o_unsafe_temp9 = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9))
+
+    acs_t1 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.1)
+    acs_t5 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5)
+    acs_t7 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.7)
+
+    acs_with_refusal = TrueFalseCompositeScorer(
+        aggregator=TrueFalseScoreAggregator.AND,
+        scorers=[
+            FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5),
+            TrueFalseInverterScorer(
+                scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9),
+            ),
+        ],
+    )
+
+    _scale_scorer_gpt_4o = SelfAskScaleScorer(
+        chat_target=OpenAIChatTarget(
+            endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
+            api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
+            model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
+            temperature=0.9,
+        ),
+    )
+
+    scale_gpt_4o_9_threshhold = FloatScaleThresholdScorer(scorer=_scale_scorer_gpt_4o, threshold=0.9)
+
+    scale_and_refusal = TrueFalseCompositeScorer(
+        scorers=[scale_gpt_4o_9_threshhold, refuasal_gpt_4o],
+        aggregator=TrueFalseScoreAggregator.AND,
+    )
+
+    # Harm Scorers
+    likert_scorers_gpt_4o = []
+    for scale in LikertScalePaths:
+        if scale.evaluation_files is not None:
+            likert_scorers_gpt_4o.append(
+                SelfAskLikertScorer(
+                    chat_target=gpt_4o_target,
+                    likert_scale=scale,
+                )
+            )
+
+    acs_hate = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])
+    acs_self_harm = AzureContentFilterScorer(harm_categories=[TextCategory.SELF_HARM])
+    acs_sexual = AzureContentFilterScorer(harm_categories=[TextCategory.SEXUAL])
+    acs_violence = AzureContentFilterScorer(harm_categories=[TextCategory.VIOLENCE])
+
+    # Build list of scorers to evaluate
+    scorers = [
+        refuasal,
+        refuasal_gpt_4o,
+        refusal_gpt_4o_unsafe,
+        refusal_gpt_4o_unsafe_temp9,
+        acs_t1,
+        acs_t5,
+        acs_t7,
+        acs_with_refusal,
+        scale_gpt_4o_9_threshhold,
+        scale_and_refusal,
+        acs_hate,
+        acs_self_harm,
+        acs_sexual,
+        acs_violence,
+    ]
+
+    scorers.extend(likert_scorers_gpt_4o)
+
+    print(f"\nEvaluating {len(scorers)} scorer(s)...\n")
+
+    # Use tqdm for progress tracking across all scorers
+    scorer_iterator = tqdm(enumerate(scorers, 1), total=len(scorers), desc="Scorers") if tqdm else enumerate(scorers, 1)
+
+    # Evaluate each scorer
+    for i, scorer in scorer_iterator:
+        scorer_name = scorer.__class__.__name__
+        print(f"\n[{i}/{len(scorers)}] Evaluating {scorer_name}...")
+        print("  Status: Starting evaluation (this may take several minutes)...")
+
+        start_time = time.time()
+
+        try:
+            # Run evaluation with production settings:
+            # - num_scorer_trials=3 for variance measurement
+            # - add_to_evaluation_results=True to save to registry
+            print("  Status: Running evaluations...")
+            results = await scorer.evaluate_async(
+                num_scorer_trials=3,
+                max_concurrency=10,
+            )
+
+            elapsed_time = time.time() - start_time
+
+            # Results are saved to disk by evaluate_async() with add_to_evaluation_results=True
+            print("  ✓ Evaluation complete and saved!")
+            print(f"    Elapsed time: {elapsed_time:.1f}s")
+            if results:
+                print(f"    Dataset: {results.dataset_name}")
+
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            print(f"  ✗ Error evaluating {scorer_name} after {elapsed_time:.1f}s: {e}")
+            print("    Continuing with next scorer...\n")
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    print("=" * 60)
+    print("Evaluation complete!")
+    print(f"Results saved to: {SCORER_EVALS_PATH}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("PyRIT Scorer Evaluation Script")
+    print("=" * 60)
+    print("This script will evaluate multiple scorers against human-labeled")
+    print("datasets. This is a long-running process that may take several")
+    print("minutes to hours depending on the number of scorers and datasets.")
+    print()
+    print("Results will be saved to the registry files in:")
+    print(f"  {SCORER_EVALS_PATH}")
+    print("=" * 60)
+    print()
+
+    try:
+        asyncio.run(evaluate_scorers())
+    except KeyboardInterrupt:
+        print("\n\nEvaluation interrupted by user.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nFatal error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/doc/_toc.yml b/doc/_toc.yml
@@ -108,7 +108,8 @@ chapters:
       - file: code/scoring/persuasion_full_conversation_scorer
       - file: code/scoring/prompt_shield_scorer
       - file: code/scoring/generic_scorers
-      - file: code/scoring/scorer_evals
+      - file: code/scoring/8_scorer_metrics
+      - file: code/scoring/9_printing_scorers
     - file: code/memory/0_memory
       sections:
       - file: code/memory/1_sqlite_memory

diff --git a/doc/api.rst b/doc/api.rst
@@ -324,8 +324,10 @@ API Reference
     EmbeddingSupport
     EmbeddingUsageInformation
     ErrorDataTypeSerializer
+    get_all_harm_definitions
     group_conversation_message_pieces_by_sequence
     group_message_pieces_into_conversations
+    HarmDefinition
     Identifier
     ImagePathDataTypeSerializer
     AllowedCategories
@@ -338,6 +340,7 @@ API Reference
     QuestionAnsweringDataset
     QuestionAnsweringEntry
     QuestionChoice
+    ScaleDescription
     ScenarioIdentifier
     ScenarioResult
     Score
@@ -515,6 +518,7 @@ API Reference
 
     AzureContentFilterScorer
     BatchScorer
+    ConsoleScorerPrinter
     ContentClassifierPaths
     ConversationScorer
     create_conversation_scorer
@@ -525,13 +529,16 @@ API Reference
     FloatScaleScorerByCategory
     FloatScaleThresholdScorer
     GandalfScorer
+    get_all_harm_definitions
+    HarmDefinition
     HarmHumanLabeledEntry
     HarmScorerEvaluator
     HarmScorerMetrics
     HumanInTheLoopScorerGradio
     HumanLabeledDataset
     HumanLabeledEntry
     InsecureCodeScorer
+    LikertScaleEvalFiles
     LikertScalePaths
     MarkdownInjectionScorer
     MetricsType
@@ -542,11 +549,18 @@ API Reference
     PlagiarismScorer
     PromptShieldScorer
     QuestionAnswerScorer
+    RegistryUpdateBehavior
     Scorer
+    ScorerEvalDatasetFiles
     ScorerEvaluator
     ScorerIdentifier
     ScorerMetrics
+    ScorerMetricsWithIdentity
+    ScorerPrinter
     ScorerPromptValidator
+    ScaleDescription
+    get_all_harm_metrics
+    get_all_objective_metrics
     SelfAskCategoryScorer
     SelfAskGeneralFloatScaleScorer
     SelfAskGeneralTrueFalseScorer

diff --git a/doc/code/executor/attack/1_prompt_sending_attack.py b/doc/code/executor/attack/1_prompt_sending_attack.py
@@ -193,7 +193,7 @@
 
 # Set up the Azure Content Filter Scorer and SelfAsk Likert Scorer for bias detection
 acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])  # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE
-bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)
+bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)
 
 # Anything that isn't a refusal is a "success" for this scenario
 objective_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=target))

diff --git a/doc/code/executor/attack/many_shot_jailbreak_attack.ipynb b/doc/code/executor/attack/many_shot_jailbreak_attack.ipynb
@@ -77,7 +77,7 @@
     "prompt_target = OpenAIChatTarget()\n",
     "\n",
     "# Set up the Harm Scorer with the same prompt target to ensure consistency\n",
-    "harm_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.HARM_SCALE.value, chat_target=prompt_target)\n",
+    "harm_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.HARM_SCALE, chat_target=prompt_target)\n",
     "scoring_config = AttackScoringConfig(\n",
     "    auxiliary_scorers=[harm_scorer],\n",
     ")\n",

diff --git a/doc/code/executor/attack/many_shot_jailbreak_attack.py b/doc/code/executor/attack/many_shot_jailbreak_attack.py
@@ -41,7 +41,7 @@
 prompt_target = OpenAIChatTarget()
 
 # Set up the Harm Scorer with the same prompt target to ensure consistency
-harm_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.HARM_SCALE.value, chat_target=prompt_target)
+harm_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.HARM_SCALE, chat_target=prompt_target)
 scoring_config = AttackScoringConfig(
     auxiliary_scorers=[harm_scorer],
 )

diff --git a/doc/code/memory/7_azure_sql_memory_attacks.ipynb b/doc/code/memory/7_azure_sql_memory_attacks.ipynb
@@ -215,7 +215,7 @@
     "\n",
     "# Set up the Azure Content Filter Scorer\n",
     "acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])  # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE\n",
-    "bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)\n",
+    "bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)\n",
     "\n",
     "scoring_config = AttackScoringConfig(\n",
     "    auxiliary_scorers=[acf, bias_scorer],\n",

diff --git a/doc/code/memory/7_azure_sql_memory_attacks.py b/doc/code/memory/7_azure_sql_memory_attacks.py
@@ -75,7 +75,7 @@
 
 # Set up the Azure Content Filter Scorer
 acf = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])  # Options: HATE, SELF_HARM, SEXUAL, VIOLENCE
-bias_scorer = SelfAskLikertScorer(likert_scale_path=LikertScalePaths.FAIRNESS_BIAS_SCALE.value, chat_target=target)
+bias_scorer = SelfAskLikertScorer(likert_scale=LikertScalePaths.FAIRNESS_BIAS_SCALE, chat_target=target)
 
 scoring_config = AttackScoringConfig(
     auxiliary_scorers=[acf, bias_scorer],

diff --git a/doc/code/scenarios/0_scenarios.py b/doc/code/scenarios/0_scenarios.py
@@ -118,7 +118,7 @@ def __init__(
             name="My Custom Scenario",
             version=self.version,
             strategy_class=MyStrategy,
-            objective_scorer_identifier=objective_scorer.get_identifier() if objective_scorer else None,
+            objective_scorer=objective_scorer,
             scenario_result_id=scenario_result_id,
         )