From 8477420f924ec5d04c7199d4997e9b1c81339b33 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 15:17:53 -0700 Subject: [PATCH 1/4] refactor default scenario targets --- .env_example | 5 + pyrit/scenario/core/__init__.py | 5 +- pyrit/scenario/core/scenario.py | 42 +++++++-- .../scenario/core/scenario_target_defaults.py | 91 +++++++++++++++++++ pyrit/scenario/core/scenario_techniques.py | 41 +-------- pyrit/scenario/scenarios/airt/cyber.py | 49 +--------- pyrit/scenario/scenarios/airt/jailbreak.py | 4 +- pyrit/scenario/scenarios/airt/leakage.py | 71 +-------------- pyrit/scenario/scenarios/airt/psychosocial.py | 22 +---- pyrit/scenario/scenarios/airt/scam.py | 66 +------------- .../setup/initializers/components/targets.py | 9 ++ tests/unit/scenario/test_jailbreak.py | 12 ++- tests/unit/scenario/test_rapid_response.py | 2 +- tests/unit/scenario/test_scenario.py | 16 +++- 14 files changed, 183 insertions(+), 252 deletions(-) create mode 100644 pyrit/scenario/core/scenario_target_defaults.py diff --git a/.env_example b/.env_example index fdf9d715e1..b925fb097c 100644 --- a/.env_example +++ b/.env_example @@ -79,6 +79,11 @@ ADVERSARIAL_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" ADVERSARIAL_CHAT_KEY="xxxxx" ADVERSARIAL_CHAT_MODEL="deployment-name" +# Objective Scorer chat target (used in scorers in scenarios) +OBJECTIVE_SCORER_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" +OBJECTIVE_SCORER_CHAT_KEY="xxxxx" +OBJECTIVE_SCORER_CHAT_MODEL="deployment-name" + AZURE_FOUNDRY_DEEPSEEK_ENDPOINT="https://xxxxx.eastus2.models.ai.azure.com" AZURE_FOUNDRY_DEEPSEEK_KEY="xxxxx" AZURE_FOUNDRY_DEEPSEEK_MODEL="" diff --git a/pyrit/scenario/core/__init__.py b/pyrit/scenario/core/__init__.py index c470d31c29..5442c2dd2b 100644 --- a/pyrit/scenario/core/__init__.py +++ b/pyrit/scenario/core/__init__.py @@ -10,9 +10,9 @@ from pyrit.scenario.core.dataset_configuration import EXPLICIT_SEED_GROUPS_KEY, DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.scenario.core.scenario_techniques import ( SCENARIO_TECHNIQUES, - get_default_adversarial_target, register_scenario_techniques, ) @@ -27,6 +27,7 @@ "Scenario", "ScenarioCompositeStrategy", "ScenarioStrategy", - "get_default_adversarial_target", "register_scenario_techniques", + "get_default_scorer_target", + "get_default_adversarial_target", ] diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 38a450cec4..145abba8ba 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -16,6 +16,7 @@ import uuid from abc import ABC, abstractmethod from collections.abc import Sequence +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union, cast, get_origin from tqdm.auto import tqdm @@ -27,14 +28,20 @@ from pyrit.memory.memory_models import ScenarioResultEntry from pyrit.models import AttackResult, SeedAttackGroup from pyrit.models.scenario_result import ScenarioIdentifier, ScenarioResult -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.target_requirements import TargetRequirements -from pyrit.registry import ScorerRegistry +from pyrit.registry.object_registries.scorer_registry import ScorerRegistry from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import Scorer, SelfAskRefusalScorer, TrueFalseInverterScorer, TrueFalseScorer +from pyrit.scenario.core.scenario_target_defaults import get_default_scorer_target +from pyrit.score import Scorer, TrueFalseScorer +from pyrit.score.true_false.self_ask_refusal_scorer import SelfAskRefusalScorer +from pyrit.score.true_false.self_ask_true_false_scorer import SelfAskTrueFalseScorer +from pyrit.score.true_false.true_false_composite_scorer import TrueFalseCompositeScorer +from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer +from pyrit.score.true_false.true_false_score_aggregator import TrueFalseScoreAggregator if TYPE_CHECKING: from pyrit.executor.attack.core.attack_config import AttackScoringConfig @@ -107,6 +114,11 @@ class Scenario(ABC): #: what the scenario needs. Validated in ``initialize_async`` once the target is supplied. TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements() + #: Optional true/false question prompt path for objective scoring. + #: When set, the default objective scorer becomes + #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. + OBJECTIVE_TRUE_FALSE_QUESTION_PATH: ClassVar[Path | None] = None + def __init__( self, *, @@ -310,17 +322,27 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - # Deferred import to avoid circular dependency: + if type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH is not None: + chat_target = get_default_scorer_target() + objective_scorer = SelfAskTrueFalseScorer( + chat_target=chat_target, + true_false_question_path=type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH, + ) + backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + return TrueFalseCompositeScorer( + aggregator=TrueFalseScoreAggregator.AND, + scorers=[objective_scorer, backstop_scorer], + ) + + # Deferred import to avoid circular dependency. from pyrit.setup.initializers.components.scorers import ScorerInitializerTags entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) if entries and isinstance(entries[0].instance, TrueFalseScorer): - scorer = entries[0].instance - logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") - return scorer - scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=OpenAIChatTarget())) - logger.info(f"No registered default objective scorer found, using fallback: {type(scorer).__name__}") - return scorer + return entries[0].instance + + chat_target = get_default_scorer_target() + return TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) def set_params_from_args(self, *, args: dict[str, Any]) -> None: """ diff --git a/pyrit/scenario/core/scenario_target_defaults.py b/pyrit/scenario/core/scenario_target_defaults.py new file mode 100644 index 0000000000..bc6fe084ae --- /dev/null +++ b/pyrit/scenario/core/scenario_target_defaults.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target.common.target_capabilities import CapabilityName +from pyrit.registry import TargetRegistry + + +def get_default_scorer_target() -> PromptChatTarget: + """ + Resolve the default objective scorer chat target. + + First checks the ``TargetRegistry`` for an ``"objective_scorer_chat"`` entry + (populated by ``TargetInitializer`` from ``OBJECTIVE_SCORER_CHAT_*`` env vars). + Falls back to a plain ``OpenAIChatTarget`` + + Returns: + PromptChatTarget: The resolved objective scorer chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target(preferred_target_key="objective_scorer_chat") + + +def get_default_adversarial_target() -> PromptChatTarget: + """ + Resolve the default adversarial chat target. + + First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry + (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). + Falls back to a default fallback target with temperature=1.2 + + Returns: + PromptChatTarget: The resolved adversarial chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target( + preferred_target_key="adversarial_chat", + required_capabilities={CapabilityName.MULTI_TURN}, + fallback_temperature=1.2, + ) + + +def _get_default_chat_target( + *, + preferred_target_key: str, + required_capabilities: set[CapabilityName] | None = None, + fallback_temperature: float | None = None, +) -> PromptChatTarget: + """ + Resolve a chat target from TargetRegistry with configurable fallback behavior. + + Resolution order: + 1. ``preferred_target_key`` entry from ``TargetRegistry`` + 2. ``OpenAIChatTarget(...)`` with optional temperature + + Args: + preferred_target_key (str): TargetRegistry key to resolve first. + required_capabilities (set[CapabilityName] | None): Optional capabilities + that a resolved target must support. + fallback_temperature (float | None): Optional temperature for fallback + ``OpenAIChatTarget`` construction. + + Returns: + PromptChatTarget: The resolved chat target. + + Raises: + ValueError: If the resolved target does not satisfy required capabilities. + ValueError: If the registry entry exists but is not a PromptChatTarget. + """ + registry = TargetRegistry.get_registry_singleton() + target = registry.get(preferred_target_key) + if target is not None: + # Check required capabilities first (fail fast) + if required_capabilities: + for capability in required_capabilities: + if not target.capabilities.includes(capability=capability): + raise ValueError(f"Registry entry '{preferred_target_key}' must support {capability.value}.") + + # Then check type + if not isinstance(target, PromptChatTarget): + raise ValueError( + f"Registry entry '{preferred_target_key}' must be a PromptChatTarget, but got {type(target).__name__}" + ) + + return target + + return OpenAIChatTarget(temperature=fallback_temperature) diff --git a/pyrit/scenario/core/scenario_techniques.py b/pyrit/scenario/core/scenario_techniques.py index 818ba8a530..46e723e762 100644 --- a/pyrit/scenario/core/scenario_techniques.py +++ b/pyrit/scenario/core/scenario_techniques.py @@ -22,6 +22,7 @@ import inspect import logging from pathlib import Path +from typing import TYPE_CHECKING from pyrit.common.path import EXECUTOR_SEED_PROMPT_PATH from pyrit.executor.attack import ( @@ -34,13 +35,15 @@ ) from pyrit.models import SeedAttackTechniqueGroup, SeedSimulatedConversation from pyrit.models.seeds.seed_simulated_conversation import NextMessageSystemPromptPaths -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget -from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.registry import TargetRegistry from pyrit.registry.object_registries.attack_technique_registry import ( AttackTechniqueRegistry, AttackTechniqueSpec, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target + +if TYPE_CHECKING: + from pyrit.prompt_target import PromptChatTarget logger = logging.getLogger(__name__) @@ -99,40 +102,6 @@ ] -# --------------------------------------------------------------------------- -# Default adversarial target -# --------------------------------------------------------------------------- - - -def get_default_adversarial_target() -> PromptChatTarget: - """ - Resolve the default adversarial chat target. - - First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry - (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). - Falls back to a plain ``OpenAIChatTarget(temperature=1.2)`` using - ``@apply_defaults`` resolution. - - Returns: - PromptChatTarget: The resolved adversarial chat target. - - Raises: - ValueError: If the registered target does not support multi-turn. - """ - registry = TargetRegistry.get_registry_singleton() - if "adversarial_chat" in registry: - target = registry.get("adversarial_chat") - if target: - if not target.capabilities.includes(capability=CapabilityName.MULTI_TURN): - raise ValueError( - f"Registry entry 'adversarial_chat' must support multi-turn conversations, " - f"but {type(target).__name__} does not." - ) - return target - - return OpenAIChatTarget(temperature=1.2) - - # --------------------------------------------------------------------------- # Runtime spec builder # --------------------------------------------------------------------------- diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index 7f0250911a..ac67174790 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -12,26 +12,16 @@ from __future__ import annotations import logging -import os from typing import TYPE_CHECKING, ClassVar -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import SCORER_SEED_PROMPT_PATH -from pyrit.prompt_target import OpenAIChatTarget from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) if TYPE_CHECKING: from pyrit.scenario.core.scenario_strategy import ScenarioStrategy + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -74,6 +64,7 @@ class Cyber(Scenario): """ VERSION: int = 2 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod @@ -138,39 +129,3 @@ def __init__( include_default_baseline=include_baseline, scenario_result_id=scenario_result_id, ) - - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - The scoring mechanism is as follows: - 1. Detect for the presence of malware using a SelfAskTrueFalseScorer. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce malware, we want that - to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and malware evaluation. - """ - endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - presence_of_malware = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml", - ) - - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_malware, backstop] - ) diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 1870fdc0cf..60e7c0146f 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import Any, Optional, Union -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack.core.attack_config import ( @@ -199,9 +198,10 @@ def _create_adversarial_target(self) -> OpenAIChatTarget: OpenAIChatTarget: A fresh adversarial target using an unfiltered endpoint. """ endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") + api_key = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") return OpenAIChatTarget( endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), + api_key=api_key, model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), temperature=1.2, ) diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index 801522d82e..687c159809 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -1,13 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os from pathlib import Path from typing import Optional from PIL import Image -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import DATASETS_PATH, SCORER_SEED_PROMPT_PATH from pyrit.executor.attack import ( @@ -22,20 +20,14 @@ from pyrit.models import SeedAttackGroup, SeedObjective from pyrit.prompt_converter import AddImageTextConverter, FirstLetterConverter, PromptConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.score import TrueFalseScorer class LeakageStrategy(ScenarioStrategy): @@ -89,6 +81,7 @@ class Leakage(Scenario): """ VERSION: int = 1 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -155,7 +148,7 @@ def __init__( self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) # Initialize adversarial chat for multi-turn attacks - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() self._adversarial_config = AttackAdversarialConfig(target=self._adversarial_chat) super().__init__( @@ -168,60 +161,6 @@ def __init__( self._objectives = objectives if objectives else self._get_default_objectives() - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - The scoring mechanism is as follows: - 1. Detect for objective achieved using a SelfAskTrueFalseScorer with leakage detection. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to leak data, - we want that to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and leakage evaluation. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - presence_of_leakage = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml", - ) - - # Backstop scorer: Ensures the model didn't simply refuse the request. - # Uses SelfAskRefusalScorer inverted - if model refused, this returns False. - # Combined with AND aggregator: response must contain leakage AND not be a refusal. - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_leakage, backstop] - ) - - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Create and retrieve the default adversarial target. - - Returns: - OpenAIChatTarget: Default adversarial target using an unfiltered endpoint. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _get_default_objectives(self) -> list[str]: """ Get the default seed prompts for leakage tests. diff --git a/pyrit/scenario/scenarios/airt/psychosocial.py b/pyrit/scenario/scenarios/airt/psychosocial.py index 8a0fc924b9..2e156e6f34 100644 --- a/pyrit/scenario/scenarios/airt/psychosocial.py +++ b/pyrit/scenario/scenarios/airt/psychosocial.py @@ -9,7 +9,6 @@ import yaml -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import DATASETS_PATH from pyrit.executor.attack import ( @@ -37,6 +36,7 @@ from pyrit.scenario.core.scenario_strategy import ( ScenarioStrategy, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target from pyrit.score import ( FloatScaleScorer, FloatScaleThresholdScorer, @@ -253,7 +253,7 @@ def __init__( "objectives is deprecated and will be removed in a future version. " "Use dataset_config in initialize_async instead." ) - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() # Merge user-provided configs with defaults (user-provided takes precedence) self._subharm_configs = {**self.DEFAULT_SUBHARM_CONFIGS, **(subharm_configs or {})} @@ -356,21 +356,6 @@ def _filter_by_harm_category( filtered_groups.append(SeedAttackGroup(seeds=filtered_seeds)) return filtered_groups - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Create default adversarial chat target for multi-turn attacks. - - Returns: - OpenAIChatTarget: Default adversarial target, using an unfiltered endpoint. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.7, - ) - def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScorer: """ Create scorer for psychosocial harms evaluation. @@ -404,9 +389,10 @@ def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScore psychosocial_harm_rubric = yaml_data["value"] endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") + api_key = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") azure_openai_chat_target = OpenAIChatTarget( endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), + api_key=api_key, model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), ) diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index e714b73a57..cf693836b8 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -2,11 +2,9 @@ # Licensed under the MIT license. import logging -import os from pathlib import Path from typing import TYPE_CHECKING, Any, Optional -from pyrit.auth import get_azure_openai_auth from pyrit.common import Parameter, apply_defaults from pyrit.common.path import ( EXECUTOR_RED_TEAM_PATH, @@ -23,20 +21,14 @@ AttackScoringConfig, ) from pyrit.models import SeedAttackGroup -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.score import TrueFalseScorer if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -91,6 +83,7 @@ class Scam(Scenario): """ VERSION: int = 1 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -172,7 +165,7 @@ def __init__( self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) # Used for multiturn strategies and RolePlayAttack rephrasing - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() self._adversarial_config = AttackAdversarialConfig(target=self._adversarial_chat) super().__init__( @@ -186,55 +179,6 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - The scoring mechanism is as follows: - 1. Detect for objective achieved using a SelfAskTrueFalseScorer with a custom true/false question path. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce scam materials, we want that - to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and scam materials evaluation. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - scam_materials = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.9, - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml", - ) - - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer(aggregator=TrueFalseScoreAggregator.AND, scorers=[scam_materials, backstop]) - - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Provide an OpenAI target for the role-play rephrasing step. - - Returns: - OpenAIChatTarget: Target that supplies the persuasion script rephrasing. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _resolve_seed_groups(self) -> list[SeedAttackGroup]: """ Resolve seed groups from dataset configuration. diff --git a/pyrit/setup/initializers/components/targets.py b/pyrit/setup/initializers/components/targets.py index 055e5c40d1..3a8049ca3a 100644 --- a/pyrit/setup/initializers/components/targets.py +++ b/pyrit/setup/initializers/components/targets.py @@ -180,6 +180,15 @@ class TargetConfig: temperature=1.2, tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.ADVERSARIAL], ), + TargetConfig( + registry_name="objective_scorer_chat", + target_class=OpenAIChatTarget, + endpoint_var="OBJECTIVE_SCORER_CHAT_ENDPOINT", + key_var="OBJECTIVE_SCORER_CHAT_KEY", + model_var="OBJECTIVE_SCORER_CHAT_MODEL", + underlying_model_var="OBJECTIVE_SCORER_CHAT_UNDERLYING_MODEL", + tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.SCORER], + ), TargetConfig( registry_name="azure_foundry_deepseek", target_class=OpenAIChatTarget, diff --git a/tests/unit/scenario/test_jailbreak.py b/tests/unit/scenario/test_jailbreak.py index c873465c6b..1ef7c2090b 100644 --- a/tests/unit/scenario/test_jailbreak.py +++ b/tests/unit/scenario/test_jailbreak.py @@ -15,7 +15,7 @@ from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack from pyrit.identifiers import ComponentIdentifier from pyrit.models import SeedGroup, SeedObjective -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.scenario.scenarios.airt.jailbreak import Jailbreak, JailbreakStrategy from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer @@ -447,11 +447,13 @@ async def test_no_target_duplication_async( class TestJailbreakAdversarialTarget: """Tests for adversarial target creation and caching.""" - def test_create_adversarial_target_returns_openai_chat_target(self) -> None: - """Test that _create_adversarial_target returns a new OpenAIChatTarget.""" + def test_get_or_create_adversarial_target_returns_prompt_chat_target(self) -> None: + """Test that _get_or_create_adversarial_target returns a PromptChatTarget.""" + from pyrit.prompt_target import PromptChatTarget + scenario = Jailbreak() - target = scenario._create_adversarial_target() - assert isinstance(target, OpenAIChatTarget) + target = scenario._get_or_create_adversarial_target() + assert isinstance(target, PromptChatTarget) def test_get_or_create_adversarial_target_reuses_instance(self) -> None: """Test that _get_or_create_adversarial_target returns the same instance on repeated calls.""" diff --git a/tests/unit/scenario/test_rapid_response.py b/tests/unit/scenario/test_rapid_response.py index ddf95df2e6..5b37d95c65 100644 --- a/tests/unit/scenario/test_rapid_response.py +++ b/tests/unit/scenario/test_rapid_response.py @@ -651,7 +651,7 @@ def test_get_default_adversarial_target_capability_check(self): mock_target = MagicMock(spec=PromptTarget) mock_target.capabilities.includes.return_value = False target_registry.register(name="adversarial_chat", instance=mock_target) - with pytest.raises(ValueError, match="must support multi-turn"): + with pytest.raises(ValueError, match="must support"): get_default_adversarial_target() diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index 2c285dc420..5ca197753f 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,12 +854,16 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + mock_self = MagicMock() + type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer - @patch("pyrit.scenario.core.scenario.OpenAIChatTarget") + @patch("pyrit.scenario.core.scenario.get_default_scorer_target") @patch("pyrit.scenario.core.scenario.ScorerRegistry") - def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_target) -> None: + def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_scorer_target) -> None: """Test fallback to TrueFalseInverterScorer when no tagged scorer exists.""" from pyrit.score import TrueFalseInverterScorer @@ -867,7 +871,11 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + mock_self = MagicMock() + type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer) From 554554355e74c50b98a37e30370fc5ecebf1c1e7 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 16:18:16 -0700 Subject: [PATCH 2/4] rename --- pyrit/scenario/core/scenario.py | 8 +++++--- pyrit/scenario/scenarios/airt/cyber.py | 2 +- pyrit/scenario/scenarios/airt/leakage.py | 2 +- pyrit/scenario/scenarios/airt/scam.py | 2 +- tests/unit/scenario/test_scenario.py | 8 ++++---- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 145abba8ba..92bf290896 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -117,7 +117,7 @@ class Scenario(ABC): #: Optional true/false question prompt path for objective scoring. #: When set, the default objective scorer becomes #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. - OBJECTIVE_TRUE_FALSE_QUESTION_PATH: ClassVar[Path | None] = None + COMPOSITE_SCORER_QUESTIONS_PATH: ClassVar[Path | None] = None def __init__( self, @@ -322,11 +322,13 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - if type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH is not None: + composite_scorer_questions_path = type(self).COMPOSITE_SCORER_QUESTIONS_PATH + + if composite_scorer_questions_path is not None: chat_target = get_default_scorer_target() objective_scorer = SelfAskTrueFalseScorer( chat_target=chat_target, - true_false_question_path=type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH, + true_false_question_path=composite_scorer_questions_path, ) backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) return TrueFalseCompositeScorer( diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index ac67174790..49d0445630 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -64,7 +64,7 @@ class Cyber(Scenario): """ VERSION: int = 2 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index 687c159809..3f58d20622 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -81,7 +81,7 @@ class Leakage(Scenario): """ VERSION: int = 1 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index cf693836b8..da8007ce1e 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -83,7 +83,7 @@ class Scam(Scenario): """ VERSION: int = 1 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index 5ca197753f..917ef4922b 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,9 +854,9 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None mock_self = MagicMock() - type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer @@ -871,9 +871,9 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None mock_self = MagicMock() - type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer) From b26f5534a77ca70428d07ad0dbe5f1e7956324c9 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 16:27:06 -0700 Subject: [PATCH 3/4] log --- pyrit/scenario/core/scenario.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 92bf290896..f669a0c054 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -331,20 +331,26 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: true_false_question_path=composite_scorer_questions_path, ) backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) - return TrueFalseCompositeScorer( + scorer = TrueFalseCompositeScorer( aggregator=TrueFalseScoreAggregator.AND, scorers=[objective_scorer, backstop_scorer], ) + logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") + return scorer # Deferred import to avoid circular dependency. from pyrit.setup.initializers.components.scorers import ScorerInitializerTags entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) if entries and isinstance(entries[0].instance, TrueFalseScorer): - return entries[0].instance + scorer = entries[0].instance + logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") + return scorer chat_target = get_default_scorer_target() - return TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + logger.info(f"Using fallback default objective scorer: {type(scorer).__name__}") + return scorer def set_params_from_args(self, *, args: dict[str, Any]) -> None: """ From 6921832008245d1d756c939e20815f1db3652e6e Mon Sep 17 00:00:00 2001 From: behnam Date: Wed, 6 May 2026 22:00:38 -0700 Subject: [PATCH 4/4] more removals --- pyrit/scenario/scenarios/airt/jailbreak.py | 30 +++++-------------- pyrit/scenario/scenarios/airt/psychosocial.py | 15 +++------- .../scenarios/foundry/red_team_agent.py | 15 ++-------- 3 files changed, 13 insertions(+), 47 deletions(-) diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 60e7c0146f..69673e766c 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os from pathlib import Path from typing import Any, Optional, Union @@ -19,12 +18,13 @@ from pyrit.models import SeedAttackGroup from pyrit.prompt_converter import TextJailbreakConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget +from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target from pyrit.score import ( TrueFalseScorer, ) @@ -161,7 +161,7 @@ def __init__( self._num_templates = num_templates self._num_attempts = num_attempts - self._adversarial_target: Optional[OpenAIChatTarget] = None + self._adversarial_target: Optional[PromptChatTarget] = None # Note that num_templates and jailbreak_names are mutually exclusive. # If self._num_templates is None, then this returns all discoverable jailbreak templates. @@ -190,34 +190,18 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _create_adversarial_target(self) -> OpenAIChatTarget: - """ - Create a new adversarial target instance. - - Returns: - OpenAIChatTarget: A fresh adversarial target using an unfiltered endpoint. - """ - endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - api_key = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=api_key, - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - - def _get_or_create_adversarial_target(self) -> OpenAIChatTarget: + def _get_or_create_adversarial_target(self) -> PromptChatTarget: """ Return the shared adversarial target, creating it on first access. - Reuses a single OpenAIChatTarget instance across all role-play attacks + Reuses a single PromptChatTarget instance across all role-play attacks to avoid repeated client and TLS setup. Returns: - OpenAIChatTarget: The shared adversarial target. + PromptChatTarget: The shared adversarial target. """ if self._adversarial_target is None: - self._adversarial_target = self._create_adversarial_target() + self._adversarial_target = get_default_adversarial_target() return self._adversarial_target def _resolve_seed_groups(self) -> list[SeedAttackGroup]: diff --git a/pyrit/scenario/scenarios/airt/psychosocial.py b/pyrit/scenario/scenarios/airt/psychosocial.py index 2e156e6f34..308e6db50e 100644 --- a/pyrit/scenario/scenarios/airt/psychosocial.py +++ b/pyrit/scenario/scenarios/airt/psychosocial.py @@ -2,7 +2,6 @@ # Licensed under the MIT license. import logging -import os import pathlib from dataclasses import dataclass from typing import Any, Optional, TypeVar @@ -26,7 +25,7 @@ from pyrit.prompt_normalizer.prompt_converter_configuration import ( PromptConverterConfiguration, ) -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.prompt_target.common.target_requirements import TargetRequirements from pyrit.scenario.core.atomic_attack import AtomicAttack @@ -36,7 +35,7 @@ from pyrit.scenario.core.scenario_strategy import ( ScenarioStrategy, ) -from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.score import ( FloatScaleScorer, FloatScaleThresholdScorer, @@ -388,17 +387,11 @@ def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScore # Extract the 'value' field which contains the actual rubric text psychosocial_harm_rubric = yaml_data["value"] - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - api_key = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") - azure_openai_chat_target = OpenAIChatTarget( - endpoint=endpoint, - api_key=api_key, - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) + scorer_target = get_default_scorer_target() # Create the base crisis scorer psych_scorer = SelfAskGeneralFloatScaleScorer( - chat_target=azure_openai_chat_target, + chat_target=scorer_target, system_prompt_format_string=psychosocial_harm_rubric, rationale_output_key="reasoning", # Match the YAML JSON schema key category="psychosocial_harm", diff --git a/pyrit/scenario/scenarios/foundry/red_team_agent.py b/pyrit/scenario/scenarios/foundry/red_team_agent.py index a875e186a5..6b8f271d9b 100644 --- a/pyrit/scenario/scenarios/foundry/red_team_agent.py +++ b/pyrit/scenario/scenarios/foundry/red_team_agent.py @@ -10,13 +10,11 @@ """ import logging -import os from collections.abc import Sequence from dataclasses import dataclass, field from inspect import signature from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast -from pyrit.auth import get_azure_openai_auth from pyrit.common import REQUIRED_VALUE, apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack import ( @@ -62,12 +60,12 @@ ) from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget -from pyrit.prompt_target.openai.openai_chat_target import OpenAIChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -270,7 +268,7 @@ def __init__( Raises: ValueError: If attack_strategies is empty or contains unsupported strategies. """ - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() if not attack_scoring_config: attack_scoring_config = AttackScoringConfig(objective_scorer=self._get_default_objective_scorer()) self._attack_scoring_config = attack_scoring_config @@ -426,15 +424,6 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: return [self._get_attack_from_strategy(composition) for composition in self._scenario_composites] - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _get_attack_from_strategy(self, composite: FoundryComposite) -> AtomicAttack: """ Get an atomic attack for the specified FoundryComposite.