diff --git a/.env_example b/.env_example index fdf9d715e1..b925fb097c 100644 --- a/.env_example +++ b/.env_example @@ -79,6 +79,11 @@ ADVERSARIAL_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" ADVERSARIAL_CHAT_KEY="xxxxx" ADVERSARIAL_CHAT_MODEL="deployment-name" +# Objective Scorer chat target (used in scorers in scenarios) +OBJECTIVE_SCORER_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" +OBJECTIVE_SCORER_CHAT_KEY="xxxxx" +OBJECTIVE_SCORER_CHAT_MODEL="deployment-name" + AZURE_FOUNDRY_DEEPSEEK_ENDPOINT="https://xxxxx.eastus2.models.ai.azure.com" AZURE_FOUNDRY_DEEPSEEK_KEY="xxxxx" AZURE_FOUNDRY_DEEPSEEK_MODEL="" diff --git a/pyrit/scenario/core/__init__.py b/pyrit/scenario/core/__init__.py index 06304d3715..b1d3247857 100644 --- a/pyrit/scenario/core/__init__.py +++ b/pyrit/scenario/core/__init__.py @@ -10,9 +10,9 @@ from pyrit.scenario.core.dataset_configuration import EXPLICIT_SEED_GROUPS_KEY, DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.scenario.core.scenario_techniques import ( SCENARIO_TECHNIQUES, - get_default_adversarial_target, register_scenario_techniques, ) @@ -28,6 +28,7 @@ "ScenarioCompositeStrategy", "ScenarioStrategy", "ScorerOverridePolicy", - "get_default_adversarial_target", "register_scenario_techniques", + "get_default_scorer_target", + "get_default_adversarial_target", ] diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 089bed4267..42e8222d0d 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -16,6 +16,7 @@ import uuid from abc import ABC, abstractmethod from collections.abc import Sequence +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union, cast, get_origin from tqdm.auto import tqdm @@ -27,14 +28,20 @@ from pyrit.memory.memory_models import ScenarioResultEntry from pyrit.models import AttackResult, SeedAttackGroup from pyrit.models.scenario_result import ScenarioIdentifier, ScenarioResult -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.target_requirements import TargetRequirements -from pyrit.registry import ScorerRegistry +from pyrit.registry.object_registries.scorer_registry import ScorerRegistry from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import Scorer, SelfAskRefusalScorer, TrueFalseInverterScorer, TrueFalseScorer +from pyrit.scenario.core.scenario_target_defaults import get_default_scorer_target +from pyrit.score import Scorer, TrueFalseScorer +from pyrit.score.true_false.self_ask_refusal_scorer import SelfAskRefusalScorer +from pyrit.score.true_false.self_ask_true_false_scorer import SelfAskTrueFalseScorer +from pyrit.score.true_false.true_false_composite_scorer import TrueFalseCompositeScorer +from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer +from pyrit.score.true_false.true_false_score_aggregator import TrueFalseScoreAggregator if TYPE_CHECKING: from pyrit.executor.attack.core.attack_config import AttackScoringConfig @@ -107,6 +114,11 @@ class Scenario(ABC): #: what the scenario needs. Validated in ``initialize_async`` once the target is supplied. TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements() + #: Optional true/false question prompt path for objective scoring. + #: When set, the default objective scorer becomes + #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. + COMPOSITE_SCORER_QUESTIONS_PATH: ClassVar[Path | None] = None + def __init__( self, *, @@ -310,7 +322,23 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - # Deferred import to avoid circular dependency: + composite_scorer_questions_path = type(self).COMPOSITE_SCORER_QUESTIONS_PATH + + if composite_scorer_questions_path is not None: + chat_target = get_default_scorer_target() + objective_scorer = SelfAskTrueFalseScorer( + chat_target=chat_target, + true_false_question_path=composite_scorer_questions_path, + ) + backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + scorer = TrueFalseCompositeScorer( + aggregator=TrueFalseScoreAggregator.AND, + scorers=[objective_scorer, backstop_scorer], + ) + logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") + return scorer + + # Deferred import to avoid circular dependency. from pyrit.setup.initializers.components.scorers import ScorerInitializerTags entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) @@ -318,8 +346,10 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: scorer = entries[0].instance logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") return scorer - scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=OpenAIChatTarget())) - logger.info(f"No registered default objective scorer found, using fallback: {type(scorer).__name__}") + + chat_target = get_default_scorer_target() + scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + logger.info(f"Using fallback default objective scorer: {type(scorer).__name__}") return scorer def set_params_from_args(self, *, args: dict[str, Any]) -> None: diff --git a/pyrit/scenario/core/scenario_target_defaults.py b/pyrit/scenario/core/scenario_target_defaults.py new file mode 100644 index 0000000000..bc6fe084ae --- /dev/null +++ b/pyrit/scenario/core/scenario_target_defaults.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target.common.target_capabilities import CapabilityName +from pyrit.registry import TargetRegistry + + +def get_default_scorer_target() -> PromptChatTarget: + """ + Resolve the default objective scorer chat target. + + First checks the ``TargetRegistry`` for an ``"objective_scorer_chat"`` entry + (populated by ``TargetInitializer`` from ``OBJECTIVE_SCORER_CHAT_*`` env vars). + Falls back to a plain ``OpenAIChatTarget`` + + Returns: + PromptChatTarget: The resolved objective scorer chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target(preferred_target_key="objective_scorer_chat") + + +def get_default_adversarial_target() -> PromptChatTarget: + """ + Resolve the default adversarial chat target. + + First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry + (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). + Falls back to a default fallback target with temperature=1.2 + + Returns: + PromptChatTarget: The resolved adversarial chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target( + preferred_target_key="adversarial_chat", + required_capabilities={CapabilityName.MULTI_TURN}, + fallback_temperature=1.2, + ) + + +def _get_default_chat_target( + *, + preferred_target_key: str, + required_capabilities: set[CapabilityName] | None = None, + fallback_temperature: float | None = None, +) -> PromptChatTarget: + """ + Resolve a chat target from TargetRegistry with configurable fallback behavior. + + Resolution order: + 1. ``preferred_target_key`` entry from ``TargetRegistry`` + 2. ``OpenAIChatTarget(...)`` with optional temperature + + Args: + preferred_target_key (str): TargetRegistry key to resolve first. + required_capabilities (set[CapabilityName] | None): Optional capabilities + that a resolved target must support. + fallback_temperature (float | None): Optional temperature for fallback + ``OpenAIChatTarget`` construction. + + Returns: + PromptChatTarget: The resolved chat target. + + Raises: + ValueError: If the resolved target does not satisfy required capabilities. + ValueError: If the registry entry exists but is not a PromptChatTarget. + """ + registry = TargetRegistry.get_registry_singleton() + target = registry.get(preferred_target_key) + if target is not None: + # Check required capabilities first (fail fast) + if required_capabilities: + for capability in required_capabilities: + if not target.capabilities.includes(capability=capability): + raise ValueError(f"Registry entry '{preferred_target_key}' must support {capability.value}.") + + # Then check type + if not isinstance(target, PromptChatTarget): + raise ValueError( + f"Registry entry '{preferred_target_key}' must be a PromptChatTarget, but got {type(target).__name__}" + ) + + return target + + return OpenAIChatTarget(temperature=fallback_temperature) diff --git a/pyrit/scenario/core/scenario_techniques.py b/pyrit/scenario/core/scenario_techniques.py index 2e405ce4c8..b07ded53cd 100644 --- a/pyrit/scenario/core/scenario_techniques.py +++ b/pyrit/scenario/core/scenario_techniques.py @@ -22,6 +22,7 @@ import inspect import logging from pathlib import Path +from typing import TYPE_CHECKING from pyrit.common.path import EXECUTOR_SEED_PROMPT_PATH from pyrit.executor.attack import ( @@ -34,13 +35,15 @@ ) from pyrit.models import SeedAttackTechniqueGroup, SeedSimulatedConversation from pyrit.models.seeds.seed_simulated_conversation import NextMessageSystemPromptPaths -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget -from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.registry import TargetRegistry from pyrit.registry.object_registries.attack_technique_registry import ( AttackTechniqueRegistry, AttackTechniqueSpec, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target + +if TYPE_CHECKING: + from pyrit.prompt_target import PromptChatTarget logger = logging.getLogger(__name__) @@ -98,40 +101,6 @@ ] -# --------------------------------------------------------------------------- -# Default adversarial target -# --------------------------------------------------------------------------- - - -def get_default_adversarial_target() -> PromptChatTarget: - """ - Resolve the default adversarial chat target. - - First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry - (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). - Falls back to a plain ``OpenAIChatTarget(temperature=1.2)`` using - ``@apply_defaults`` resolution. - - Returns: - PromptChatTarget: The resolved adversarial chat target. - - Raises: - ValueError: If the registered target does not support multi-turn. - """ - registry = TargetRegistry.get_registry_singleton() - if "adversarial_chat" in registry: - target = registry.get("adversarial_chat") - if target: - if not target.capabilities.includes(capability=CapabilityName.MULTI_TURN): - raise ValueError( - f"Registry entry 'adversarial_chat' must support multi-turn conversations, " - f"but {type(target).__name__} does not." - ) - return target - - return OpenAIChatTarget(temperature=1.2) - - # --------------------------------------------------------------------------- # Runtime spec builder # --------------------------------------------------------------------------- diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index 7ff874be1b..9c7abbe022 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -8,20 +8,12 @@ from pyrit.common import apply_defaults from pyrit.common.path import SCORER_SEED_PROMPT_PATH -from pyrit.prompt_target import OpenAIChatTarget from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) if TYPE_CHECKING: from pyrit.scenario.core.scenario_strategy import ScenarioStrategy + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -64,6 +56,7 @@ class Cyber(Scenario): """ VERSION: int = 2 + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod @@ -128,32 +121,3 @@ def __init__( include_default_baseline=include_baseline, scenario_result_id=scenario_result_id, ) - - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - The scoring mechanism is as follows: - 1. Detect for the presence of malware using a SelfAskTrueFalseScorer. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce malware, we want that - to register as a negative score). - - The chat target is resolved from the base class default scorer if available, - otherwise falls back to OpenAIChatTarget(). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and malware evaluation. - """ - base_scorer = super()._get_default_objective_scorer() - chat_target = base_scorer.get_chat_target() or OpenAIChatTarget() - - presence_of_malware = SelfAskTrueFalseScorer( - chat_target=chat_target, - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml", - ) - - backstop = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_malware, backstop] - ) diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 1870fdc0cf..69673e766c 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -1,11 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os from pathlib import Path from typing import Any, Optional, Union -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack.core.attack_config import ( @@ -20,12 +18,13 @@ from pyrit.models import SeedAttackGroup from pyrit.prompt_converter import TextJailbreakConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget +from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target from pyrit.score import ( TrueFalseScorer, ) @@ -162,7 +161,7 @@ def __init__( self._num_templates = num_templates self._num_attempts = num_attempts - self._adversarial_target: Optional[OpenAIChatTarget] = None + self._adversarial_target: Optional[PromptChatTarget] = None # Note that num_templates and jailbreak_names are mutually exclusive. # If self._num_templates is None, then this returns all discoverable jailbreak templates. @@ -191,33 +190,18 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _create_adversarial_target(self) -> OpenAIChatTarget: - """ - Create a new adversarial target instance. - - Returns: - OpenAIChatTarget: A fresh adversarial target using an unfiltered endpoint. - """ - endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - - def _get_or_create_adversarial_target(self) -> OpenAIChatTarget: + def _get_or_create_adversarial_target(self) -> PromptChatTarget: """ Return the shared adversarial target, creating it on first access. - Reuses a single OpenAIChatTarget instance across all role-play attacks + Reuses a single PromptChatTarget instance across all role-play attacks to avoid repeated client and TLS setup. Returns: - OpenAIChatTarget: The shared adversarial target. + PromptChatTarget: The shared adversarial target. """ if self._adversarial_target is None: - self._adversarial_target = self._create_adversarial_target() + self._adversarial_target = get_default_adversarial_target() return self._adversarial_target def _resolve_seed_groups(self) -> list[SeedAttackGroup]: diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index a3a1826aef..8be71be6ef 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -14,7 +14,6 @@ ) from pyrit.prompt_converter import AddImageTextConverter, FirstLetterConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget from pyrit.registry.object_registries.attack_technique_registry import ( AttackTechniqueRegistry, AttackTechniqueSpec, @@ -22,18 +21,12 @@ from pyrit.registry.tag_query import TagQuery from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_strategy import ScenarioStrategy if TYPE_CHECKING: from pyrit.scenario.core.attack_technique_factory import AttackTechniqueFactory from pyrit.scenario.core.scenario_strategy import ScenarioStrategy + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -104,6 +97,7 @@ class Leakage(Scenario): VERSION: int = 2 _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -159,34 +153,6 @@ def __init__( scenario_result_id=scenario_result_id, ) - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - Uses a composite scorer: - 1. SelfAskTrueFalseScorer with leakage detection prompt. - 2. Inverted SelfAskRefusalScorer as backstop (response must not be a refusal). - - The chat target is resolved from the base class default scorer if available, - otherwise falls back to OpenAIChatTarget(). - - Returns: - TrueFalseCompositeScorer: A composite scorer combining leakage detection and refusal backstop. - """ - base_scorer = super()._get_default_objective_scorer() - chat_target = base_scorer.get_chat_target() or OpenAIChatTarget() - - presence_of_leakage = SelfAskTrueFalseScorer( - chat_target=chat_target, - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml", - ) - - backstop = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_leakage, backstop] - ) - def _get_attack_technique_factories(self) -> dict[str, AttackTechniqueFactory]: """ Return core + leakage-specific attack technique factories. diff --git a/pyrit/scenario/scenarios/airt/psychosocial.py b/pyrit/scenario/scenarios/airt/psychosocial.py index 8a0fc924b9..308e6db50e 100644 --- a/pyrit/scenario/scenarios/airt/psychosocial.py +++ b/pyrit/scenario/scenarios/airt/psychosocial.py @@ -2,14 +2,12 @@ # Licensed under the MIT license. import logging -import os import pathlib from dataclasses import dataclass from typing import Any, Optional, TypeVar import yaml -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import DATASETS_PATH from pyrit.executor.attack import ( @@ -27,7 +25,7 @@ from pyrit.prompt_normalizer.prompt_converter_configuration import ( PromptConverterConfiguration, ) -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.prompt_target.common.target_requirements import TargetRequirements from pyrit.scenario.core.atomic_attack import AtomicAttack @@ -37,6 +35,7 @@ from pyrit.scenario.core.scenario_strategy import ( ScenarioStrategy, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.score import ( FloatScaleScorer, FloatScaleThresholdScorer, @@ -253,7 +252,7 @@ def __init__( "objectives is deprecated and will be removed in a future version. " "Use dataset_config in initialize_async instead." ) - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() # Merge user-provided configs with defaults (user-provided takes precedence) self._subharm_configs = {**self.DEFAULT_SUBHARM_CONFIGS, **(subharm_configs or {})} @@ -356,21 +355,6 @@ def _filter_by_harm_category( filtered_groups.append(SeedAttackGroup(seeds=filtered_seeds)) return filtered_groups - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Create default adversarial chat target for multi-turn attacks. - - Returns: - OpenAIChatTarget: Default adversarial target, using an unfiltered endpoint. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.7, - ) - def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScorer: """ Create scorer for psychosocial harms evaluation. @@ -403,16 +387,11 @@ def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScore # Extract the 'value' field which contains the actual rubric text psychosocial_harm_rubric = yaml_data["value"] - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - azure_openai_chat_target = OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) + scorer_target = get_default_scorer_target() # Create the base crisis scorer psych_scorer = SelfAskGeneralFloatScaleScorer( - chat_target=azure_openai_chat_target, + chat_target=scorer_target, system_prompt_format_string=psychosocial_harm_rubric, rationale_output_key="reasoning", # Match the YAML JSON schema key category="psychosocial_harm", diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index e714b73a57..da8007ce1e 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -2,11 +2,9 @@ # Licensed under the MIT license. import logging -import os from pathlib import Path from typing import TYPE_CHECKING, Any, Optional -from pyrit.auth import get_azure_openai_auth from pyrit.common import Parameter, apply_defaults from pyrit.common.path import ( EXECUTOR_RED_TEAM_PATH, @@ -23,20 +21,14 @@ AttackScoringConfig, ) from pyrit.models import SeedAttackGroup -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.score import TrueFalseScorer if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -91,6 +83,7 @@ class Scam(Scenario): """ VERSION: int = 1 + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -172,7 +165,7 @@ def __init__( self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) # Used for multiturn strategies and RolePlayAttack rephrasing - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() self._adversarial_config = AttackAdversarialConfig(target=self._adversarial_chat) super().__init__( @@ -186,55 +179,6 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - The scoring mechanism is as follows: - 1. Detect for objective achieved using a SelfAskTrueFalseScorer with a custom true/false question path. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce scam materials, we want that - to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and scam materials evaluation. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - scam_materials = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.9, - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml", - ) - - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer(aggregator=TrueFalseScoreAggregator.AND, scorers=[scam_materials, backstop]) - - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Provide an OpenAI target for the role-play rephrasing step. - - Returns: - OpenAIChatTarget: Target that supplies the persuasion script rephrasing. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _resolve_seed_groups(self) -> list[SeedAttackGroup]: """ Resolve seed groups from dataset configuration. diff --git a/pyrit/scenario/scenarios/foundry/red_team_agent.py b/pyrit/scenario/scenarios/foundry/red_team_agent.py index a875e186a5..6b8f271d9b 100644 --- a/pyrit/scenario/scenarios/foundry/red_team_agent.py +++ b/pyrit/scenario/scenarios/foundry/red_team_agent.py @@ -10,13 +10,11 @@ """ import logging -import os from collections.abc import Sequence from dataclasses import dataclass, field from inspect import signature from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast -from pyrit.auth import get_azure_openai_auth from pyrit.common import REQUIRED_VALUE, apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack import ( @@ -62,12 +60,12 @@ ) from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget -from pyrit.prompt_target.openai.openai_chat_target import OpenAIChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -270,7 +268,7 @@ def __init__( Raises: ValueError: If attack_strategies is empty or contains unsupported strategies. """ - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() if not attack_scoring_config: attack_scoring_config = AttackScoringConfig(objective_scorer=self._get_default_objective_scorer()) self._attack_scoring_config = attack_scoring_config @@ -426,15 +424,6 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: return [self._get_attack_from_strategy(composition) for composition in self._scenario_composites] - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _get_attack_from_strategy(self, composite: FoundryComposite) -> AtomicAttack: """ Get an atomic attack for the specified FoundryComposite. diff --git a/pyrit/setup/initializers/components/targets.py b/pyrit/setup/initializers/components/targets.py index 055e5c40d1..3a8049ca3a 100644 --- a/pyrit/setup/initializers/components/targets.py +++ b/pyrit/setup/initializers/components/targets.py @@ -180,6 +180,15 @@ class TargetConfig: temperature=1.2, tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.ADVERSARIAL], ), + TargetConfig( + registry_name="objective_scorer_chat", + target_class=OpenAIChatTarget, + endpoint_var="OBJECTIVE_SCORER_CHAT_ENDPOINT", + key_var="OBJECTIVE_SCORER_CHAT_KEY", + model_var="OBJECTIVE_SCORER_CHAT_MODEL", + underlying_model_var="OBJECTIVE_SCORER_CHAT_UNDERLYING_MODEL", + tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.SCORER], + ), TargetConfig( registry_name="azure_foundry_deepseek", target_class=OpenAIChatTarget, diff --git a/tests/unit/scenario/test_jailbreak.py b/tests/unit/scenario/test_jailbreak.py index c873465c6b..1ef7c2090b 100644 --- a/tests/unit/scenario/test_jailbreak.py +++ b/tests/unit/scenario/test_jailbreak.py @@ -15,7 +15,7 @@ from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack from pyrit.identifiers import ComponentIdentifier from pyrit.models import SeedGroup, SeedObjective -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.scenario.scenarios.airt.jailbreak import Jailbreak, JailbreakStrategy from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer @@ -447,11 +447,13 @@ async def test_no_target_duplication_async( class TestJailbreakAdversarialTarget: """Tests for adversarial target creation and caching.""" - def test_create_adversarial_target_returns_openai_chat_target(self) -> None: - """Test that _create_adversarial_target returns a new OpenAIChatTarget.""" + def test_get_or_create_adversarial_target_returns_prompt_chat_target(self) -> None: + """Test that _get_or_create_adversarial_target returns a PromptChatTarget.""" + from pyrit.prompt_target import PromptChatTarget + scenario = Jailbreak() - target = scenario._create_adversarial_target() - assert isinstance(target, OpenAIChatTarget) + target = scenario._get_or_create_adversarial_target() + assert isinstance(target, PromptChatTarget) def test_get_or_create_adversarial_target_reuses_instance(self) -> None: """Test that _get_or_create_adversarial_target returns the same instance on repeated calls.""" diff --git a/tests/unit/scenario/test_rapid_response.py b/tests/unit/scenario/test_rapid_response.py index ddf95df2e6..5b37d95c65 100644 --- a/tests/unit/scenario/test_rapid_response.py +++ b/tests/unit/scenario/test_rapid_response.py @@ -651,7 +651,7 @@ def test_get_default_adversarial_target_capability_check(self): mock_target = MagicMock(spec=PromptTarget) mock_target.capabilities.includes.return_value = False target_registry.register(name="adversarial_chat", instance=mock_target) - with pytest.raises(ValueError, match="must support multi-turn"): + with pytest.raises(ValueError, match="must support"): get_default_adversarial_target() diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index bbead38407..7e1b1c2e64 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,12 +854,16 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None + mock_self = MagicMock() + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer - @patch("pyrit.scenario.core.scenario.OpenAIChatTarget") + @patch("pyrit.scenario.core.scenario.get_default_scorer_target") @patch("pyrit.scenario.core.scenario.ScorerRegistry") - def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_target) -> None: + def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_scorer_target) -> None: """Test fallback to TrueFalseInverterScorer when no tagged scorer exists.""" from pyrit.score import TrueFalseInverterScorer @@ -867,7 +871,11 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None + mock_self = MagicMock() + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer)