diff --git a/pyrit/datasets/jailbreak/text_jailbreak.py b/pyrit/datasets/jailbreak/text_jailbreak.py index 737c0c938..9e6012b89 100644 --- a/pyrit/datasets/jailbreak/text_jailbreak.py +++ b/pyrit/datasets/jailbreak/text_jailbreak.py @@ -104,12 +104,12 @@ def __init__( self.template.value = self.template.render_template_value_silent(**kwargs) @classmethod - def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]: + def get_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]: """ Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH. Args: - n (int, optional): Number of jailbreak templates to return. None to get all. + k (int, optional): Number of jailbreak templates to return. None to get all. Returns: List[str]: List of jailbreak template file names. @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]: if not jailbreak_template_names: raise ValueError("No jailbreak templates found in the jailbreak directory") - if n: - if n > len(jailbreak_template_names): + if k: + if k > len(jailbreak_template_names): raise ValueError( - f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!" + f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!" ) - jailbreak_template_names = random.choices(jailbreak_template_names, k=n) + jailbreak_template_names = random.choices(jailbreak_template_names, k=k) return jailbreak_template_names def get_jailbreak_system_prompt(self) -> str: diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 0478b4ea9..563325390 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -3,7 +3,7 @@ import os from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union from pyrit.common import apply_defaults from pyrit.datasets import TextJailBreak @@ -11,7 +11,10 @@ AttackConverterConfig, AttackScoringConfig, ) +from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack +from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths +from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack from pyrit.models import SeedAttackGroup from pyrit.prompt_converter import TextJailbreakConverter from pyrit.prompt_normalizer import PromptConverterConfiguration @@ -19,9 +22,7 @@ from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.scenario.core.scenario_strategy import ( - ScenarioStrategy, -) +from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy from pyrit.score import ( SelfAskRefusalScorer, TrueFalseInverterScorer, @@ -31,13 +32,38 @@ class JailbreakStrategy(ScenarioStrategy): """ - Strategy for single-turn jailbreak attacks. + Strategy for jailbreak attacks. + + The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to + expose an obvious way of using this scenario without worrying about additional tweaks and changes + to the prompt. - There is currently only one, running all jailbreaks. + COMPLEX strategies """ + # Aggregate members (special markers that expand to strategies with matching tags) ALL = ("all", {"all"}) - PYRIT = ("pyrit", {"pyrit"}) + SIMPLE = ("simple", {"simple"}) + COMPLEX = ("complex", {"complex"}) + + # Simple strategies + PromptSending = ("prompt_sending", {"simple"}) + + # Complex strategies + ManyShot = ("many_shot", {"complex"}) + SkeletonKey = ("skeleton", {"complex"}) + RolePlay = ("role_play", {"complex"}) + + @classmethod + def get_aggregate_tags(cls) -> set[str]: + """ + Get the set of tags that represent aggregate categories. + + Returns: + set[str]: Set of tags that are aggregate markers. + """ + # Include base class aggregates ("all") and add scenario-specific ones + return super().get_aggregate_tags() | {"simple", "complex"} class Jailbreak(Scenario): @@ -67,9 +93,9 @@ def get_default_strategy(cls) -> ScenarioStrategy: Get the default strategy used when no strategies are specified. Returns: - ScenarioStrategy: JailbreakStrategy.ALL. + ScenarioStrategy: JailbreakStrategy.PromptSending. """ - return JailbreakStrategy.ALL + return JailbreakStrategy.PromptSending @classmethod def required_datasets(cls) -> list[str]: @@ -84,7 +110,7 @@ def default_dataset_config(cls) -> DatasetConfiguration: Returns: DatasetConfiguration: Configuration with airt_harms dataset. """ - return DatasetConfiguration(dataset_names=["airt_harms"], max_dataset_size=4) + return DatasetConfiguration(dataset_names=["airt_harms"]) @apply_defaults def __init__( @@ -93,7 +119,9 @@ def __init__( objective_scorer: Optional[TrueFalseScorer] = None, include_baseline: bool = False, scenario_result_id: Optional[str] = None, - n_jailbreaks: Optional[int] = 3, + k_jailbreaks: Optional[int] = None, + num_tries: int = 1, + jailbreak_names: Optional[List[str]] = None, ) -> None: """ Initialize the jailbreak scenario. @@ -104,13 +132,39 @@ def __init__( include_baseline (bool): Whether to include a baseline atomic attack that sends all objectives without modifications. Defaults to True. scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume. - n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them. + k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them. + num_tries (Optional[int]): Number of times to try each jailbreak. + jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets. + to use. + + Raises: + ValueError: If both jailbreak_names and k_jailbreaks are provided, as random selection + is incompatible with a predetermined list. + ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed + templates. + """ + if jailbreak_names and k_jailbreaks: + raise ValueError( + "Please provide only one of `k_jailbreaks` (random selection) or `jailbreaks` (specific selection)." + ) + if not objective_scorer: objective_scorer = self._get_default_objective_scorer() self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) - self._n = n_jailbreaks + self._k = k_jailbreaks + self._n = num_tries + + all_templates = TextJailBreak.get_jailbreak_templates() + + if jailbreak_names: + diff = set(jailbreak_names) - set(all_templates) + if len(diff) > 0: + raise ValueError(f"Error: could not find templates `{diff}`!") + self._jailbreaks = jailbreak_names + else: + self._jailbreaks = TextJailBreak.get_jailbreak_templates(k=self._k) super().__init__( name="Jailbreak", @@ -146,6 +200,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: ) return refusal_scorer + def _get_default_adversarial_target(self) -> OpenAIChatTarget: + """ + Create and retrieve the default adversarial target. + + Returns: + OpenAIChatTarget: Default adversarial target using an unfiltered endpoint. + """ + return OpenAIChatTarget( + endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), + api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), + model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), + temperature=1.2, + ) + def _resolve_seed_groups(self) -> List[SeedAttackGroup]: """ Resolve seed groups from dataset configuration. @@ -161,23 +229,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]: return list(seed_groups) - def _get_all_jailbreak_templates(self) -> List[str]: - """ - Retrieve all available jailbreak templates. - - Returns: - List[str]: List of jailbreak template file names. - """ - if not self._n: - return TextJailBreak.get_all_jailbreak_templates() - else: - return TextJailBreak.get_all_jailbreak_templates(n=self._n) - - async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack: + async def _get_atomic_attack_from_strategy_async( + self, *, strategy: str, jailbreak_template_name: str + ) -> AtomicAttack: """ Create an atomic attack for a specific jailbreak template. Args: + strategy (str): JailbreakStrategy to use. jailbreak_template_name (str): Name of the jailbreak template file. Returns: @@ -202,12 +261,33 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter]) ) - # Create the attack - attack = PromptSendingAttack( - objective_target=self._objective_target, - attack_scoring_config=self._scorer_config, - attack_converter_config=converter_config, - ) + attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None + args = { + "objective_target": self._objective_target, + "attack_scoring_config": self._scorer_config, + "attack_converter_config": converter_config, + } + match strategy: + case "many_shot": + attack = ManyShotJailbreakAttack(**args) + case "prompt_sending": + attack = PromptSendingAttack(**args) + case "skeleton": + attack = SkeletonKeyAttack(**args) + case "role_play": + args["adversarial_chat"] = OpenAIChatTarget( + endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), + api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), + model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), + temperature=1.2, + ) + args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value + attack = RolePlayAttack(**args) + case _: + raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.") + + if not attack: + raise ValueError(f"Attack cannot be None!") # Extract template name without extension for the atomic attack name template_name = Path(jailbreak_template_name).stem @@ -230,11 +310,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]: # Retrieve seed prompts based on selected strategies self._seed_groups = self._resolve_seed_groups() - # Get all jailbreak template names - jailbreak_template_names = self._get_all_jailbreak_templates() + strategies = ScenarioCompositeStrategy.extract_single_strategy_values( + composites=self._scenario_composites, strategy_type=JailbreakStrategy + ) - for template_name in jailbreak_template_names: - atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name) - atomic_attacks.append(atomic_attack) + for strategy in strategies: + for template_name in self._jailbreaks: + for _ in range(0, self._n): + atomic_attack = await self._get_atomic_attack_from_strategy_async( + strategy=strategy, jailbreak_template_name=template_name + ) + atomic_attacks.append(atomic_attack) return atomic_attacks diff --git a/tests/unit/scenarios/test_jailbreak.py b/tests/unit/scenarios/test_jailbreak.py index 109b15c94..0c4f96fb7 100644 --- a/tests/unit/scenarios/test_jailbreak.py +++ b/tests/unit/scenarios/test_jailbreak.py @@ -9,7 +9,10 @@ import pytest from pyrit.executor.attack.core.attack_config import AttackScoringConfig +from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack +from pyrit.executor.attack.single_turn.role_play import RolePlayAttack +from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack from pyrit.identifiers import ScorerIdentifier, TargetIdentifier from pyrit.models import SeedGroup, SeedObjective from pyrit.prompt_target import PromptTarget @@ -17,8 +20,21 @@ from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer +@pytest.fixture +def mock_jailbreaks() -> List[str]: + """Mock constant for jailbreak subset.""" + return ["aim", "dan_1", "tuo"] + + @pytest.fixture def mock_random_n() -> int: + """Mock constant for n-many attempts per jailbreak.""" + return 2 + + +@pytest.fixture +def mock_random_k() -> int: + """Mock constant for k-many jailbreak templates to be used.""" return 3 @@ -62,8 +78,33 @@ def all_jailbreak_strategy() -> JailbreakStrategy: @pytest.fixture -def pyrit_jailbreak_strategy() -> JailbreakStrategy: - return JailbreakStrategy.PYRIT +def simple_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.SIMPLE + + +@pytest.fixture +def complex_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.COMPLEX + + +@pytest.fixture +def manyshot_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.ManyShot + + +@pytest.fixture +def promptsending_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.PromptSending + + +@pytest.fixture +def skeleton_jailbreak_attack() -> JailbreakStrategy: + return JailbreakStrategy.SkeletonKey + + +@pytest.fixture +def roleplay_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.RolePlay @pytest.fixture @@ -107,6 +148,24 @@ def test_init_with_custom_scorer(self, mock_objective_scorer, mock_memory_seed_g scenario = Jailbreak(objective_scorer=mock_objective_scorer) assert isinstance(scenario._scorer_config, AttackScoringConfig) + def test_init_with_k_jailbreaks(self, mock_random_k): + """Test initialization with k_jailbreaks provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(k_jailbreaks=mock_random_k) + assert scenario._k == mock_random_k + + def test_init_with_num_tries(self, mock_random_n): + """Test initialization with n provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(num_tries=mock_random_n) + assert scenario._n == mock_random_n + + def test_init_raises_exception_when_both_k_and_which_jailbreaks(self, mock_random_k, mock_jailbreaks): + """Test failure on providing mutually exclusive arguments.""" + + with pytest.raises(ValueError): + Jailbreak(k_jailbreaks=mock_random_k, jailbreak_names=mock_jailbreaks) + @pytest.mark.asyncio async def test_init_raises_exception_when_no_datasets_available(self, mock_objective_target, mock_objective_scorer): """Test that initialization raises ValueError when datasets are not available in memory.""" @@ -137,22 +196,99 @@ async def test_attack_generation_for_all( assert all(hasattr(run, "_attack") for run in atomic_attacks) @pytest.mark.asyncio - async def test_attack_generation_for_pyrit( - self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, pyrit_jailbreak_strategy + async def test_attack_generation_for_simple( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, simple_jailbreak_strategy ): - """Test that the single turn attack generation works.""" + """Test that the simple attack generation works.""" with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): - scenario = Jailbreak( - objective_scorer=mock_objective_scorer, + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[simple_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, PromptSendingAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_complex( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, complex_jailbreak_strategy + ): + """Test that the complex attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[complex_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert ( + isinstance(run._attack, RolePlayAttack) + or isinstance(run._attack, ManyShotJailbreakAttack) + or isinstance(run._attack, SkeletonKeyAttack) + ) + + @pytest.mark.asyncio + async def test_attack_generation_for_manyshot( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, manyshot_jailbreak_strategy + ): + """Test that the manyshot attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[manyshot_jailbreak_strategy] ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, ManyShotJailbreakAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_promptsending( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, promptsending_jailbreak_strategy + ): + """Test that the prompt sending attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) await scenario.initialize_async( - objective_target=mock_objective_target, scenario_strategies=[pyrit_jailbreak_strategy] + objective_target=mock_objective_target, scenario_strategies=[promptsending_jailbreak_strategy] ) atomic_attacks = await scenario._get_atomic_attacks_async() for run in atomic_attacks: assert isinstance(run._attack, PromptSendingAttack) + @pytest.mark.asyncio + async def test_attack_generation_for_skeleton( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, skeleton_jailbreak_attack + ): + """Test that the skelton key attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[skeleton_jailbreak_attack] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, SkeletonKeyAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_roleplay( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, roleplay_jailbreak_strategy + ): + """Test that the roleplaying attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[roleplay_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, RolePlayAttack) + @pytest.mark.asyncio async def test_attack_runs_include_objectives( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups @@ -195,17 +331,33 @@ async def test_get_all_jailbreak_templates( objective_scorer=mock_objective_scorer, ) await scenario.initialize_async(objective_target=mock_objective_target) - assert len(scenario._get_all_jailbreak_templates()) > 0 + assert len(scenario._jailbreaks) > 0 @pytest.mark.asyncio async def test_get_some_jailbreak_templates( - self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_k ): """Test that random jailbreak template selection works.""" with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): - scenario = Jailbreak(objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n) + scenario = Jailbreak(objective_scorer=mock_objective_scorer, k_jailbreaks=mock_random_k) await scenario.initialize_async(objective_target=mock_objective_target) - assert len(scenario._get_all_jailbreak_templates()) == 3 + assert len(scenario._jailbreaks) == mock_random_k + + @pytest.mark.asyncio + async def test_custom_num_tries( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + ): + """Test that n successfully tries each jailbreak template n-many times.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + base_scenario = Jailbreak(objective_scorer=mock_objective_scorer) + await base_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_1 = await base_scenario._get_atomic_attacks_async() + + mult_scenario = Jailbreak(objective_scorer=mock_objective_scorer, num_tries=mock_random_n) + await mult_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_n = await mult_scenario._get_atomic_attacks_async() + + assert len(atomic_attacks_1) * mock_random_n == len(atomic_attacks_n) @pytest.mark.usefixtures(*FIXTURES)