Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
022f70a
Scaffolding
Jan 26, 2026
e85cdb9
Precommit
Jan 26, 2026
fc260c3
fixtures and basic tests
Jan 27, 2026
89a8079
basic tests
Jan 27, 2026
b18f224
basic tests
Jan 27, 2026
96ddf6c
last test
Jan 28, 2026
eb4e936
jailbreak format test
Jan 28, 2026
243ea0a
sample jailbreak prompt
Jan 28, 2026
946fdde
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
132caf5
real jailbreaks added
Jan 28, 2026
c4e625f
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
79d1a64
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
cb28fda
changing dataset name
Jan 29, 2026
f399b6d
moved jailbreak discovery
Jan 29, 2026
75436ea
changed path resolution
Jan 29, 2026
c0022f6
minor changes
Jan 29, 2026
9f579f2
minor bug
Jan 29, 2026
ccf7025
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
349cc6b
old dataset name
Jan 30, 2026
9fa6430
precommit
Jan 30, 2026
513cbf3
random jailbreak selection
Jan 30, 2026
b57b35a
error handling
Jan 30, 2026
999a0c6
error handling docstring
Jan 30, 2026
f3ec8bb
Merge branch 'Azure:main' into jailbreak2
ValbuenaVC Jan 30, 2026
89fd8bd
scaffolding
Jan 30, 2026
66650a6
scaffolding for subset
Jan 30, 2026
fa5b01a
scaffolding
Jan 30, 2026
44bc05c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
db5270c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
9d9666f
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 7, 2026
302101f
subset
Feb 9, 2026
9c7b757
tweaking
Feb 10, 2026
737aabe
new strategy template
Feb 10, 2026
472bd20
types'
Feb 10, 2026
b07e197
adversarial
Feb 10, 2026
c31d088
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 10, 2026
6dcf318
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
ec9d731
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
163e582
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
a503a4b
unit test fixes
Feb 11, 2026
af32046
Merge branch 'jailbreak2' of https://github.com/ValbuenaVC/PyRIT into…
Feb 11, 2026
6da95f9
unit test fix
Feb 11, 2026
73d77a6
mypy
Feb 11, 2026
827ec0e
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 12, 2026
8168db8
params
Feb 12, 2026
5ac7651
tweaks
Feb 12, 2026
20ef0c3
dataset_size
Feb 12, 2026
06bb694
k_jailbreak bug
Feb 13, 2026
03a1e9b
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 13, 2026
6a67ac4
tests
Feb 13, 2026
4b441d4
new strategies
Feb 14, 2026
b14f564
adversarial chat
Feb 14, 2026
07b6142
roleplay path
Feb 14, 2026
36b6b95
roleplay
Feb 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ def __init__(
self.template.value = self.template.render_template_value_silent(**kwargs)

@classmethod
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
def get_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]:
"""
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.

Args:
n (int, optional): Number of jailbreak templates to return. None to get all.
k (int, optional): Number of jailbreak templates to return. None to get all.

Returns:
List[str]: List of jailbreak template file names.
Expand All @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
if not jailbreak_template_names:
raise ValueError("No jailbreak templates found in the jailbreak directory")

if n:
if n > len(jailbreak_template_names):
if k:
if k > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
)
jailbreak_template_names = random.choices(jailbreak_template_names, k=n)
jailbreak_template_names = random.choices(jailbreak_template_names, k=k)
return jailbreak_template_names

def get_jailbreak_system_prompt(self) -> str:
Expand Down
159 changes: 122 additions & 37 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@

import os
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Union

from pyrit.common import apply_defaults
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
AttackConverterConfig,
AttackScoringConfig,
)
from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths
from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack
from pyrit.models import SeedAttackGroup
from pyrit.prompt_converter import TextJailbreakConverter
from pyrit.prompt_normalizer import PromptConverterConfiguration
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_strategy import (
ScenarioStrategy,
)
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
from pyrit.score import (
SelfAskRefusalScorer,
TrueFalseInverterScorer,
Expand All @@ -31,13 +32,38 @@

class JailbreakStrategy(ScenarioStrategy):
"""
Strategy for single-turn jailbreak attacks.
Strategy for jailbreak attacks.

The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to
expose an obvious way of using this scenario without worrying about additional tweaks and changes
to the prompt.

There is currently only one, running all jailbreaks.
COMPLEX strategies
"""

# Aggregate members (special markers that expand to strategies with matching tags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the default strategy? Some other scenarios have a get_default_strategy function but I don't see one here. I would recommend to make the default PromptSending because it's the one that makes the most sense for the jailbreaks we have in PyRIT so far.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now it's JailbreakStrategy.ALL, but I like that idea more, so I'm going to change it to PromptSending. Should be line 84

ALL = ("all", {"all"})
PYRIT = ("pyrit", {"pyrit"})
SIMPLE = ("simple", {"simple"})
COMPLEX = ("complex", {"complex"})

# Simple strategies
PromptSending = ("prompt_sending", {"simple"})

# Complex strategies
ManyShot = ("many_shot", {"complex"})
SkeletonKey = ("skeleton", {"complex"})
RolePlay = ("role_play", {"complex"})

@classmethod
def get_aggregate_tags(cls) -> set[str]:
"""
Get the set of tags that represent aggregate categories.

Returns:
set[str]: Set of tags that are aggregate markers.
"""
# Include base class aggregates ("all") and add scenario-specific ones
return super().get_aggregate_tags() | {"simple", "complex"}


class Jailbreak(Scenario):
Expand Down Expand Up @@ -67,9 +93,9 @@ def get_default_strategy(cls) -> ScenarioStrategy:
Get the default strategy used when no strategies are specified.

Returns:
ScenarioStrategy: JailbreakStrategy.ALL.
ScenarioStrategy: JailbreakStrategy.PromptSending.
"""
return JailbreakStrategy.ALL
return JailbreakStrategy.PromptSending

@classmethod
def required_datasets(cls) -> list[str]:
Expand All @@ -84,7 +110,7 @@ def default_dataset_config(cls) -> DatasetConfiguration:
Returns:
DatasetConfiguration: Configuration with airt_harms dataset.
"""
return DatasetConfiguration(dataset_names=["airt_harms"], max_dataset_size=4)
return DatasetConfiguration(dataset_names=["airt_harms"])

@apply_defaults
def __init__(
Expand All @@ -93,7 +119,9 @@ def __init__(
objective_scorer: Optional[TrueFalseScorer] = None,
include_baseline: bool = False,
scenario_result_id: Optional[str] = None,
n_jailbreaks: Optional[int] = 3,
k_jailbreaks: Optional[int] = None,
num_tries: int = 1,
jailbreak_names: Optional[List[str]] = None,
) -> None:
"""
Initialize the jailbreak scenario.
Expand All @@ -104,13 +132,39 @@ def __init__(
include_baseline (bool): Whether to include a baseline atomic attack that sends all
objectives without modifications. Defaults to True.
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them.
num_tries (Optional[int]): Number of times to try each jailbreak.
jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets.
to use.

Raises:
ValueError: If both jailbreak_names and k_jailbreaks are provided, as random selection
is incompatible with a predetermined list.
ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed
templates.

"""
if jailbreak_names and k_jailbreaks:
raise ValueError(
"Please provide only one of `k_jailbreaks` (random selection) or `jailbreaks` (specific selection)."
)

if not objective_scorer:
objective_scorer = self._get_default_objective_scorer()
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)

self._n = n_jailbreaks
self._k = k_jailbreaks
self._n = num_tries

all_templates = TextJailBreak.get_jailbreak_templates()

if jailbreak_names:
diff = set(jailbreak_names) - set(all_templates)
if len(diff) > 0:
raise ValueError(f"Error: could not find templates `{diff}`!")
self._jailbreaks = jailbreak_names
else:
self._jailbreaks = TextJailBreak.get_jailbreak_templates(k=self._k)

super().__init__(
name="Jailbreak",
Expand Down Expand Up @@ -146,6 +200,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
)
return refusal_scorer

def _get_default_adversarial_target(self) -> OpenAIChatTarget:
"""
Create and retrieve the default adversarial target.

Returns:
OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
"""
return OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)

def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
"""
Resolve seed groups from dataset configuration.
Expand All @@ -161,23 +229,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]:

return list(seed_groups)

def _get_all_jailbreak_templates(self) -> List[str]:
"""
Retrieve all available jailbreak templates.

Returns:
List[str]: List of jailbreak template file names.
"""
if not self._n:
return TextJailBreak.get_all_jailbreak_templates()
else:
return TextJailBreak.get_all_jailbreak_templates(n=self._n)

async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
async def _get_atomic_attack_from_strategy_async(
self, *, strategy: str, jailbreak_template_name: str
) -> AtomicAttack:
"""
Create an atomic attack for a specific jailbreak template.

Args:
strategy (str): JailbreakStrategy to use.
jailbreak_template_name (str): Name of the jailbreak template file.

Returns:
Expand All @@ -202,12 +261,33 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
)

# Create the attack
attack = PromptSendingAttack(
objective_target=self._objective_target,
attack_scoring_config=self._scorer_config,
attack_converter_config=converter_config,
)
attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None
args = {
"objective_target": self._objective_target,
"attack_scoring_config": self._scorer_config,
"attack_converter_config": converter_config,
}
match strategy:
case "many_shot":
attack = ManyShotJailbreakAttack(**args)
case "prompt_sending":
attack = PromptSendingAttack(**args)
case "skeleton":
attack = SkeletonKeyAttack(**args)
case "role_play":
args["adversarial_chat"] = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)
args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value
attack = RolePlayAttack(**args)
case _:
raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.")

if not attack:
raise ValueError(f"Attack cannot be None!")

# Extract template name without extension for the atomic attack name
template_name = Path(jailbreak_template_name).stem
Expand All @@ -230,11 +310,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
# Retrieve seed prompts based on selected strategies
self._seed_groups = self._resolve_seed_groups()

# Get all jailbreak template names
jailbreak_template_names = self._get_all_jailbreak_templates()
strategies = ScenarioCompositeStrategy.extract_single_strategy_values(
composites=self._scenario_composites, strategy_type=JailbreakStrategy
)

for template_name in jailbreak_template_names:
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
atomic_attacks.append(atomic_attack)
for strategy in strategies:
for template_name in self._jailbreaks:
for _ in range(0, self._n):
atomic_attack = await self._get_atomic_attack_from_strategy_async(
strategy=strategy, jailbreak_template_name=template_name
)
atomic_attacks.append(atomic_attack)

return atomic_attacks
Loading