Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion dreadnode/airt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dreadnode.airt import attack, search
from dreadnode.airt import attack, compliance, search
from dreadnode.airt.attack import (
Attack,
goat_attack,
Expand All @@ -9,20 +9,37 @@
tap_attack,
zoo_attack,
)
from dreadnode.airt.compliance import (
ATTACK_MAPPINGS,
ATLASTechnique,
NISTAIRMFFunction,
OWASPCategory,
SAIFCategory,
tag_attack,
tag_transform,
)
from dreadnode.airt.target import CustomTarget, LLMTarget, Target

__all__ = [
"ATTACK_MAPPINGS",
"ATLASTechnique",
"Attack",
"CustomTarget",
"LLMTarget",
"NISTAIRMFFunction",
"OWASPCategory",
"SAIFCategory",
"Target",
"attack",
"compliance",
"goat_attack",
"hop_skip_jump_attack",
"nes_attack",
"prompt_attack",
"search",
"simba_attack",
"tag_attack",
"tag_transform",
"tap_attack",
"target",
"zoo_attack",
Expand Down
3 changes: 3 additions & 0 deletions dreadnode/airt/attack/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class Attack(Study[In, Out]):
tags: list[str] = Config(default_factory=lambda: ["attack"])
"""A list of tags associated with the attack for logging."""

compliance_tags: dict[str, t.Any] = Config(default_factory=dict)
"""Compliance framework tags (OWASP, ATLAS, SAIF, NIST) for this attack."""

hooks: list[EvalHook] = Field(default_factory=list, exclude=True, repr=False)
"""Hooks to run at various points in the attack lifecycle."""

Expand Down
22 changes: 22 additions & 0 deletions dreadnode/airt/attack/crescendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
import yaml

from dreadnode.airt.attack import Attack
from dreadnode.airt.compliance import (
ATLASTechnique,
NISTAIRMFFunction,
OWASPCategory,
SAIFCategory,
tag_attack,
)
from dreadnode.airt.target.base import Target
from dreadnode.constants import CRESCENDO_VARIANT_1
from dreadnode.data_types.message import Message as DnMessage
Expand All @@ -14,6 +21,20 @@
from dreadnode.transforms.base import Transform
from dreadnode.transforms.refine import adapt_prompt_trials, llm_refine

# Compliance framework tags for Crescendo attack
# Core jailbreak technique tags - specific vulnerability categories (LLM02, LLM07, etc.)
# are added when transforms targeting those categories are used
COMPLIANCE_TAGS = tag_attack(
atlas=[
ATLASTechnique.PROMPT_INJECTION_DIRECT,
ATLASTechnique.LLM_JAILBREAK,
],
owasp=OWASPCategory.LLM01_PROMPT_INJECTION,
saif=SAIFCategory.INPUT_MANIPULATION,
nist_function=NISTAIRMFFunction.MEASURE,
nist_subcategory="MS-2.7",
)


def crescendo_attack(
goal: str,
Expand Down Expand Up @@ -179,6 +200,7 @@ async def crescendo_refiner(trials: list[Trial[DnMessage]]) -> DnMessage:
"objective": objective_judge,
},
hooks=hooks or [],
compliance_tags=COMPLIANCE_TAGS,
)

# Add stop condition based on early_stopping_score
Expand Down
23 changes: 23 additions & 0 deletions dreadnode/airt/attack/goat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import typing as t

from dreadnode.airt.attack import Attack
from dreadnode.airt.compliance import (
ATLASTechnique,
NISTAIRMFFunction,
OWASPCategory,
SAIFCategory,
tag_attack,
)
from dreadnode.data_types.message import Message as DnMessage
from dreadnode.meta.context import TrialCandidate
from dreadnode.optimization.search.graph import graph_neighborhood_search
Expand All @@ -18,6 +25,21 @@
from dreadnode.optimization.trial import Trial


# Compliance framework tags for GOAT attack
# Core jailbreak technique tags - specific vulnerability categories (LLM02, LLM07, etc.)
# are added when transforms targeting those categories are used
COMPLIANCE_TAGS = tag_attack(
atlas=[
ATLASTechnique.PROMPT_INJECTION_DIRECT,
ATLASTechnique.LLM_JAILBREAK,
],
owasp=OWASPCategory.LLM01_PROMPT_INJECTION,
saif=SAIFCategory.INPUT_MANIPULATION,
nist_function=NISTAIRMFFunction.MEASURE,
nist_subcategory="MS-2.7",
)


def goat_attack(
goal: str,
target: "Target[DnMessage, DnMessage]",
Expand Down Expand Up @@ -121,6 +143,7 @@ async def message_refiner(trials: list["Trial[DnMessage]"]) -> DnMessage:
},
constraints=[topic_constraint],
hooks=hooks or [],
compliance_tags=COMPLIANCE_TAGS,
)

if early_stopping_score is not None:
Expand Down
15 changes: 15 additions & 0 deletions dreadnode/airt/attack/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import rigging as rg

from dreadnode.airt.attack.base import Attack
from dreadnode.airt.compliance import ATLASTechnique, OWASPCategory, SAIFCategory, tag_attack
from dreadnode.data_types.message import Message as DnMessage
from dreadnode.meta import TrialCandidate
from dreadnode.optimization.search.graph import beam_search
Expand All @@ -18,6 +19,19 @@
from dreadnode.optimization.trial import Trial


# Compliance framework tags for prompt attack
# Core jailbreak technique tags - specific vulnerability categories (LLM02, LLM07, etc.)
# are added when transforms targeting those categories are used
COMPLIANCE_TAGS = tag_attack(
atlas=[
ATLASTechnique.PROMPT_INJECTION_DIRECT,
ATLASTechnique.LLM_JAILBREAK,
],
owasp=OWASPCategory.LLM01_PROMPT_INJECTION,
saif=SAIFCategory.INPUT_MANIPULATION,
)


def prompt_attack(
goal: str,
target: "Target[DnMessage, DnMessage]",
Expand Down Expand Up @@ -117,6 +131,7 @@ async def message_refiner(trials: list["Trial[DnMessage]"]) -> DnMessage:
"prompt_judge": prompt_judge,
},
hooks=hooks or [],
compliance_tags=COMPLIANCE_TAGS,
)

if early_stopping_score is not None:
Expand Down
32 changes: 30 additions & 2 deletions dreadnode/airt/attack/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

from dreadnode.airt.attack import Attack
from dreadnode.airt.attack.prompt import prompt_attack
from dreadnode.airt.compliance import (
ATLASTechnique,
NISTAIRMFFunction,
OWASPCategory,
SAIFCategory,
tag_attack,
)
from dreadnode.data_types.message import Message as DnMessage
from dreadnode.scorers.judge import llm_judge

Expand All @@ -10,6 +17,21 @@
from dreadnode.eval.hooks.base import EvalHook


# Compliance framework tags for TAP attack
# Core jailbreak technique tags - specific vulnerability categories (LLM02, LLM07, etc.)
# are added when transforms targeting those categories are used
COMPLIANCE_TAGS = tag_attack(
atlas=[
ATLASTechnique.PROMPT_INJECTION_DIRECT,
ATLASTechnique.LLM_JAILBREAK,
],
owasp=OWASPCategory.LLM01_PROMPT_INJECTION,
saif=SAIFCategory.INPUT_MANIPULATION,
nist_function=NISTAIRMFFunction.MEASURE,
nist_subcategory="MS-2.7",
)


def tap_attack(
goal: str,
target: "Target[DnMessage, DnMessage]",
Expand Down Expand Up @@ -45,7 +67,7 @@ def tap_attack(

topic_constraint = llm_judge(evaluator_model, ON_TOPIC_RUBRIC.format(goal=goal))

return prompt_attack(
base_attack = prompt_attack(
goal,
target,
attacker_model,
Expand All @@ -58,7 +80,13 @@ def tap_attack(
branching_factor=branching_factor,
context_depth=context_depth,
hooks=hooks or [],
).with_(constraints={"on_topic": topic_constraint})
)

# Set compliance tags before cloning
base_attack.compliance_tags = COMPLIANCE_TAGS

# Add constraint and return
return base_attack.with_(constraints={"on_topic": topic_constraint})


REFINE_GUIDANCE = """\
Expand Down
Loading