From 668512dbe7975e7e8712ffedd9d8a0713ca0f0a0 Mon Sep 17 00:00:00 2001 From: jsong468 Date: Mon, 4 May 2026 15:24:04 -0700 Subject: [PATCH 1/5] score blocked content --- pyrit/executor/attack/core/attack_config.py | 6 + pyrit/executor/attack/multi_turn/crescendo.py | 7 +- .../attack/multi_turn/multi_prompt_sending.py | 2 + .../executor/attack/multi_turn/red_teaming.py | 2 + .../attack/multi_turn/tree_of_attacks.py | 13 + .../attack/single_turn/prompt_sending.py | 2 + .../openai/openai_chat_target.py | 20 + .../openai/openai_response_target.py | 28 ++ pyrit/prompt_target/openai/openai_target.py | 30 +- pyrit/score/conversation_scorer.py | 6 +- pyrit/score/scorer.py | 87 ++++- .../float_scale_threshold_scorer.py | 4 + pyrit/score/true_false/true_false_scorer.py | 10 +- .../attack/core/test_attack_config.py | 17 + .../target/test_openai_chat_target.py | 56 +++ .../target/test_openai_response_target.py | 42 +++ tests/unit/score/test_scorer.py | 345 +++++++++++++++++- 17 files changed, 667 insertions(+), 10 deletions(-) diff --git a/pyrit/executor/attack/core/attack_config.py b/pyrit/executor/attack/core/attack_config.py index 7d128ffd79..b11b91aeff 100644 --- a/pyrit/executor/attack/core/attack_config.py +++ b/pyrit/executor/attack/core/attack_config.py @@ -53,6 +53,12 @@ class AttackScoringConfig: # Whether to use scoring results as feedback for iterative attacks use_score_as_feedback: bool = True + # Whether to score blocked responses using partial content from prompt_metadata["partial_content"]. + # When True, blocked responses that contain partial model output (e.g., from Azure Content Safety + # triggering mid-generation) will be evaluated by scorers instead of being skipped or + # auto-classified as failures/refusals. + score_blocked_content: bool = False + def __post_init__(self) -> None: """ Validate configuration values. diff --git a/pyrit/executor/attack/multi_turn/crescendo.py b/pyrit/executor/attack/multi_turn/crescendo.py index fec9e9c856..9f04ab0df0 100644 --- a/pyrit/executor/attack/multi_turn/crescendo.py +++ b/pyrit/executor/attack/multi_turn/crescendo.py @@ -203,6 +203,7 @@ def __init__( self._objective_scorer = objective_scorer self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize refusal scorer - use the one from config if provided, otherwise create default self._refusal_scorer = attack_scoring_config.refusal_scorer or SelfAskRefusalScorer( @@ -675,7 +676,10 @@ async def _check_refusal_async(self, context: CrescendoAttackContext, objective: objective=context.objective, ): scores = await self._refusal_scorer.score_async( - message=context.last_response, objective=objective, skip_on_error_result=False + message=context.last_response, + objective=objective, + skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) return scores[0] @@ -711,6 +715,7 @@ async def _score_response_async(self, *, context: CrescendoAttackContext) -> Sco role_filter="assistant", objective=context.objective, skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) objective_score = scoring_results["objective_scores"] diff --git a/pyrit/executor/attack/multi_turn/multi_prompt_sending.py b/pyrit/executor/attack/multi_turn/multi_prompt_sending.py index 546b6c970b..d429813a51 100644 --- a/pyrit/executor/attack/multi_turn/multi_prompt_sending.py +++ b/pyrit/executor/attack/multi_turn/multi_prompt_sending.py @@ -173,6 +173,7 @@ def __init__( self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers self._objective_scorer = attack_scoring_config.objective_scorer + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize prompt normalizer and conversation manager self._prompt_normalizer = prompt_normalizer or PromptNormalizer() @@ -401,6 +402,7 @@ async def _evaluate_response_async(self, *, response: Message, objective: str) - role_filter="assistant", objective=objective, skip_on_error_result=True, + score_blocked_content=self._score_blocked_content, ) objective_scores = scoring_results["objective_scores"] diff --git a/pyrit/executor/attack/multi_turn/red_teaming.py b/pyrit/executor/attack/multi_turn/red_teaming.py index 60a05e388b..e41967067c 100644 --- a/pyrit/executor/attack/multi_turn/red_teaming.py +++ b/pyrit/executor/attack/multi_turn/red_teaming.py @@ -143,6 +143,7 @@ def __init__( self._objective_scorer = attack_scoring_config.objective_scorer self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize adversarial configuration self._adversarial_chat = attack_adversarial_config.target @@ -604,6 +605,7 @@ async def _score_response_async(self, *, context: MultiTurnAttackContext[Any]) - message=context.last_response, role_filter="assistant", objective=context.objective, + score_blocked_content=self._score_blocked_content, ) objective_scores = scoring_results diff --git a/pyrit/executor/attack/multi_turn/tree_of_attacks.py b/pyrit/executor/attack/multi_turn/tree_of_attacks.py index 7f0470bba4..ad689829fa 100644 --- a/pyrit/executor/attack/multi_turn/tree_of_attacks.py +++ b/pyrit/executor/attack/multi_turn/tree_of_attacks.py @@ -96,6 +96,7 @@ def __init__( refusal_scorer: Optional[TrueFalseScorer] = None, auxiliary_scorers: Optional[list[Scorer]] = None, use_score_as_feedback: bool = True, + score_blocked_content: bool = False, ) -> None: """ Initialize TAP scoring configuration. @@ -107,6 +108,8 @@ def __init__( refusal_scorer (Optional[TrueFalseScorer]): Optional scorer for detecting refusals. auxiliary_scorers (Optional[List[Scorer]]): Additional scorers for auxiliary metrics. use_score_as_feedback (bool): Whether to use scoring results as feedback. Defaults to True. + score_blocked_content (bool): Whether to score blocked responses using partial content. + Defaults to False. Raises: ValueError: If objective_scorer is not a FloatScaleThresholdScorer or @@ -128,6 +131,7 @@ def __init__( self.refusal_scorer = refusal_scorer self.auxiliary_scorers = auxiliary_scorers or [] self.use_score_as_feedback = use_score_as_feedback + self.score_blocked_content = score_blocked_content @property def threshold(self) -> float: @@ -283,6 +287,7 @@ def __init__( parent_id: Optional[str] = None, prompt_normalizer: Optional[PromptNormalizer] = None, initial_prompt: Optional[Message] = None, + score_blocked_content: bool = False, ) -> None: """ Initialize a tree node. @@ -306,6 +311,8 @@ def __init__( prompt_normalizer (Optional[PromptNormalizer]): Normalizer for handling prompts and responses. initial_prompt (Optional[Message]): Initial message to send for the first turn, bypassing adversarial chat generation. Supports multimodal messages. + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. """ # Store configuration self._objective_target = objective_target @@ -322,6 +329,7 @@ def __init__( self._attack_id = attack_id self._attack_strategy_name = attack_strategy_name self._memory_labels = memory_labels or {} + self._score_blocked_content = score_blocked_content # Initialize utilities self._memory = CentralMemory.get_memory_instance() @@ -660,6 +668,7 @@ async def _score_response_async(self, *, response: Message, objective: str) -> N role_filter="assistant", objective=objective, skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) # Extract objective score @@ -783,6 +792,7 @@ def duplicate(self) -> "_TreeOfAttacksNode": desired_response_prefix=self._desired_response_prefix, parent_id=self.node_id, prompt_normalizer=self._prompt_normalizer, + score_blocked_content=self._score_blocked_content, ) # Duplicate the conversations to preserve history @@ -1382,11 +1392,13 @@ def __init__( refusal_scorer=attack_scoring_config.refusal_scorer, auxiliary_scorers=attack_scoring_config.auxiliary_scorers or None, use_score_as_feedback=attack_scoring_config.use_score_as_feedback, + score_blocked_content=attack_scoring_config.score_blocked_content, ) self._attack_scoring_config = tap_scoring_config self._auxiliary_scorers = tap_scoring_config.auxiliary_scorers self._objective_scorer = tap_scoring_config.objective_scorer + self._score_blocked_content = tap_scoring_config.score_blocked_content # Use the adversarial chat target for scoring, as in CrescendoAttack self._scoring_target = self._adversarial_chat @@ -1890,6 +1902,7 @@ def _create_attack_node( parent_id=parent_id, prompt_normalizer=self._prompt_normalizer, initial_prompt=initial_prompt, + score_blocked_content=self._score_blocked_content, ) # Add the adversarial chat conversation ID to the context's tracking (ensuring uniqueness) diff --git a/pyrit/executor/attack/single_turn/prompt_sending.py b/pyrit/executor/attack/single_turn/prompt_sending.py index 650d86bd04..451d3b96b3 100644 --- a/pyrit/executor/attack/single_turn/prompt_sending.py +++ b/pyrit/executor/attack/single_turn/prompt_sending.py @@ -102,6 +102,7 @@ def __init__( self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers self._objective_scorer = attack_scoring_config.objective_scorer + self._score_blocked_content = attack_scoring_config.score_blocked_content # Skip criteria could be set directly in the injected prompt normalizer self._prompt_normalizer = prompt_normalizer or PromptNormalizer() @@ -363,6 +364,7 @@ async def _evaluate_response_async( role_filter="assistant", objective=objective, skip_on_error_result=True, + score_blocked_content=self._score_blocked_content, ) if not self._objective_scorer: diff --git a/pyrit/prompt_target/openai/openai_chat_target.py b/pyrit/prompt_target/openai/openai_chat_target.py index 6dfb5f391f..eb9061e586 100644 --- a/pyrit/prompt_target/openai/openai_chat_target.py +++ b/pyrit/prompt_target/openai/openai_chat_target.py @@ -288,6 +288,26 @@ def _check_content_filter(self, response: Any) -> bool: pass return False + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract partial content from a Chat Completions response with finish_reason=content_filter. + + When Azure Content Safety triggers mid-generation, the model may have produced partial + text in ``response.choices[0].message.content`` before being cut off. + + Args: + response: A ChatCompletion object from the OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + try: + if response.choices and response.choices[0].message and response.choices[0].message.content: + return response.choices[0].message.content + except (AttributeError, IndexError): + pass + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate a Chat Completions API response for errors. diff --git a/pyrit/prompt_target/openai/openai_response_target.py b/pyrit/prompt_target/openai/openai_response_target.py index dbe71e5406..b4f51dc9b3 100644 --- a/pyrit/prompt_target/openai/openai_response_target.py +++ b/pyrit/prompt_target/openai/openai_response_target.py @@ -454,6 +454,34 @@ def _check_content_filter(self, response: Any) -> bool: return _is_content_filter_error(response_dict) return False + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract partial content from a Response API response that was content-filtered. + + The Response API may include partial text in ``response.output`` message sections + even when the response has a content filter error. + + Args: + response: A Response object from the OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + try: + if not hasattr(response, "output") or not response.output: + return None + parts: list[str] = [] + for section in response.output: + if getattr(section, "type", None) == MessagePieceType.MESSAGE: + content = getattr(section, "content", None) + if content and len(content) > 0: + text = getattr(content[0], "text", None) + if text: + parts.append(text) + return "\n".join(parts) if parts else None + except (AttributeError, IndexError, TypeError): + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate a Response API response for errors. diff --git a/pyrit/prompt_target/openai/openai_target.py b/pyrit/prompt_target/openai/openai_target.py index 8058a2b7fd..885f650894 100644 --- a/pyrit/prompt_target/openai/openai_target.py +++ b/pyrit/prompt_target/openai/openai_target.py @@ -559,6 +559,10 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece) """ Handle content filter errors by creating a proper error Message. + If the subclass provides partial content via ``_extract_partial_content``, + it is attached to each response piece as ``prompt_metadata["partial_content"]`` + so that scorers with ``score_blocked_content=True`` can evaluate it. + Args: response: The response object from OpenAI SDK. request: The original request message piece. @@ -567,13 +571,37 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece) Message object with error type indicating content was filtered. """ logger.warning("Output content filtered by content policy.") - return handle_bad_request_exception( + + partial_content = self._extract_partial_content(response) + + error_message = handle_bad_request_exception( response_text=response.model_dump_json(), request=request, error_code=200, is_content_filter=True, ) + if partial_content: + for piece in error_message.message_pieces: + piece.prompt_metadata["partial_content"] = partial_content + + return error_message + + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract any partial content the model generated before the content filter triggered. + + Override this in subclasses to extract partial content from API-specific response + structures. The base implementation returns None (no partial content). + + Args: + response: The response object from OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate the response and return error Message if needed. diff --git a/pyrit/score/conversation_scorer.py b/pyrit/score/conversation_scorer.py index c3bcbf4f87..7908d27404 100644 --- a/pyrit/score/conversation_scorer.py +++ b/pyrit/score/conversation_scorer.py @@ -33,7 +33,9 @@ class ConversationScorer(Scorer, ABC): enforce_all_pieces_valid=True, ) - async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: + async def _score_async( + self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False + ) -> list[Score]: """ Scores the entire conversation history by concatenating all messages and passing to the wrapped scorer. @@ -41,6 +43,8 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non message (Message): A message from the conversation to be scored. The conversation ID from the first message piece is used to retrieve the full conversation from memory. objective (Optional[str]): Optional objective to evaluate against. + score_blocked_content (bool): If True, blocked pieces with partial content will be + substituted with text copies for scoring. Defaults to False. Returns: list[Score]: List of Score objects from the underlying scorer diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 1a011823fd..4288926c39 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -163,6 +163,7 @@ async def score_async( role_filter: Optional[ChatMessageRole] = None, skip_on_error_result: bool = False, infer_objective_from_request: bool = False, + score_blocked_content: bool = False, ) -> list[Score]: """ Score the message, add the results to the database, and return a list of Score objects. @@ -177,6 +178,9 @@ async def score_async( skip_on_error_result (bool): If True, skip scoring if the message contains an error. Defaults to False. infer_objective_from_request (bool): If True, infer the objective from the message's previous request when objective is not provided. Defaults to False. + score_blocked_content (bool): If True, blocked responses that contain partial content + (in prompt_metadata["partial_content"]) will be scored using that content instead + of being filtered out or short-circuited. Defaults to False. Returns: list[Score]: A list of Score objects representing the results. @@ -192,8 +196,12 @@ async def score_async( return [] if skip_on_error_result and message.is_error(): - logger.debug("Skipping scoring due to error in message and skip_on_error=True.") - return [] + # When score_blocked_content is enabled and the message has partial content, + # don't skip — let _score_async handle the substitution. + has_partial = any("partial_content" in p.prompt_metadata for p in message.message_pieces if p.is_blocked()) + if not (score_blocked_content and has_partial): + logger.debug("Skipping scoring due to error in message and skip_on_error=True.") + return [] if infer_objective_from_request and (not objective): objective = self._extract_objective_from_response(message) @@ -202,6 +210,7 @@ async def score_async( scores = await self._score_async( message, objective=objective, + score_blocked_content=score_blocked_content, ) except PyritException as e: # Re-raise PyRIT exceptions with enhanced context while preserving type for retry decorators @@ -217,7 +226,9 @@ async def score_async( return scores - async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: + async def _score_async( + self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False + ) -> list[Score]: """ Score the given request response asynchronously. @@ -225,9 +236,16 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non and returns a flattened list of scores. Subclasses can override this method to implement custom scoring logic (e.g., aggregating scores). + When score_blocked_content is True, blocked pieces with partial content in + prompt_metadata["partial_content"] are substituted with text-type copies + (with response_error="none") so they pass the validator and are scored + by the LLM without triggering blocked short-circuits. + Args: message (Message): The message to score. objective (Optional[str]): The objective to evaluate against. Defaults to None. + score_blocked_content (bool): If True, substitute blocked pieces that have + partial content with text-type copies. Defaults to False. Returns: list[Score]: A list of Score objects. @@ -238,6 +256,20 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non # Score only the supported pieces supported_pieces = self._get_supported_pieces(message) + # When score_blocked_content is enabled, substitute blocked pieces that have partial content. + # Substitutes replace the original blocked piece (if present) or are added if not. + if score_blocked_content: + already_supported_ids = {p.id for p in supported_pieces} + for piece in message.message_pieces: + if piece.is_blocked() and "partial_content" in piece.prompt_metadata: + substitute = self._create_text_piece_from_blocked(piece) + if substitute and self._validator.is_message_piece_supported(message_piece=substitute): + # Replace original blocked piece if it was already in supported_pieces + if piece.id in already_supported_ids: + supported_pieces = [substitute if p.id == piece.id else p for p in supported_pieces] + else: + supported_pieces.append(substitute) + tasks = [self._score_piece_async(message_piece=piece, objective=objective) for piece in supported_pieces] if not tasks: @@ -253,6 +285,44 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: raise NotImplementedError + @staticmethod + def _create_text_piece_from_blocked(piece: MessagePiece) -> Optional[MessagePiece]: + """ + Create a text-typed copy of a blocked MessagePiece using its partial content. + + The substitute preserves the original piece's id (so scores link back correctly), + sets converted_value to the partial content with converted_value_data_type="text", + and sets response_error="none" so scorer short-circuits (e.g., refusal scorer's + blocked check) do not fire. + + Args: + piece: A blocked MessagePiece with prompt_metadata["partial_content"]. + + Returns: + MessagePiece with text content, or None if partial content is empty. + """ + partial_content = str(piece.prompt_metadata.get("partial_content", "")) + if not partial_content: + return None + + return MessagePiece( + id=piece.id, + role=piece.api_role, + original_value=piece.original_value, + converted_value=partial_content, + original_value_data_type=piece.original_value_data_type, + converted_value_data_type="text", + conversation_id=piece.conversation_id, + sequence=piece.sequence, + labels=piece.labels, + prompt_metadata=piece.prompt_metadata, + converter_identifiers=list(piece.converter_identifiers), # type: ignore[arg-type] + prompt_target_identifier=piece.prompt_target_identifier, + attack_identifier=piece.attack_identifier, + response_error="none", + timestamp=piece.timestamp, + ) + def _get_supported_pieces(self, message: Message) -> list[MessagePiece]: """ Get a list of supported message pieces for this scorer. @@ -713,6 +783,7 @@ async def score_response_async( role_filter: ChatMessageRole = "assistant", objective: Optional[str] = None, skip_on_error_result: bool = True, + score_blocked_content: bool = False, ) -> dict[str, list[Score]]: """ Score a response using an objective scorer and optional auxiliary scorers. @@ -725,6 +796,8 @@ async def score_response_async( Defaults to "assistant" (real responses only, not simulated). objective (Optional[str]): Task/objective for scoring context. Defaults to None. skip_on_error_result (bool): If True, skip scoring pieces that have errors. Defaults to True. + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. Returns: Dict[str, List[Score]]: Dictionary with keys `auxiliary_scores` and `objective_scores` @@ -747,6 +820,7 @@ async def score_response_async( role_filter=role_filter, objective=objective, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) result["auxiliary_scores"] = aux_scores # objective_scores remains empty @@ -760,12 +834,14 @@ async def score_response_async( role_filter=role_filter, objective=objective, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) obj_task = objective_scorer.score_async( message=response, objective=objective, skip_on_error_result=skip_on_error_result, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) aux_scores, obj_scores = await asyncio.gather(aux_task, obj_task) result["auxiliary_scores"] = aux_scores @@ -776,6 +852,7 @@ async def score_response_async( objective=objective, skip_on_error_result=skip_on_error_result, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) result["objective_scores"] = obj_scores return result @@ -788,6 +865,7 @@ async def score_response_multiple_scorers_async( role_filter: ChatMessageRole = "assistant", objective: Optional[str] = None, skip_on_error_result: bool = True, + score_blocked_content: bool = False, ) -> list[Score]: """ Score a response using multiple scorers in parallel. @@ -802,6 +880,8 @@ async def score_response_multiple_scorers_async( Defaults to "assistant" (real responses only, not simulated). objective (Optional[str]): Optional objective description for scoring context. skip_on_error_result (bool): If True, skip scoring pieces that have errors (default: True). + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. Returns: List[Score]: All scores from all scorers @@ -816,6 +896,7 @@ async def score_response_multiple_scorers_async( objective=objective, role_filter=role_filter, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) for scorer in scorers ] diff --git a/pyrit/score/true_false/float_scale_threshold_scorer.py b/pyrit/score/true_false/float_scale_threshold_scorer.py index 5d35d52d0d..0271c9ad3d 100644 --- a/pyrit/score/true_false/float_scale_threshold_scorer.py +++ b/pyrit/score/true_false/float_scale_threshold_scorer.py @@ -79,6 +79,7 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, + score_blocked_content: bool = False, ) -> list[Score]: """ Scores the piece using the underlying float-scale scorer and thresholds the resulting score. @@ -88,6 +89,8 @@ async def _score_async( objective (Optional[str]): The objective to evaluate against (the original attacker model's objective). Defaults to None. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. + score_blocked_content (bool): If True, blocked pieces with partial content will be + substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single true/false Score object based on the threshold comparison. @@ -96,6 +99,7 @@ async def _score_async( message, objective=objective, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) # Aggregator handles 0-many scores and returns exactly one result (or raises if configured) diff --git a/pyrit/score/true_false/true_false_scorer.py b/pyrit/score/true_false/true_false_scorer.py index 6b6e79815e..3017895660 100644 --- a/pyrit/score/true_false/true_false_scorer.py +++ b/pyrit/score/true_false/true_false_scorer.py @@ -104,7 +104,9 @@ def get_scorer_metrics(self) -> Optional["ObjectiveScorerMetrics"]: return find_objective_metrics_by_eval_hash(eval_hash=eval_hash, file_path=result_file) - async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: + async def _score_async( + self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False + ) -> list[Score]: """ Score the given request response asynchronously. @@ -113,6 +115,8 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non Args: message (Message): The message to score. objective (Optional[str]): The objective to evaluate against. Defaults to None. + score_blocked_content (bool): If True, blocked pieces with partial content will be + substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single true/false Score object. @@ -121,7 +125,9 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non ValueError: If no pieces are scored and cannot determine a piece ID for the return score. """ # Get individual scores for all supported pieces using base implementation logic - score_list = await super()._score_async(message, objective=objective) + score_list = await super()._score_async( + message, objective=objective, score_blocked_content=score_blocked_content + ) if not score_list: # If no pieces matched (e.g., due to role filter or if all pieces filtered), return False diff --git a/tests/unit/executor/attack/core/test_attack_config.py b/tests/unit/executor/attack/core/test_attack_config.py index bc3a822f67..8355b16cf6 100644 --- a/tests/unit/executor/attack/core/test_attack_config.py +++ b/tests/unit/executor/attack/core/test_attack_config.py @@ -76,3 +76,20 @@ def test_init_with_use_score_as_feedback_false(self): config = AttackScoringConfig(use_score_as_feedback=False) assert config.use_score_as_feedback is False + + def test_score_blocked_content_default_is_false(self): + """Test that score_blocked_content defaults to False.""" + config = AttackScoringConfig() + assert config.score_blocked_content is False + + def test_score_blocked_content_can_set_to_true(self): + """Test that score_blocked_content can be set to True.""" + config = AttackScoringConfig(score_blocked_content=True) + assert config.score_blocked_content is True + + def test_score_blocked_content_with_valid_scorers(self): + """Test that score_blocked_content works with valid scorers.""" + mock_scorer = MagicMock(spec=TrueFalseScorer) + config = AttackScoringConfig(objective_scorer=mock_scorer, score_blocked_content=True) + assert config.score_blocked_content is True + assert config.objective_scorer is mock_scorer diff --git a/tests/unit/prompt_target/target/test_openai_chat_target.py b/tests/unit/prompt_target/target/test_openai_chat_target.py index 59395a270f..a1796f5304 100644 --- a/tests/unit/prompt_target/target/test_openai_chat_target.py +++ b/tests/unit/prompt_target/target/test_openai_chat_target.py @@ -1596,6 +1596,62 @@ async def test_save_audio_response_async_pcm16_format(patch_central_database): assert result == "/path/to/saved/audio.wav" +# ── _extract_partial_content tests ────────────────────────────────────────── + + +class TestExtractPartialContentChatTarget: + def test_extracts_partial_content_from_content_filter_response(self, target: OpenAIChatTarget): + mock_response = create_mock_completion( + content="Partial harmful content before cutoff", finish_reason="content_filter" + ) + result = target._extract_partial_content(mock_response) + assert result == "Partial harmful content before cutoff" + + def test_returns_none_when_no_content(self, target: OpenAIChatTarget): + mock_response = create_mock_completion(content=None, finish_reason="content_filter") + result = target._extract_partial_content(mock_response) + assert result is None + + def test_returns_none_when_empty_content(self, target: OpenAIChatTarget): + mock_response = create_mock_completion(content="", finish_reason="content_filter") + result = target._extract_partial_content(mock_response) + assert result is None + + def test_returns_none_when_no_choices(self, target: OpenAIChatTarget): + mock_response = MagicMock(spec=ChatCompletion) + mock_response.choices = [] + result = target._extract_partial_content(mock_response) + assert result is None + + +class TestContentFilterPreservesPartialContent: + async def test_200_content_filter_attaches_partial_content_metadata(self, target: OpenAIChatTarget): + """Integration: 200 + content_filter response preserves partial content in metadata.""" + message = Message( + message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")] + ) + mock_completion = create_mock_completion(content="Harmful partial content here", finish_reason="content_filter") + target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign] + + response = await target.send_prompt_async(message=message) + + assert response[0].message_pieces[0].response_error == "blocked" + assert response[0].message_pieces[0].prompt_metadata["partial_content"] == "Harmful partial content here" + + async def test_200_content_filter_no_metadata_when_no_content(self, target: OpenAIChatTarget): + """200 + content_filter with no content doesn't attach metadata.""" + message = Message( + message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")] + ) + mock_completion = create_mock_completion(content=None, finish_reason="content_filter") + target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign] + + response = await target.send_prompt_async(message=message) + + assert response[0].message_pieces[0].response_error == "blocked" + assert "partial_content" not in response[0].message_pieces[0].prompt_metadata + + async def test_save_audio_response_async_flac_format(patch_central_database): """Test saving audio response with flac format.""" audio_config = OpenAIChatAudioConfig(voice="alloy", audio_format="flac") diff --git a/tests/unit/prompt_target/target/test_openai_response_target.py b/tests/unit/prompt_target/target/test_openai_response_target.py index 95f6e238f8..0dfb02cf18 100644 --- a/tests/unit/prompt_target/target/test_openai_response_target.py +++ b/tests/unit/prompt_target/target/test_openai_response_target.py @@ -1043,6 +1043,48 @@ def test_invalid_top_p_raises(patch_central_database): # Unit tests for override methods +class TestExtractPartialContentResponseTarget: + def test_extracts_text_from_message_sections(self): + from pyrit.prompt_target.openai.openai_response_target import MessagePieceType + + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + section = MagicMock() + section.type = MessagePieceType.MESSAGE + content_item = MagicMock() + content_item.text = "Partial response text" + section.content = [content_item] + + mock_response = MagicMock() + mock_response.output = [section] + + result = target._extract_partial_content(mock_response) + assert result == "Partial response text" + + def test_returns_none_when_no_output(self): + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + mock_response = MagicMock() + mock_response.output = [] + + result = target._extract_partial_content(mock_response) + assert result is None + + def test_ignores_non_message_sections(self): + from pyrit.prompt_target.openai.openai_response_target import MessagePieceType + + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + section = MagicMock() + section.type = MessagePieceType.REASONING + + mock_response = MagicMock() + mock_response.output = [section] + + result = target._extract_partial_content(mock_response) + assert result is None + + def test_check_content_filter_detects_filtered_response(target: OpenAIResponseTarget): """Test _check_content_filter detects content_filter error code.""" mock_response = MagicMock() diff --git a/tests/unit/score/test_scorer.py b/tests/unit/score/test_scorer.py index a35fbe3cb1..8e5335bd52 100644 --- a/tests/unit/score/test_scorer.py +++ b/tests/unit/score/test_scorer.py @@ -587,10 +587,18 @@ async def test_score_response_async_parallel_execution(): assert score1_1 in result["auxiliary_scores"] assert score2_1 in result["auxiliary_scores"] scorer1.score_async.assert_any_call( - message=response, objective="test task", role_filter="assistant", skip_on_error_result=True + message=response, + objective="test task", + role_filter="assistant", + skip_on_error_result=True, + score_blocked_content=False, ) scorer2.score_async.assert_any_call( - message=response, objective="test task", role_filter="assistant", skip_on_error_result=True + message=response, + objective="test task", + role_filter="assistant", + skip_on_error_result=True, + score_blocked_content=False, ) @@ -1465,3 +1473,336 @@ async def test_score_value_with_llm_skips_reasoning_piece(good_json): assert result.raw_score_value == "1" assert result.score_rationale == "Valid response" + + +# ── Helpers for score_blocked_content tests ────────────────────────────────── + + +class _AcceptAllValidator(ScorerPromptValidator): + """Validator that accepts all pieces (like SelfAskRefusalScorer's default).""" + + def validate(self, message: Message, objective: Optional[str] = None) -> None: + pass + + def is_message_piece_supported(self, message_piece: MessagePiece) -> bool: + return True + + +class _TextOnlyValidator(ScorerPromptValidator): + """Validator that only accepts text pieces (like SelfAskTrueFalseScorer's default).""" + + def __init__(self) -> None: + super().__init__(supported_data_types=["text", "image_path"]) + + def validate(self, message: Message, objective: Optional[str] = None) -> None: + pass + + +class _BlockedContentScorer(TrueFalseScorer): + """A mock TrueFalseScorer that records what pieces it was asked to score.""" + + def __init__(self, *, validator: Optional[ScorerPromptValidator] = None) -> None: + super().__init__(validator=validator or _TextOnlyValidator()) + self.scored_pieces: list[MessagePiece] = [] + + def _build_identifier(self) -> ComponentIdentifier: + return self._create_identifier() + + async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: + self.scored_pieces.append(message_piece) + return [ + Score( + score_value="true", + score_value_description="desc", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="rationale", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + + +class _MockRefusalScorer(TrueFalseScorer): + """Mimics SelfAskRefusalScorer: accepts all types, short-circuits on blocked.""" + + def __init__(self) -> None: + super().__init__(validator=_AcceptAllValidator()) + self.scored_pieces: list[MessagePiece] = [] + + def _build_identifier(self) -> ComponentIdentifier: + return self._create_identifier() + + async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: + self.scored_pieces.append(message_piece) + if message_piece.response_error == "blocked": + return [ + Score( + score_value="true", + score_value_description="Refusal detected", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="Content was filtered, constituting a refusal.", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + return [ + Score( + score_value="false", + score_value_description="Not a refusal", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="The response contains substantive content.", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + + +def _make_blocked_piece(*, partial_content: Optional[str] = None, conversation_id: str = "test-convo") -> MessagePiece: + """Create a blocked MessagePiece, optionally with partial content metadata.""" + metadata: dict = {} + if partial_content is not None: + metadata["partial_content"] = partial_content + return MessagePiece( + role="assistant", + original_value='{"status_code": 200, "message": "content_filter"}', + converted_value='{"status_code": 200, "message": "content_filter"}', + original_value_data_type="error", + converted_value_data_type="error", + conversation_id=conversation_id, + response_error="blocked", + prompt_metadata=metadata, + ) + + +def _make_normal_piece(*, conversation_id: str = "test-convo") -> MessagePiece: + """Create a normal text MessagePiece.""" + return MessagePiece( + role="assistant", + original_value="Hello, how can I help?", + conversation_id=conversation_id, + ) + + +# ── _create_text_piece_from_blocked tests ──────────────────────────────────── + + +class TestCreateTextPieceFromBlocked: + def test_returns_text_piece_with_partial_content(self): + piece = _make_blocked_piece(partial_content="Harmful partial text here") + substitute = Scorer._create_text_piece_from_blocked(piece) + + assert substitute is not None + assert substitute.converted_value == "Harmful partial text here" + assert substitute.converted_value_data_type == "text" + assert substitute.response_error == "none" + assert substitute.id == piece.id + + def test_preserves_original_value(self): + piece = _make_blocked_piece(partial_content="partial") + substitute = Scorer._create_text_piece_from_blocked(piece) + + assert substitute is not None + assert substitute.original_value == piece.original_value + assert substitute.original_value_data_type == piece.original_value_data_type + + def test_returns_none_when_no_partial_content(self): + piece = _make_blocked_piece() + assert Scorer._create_text_piece_from_blocked(piece) is None + + def test_returns_none_when_empty_partial_content(self): + piece = _make_blocked_piece(partial_content="") + assert Scorer._create_text_piece_from_blocked(piece) is None + + def test_preserves_conversation_id(self): + piece = _make_blocked_piece(partial_content="partial") + substitute = Scorer._create_text_piece_from_blocked(piece) + assert substitute is not None + assert substitute.conversation_id == piece.conversation_id + + def test_response_error_is_none_not_blocked(self): + """Substitute must have response_error='none' so refusal short-circuits don't fire.""" + piece = _make_blocked_piece(partial_content="partial text") + substitute = Scorer._create_text_piece_from_blocked(piece) + assert substitute is not None + assert substitute.response_error == "none" + assert not substitute.is_blocked() + assert not substitute.has_error() + + +# ── score_async with score_blocked_content tests ───────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestScoreAsyncWithBlockedContent: + async def test_default_false_skips_blocked_piece_text_only_scorer(self): + """Default behavior: text-only scorer filters out blocked error-type pieces.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=False) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert len(scorer.scored_pieces) == 0 + + async def test_true_substitutes_blocked_piece_for_text_only_scorer(self): + """With flag on, text-only scorer gets a text substitute and scores it.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "true" + assert len(scorer.scored_pieces) == 1 + assert scorer.scored_pieces[0].converted_value == "harmful text" + assert scorer.scored_pieces[0].converted_value_data_type == "text" + + async def test_refusal_scorer_short_circuits_on_blocked_by_default(self): + """Refusal scorer (accepts all types) sees original blocked piece, returns True.""" + scorer = _MockRefusalScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=False) + + assert len(scores) == 1 + assert scores[0].score_value == "true" + assert scorer.scored_pieces[0].response_error == "blocked" + + async def test_refusal_scorer_evaluates_partial_content_when_flag_on(self): + """With flag on, refusal scorer gets substitute (response_error=none), evaluates via LLM path.""" + scorer = _MockRefusalScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert scorer.scored_pieces[0].response_error == "none" + assert scorer.scored_pieces[0].converted_value == "harmful text" + + async def test_no_substitute_when_no_partial_content(self): + """400 full block with no partial content: no substitute, same behavior.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece()]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert len(scorer.scored_pieces) == 0 + + async def test_normal_piece_unaffected_by_flag(self): + """Normal text pieces are scored the same regardless of flag.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_normal_piece()]) + + scores_off = await scorer.score_async(msg, score_blocked_content=False) + scorer.scored_pieces.clear() + scores_on = await scorer.score_async(msg, score_blocked_content=True) + + assert scores_off[0].score_value == scores_on[0].score_value + + async def test_mixed_pieces_only_blocked_substituted(self): + """In a multi-piece message, only blocked pieces get substituted.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_normal_piece(), _make_blocked_piece(partial_content="partial harmful")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 # TrueFalseScorer aggregates + assert len(scorer.scored_pieces) == 2 + assert scorer.scored_pieces[0].converted_value == "Hello, how can I help?" + assert scorer.scored_pieces[1].converted_value == "partial harmful" + assert scorer.scored_pieces[1].response_error == "none" + + +# ── skip_on_error_result interaction tests ─────────────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestSkipOnErrorWithBlockedContent: + async def test_skip_on_error_true_without_flag_skips_blocked(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=False) + assert scores == [] + + async def test_skip_on_error_true_with_flag_does_not_skip_when_partial_content(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=True) + assert len(scores) == 1 + assert scores[0].score_value == "true" + + async def test_skip_on_error_true_with_flag_still_skips_when_no_partial_content(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece()]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=True) + assert scores == [] + + +# ── score_response_async passthrough tests ─────────────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestScoreResponseAsyncBlockedContent: + async def test_score_response_async_passes_flag_to_scorers(self): + obj_scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + result = await Scorer.score_response_async( + response=msg, + objective_scorer=obj_scorer, + objective="test", + skip_on_error_result=False, + score_blocked_content=True, + ) + + assert len(result["objective_scores"]) == 1 + assert result["objective_scores"][0].score_value == "true" + assert obj_scorer.scored_pieces[0].converted_value == "harmful text" + + async def test_score_response_async_default_does_not_substitute(self): + obj_scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + result = await Scorer.score_response_async( + response=msg, + objective_scorer=obj_scorer, + objective="test", + skip_on_error_result=False, + score_blocked_content=False, + ) + + assert result["objective_scores"][0].score_value == "false" + assert len(obj_scorer.scored_pieces) == 0 + + async def test_score_response_multiple_scorers_passes_flag(self): + scorer1 = _BlockedContentScorer() + scorer2 = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await Scorer.score_response_multiple_scorers_async( + response=msg, + scorers=[scorer1, scorer2], + objective="test", + skip_on_error_result=False, + score_blocked_content=True, + ) + + assert len(scores) == 2 + assert len(scorer1.scored_pieces) == 1 + assert len(scorer2.scored_pieces) == 1 From e6fae9269bc384ab035da3ac4553b8721105f9b6 Mon Sep 17 00:00:00 2001 From: jsong468 Date: Mon, 4 May 2026 15:55:45 -0700 Subject: [PATCH 2/5] docstring --- pyrit/score/scorer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 4288926c39..17026949bc 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -175,7 +175,9 @@ async def score_async( role_filter (Optional[ChatMessageRole]): Only score messages with this exact stored role. Use "assistant" to score only real assistant responses, or "simulated_assistant" to score only simulated responses. Defaults to None (no filtering). - skip_on_error_result (bool): If True, skip scoring if the message contains an error. Defaults to False. + skip_on_error_result (bool): If True, skip scoring if the message contains an error. If True + but score_blocked_content is also True, blocked content will be scored in the case of a + content filter triggered error instead of skipping. Defaults to False. infer_objective_from_request (bool): If True, infer the objective from the message's previous request when objective is not provided. Defaults to False. score_blocked_content (bool): If True, blocked responses that contain partial content From 9a4505acbfac0784e7fef22746d0600bf11a8f86 Mon Sep 17 00:00:00 2001 From: jsong468 Date: Mon, 4 May 2026 16:53:30 -0700 Subject: [PATCH 3/5] fix unit tests --- pyrit/score/true_false/true_false_composite_scorer.py | 10 +++++++++- pyrit/score/true_false/true_false_inverter_scorer.py | 4 ++++ .../executor/attack/single_turn/test_prompt_sending.py | 3 +++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pyrit/score/true_false/true_false_composite_scorer.py b/pyrit/score/true_false/true_false_composite_scorer.py index d40d3874b3..d3d08238c1 100644 --- a/pyrit/score/true_false/true_false_composite_scorer.py +++ b/pyrit/score/true_false/true_false_composite_scorer.py @@ -75,6 +75,7 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, + score_blocked_content: bool = False, ) -> list[Score]: """ Score a request/response by combining results from all constituent scorers. @@ -83,6 +84,8 @@ async def _score_async( message (Message): The request/response to score. objective (Optional[str]): Scoring objective or context. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. + score_blocked_content (bool): If True, blocked pieces with partial content will be + substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A single-element list with the aggregated true/false score. @@ -92,7 +95,12 @@ async def _score_async( ValueError: If no scores are generated from the request response pieces. """ tasks = [ - scorer.score_async(message=message, objective=objective, role_filter=role_filter) + scorer.score_async( + message=message, + objective=objective, + role_filter=role_filter, + score_blocked_content=score_blocked_content, + ) for scorer in self._scorers ] diff --git a/pyrit/score/true_false/true_false_inverter_scorer.py b/pyrit/score/true_false/true_false_inverter_scorer.py index 5b3a1404cd..e3d42f4427 100644 --- a/pyrit/score/true_false/true_false_inverter_scorer.py +++ b/pyrit/score/true_false/true_false_inverter_scorer.py @@ -53,6 +53,7 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, + score_blocked_content: bool = False, ) -> list[Score]: """ Scores the piece using the underlying true-false scorer and returns the inverted score. @@ -62,6 +63,8 @@ async def _score_async( objective (Optional[str]): The objective to evaluate against (the original attacker model's objective). Defaults to None. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. + score_blocked_content (bool): If True, blocked pieces with partial content will be + substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single Score object with the inverted true/false value. @@ -70,6 +73,7 @@ async def _score_async( message, objective=objective, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) # TrueFalseScorers only have a single score diff --git a/tests/unit/executor/attack/single_turn/test_prompt_sending.py b/tests/unit/executor/attack/single_turn/test_prompt_sending.py index 132e98b1a9..00b1d245a0 100644 --- a/tests/unit/executor/attack/single_turn/test_prompt_sending.py +++ b/tests/unit/executor/attack/single_turn/test_prompt_sending.py @@ -458,6 +458,7 @@ async def test_evaluate_response_with_objective_scorer_returns_score( role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) async def test_evaluate_response_without_objective_scorer_returns_none(self, mock_target, sample_response): @@ -480,6 +481,7 @@ async def test_evaluate_response_without_objective_scorer_returns_none(self, moc role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) async def test_evaluate_response_with_auxiliary_scorers( @@ -522,6 +524,7 @@ async def test_evaluate_response_with_auxiliary_scorers( role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) From fc6c7e75f1e4b61825583abd8731dc46dc6547e5 Mon Sep 17 00:00:00 2001 From: jsong468 Date: Tue, 5 May 2026 16:53:20 -0700 Subject: [PATCH 4/5] fix conversation_scorer bug and score_async --- pyrit/score/conversation_scorer.py | 22 ++++-- pyrit/score/scorer.py | 67 ++++++++++++------- .../float_scale_threshold_scorer.py | 4 -- .../true_false/true_false_composite_scorer.py | 10 +-- .../true_false/true_false_inverter_scorer.py | 4 -- pyrit/score/true_false/true_false_scorer.py | 10 +-- 6 files changed, 60 insertions(+), 57 deletions(-) diff --git a/pyrit/score/conversation_scorer.py b/pyrit/score/conversation_scorer.py index 7908d27404..d1dad443bc 100644 --- a/pyrit/score/conversation_scorer.py +++ b/pyrit/score/conversation_scorer.py @@ -33,9 +33,7 @@ class ConversationScorer(Scorer, ABC): enforce_all_pieces_valid=True, ) - async def _score_async( - self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False - ) -> list[Score]: + async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: """ Scores the entire conversation history by concatenating all messages and passing to the wrapped scorer. @@ -43,8 +41,6 @@ async def _score_async( message (Message): A message from the conversation to be scored. The conversation ID from the first message piece is used to retrieve the full conversation from memory. objective (Optional[str]): Optional objective to evaluate against. - score_blocked_content (bool): If True, blocked pieces with partial content will be - substituted with text copies for scoring. Defaults to False. Returns: list[Score]: List of Score objects from the underlying scorer @@ -67,6 +63,14 @@ async def _score_async( # Build the full conversation text conversation_text = "" + # Check if the caller requested scoring of blocked content by inspecting whether + # the incoming message was substituted by score_async._apply_blocked_content_substitution. + # A substituted piece has partial_content in metadata but response_error="none". + incoming_piece = message.message_pieces[0] + use_partial_content = ( + "partial_content" in incoming_piece.prompt_metadata and incoming_piece.response_error == "none" + ) + # Goes through each message in the conversation and appends user/assistant messages only # Explicitly excludes system, tool, developer messages from being scored/included in conversation history # they are allowed in validation but not included in the scored conversation text @@ -75,7 +79,13 @@ async def _score_async( # Only include user and assistant messages in the conversation text if piece.api_role in ["user", "assistant", "tool"]: role_display = "Assistant (simulated)" if piece.is_simulated else piece.api_role.capitalize() - conversation_text += f"{role_display}: {piece.converted_value}\n" + # For blocked pieces with partial content, use the partial content + # instead of the error JSON when score_blocked_content is enabled + if use_partial_content and piece.is_blocked() and "partial_content" in piece.prompt_metadata: + text = str(piece.prompt_metadata["partial_content"]) + else: + text = piece.converted_value + conversation_text += f"{role_display}: {text}\n" # Create a new message with the concatenated conversation text # Preserve the original message piece metadata diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 17026949bc..178556ceaf 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -208,11 +208,16 @@ async def score_async( if infer_objective_from_request and (not objective): objective = self._extract_objective_from_response(message) + # When score_blocked_content is enabled, create a modified message where blocked pieces + # with partial content are replaced with text-type substitutes (response_error="none"). + # This is done here (not in _score_async) so that _score_async's signature remains + # (self, message, *, objective=None) — preserving backward compatibility for subclasses. + scoring_message = self._apply_blocked_content_substitution(message) if score_blocked_content else message + try: scores = await self._score_async( - message, + scoring_message, objective=objective, - score_blocked_content=score_blocked_content, ) except PyritException as e: # Re-raise PyRIT exceptions with enhanced context while preserving type for retry decorators @@ -228,9 +233,7 @@ async def score_async( return scores - async def _score_async( - self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False - ) -> list[Score]: + async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: """ Score the given request response asynchronously. @@ -238,16 +241,9 @@ async def _score_async( and returns a flattened list of scores. Subclasses can override this method to implement custom scoring logic (e.g., aggregating scores). - When score_blocked_content is True, blocked pieces with partial content in - prompt_metadata["partial_content"] are substituted with text-type copies - (with response_error="none") so they pass the validator and are scored - by the LLM without triggering blocked short-circuits. - Args: message (Message): The message to score. objective (Optional[str]): The objective to evaluate against. Defaults to None. - score_blocked_content (bool): If True, substitute blocked pieces that have - partial content with text-type copies. Defaults to False. Returns: list[Score]: A list of Score objects. @@ -258,20 +254,6 @@ async def _score_async( # Score only the supported pieces supported_pieces = self._get_supported_pieces(message) - # When score_blocked_content is enabled, substitute blocked pieces that have partial content. - # Substitutes replace the original blocked piece (if present) or are added if not. - if score_blocked_content: - already_supported_ids = {p.id for p in supported_pieces} - for piece in message.message_pieces: - if piece.is_blocked() and "partial_content" in piece.prompt_metadata: - substitute = self._create_text_piece_from_blocked(piece) - if substitute and self._validator.is_message_piece_supported(message_piece=substitute): - # Replace original blocked piece if it was already in supported_pieces - if piece.id in already_supported_ids: - supported_pieces = [substitute if p.id == piece.id else p for p in supported_pieces] - else: - supported_pieces.append(substitute) - tasks = [self._score_piece_async(message_piece=piece, objective=objective) for piece in supported_pieces] if not tasks: @@ -325,6 +307,39 @@ def _create_text_piece_from_blocked(piece: MessagePiece) -> Optional[MessagePiec timestamp=piece.timestamp, ) + def _apply_blocked_content_substitution(self, message: Message) -> Message: + """ + Create a copy of the message where blocked pieces with partial content are substituted. + + Each blocked piece that has prompt_metadata["partial_content"] is replaced with a + text-typed copy (response_error="none", converted_value=partial_content). Non-blocked + pieces and blocked pieces without partial content are kept as-is. + + This is called in score_async (not _score_async) so that subclass overrides of + _score_async do not need to accept the score_blocked_content parameter. + + Args: + message: The original message potentially containing blocked pieces. + + Returns: + A new Message with substituted pieces, or the original if no substitution was needed. + """ + substituted = False + new_pieces: list[MessagePiece] = [] + for piece in message.message_pieces: + if piece.is_blocked() and "partial_content" in piece.prompt_metadata: + substitute = self._create_text_piece_from_blocked(piece) + if substitute: + new_pieces.append(substitute) + substituted = True + continue + new_pieces.append(piece) + + if not substituted: + return message + + return Message(message_pieces=new_pieces) + def _get_supported_pieces(self, message: Message) -> list[MessagePiece]: """ Get a list of supported message pieces for this scorer. diff --git a/pyrit/score/true_false/float_scale_threshold_scorer.py b/pyrit/score/true_false/float_scale_threshold_scorer.py index 0271c9ad3d..5d35d52d0d 100644 --- a/pyrit/score/true_false/float_scale_threshold_scorer.py +++ b/pyrit/score/true_false/float_scale_threshold_scorer.py @@ -79,7 +79,6 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, - score_blocked_content: bool = False, ) -> list[Score]: """ Scores the piece using the underlying float-scale scorer and thresholds the resulting score. @@ -89,8 +88,6 @@ async def _score_async( objective (Optional[str]): The objective to evaluate against (the original attacker model's objective). Defaults to None. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. - score_blocked_content (bool): If True, blocked pieces with partial content will be - substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single true/false Score object based on the threshold comparison. @@ -99,7 +96,6 @@ async def _score_async( message, objective=objective, role_filter=role_filter, - score_blocked_content=score_blocked_content, ) # Aggregator handles 0-many scores and returns exactly one result (or raises if configured) diff --git a/pyrit/score/true_false/true_false_composite_scorer.py b/pyrit/score/true_false/true_false_composite_scorer.py index d3d08238c1..d40d3874b3 100644 --- a/pyrit/score/true_false/true_false_composite_scorer.py +++ b/pyrit/score/true_false/true_false_composite_scorer.py @@ -75,7 +75,6 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, - score_blocked_content: bool = False, ) -> list[Score]: """ Score a request/response by combining results from all constituent scorers. @@ -84,8 +83,6 @@ async def _score_async( message (Message): The request/response to score. objective (Optional[str]): Scoring objective or context. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. - score_blocked_content (bool): If True, blocked pieces with partial content will be - substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A single-element list with the aggregated true/false score. @@ -95,12 +92,7 @@ async def _score_async( ValueError: If no scores are generated from the request response pieces. """ tasks = [ - scorer.score_async( - message=message, - objective=objective, - role_filter=role_filter, - score_blocked_content=score_blocked_content, - ) + scorer.score_async(message=message, objective=objective, role_filter=role_filter) for scorer in self._scorers ] diff --git a/pyrit/score/true_false/true_false_inverter_scorer.py b/pyrit/score/true_false/true_false_inverter_scorer.py index e3d42f4427..5b3a1404cd 100644 --- a/pyrit/score/true_false/true_false_inverter_scorer.py +++ b/pyrit/score/true_false/true_false_inverter_scorer.py @@ -53,7 +53,6 @@ async def _score_async( *, objective: Optional[str] = None, role_filter: Optional[ChatMessageRole] = None, - score_blocked_content: bool = False, ) -> list[Score]: """ Scores the piece using the underlying true-false scorer and returns the inverted score. @@ -63,8 +62,6 @@ async def _score_async( objective (Optional[str]): The objective to evaluate against (the original attacker model's objective). Defaults to None. role_filter (Optional[ChatMessageRole]): Optional filter for message roles. Defaults to None. - score_blocked_content (bool): If True, blocked pieces with partial content will be - substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single Score object with the inverted true/false value. @@ -73,7 +70,6 @@ async def _score_async( message, objective=objective, role_filter=role_filter, - score_blocked_content=score_blocked_content, ) # TrueFalseScorers only have a single score diff --git a/pyrit/score/true_false/true_false_scorer.py b/pyrit/score/true_false/true_false_scorer.py index 3017895660..6b6e79815e 100644 --- a/pyrit/score/true_false/true_false_scorer.py +++ b/pyrit/score/true_false/true_false_scorer.py @@ -104,9 +104,7 @@ def get_scorer_metrics(self) -> Optional["ObjectiveScorerMetrics"]: return find_objective_metrics_by_eval_hash(eval_hash=eval_hash, file_path=result_file) - async def _score_async( - self, message: Message, *, objective: Optional[str] = None, score_blocked_content: bool = False - ) -> list[Score]: + async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: """ Score the given request response asynchronously. @@ -115,8 +113,6 @@ async def _score_async( Args: message (Message): The message to score. objective (Optional[str]): The objective to evaluate against. Defaults to None. - score_blocked_content (bool): If True, blocked pieces with partial content will be - substituted with text copies for scoring. Defaults to False. Returns: list[Score]: A list containing a single true/false Score object. @@ -125,9 +121,7 @@ async def _score_async( ValueError: If no pieces are scored and cannot determine a piece ID for the return score. """ # Get individual scores for all supported pieces using base implementation logic - score_list = await super()._score_async( - message, objective=objective, score_blocked_content=score_blocked_content - ) + score_list = await super()._score_async(message, objective=objective) if not score_list: # If no pieces matched (e.g., due to role filter or if all pieces filtered), return False From c49debc6edcdf2643a35e52c5c7fc528a3516dce Mon Sep 17 00:00:00 2001 From: jsong468 Date: Tue, 5 May 2026 17:03:58 -0700 Subject: [PATCH 5/5] minor truthiness change --- pyrit/score/scorer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 178556ceaf..5be8fae7d0 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -200,7 +200,9 @@ async def score_async( if skip_on_error_result and message.is_error(): # When score_blocked_content is enabled and the message has partial content, # don't skip — let _score_async handle the substitution. - has_partial = any("partial_content" in p.prompt_metadata for p in message.message_pieces if p.is_blocked()) + has_partial = any( + p.prompt_metadata.get("partial_content") for p in message.message_pieces if p.is_blocked() + ) if not (score_blocked_content and has_partial): logger.debug("Skipping scoring due to error in message and skip_on_error=True.") return []