diff --git a/pyrit/executor/attack/core/attack_config.py b/pyrit/executor/attack/core/attack_config.py index 7d128ffd79..b11b91aeff 100644 --- a/pyrit/executor/attack/core/attack_config.py +++ b/pyrit/executor/attack/core/attack_config.py @@ -53,6 +53,12 @@ class AttackScoringConfig: # Whether to use scoring results as feedback for iterative attacks use_score_as_feedback: bool = True + # Whether to score blocked responses using partial content from prompt_metadata["partial_content"]. + # When True, blocked responses that contain partial model output (e.g., from Azure Content Safety + # triggering mid-generation) will be evaluated by scorers instead of being skipped or + # auto-classified as failures/refusals. + score_blocked_content: bool = False + def __post_init__(self) -> None: """ Validate configuration values. diff --git a/pyrit/executor/attack/multi_turn/crescendo.py b/pyrit/executor/attack/multi_turn/crescendo.py index 4e4fab81ef..0c5821c53b 100644 --- a/pyrit/executor/attack/multi_turn/crescendo.py +++ b/pyrit/executor/attack/multi_turn/crescendo.py @@ -203,6 +203,7 @@ def __init__( self._objective_scorer = objective_scorer self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize refusal scorer - use the one from config if provided, otherwise create default self._refusal_scorer = attack_scoring_config.refusal_scorer or SelfAskRefusalScorer( @@ -676,7 +677,10 @@ async def _check_refusal_async(self, context: CrescendoAttackContext, objective: objective=context.objective, ): scores = await self._refusal_scorer.score_async( - message=context.last_response, objective=objective, skip_on_error_result=False + message=context.last_response, + objective=objective, + skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) return scores[0] @@ -712,6 +716,7 @@ async def _score_response_async(self, *, context: CrescendoAttackContext) -> Sco role_filter="assistant", objective=context.objective, skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) objective_score = scoring_results["objective_scores"] diff --git a/pyrit/executor/attack/multi_turn/multi_prompt_sending.py b/pyrit/executor/attack/multi_turn/multi_prompt_sending.py index 4d7e8fde02..950ce4b93c 100644 --- a/pyrit/executor/attack/multi_turn/multi_prompt_sending.py +++ b/pyrit/executor/attack/multi_turn/multi_prompt_sending.py @@ -173,6 +173,7 @@ def __init__( self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers self._objective_scorer = attack_scoring_config.objective_scorer + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize prompt normalizer and conversation manager self._prompt_normalizer = prompt_normalizer or PromptNormalizer() @@ -402,6 +403,7 @@ async def _evaluate_response_async(self, *, response: Message, objective: str) - role_filter="assistant", objective=objective, skip_on_error_result=True, + score_blocked_content=self._score_blocked_content, ) objective_scores = scoring_results["objective_scores"] diff --git a/pyrit/executor/attack/multi_turn/red_teaming.py b/pyrit/executor/attack/multi_turn/red_teaming.py index 878cc8e978..f50d729656 100644 --- a/pyrit/executor/attack/multi_turn/red_teaming.py +++ b/pyrit/executor/attack/multi_turn/red_teaming.py @@ -143,6 +143,7 @@ def __init__( self._objective_scorer = attack_scoring_config.objective_scorer self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback + self._score_blocked_content = attack_scoring_config.score_blocked_content # Initialize adversarial configuration self._adversarial_chat = attack_adversarial_config.target @@ -605,6 +606,7 @@ async def _score_response_async(self, *, context: MultiTurnAttackContext[Any]) - message=context.last_response, role_filter="assistant", objective=context.objective, + score_blocked_content=self._score_blocked_content, ) objective_scores = scoring_results diff --git a/pyrit/executor/attack/multi_turn/tree_of_attacks.py b/pyrit/executor/attack/multi_turn/tree_of_attacks.py index e23af1eabf..28b1b91dc2 100644 --- a/pyrit/executor/attack/multi_turn/tree_of_attacks.py +++ b/pyrit/executor/attack/multi_turn/tree_of_attacks.py @@ -136,6 +136,7 @@ def __init__( refusal_scorer: Optional[TrueFalseScorer] = None, auxiliary_scorers: Optional[list[Scorer]] = None, use_score_as_feedback: bool = True, + score_blocked_content: bool = False, ) -> None: """ Initialize TAP scoring configuration. @@ -147,6 +148,8 @@ def __init__( refusal_scorer (Optional[TrueFalseScorer]): Optional scorer for detecting refusals. auxiliary_scorers (Optional[List[Scorer]]): Additional scorers for auxiliary metrics. use_score_as_feedback (bool): Whether to use scoring results as feedback. Defaults to True. + score_blocked_content (bool): Whether to score blocked responses using partial content. + Defaults to False. Raises: ValueError: If objective_scorer is not a FloatScaleThresholdScorer or @@ -168,6 +171,7 @@ def __init__( self.refusal_scorer = refusal_scorer self.auxiliary_scorers = auxiliary_scorers or [] self.use_score_as_feedback = use_score_as_feedback + self.score_blocked_content = score_blocked_content @property def threshold(self) -> float: @@ -324,6 +328,7 @@ def __init__( prompt_normalizer: Optional[PromptNormalizer] = None, initial_prompt: Optional[Message] = None, error_score_map: dict[str, float] | None = None, + score_blocked_content: bool = False, ) -> None: """ Initialize a tree node. @@ -352,6 +357,8 @@ def __init__( corresponding score is assigned instead of invoking the scorer. This prevents premature branch pruning when targets return blocked/filtered responses. Defaults to {"blocked": 0.0}. Pass an empty dict to disable. + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. """ # Store configuration self._objective_target = objective_target @@ -369,6 +376,7 @@ def __init__( self._attack_strategy_name = attack_strategy_name self._memory_labels = memory_labels or {} self._error_score_map = _validate_error_score_map(error_score_map) + self._score_blocked_content = score_blocked_content # Initialize utilities self._memory = CentralMemory.get_memory_instance() @@ -747,6 +755,7 @@ async def _score_response_async(self, *, response: Message, objective: str) -> N role_filter="assistant", objective=objective, skip_on_error_result=False, + score_blocked_content=self._score_blocked_content, ) # Extract objective score @@ -871,6 +880,7 @@ def duplicate(self) -> "_TreeOfAttacksNode": parent_id=self.node_id, prompt_normalizer=self._prompt_normalizer, error_score_map=self._error_score_map, + score_blocked_content=self._score_blocked_content, ) # Duplicate the conversations to preserve history @@ -1504,11 +1514,13 @@ def __init__( refusal_scorer=attack_scoring_config.refusal_scorer, auxiliary_scorers=attack_scoring_config.auxiliary_scorers or None, use_score_as_feedback=attack_scoring_config.use_score_as_feedback, + score_blocked_content=attack_scoring_config.score_blocked_content, ) self._attack_scoring_config = tap_scoring_config self._auxiliary_scorers = tap_scoring_config.auxiliary_scorers self._objective_scorer = tap_scoring_config.objective_scorer + self._score_blocked_content = tap_scoring_config.score_blocked_content # Use the adversarial chat target for scoring, as in CrescendoAttack self._scoring_target = self._adversarial_chat @@ -2023,6 +2035,7 @@ def _create_attack_node( prompt_normalizer=self._prompt_normalizer, initial_prompt=initial_prompt, error_score_map=self._error_score_map, + score_blocked_content=self._score_blocked_content, ) # Add the adversarial chat conversation ID to the context's tracking (ensuring uniqueness) diff --git a/pyrit/executor/attack/single_turn/prompt_sending.py b/pyrit/executor/attack/single_turn/prompt_sending.py index f1a2d7316e..5c0d2a533d 100644 --- a/pyrit/executor/attack/single_turn/prompt_sending.py +++ b/pyrit/executor/attack/single_turn/prompt_sending.py @@ -102,6 +102,7 @@ def __init__( self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers self._objective_scorer = attack_scoring_config.objective_scorer + self._score_blocked_content = attack_scoring_config.score_blocked_content # Skip criteria could be set directly in the injected prompt normalizer self._prompt_normalizer = prompt_normalizer or PromptNormalizer() @@ -364,6 +365,7 @@ async def _evaluate_response_async( role_filter="assistant", objective=objective, skip_on_error_result=True, + score_blocked_content=self._score_blocked_content, ) if not self._objective_scorer: diff --git a/pyrit/prompt_target/openai/openai_chat_target.py b/pyrit/prompt_target/openai/openai_chat_target.py index 6dfb5f391f..eb9061e586 100644 --- a/pyrit/prompt_target/openai/openai_chat_target.py +++ b/pyrit/prompt_target/openai/openai_chat_target.py @@ -288,6 +288,26 @@ def _check_content_filter(self, response: Any) -> bool: pass return False + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract partial content from a Chat Completions response with finish_reason=content_filter. + + When Azure Content Safety triggers mid-generation, the model may have produced partial + text in ``response.choices[0].message.content`` before being cut off. + + Args: + response: A ChatCompletion object from the OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + try: + if response.choices and response.choices[0].message and response.choices[0].message.content: + return response.choices[0].message.content + except (AttributeError, IndexError): + pass + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate a Chat Completions API response for errors. diff --git a/pyrit/prompt_target/openai/openai_response_target.py b/pyrit/prompt_target/openai/openai_response_target.py index dbe71e5406..b4f51dc9b3 100644 --- a/pyrit/prompt_target/openai/openai_response_target.py +++ b/pyrit/prompt_target/openai/openai_response_target.py @@ -454,6 +454,34 @@ def _check_content_filter(self, response: Any) -> bool: return _is_content_filter_error(response_dict) return False + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract partial content from a Response API response that was content-filtered. + + The Response API may include partial text in ``response.output`` message sections + even when the response has a content filter error. + + Args: + response: A Response object from the OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + try: + if not hasattr(response, "output") or not response.output: + return None + parts: list[str] = [] + for section in response.output: + if getattr(section, "type", None) == MessagePieceType.MESSAGE: + content = getattr(section, "content", None) + if content and len(content) > 0: + text = getattr(content[0], "text", None) + if text: + parts.append(text) + return "\n".join(parts) if parts else None + except (AttributeError, IndexError, TypeError): + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate a Response API response for errors. diff --git a/pyrit/prompt_target/openai/openai_target.py b/pyrit/prompt_target/openai/openai_target.py index 8058a2b7fd..885f650894 100644 --- a/pyrit/prompt_target/openai/openai_target.py +++ b/pyrit/prompt_target/openai/openai_target.py @@ -559,6 +559,10 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece) """ Handle content filter errors by creating a proper error Message. + If the subclass provides partial content via ``_extract_partial_content``, + it is attached to each response piece as ``prompt_metadata["partial_content"]`` + so that scorers with ``score_blocked_content=True`` can evaluate it. + Args: response: The response object from OpenAI SDK. request: The original request message piece. @@ -567,13 +571,37 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece) Message object with error type indicating content was filtered. """ logger.warning("Output content filtered by content policy.") - return handle_bad_request_exception( + + partial_content = self._extract_partial_content(response) + + error_message = handle_bad_request_exception( response_text=response.model_dump_json(), request=request, error_code=200, is_content_filter=True, ) + if partial_content: + for piece in error_message.message_pieces: + piece.prompt_metadata["partial_content"] = partial_content + + return error_message + + def _extract_partial_content(self, response: Any) -> Optional[str]: + """ + Extract any partial content the model generated before the content filter triggered. + + Override this in subclasses to extract partial content from API-specific response + structures. The base implementation returns None (no partial content). + + Args: + response: The response object from OpenAI SDK. + + Returns: + The partial text content, or None if no content was generated. + """ + return None + def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]: """ Validate the response and return error Message if needed. diff --git a/pyrit/score/conversation_scorer.py b/pyrit/score/conversation_scorer.py index c3bcbf4f87..d1dad443bc 100644 --- a/pyrit/score/conversation_scorer.py +++ b/pyrit/score/conversation_scorer.py @@ -63,6 +63,14 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non # Build the full conversation text conversation_text = "" + # Check if the caller requested scoring of blocked content by inspecting whether + # the incoming message was substituted by score_async._apply_blocked_content_substitution. + # A substituted piece has partial_content in metadata but response_error="none". + incoming_piece = message.message_pieces[0] + use_partial_content = ( + "partial_content" in incoming_piece.prompt_metadata and incoming_piece.response_error == "none" + ) + # Goes through each message in the conversation and appends user/assistant messages only # Explicitly excludes system, tool, developer messages from being scored/included in conversation history # they are allowed in validation but not included in the scored conversation text @@ -71,7 +79,13 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non # Only include user and assistant messages in the conversation text if piece.api_role in ["user", "assistant", "tool"]: role_display = "Assistant (simulated)" if piece.is_simulated else piece.api_role.capitalize() - conversation_text += f"{role_display}: {piece.converted_value}\n" + # For blocked pieces with partial content, use the partial content + # instead of the error JSON when score_blocked_content is enabled + if use_partial_content and piece.is_blocked() and "partial_content" in piece.prompt_metadata: + text = str(piece.prompt_metadata["partial_content"]) + else: + text = piece.converted_value + conversation_text += f"{role_display}: {text}\n" # Create a new message with the concatenated conversation text # Preserve the original message piece metadata diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 1a011823fd..5be8fae7d0 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -163,6 +163,7 @@ async def score_async( role_filter: Optional[ChatMessageRole] = None, skip_on_error_result: bool = False, infer_objective_from_request: bool = False, + score_blocked_content: bool = False, ) -> list[Score]: """ Score the message, add the results to the database, and return a list of Score objects. @@ -174,9 +175,14 @@ async def score_async( role_filter (Optional[ChatMessageRole]): Only score messages with this exact stored role. Use "assistant" to score only real assistant responses, or "simulated_assistant" to score only simulated responses. Defaults to None (no filtering). - skip_on_error_result (bool): If True, skip scoring if the message contains an error. Defaults to False. + skip_on_error_result (bool): If True, skip scoring if the message contains an error. If True + but score_blocked_content is also True, blocked content will be scored in the case of a + content filter triggered error instead of skipping. Defaults to False. infer_objective_from_request (bool): If True, infer the objective from the message's previous request when objective is not provided. Defaults to False. + score_blocked_content (bool): If True, blocked responses that contain partial content + (in prompt_metadata["partial_content"]) will be scored using that content instead + of being filtered out or short-circuited. Defaults to False. Returns: list[Score]: A list of Score objects representing the results. @@ -192,15 +198,27 @@ async def score_async( return [] if skip_on_error_result and message.is_error(): - logger.debug("Skipping scoring due to error in message and skip_on_error=True.") - return [] + # When score_blocked_content is enabled and the message has partial content, + # don't skip — let _score_async handle the substitution. + has_partial = any( + p.prompt_metadata.get("partial_content") for p in message.message_pieces if p.is_blocked() + ) + if not (score_blocked_content and has_partial): + logger.debug("Skipping scoring due to error in message and skip_on_error=True.") + return [] if infer_objective_from_request and (not objective): objective = self._extract_objective_from_response(message) + # When score_blocked_content is enabled, create a modified message where blocked pieces + # with partial content are replaced with text-type substitutes (response_error="none"). + # This is done here (not in _score_async) so that _score_async's signature remains + # (self, message, *, objective=None) — preserving backward compatibility for subclasses. + scoring_message = self._apply_blocked_content_substitution(message) if score_blocked_content else message + try: scores = await self._score_async( - message, + scoring_message, objective=objective, ) except PyritException as e: @@ -253,6 +271,77 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: raise NotImplementedError + @staticmethod + def _create_text_piece_from_blocked(piece: MessagePiece) -> Optional[MessagePiece]: + """ + Create a text-typed copy of a blocked MessagePiece using its partial content. + + The substitute preserves the original piece's id (so scores link back correctly), + sets converted_value to the partial content with converted_value_data_type="text", + and sets response_error="none" so scorer short-circuits (e.g., refusal scorer's + blocked check) do not fire. + + Args: + piece: A blocked MessagePiece with prompt_metadata["partial_content"]. + + Returns: + MessagePiece with text content, or None if partial content is empty. + """ + partial_content = str(piece.prompt_metadata.get("partial_content", "")) + if not partial_content: + return None + + return MessagePiece( + id=piece.id, + role=piece.api_role, + original_value=piece.original_value, + converted_value=partial_content, + original_value_data_type=piece.original_value_data_type, + converted_value_data_type="text", + conversation_id=piece.conversation_id, + sequence=piece.sequence, + labels=piece.labels, + prompt_metadata=piece.prompt_metadata, + converter_identifiers=list(piece.converter_identifiers), # type: ignore[arg-type] + prompt_target_identifier=piece.prompt_target_identifier, + attack_identifier=piece.attack_identifier, + response_error="none", + timestamp=piece.timestamp, + ) + + def _apply_blocked_content_substitution(self, message: Message) -> Message: + """ + Create a copy of the message where blocked pieces with partial content are substituted. + + Each blocked piece that has prompt_metadata["partial_content"] is replaced with a + text-typed copy (response_error="none", converted_value=partial_content). Non-blocked + pieces and blocked pieces without partial content are kept as-is. + + This is called in score_async (not _score_async) so that subclass overrides of + _score_async do not need to accept the score_blocked_content parameter. + + Args: + message: The original message potentially containing blocked pieces. + + Returns: + A new Message with substituted pieces, or the original if no substitution was needed. + """ + substituted = False + new_pieces: list[MessagePiece] = [] + for piece in message.message_pieces: + if piece.is_blocked() and "partial_content" in piece.prompt_metadata: + substitute = self._create_text_piece_from_blocked(piece) + if substitute: + new_pieces.append(substitute) + substituted = True + continue + new_pieces.append(piece) + + if not substituted: + return message + + return Message(message_pieces=new_pieces) + def _get_supported_pieces(self, message: Message) -> list[MessagePiece]: """ Get a list of supported message pieces for this scorer. @@ -713,6 +802,7 @@ async def score_response_async( role_filter: ChatMessageRole = "assistant", objective: Optional[str] = None, skip_on_error_result: bool = True, + score_blocked_content: bool = False, ) -> dict[str, list[Score]]: """ Score a response using an objective scorer and optional auxiliary scorers. @@ -725,6 +815,8 @@ async def score_response_async( Defaults to "assistant" (real responses only, not simulated). objective (Optional[str]): Task/objective for scoring context. Defaults to None. skip_on_error_result (bool): If True, skip scoring pieces that have errors. Defaults to True. + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. Returns: Dict[str, List[Score]]: Dictionary with keys `auxiliary_scores` and `objective_scores` @@ -747,6 +839,7 @@ async def score_response_async( role_filter=role_filter, objective=objective, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) result["auxiliary_scores"] = aux_scores # objective_scores remains empty @@ -760,12 +853,14 @@ async def score_response_async( role_filter=role_filter, objective=objective, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) obj_task = objective_scorer.score_async( message=response, objective=objective, skip_on_error_result=skip_on_error_result, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) aux_scores, obj_scores = await asyncio.gather(aux_task, obj_task) result["auxiliary_scores"] = aux_scores @@ -776,6 +871,7 @@ async def score_response_async( objective=objective, skip_on_error_result=skip_on_error_result, role_filter=role_filter, + score_blocked_content=score_blocked_content, ) result["objective_scores"] = obj_scores return result @@ -788,6 +884,7 @@ async def score_response_multiple_scorers_async( role_filter: ChatMessageRole = "assistant", objective: Optional[str] = None, skip_on_error_result: bool = True, + score_blocked_content: bool = False, ) -> list[Score]: """ Score a response using multiple scorers in parallel. @@ -802,6 +899,8 @@ async def score_response_multiple_scorers_async( Defaults to "assistant" (real responses only, not simulated). objective (Optional[str]): Optional objective description for scoring context. skip_on_error_result (bool): If True, skip scoring pieces that have errors (default: True). + score_blocked_content (bool): If True, blocked responses with partial content will be + scored using that content. Defaults to False. Returns: List[Score]: All scores from all scorers @@ -816,6 +915,7 @@ async def score_response_multiple_scorers_async( objective=objective, role_filter=role_filter, skip_on_error_result=skip_on_error_result, + score_blocked_content=score_blocked_content, ) for scorer in scorers ] diff --git a/tests/unit/executor/attack/core/test_attack_config.py b/tests/unit/executor/attack/core/test_attack_config.py index bc3a822f67..8355b16cf6 100644 --- a/tests/unit/executor/attack/core/test_attack_config.py +++ b/tests/unit/executor/attack/core/test_attack_config.py @@ -76,3 +76,20 @@ def test_init_with_use_score_as_feedback_false(self): config = AttackScoringConfig(use_score_as_feedback=False) assert config.use_score_as_feedback is False + + def test_score_blocked_content_default_is_false(self): + """Test that score_blocked_content defaults to False.""" + config = AttackScoringConfig() + assert config.score_blocked_content is False + + def test_score_blocked_content_can_set_to_true(self): + """Test that score_blocked_content can be set to True.""" + config = AttackScoringConfig(score_blocked_content=True) + assert config.score_blocked_content is True + + def test_score_blocked_content_with_valid_scorers(self): + """Test that score_blocked_content works with valid scorers.""" + mock_scorer = MagicMock(spec=TrueFalseScorer) + config = AttackScoringConfig(objective_scorer=mock_scorer, score_blocked_content=True) + assert config.score_blocked_content is True + assert config.objective_scorer is mock_scorer diff --git a/tests/unit/executor/attack/single_turn/test_prompt_sending.py b/tests/unit/executor/attack/single_turn/test_prompt_sending.py index 132e98b1a9..00b1d245a0 100644 --- a/tests/unit/executor/attack/single_turn/test_prompt_sending.py +++ b/tests/unit/executor/attack/single_turn/test_prompt_sending.py @@ -458,6 +458,7 @@ async def test_evaluate_response_with_objective_scorer_returns_score( role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) async def test_evaluate_response_without_objective_scorer_returns_none(self, mock_target, sample_response): @@ -480,6 +481,7 @@ async def test_evaluate_response_without_objective_scorer_returns_none(self, moc role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) async def test_evaluate_response_with_auxiliary_scorers( @@ -522,6 +524,7 @@ async def test_evaluate_response_with_auxiliary_scorers( role_filter="assistant", objective="Test objective", skip_on_error_result=True, + score_blocked_content=False, ) diff --git a/tests/unit/prompt_target/target/test_openai_chat_target.py b/tests/unit/prompt_target/target/test_openai_chat_target.py index 59395a270f..a1796f5304 100644 --- a/tests/unit/prompt_target/target/test_openai_chat_target.py +++ b/tests/unit/prompt_target/target/test_openai_chat_target.py @@ -1596,6 +1596,62 @@ async def test_save_audio_response_async_pcm16_format(patch_central_database): assert result == "/path/to/saved/audio.wav" +# ── _extract_partial_content tests ────────────────────────────────────────── + + +class TestExtractPartialContentChatTarget: + def test_extracts_partial_content_from_content_filter_response(self, target: OpenAIChatTarget): + mock_response = create_mock_completion( + content="Partial harmful content before cutoff", finish_reason="content_filter" + ) + result = target._extract_partial_content(mock_response) + assert result == "Partial harmful content before cutoff" + + def test_returns_none_when_no_content(self, target: OpenAIChatTarget): + mock_response = create_mock_completion(content=None, finish_reason="content_filter") + result = target._extract_partial_content(mock_response) + assert result is None + + def test_returns_none_when_empty_content(self, target: OpenAIChatTarget): + mock_response = create_mock_completion(content="", finish_reason="content_filter") + result = target._extract_partial_content(mock_response) + assert result is None + + def test_returns_none_when_no_choices(self, target: OpenAIChatTarget): + mock_response = MagicMock(spec=ChatCompletion) + mock_response.choices = [] + result = target._extract_partial_content(mock_response) + assert result is None + + +class TestContentFilterPreservesPartialContent: + async def test_200_content_filter_attaches_partial_content_metadata(self, target: OpenAIChatTarget): + """Integration: 200 + content_filter response preserves partial content in metadata.""" + message = Message( + message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")] + ) + mock_completion = create_mock_completion(content="Harmful partial content here", finish_reason="content_filter") + target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign] + + response = await target.send_prompt_async(message=message) + + assert response[0].message_pieces[0].response_error == "blocked" + assert response[0].message_pieces[0].prompt_metadata["partial_content"] == "Harmful partial content here" + + async def test_200_content_filter_no_metadata_when_no_content(self, target: OpenAIChatTarget): + """200 + content_filter with no content doesn't attach metadata.""" + message = Message( + message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")] + ) + mock_completion = create_mock_completion(content=None, finish_reason="content_filter") + target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign] + + response = await target.send_prompt_async(message=message) + + assert response[0].message_pieces[0].response_error == "blocked" + assert "partial_content" not in response[0].message_pieces[0].prompt_metadata + + async def test_save_audio_response_async_flac_format(patch_central_database): """Test saving audio response with flac format.""" audio_config = OpenAIChatAudioConfig(voice="alloy", audio_format="flac") diff --git a/tests/unit/prompt_target/target/test_openai_response_target.py b/tests/unit/prompt_target/target/test_openai_response_target.py index 95f6e238f8..0dfb02cf18 100644 --- a/tests/unit/prompt_target/target/test_openai_response_target.py +++ b/tests/unit/prompt_target/target/test_openai_response_target.py @@ -1043,6 +1043,48 @@ def test_invalid_top_p_raises(patch_central_database): # Unit tests for override methods +class TestExtractPartialContentResponseTarget: + def test_extracts_text_from_message_sections(self): + from pyrit.prompt_target.openai.openai_response_target import MessagePieceType + + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + section = MagicMock() + section.type = MessagePieceType.MESSAGE + content_item = MagicMock() + content_item.text = "Partial response text" + section.content = [content_item] + + mock_response = MagicMock() + mock_response.output = [section] + + result = target._extract_partial_content(mock_response) + assert result == "Partial response text" + + def test_returns_none_when_no_output(self): + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + mock_response = MagicMock() + mock_response.output = [] + + result = target._extract_partial_content(mock_response) + assert result is None + + def test_ignores_non_message_sections(self): + from pyrit.prompt_target.openai.openai_response_target import MessagePieceType + + target = OpenAIResponseTarget(model_name="gpt-4", endpoint="https://test.com", api_key="test") + + section = MagicMock() + section.type = MessagePieceType.REASONING + + mock_response = MagicMock() + mock_response.output = [section] + + result = target._extract_partial_content(mock_response) + assert result is None + + def test_check_content_filter_detects_filtered_response(target: OpenAIResponseTarget): """Test _check_content_filter detects content_filter error code.""" mock_response = MagicMock() diff --git a/tests/unit/score/test_scorer.py b/tests/unit/score/test_scorer.py index a35fbe3cb1..8e5335bd52 100644 --- a/tests/unit/score/test_scorer.py +++ b/tests/unit/score/test_scorer.py @@ -587,10 +587,18 @@ async def test_score_response_async_parallel_execution(): assert score1_1 in result["auxiliary_scores"] assert score2_1 in result["auxiliary_scores"] scorer1.score_async.assert_any_call( - message=response, objective="test task", role_filter="assistant", skip_on_error_result=True + message=response, + objective="test task", + role_filter="assistant", + skip_on_error_result=True, + score_blocked_content=False, ) scorer2.score_async.assert_any_call( - message=response, objective="test task", role_filter="assistant", skip_on_error_result=True + message=response, + objective="test task", + role_filter="assistant", + skip_on_error_result=True, + score_blocked_content=False, ) @@ -1465,3 +1473,336 @@ async def test_score_value_with_llm_skips_reasoning_piece(good_json): assert result.raw_score_value == "1" assert result.score_rationale == "Valid response" + + +# ── Helpers for score_blocked_content tests ────────────────────────────────── + + +class _AcceptAllValidator(ScorerPromptValidator): + """Validator that accepts all pieces (like SelfAskRefusalScorer's default).""" + + def validate(self, message: Message, objective: Optional[str] = None) -> None: + pass + + def is_message_piece_supported(self, message_piece: MessagePiece) -> bool: + return True + + +class _TextOnlyValidator(ScorerPromptValidator): + """Validator that only accepts text pieces (like SelfAskTrueFalseScorer's default).""" + + def __init__(self) -> None: + super().__init__(supported_data_types=["text", "image_path"]) + + def validate(self, message: Message, objective: Optional[str] = None) -> None: + pass + + +class _BlockedContentScorer(TrueFalseScorer): + """A mock TrueFalseScorer that records what pieces it was asked to score.""" + + def __init__(self, *, validator: Optional[ScorerPromptValidator] = None) -> None: + super().__init__(validator=validator or _TextOnlyValidator()) + self.scored_pieces: list[MessagePiece] = [] + + def _build_identifier(self) -> ComponentIdentifier: + return self._create_identifier() + + async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: + self.scored_pieces.append(message_piece) + return [ + Score( + score_value="true", + score_value_description="desc", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="rationale", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + + +class _MockRefusalScorer(TrueFalseScorer): + """Mimics SelfAskRefusalScorer: accepts all types, short-circuits on blocked.""" + + def __init__(self) -> None: + super().__init__(validator=_AcceptAllValidator()) + self.scored_pieces: list[MessagePiece] = [] + + def _build_identifier(self) -> ComponentIdentifier: + return self._create_identifier() + + async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]: + self.scored_pieces.append(message_piece) + if message_piece.response_error == "blocked": + return [ + Score( + score_value="true", + score_value_description="Refusal detected", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="Content was filtered, constituting a refusal.", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + return [ + Score( + score_value="false", + score_value_description="Not a refusal", + score_type="true_false", + score_category=None, + score_metadata=None, + score_rationale="The response contains substantive content.", + scorer_class_identifier=self.get_identifier(), + message_piece_id=str(message_piece.id), + objective=objective, + ) + ] + + +def _make_blocked_piece(*, partial_content: Optional[str] = None, conversation_id: str = "test-convo") -> MessagePiece: + """Create a blocked MessagePiece, optionally with partial content metadata.""" + metadata: dict = {} + if partial_content is not None: + metadata["partial_content"] = partial_content + return MessagePiece( + role="assistant", + original_value='{"status_code": 200, "message": "content_filter"}', + converted_value='{"status_code": 200, "message": "content_filter"}', + original_value_data_type="error", + converted_value_data_type="error", + conversation_id=conversation_id, + response_error="blocked", + prompt_metadata=metadata, + ) + + +def _make_normal_piece(*, conversation_id: str = "test-convo") -> MessagePiece: + """Create a normal text MessagePiece.""" + return MessagePiece( + role="assistant", + original_value="Hello, how can I help?", + conversation_id=conversation_id, + ) + + +# ── _create_text_piece_from_blocked tests ──────────────────────────────────── + + +class TestCreateTextPieceFromBlocked: + def test_returns_text_piece_with_partial_content(self): + piece = _make_blocked_piece(partial_content="Harmful partial text here") + substitute = Scorer._create_text_piece_from_blocked(piece) + + assert substitute is not None + assert substitute.converted_value == "Harmful partial text here" + assert substitute.converted_value_data_type == "text" + assert substitute.response_error == "none" + assert substitute.id == piece.id + + def test_preserves_original_value(self): + piece = _make_blocked_piece(partial_content="partial") + substitute = Scorer._create_text_piece_from_blocked(piece) + + assert substitute is not None + assert substitute.original_value == piece.original_value + assert substitute.original_value_data_type == piece.original_value_data_type + + def test_returns_none_when_no_partial_content(self): + piece = _make_blocked_piece() + assert Scorer._create_text_piece_from_blocked(piece) is None + + def test_returns_none_when_empty_partial_content(self): + piece = _make_blocked_piece(partial_content="") + assert Scorer._create_text_piece_from_blocked(piece) is None + + def test_preserves_conversation_id(self): + piece = _make_blocked_piece(partial_content="partial") + substitute = Scorer._create_text_piece_from_blocked(piece) + assert substitute is not None + assert substitute.conversation_id == piece.conversation_id + + def test_response_error_is_none_not_blocked(self): + """Substitute must have response_error='none' so refusal short-circuits don't fire.""" + piece = _make_blocked_piece(partial_content="partial text") + substitute = Scorer._create_text_piece_from_blocked(piece) + assert substitute is not None + assert substitute.response_error == "none" + assert not substitute.is_blocked() + assert not substitute.has_error() + + +# ── score_async with score_blocked_content tests ───────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestScoreAsyncWithBlockedContent: + async def test_default_false_skips_blocked_piece_text_only_scorer(self): + """Default behavior: text-only scorer filters out blocked error-type pieces.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=False) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert len(scorer.scored_pieces) == 0 + + async def test_true_substitutes_blocked_piece_for_text_only_scorer(self): + """With flag on, text-only scorer gets a text substitute and scores it.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "true" + assert len(scorer.scored_pieces) == 1 + assert scorer.scored_pieces[0].converted_value == "harmful text" + assert scorer.scored_pieces[0].converted_value_data_type == "text" + + async def test_refusal_scorer_short_circuits_on_blocked_by_default(self): + """Refusal scorer (accepts all types) sees original blocked piece, returns True.""" + scorer = _MockRefusalScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=False) + + assert len(scores) == 1 + assert scores[0].score_value == "true" + assert scorer.scored_pieces[0].response_error == "blocked" + + async def test_refusal_scorer_evaluates_partial_content_when_flag_on(self): + """With flag on, refusal scorer gets substitute (response_error=none), evaluates via LLM path.""" + scorer = _MockRefusalScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert scorer.scored_pieces[0].response_error == "none" + assert scorer.scored_pieces[0].converted_value == "harmful text" + + async def test_no_substitute_when_no_partial_content(self): + """400 full block with no partial content: no substitute, same behavior.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece()]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 + assert scores[0].score_value == "false" + assert len(scorer.scored_pieces) == 0 + + async def test_normal_piece_unaffected_by_flag(self): + """Normal text pieces are scored the same regardless of flag.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_normal_piece()]) + + scores_off = await scorer.score_async(msg, score_blocked_content=False) + scorer.scored_pieces.clear() + scores_on = await scorer.score_async(msg, score_blocked_content=True) + + assert scores_off[0].score_value == scores_on[0].score_value + + async def test_mixed_pieces_only_blocked_substituted(self): + """In a multi-piece message, only blocked pieces get substituted.""" + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_normal_piece(), _make_blocked_piece(partial_content="partial harmful")]) + + scores = await scorer.score_async(msg, score_blocked_content=True) + + assert len(scores) == 1 # TrueFalseScorer aggregates + assert len(scorer.scored_pieces) == 2 + assert scorer.scored_pieces[0].converted_value == "Hello, how can I help?" + assert scorer.scored_pieces[1].converted_value == "partial harmful" + assert scorer.scored_pieces[1].response_error == "none" + + +# ── skip_on_error_result interaction tests ─────────────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestSkipOnErrorWithBlockedContent: + async def test_skip_on_error_true_without_flag_skips_blocked(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=False) + assert scores == [] + + async def test_skip_on_error_true_with_flag_does_not_skip_when_partial_content(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=True) + assert len(scores) == 1 + assert scores[0].score_value == "true" + + async def test_skip_on_error_true_with_flag_still_skips_when_no_partial_content(self): + scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece()]) + + scores = await scorer.score_async(msg, skip_on_error_result=True, score_blocked_content=True) + assert scores == [] + + +# ── score_response_async passthrough tests ─────────────────────────────────── + + +@pytest.mark.usefixtures("patch_central_database") +class TestScoreResponseAsyncBlockedContent: + async def test_score_response_async_passes_flag_to_scorers(self): + obj_scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + result = await Scorer.score_response_async( + response=msg, + objective_scorer=obj_scorer, + objective="test", + skip_on_error_result=False, + score_blocked_content=True, + ) + + assert len(result["objective_scores"]) == 1 + assert result["objective_scores"][0].score_value == "true" + assert obj_scorer.scored_pieces[0].converted_value == "harmful text" + + async def test_score_response_async_default_does_not_substitute(self): + obj_scorer = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + result = await Scorer.score_response_async( + response=msg, + objective_scorer=obj_scorer, + objective="test", + skip_on_error_result=False, + score_blocked_content=False, + ) + + assert result["objective_scores"][0].score_value == "false" + assert len(obj_scorer.scored_pieces) == 0 + + async def test_score_response_multiple_scorers_passes_flag(self): + scorer1 = _BlockedContentScorer() + scorer2 = _BlockedContentScorer() + msg = Message(message_pieces=[_make_blocked_piece(partial_content="harmful text")]) + + scores = await Scorer.score_response_multiple_scorers_async( + response=msg, + scorers=[scorer1, scorer2], + objective="test", + skip_on_error_result=False, + score_blocked_content=True, + ) + + assert len(scores) == 2 + assert len(scorer1.scored_pieces) == 1 + assert len(scorer2.scored_pieces) == 1