microsoft · jsong468 · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/pyrit/executor/attack/core/attack_config.py b/pyrit/executor/attack/core/attack_config.py
@@ -53,6 +53,12 @@ class AttackScoringConfig:
     # Whether to use scoring results as feedback for iterative attacks
     use_score_as_feedback: bool = True
 
+    # Whether to score blocked responses using partial content from prompt_metadata["partial_content"].
+    # When True, blocked responses that contain partial model output (e.g., from Azure Content Safety
+    # triggering mid-generation) will be evaluated by scorers instead of being skipped or
+    # auto-classified as failures/refusals.
+    score_blocked_content: bool = False
+
     def __post_init__(self) -> None:
         """
         Validate configuration values.

diff --git a/pyrit/executor/attack/multi_turn/crescendo.py b/pyrit/executor/attack/multi_turn/crescendo.py
@@ -203,6 +203,7 @@ def __init__(
         self._objective_scorer = objective_scorer
         self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback
         self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers
+        self._score_blocked_content = attack_scoring_config.score_blocked_content
 
         # Initialize refusal scorer - use the one from config if provided, otherwise create default
         self._refusal_scorer = attack_scoring_config.refusal_scorer or SelfAskRefusalScorer(
@@ -676,7 +677,10 @@ async def _check_refusal_async(self, context: CrescendoAttackContext, objective:
             objective=context.objective,
         ):
             scores = await self._refusal_scorer.score_async(
-                message=context.last_response, objective=objective, skip_on_error_result=False
+                message=context.last_response,
+                objective=objective,
+                skip_on_error_result=False,
+                score_blocked_content=self._score_blocked_content,
             )
         return scores[0]
 
@@ -712,6 +716,7 @@ async def _score_response_async(self, *, context: CrescendoAttackContext) -> Sco
                 role_filter="assistant",
                 objective=context.objective,
                 skip_on_error_result=False,
+                score_blocked_content=self._score_blocked_content,
             )
 
         objective_score = scoring_results["objective_scores"]

diff --git a/pyrit/executor/attack/multi_turn/multi_prompt_sending.py b/pyrit/executor/attack/multi_turn/multi_prompt_sending.py
@@ -173,6 +173,7 @@ def __init__(
 
         self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers
         self._objective_scorer = attack_scoring_config.objective_scorer
+        self._score_blocked_content = attack_scoring_config.score_blocked_content
 
         # Initialize prompt normalizer and conversation manager
         self._prompt_normalizer = prompt_normalizer or PromptNormalizer()
@@ -402,6 +403,7 @@ async def _evaluate_response_async(self, *, response: Message, objective: str) -
                 role_filter="assistant",
                 objective=objective,
                 skip_on_error_result=True,
+                score_blocked_content=self._score_blocked_content,
             )
 
         objective_scores = scoring_results["objective_scores"]

diff --git a/pyrit/executor/attack/multi_turn/red_teaming.py b/pyrit/executor/attack/multi_turn/red_teaming.py
@@ -143,6 +143,7 @@ def __init__(
 
         self._objective_scorer = attack_scoring_config.objective_scorer
         self._use_score_as_feedback = attack_scoring_config.use_score_as_feedback
+        self._score_blocked_content = attack_scoring_config.score_blocked_content
 
         # Initialize adversarial configuration
         self._adversarial_chat = attack_adversarial_config.target
@@ -605,6 +606,7 @@ async def _score_response_async(self, *, context: MultiTurnAttackContext[Any]) -
                 message=context.last_response,
                 role_filter="assistant",
                 objective=context.objective,
+                score_blocked_content=self._score_blocked_content,
             )
 
         objective_scores = scoring_results

diff --git a/pyrit/executor/attack/multi_turn/tree_of_attacks.py b/pyrit/executor/attack/multi_turn/tree_of_attacks.py
@@ -136,6 +136,7 @@ def __init__(
         refusal_scorer: Optional[TrueFalseScorer] = None,
         auxiliary_scorers: Optional[list[Scorer]] = None,
         use_score_as_feedback: bool = True,
+        score_blocked_content: bool = False,
     ) -> None:
         """
         Initialize TAP scoring configuration.
@@ -147,6 +148,8 @@ def __init__(
             refusal_scorer (Optional[TrueFalseScorer]): Optional scorer for detecting refusals.
             auxiliary_scorers (Optional[List[Scorer]]): Additional scorers for auxiliary metrics.
             use_score_as_feedback (bool): Whether to use scoring results as feedback. Defaults to True.
+            score_blocked_content (bool): Whether to score blocked responses using partial content.
+                Defaults to False.
 
         Raises:
             ValueError: If objective_scorer is not a FloatScaleThresholdScorer or
@@ -168,6 +171,7 @@ def __init__(
         self.refusal_scorer = refusal_scorer
         self.auxiliary_scorers = auxiliary_scorers or []
         self.use_score_as_feedback = use_score_as_feedback
+        self.score_blocked_content = score_blocked_content
 
     @property
     def threshold(self) -> float:
@@ -324,6 +328,7 @@ def __init__(
         prompt_normalizer: Optional[PromptNormalizer] = None,
         initial_prompt: Optional[Message] = None,
         error_score_map: dict[str, float] | None = None,
+        score_blocked_content: bool = False,
     ) -> None:
         """
         Initialize a tree node.
@@ -352,6 +357,8 @@ def __init__(
                 corresponding score is assigned instead of invoking the scorer. This prevents
                 premature branch pruning when targets return blocked/filtered responses.
                 Defaults to {"blocked": 0.0}. Pass an empty dict to disable.
+            score_blocked_content (bool): If True, blocked responses with partial content will be
+                scored using that content. Defaults to False.
         """
         # Store configuration
         self._objective_target = objective_target
@@ -369,6 +376,7 @@ def __init__(
         self._attack_strategy_name = attack_strategy_name
         self._memory_labels = memory_labels or {}
         self._error_score_map = _validate_error_score_map(error_score_map)
+        self._score_blocked_content = score_blocked_content
 
         # Initialize utilities
         self._memory = CentralMemory.get_memory_instance()
@@ -747,6 +755,7 @@ async def _score_response_async(self, *, response: Message, objective: str) -> N
                 role_filter="assistant",
                 objective=objective,
                 skip_on_error_result=False,
+                score_blocked_content=self._score_blocked_content,
             )
 
         # Extract objective score
@@ -871,6 +880,7 @@ def duplicate(self) -> "_TreeOfAttacksNode":
             parent_id=self.node_id,
             prompt_normalizer=self._prompt_normalizer,
             error_score_map=self._error_score_map,
+            score_blocked_content=self._score_blocked_content,
         )
 
         # Duplicate the conversations to preserve history
@@ -1504,11 +1514,13 @@ def __init__(
                 refusal_scorer=attack_scoring_config.refusal_scorer,
                 auxiliary_scorers=attack_scoring_config.auxiliary_scorers or None,
                 use_score_as_feedback=attack_scoring_config.use_score_as_feedback,
+                score_blocked_content=attack_scoring_config.score_blocked_content,
             )
 
         self._attack_scoring_config = tap_scoring_config
         self._auxiliary_scorers = tap_scoring_config.auxiliary_scorers
         self._objective_scorer = tap_scoring_config.objective_scorer
+        self._score_blocked_content = tap_scoring_config.score_blocked_content
 
         # Use the adversarial chat target for scoring, as in CrescendoAttack
         self._scoring_target = self._adversarial_chat
@@ -2023,6 +2035,7 @@ def _create_attack_node(
             prompt_normalizer=self._prompt_normalizer,
             initial_prompt=initial_prompt,
             error_score_map=self._error_score_map,
+            score_blocked_content=self._score_blocked_content,
         )
 
         # Add the adversarial chat conversation ID to the context's tracking (ensuring uniqueness)

diff --git a/pyrit/executor/attack/single_turn/prompt_sending.py b/pyrit/executor/attack/single_turn/prompt_sending.py
@@ -102,6 +102,7 @@ def __init__(
 
         self._auxiliary_scorers = attack_scoring_config.auxiliary_scorers
         self._objective_scorer = attack_scoring_config.objective_scorer
+        self._score_blocked_content = attack_scoring_config.score_blocked_content
 
         # Skip criteria could be set directly in the injected prompt normalizer
         self._prompt_normalizer = prompt_normalizer or PromptNormalizer()
@@ -364,6 +365,7 @@ async def _evaluate_response_async(
                 role_filter="assistant",
                 objective=objective,
                 skip_on_error_result=True,
+                score_blocked_content=self._score_blocked_content,
             )
 
         if not self._objective_scorer:

diff --git a/pyrit/prompt_target/openai/openai_chat_target.py b/pyrit/prompt_target/openai/openai_chat_target.py
@@ -288,6 +288,26 @@ def _check_content_filter(self, response: Any) -> bool:
             pass
         return False
 
+    def _extract_partial_content(self, response: Any) -> Optional[str]:
+        """
+        Extract partial content from a Chat Completions response with finish_reason=content_filter.
+
+        When Azure Content Safety triggers mid-generation, the model may have produced partial
+        text in ``response.choices[0].message.content`` before being cut off.
+
+        Args:
+            response: A ChatCompletion object from the OpenAI SDK.
+
+        Returns:
+            The partial text content, or None if no content was generated.
+        """
+        try:
+            if response.choices and response.choices[0].message and response.choices[0].message.content:
+                return response.choices[0].message.content
+        except (AttributeError, IndexError):
+            pass
+        return None
+
     def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]:
         """
         Validate a Chat Completions API response for errors.

diff --git a/pyrit/prompt_target/openai/openai_response_target.py b/pyrit/prompt_target/openai/openai_response_target.py
@@ -454,6 +454,34 @@ def _check_content_filter(self, response: Any) -> bool:
             return _is_content_filter_error(response_dict)
         return False
 
+    def _extract_partial_content(self, response: Any) -> Optional[str]:
+        """
+        Extract partial content from a Response API response that was content-filtered.
+
+        The Response API may include partial text in ``response.output`` message sections
+        even when the response has a content filter error.
+
+        Args:
+            response: A Response object from the OpenAI SDK.
+
+        Returns:
+            The partial text content, or None if no content was generated.
+        """
+        try:
+            if not hasattr(response, "output") or not response.output:
+                return None
+            parts: list[str] = []
+            for section in response.output:
+                if getattr(section, "type", None) == MessagePieceType.MESSAGE:
+                    content = getattr(section, "content", None)
+                    if content and len(content) > 0:
+                        text = getattr(content[0], "text", None)
+                        if text:
+                            parts.append(text)
+            return "\n".join(parts) if parts else None
+        except (AttributeError, IndexError, TypeError):
+            return None
+
     def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]:
         """
         Validate a Response API response for errors.

diff --git a/pyrit/prompt_target/openai/openai_target.py b/pyrit/prompt_target/openai/openai_target.py
@@ -559,6 +559,10 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece)
         """
         Handle content filter errors by creating a proper error Message.
 
+        If the subclass provides partial content via ``_extract_partial_content``,
+        it is attached to each response piece as ``prompt_metadata["partial_content"]``
+        so that scorers with ``score_blocked_content=True`` can evaluate it.
+
         Args:
             response: The response object from OpenAI SDK.
             request: The original request message piece.
@@ -567,13 +571,37 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece)
             Message object with error type indicating content was filtered.
         """
         logger.warning("Output content filtered by content policy.")
-        return handle_bad_request_exception(
+
+        partial_content = self._extract_partial_content(response)
+
+        error_message = handle_bad_request_exception(
             response_text=response.model_dump_json(),
             request=request,
             error_code=200,
             is_content_filter=True,
         )
 
+        if partial_content:
+            for piece in error_message.message_pieces:
+                piece.prompt_metadata["partial_content"] = partial_content
+
+        return error_message
+
+    def _extract_partial_content(self, response: Any) -> Optional[str]:
+        """
+        Extract any partial content the model generated before the content filter triggered.
+
+        Override this in subclasses to extract partial content from API-specific response
+        structures. The base implementation returns None (no partial content).
+
+        Args:
+            response: The response object from OpenAI SDK.
+
+        Returns:
+            The partial text content, or None if no content was generated.
+        """
+        return None
+
     def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]:
         """
         Validate the response and return error Message if needed.

diff --git a/pyrit/score/conversation_scorer.py b/pyrit/score/conversation_scorer.py
@@ -63,6 +63,14 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non
         # Build the full conversation text
         conversation_text = ""
 
+        # Check if the caller requested scoring of blocked content by inspecting whether
+        # the incoming message was substituted by score_async._apply_blocked_content_substitution.
+        # A substituted piece has partial_content in metadata but response_error="none".
+        incoming_piece = message.message_pieces[0]
+        use_partial_content = (
+            "partial_content" in incoming_piece.prompt_metadata and incoming_piece.response_error == "none"
+        )
+
         # Goes through each message in the conversation and appends user/assistant messages only
         # Explicitly excludes system, tool, developer messages from being scored/included in conversation history
         # they are allowed in validation but not included in the scored conversation text
@@ -71,7 +79,13 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non
                 # Only include user and assistant messages in the conversation text
                 if piece.api_role in ["user", "assistant", "tool"]:
                     role_display = "Assistant (simulated)" if piece.is_simulated else piece.api_role.capitalize()
-                    conversation_text += f"{role_display}: {piece.converted_value}\n"
+                    # For blocked pieces with partial content, use the partial content
+                    # instead of the error JSON when score_blocked_content is enabled
+                    if use_partial_content and piece.is_blocked() and "partial_content" in piece.prompt_metadata:
+                        text = str(piece.prompt_metadata["partial_content"])
+                    else:
+                        text = piece.converted_value
+                    conversation_text += f"{role_display}: {text}\n"
 
         # Create a new message with the concatenated conversation text
         # Preserve the original message piece metadata