[Attn Masks] Add skip option for non-packed sequences (#42367)

vasqu · web-flow · commit 672dc07527e1 · 2025-11-25T12:11:42.000+01:00
skip option
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -644,7 +644,7 @@ class AttentionMaskInterface(GeneralInterface):
 ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
 
 
-def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
+def find_packed_sequence_indices(position_ids: torch.Tensor) -> Optional[torch.Tensor]:
     """
     Find the indices of the sequence to which each new query token in the sequence belongs when using packed
     tensor format (i.e. several sequences packed in the same batch dimension).
@@ -656,6 +656,9 @@ def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
     Returns:
         A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we
         pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]].
+
+        If the there is only one sequence in each batch item (and we don't compile), then we return `None` indicating
+        no packed sequences. This is the same as [[0, 0, 0, 0, 0, 0]] for the example above.
     """
     # What separate different sequences is when 2 consecutive positions_ids are separated by more than 1. So
     # taking the diff (by prepending the first value - 1 to keep correct indexing) and applying cumsum to the result
@@ -666,8 +669,10 @@ def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
     position_diff = torch.diff(position_ids, prepend=first_dummy_value, dim=-1)
     packed_sequence_mask = (position_diff != 1).cumsum(-1)
 
-    # Here it would be nice to return None if we did not detect packed sequence format, i.e. if `packed_sequence_mask[:, -1] == 0`
-    # but it causes issues with export
+    # Sadly this is a dynamic control flow, so we cannot enable this check on anything compile related
+    if not is_tracing(packed_sequence_mask) and (packed_sequence_mask[:, -1] == 0).all():
+        return None
+
     return packed_sequence_mask
 
 
diff --git a/tests/utils/test_masking_utils.py b/tests/utils/test_masking_utils.py
@@ -153,6 +153,42 @@ def test_find_packed_sequence_indices(self):
         EXPECTED_SEQUENCE_INDICES = torch.tensor([[0, 0, 0, 0, 1, 1, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
         self.assertTrue((find_packed_sequence_indices(position_ids) == EXPECTED_SEQUENCE_INDICES).all())
 
+    def test_nonpacked_sequence_mask_skip(self):
+        config = LlamaConfig()
+        config._attn_implementation = "sdpa"
+
+        batch_size = 2
+        sequence_length = 10
+        cache_position = torch.arange(sequence_length)
+
+        # Non-packed sequences
+        position_ids = torch.arange(sequence_length)[None, :]
+
+        causal_mask = create_causal_mask(
+            config=config,
+            # we only need batch size, seq_length and dtype here - we don't care about the values of the embeddings
+            input_embeds=torch.empty((batch_size, sequence_length), dtype=torch.float16),
+            attention_mask=None,
+            cache_position=cache_position,
+            past_key_values=None,
+            position_ids=position_ids,
+        )
+        # packed sequence should be skipped
+        self.assertTrue(causal_mask is None)
+
+        create_causal_mask_compiled = torch.compile(create_causal_mask, mode="reduce-overhead")
+        causal_mask = create_causal_mask_compiled(
+            config=config,
+            # we only need batch size, seq_length and dtype here - we don't care about the values of the embeddings
+            input_embeds=torch.empty((batch_size, sequence_length), dtype=torch.float16),
+            attention_mask=None,
+            cache_position=cache_position,
+            past_key_values=None,
+            position_ids=position_ids,
+        )
+        # cannot be skipped under compile, should result into a triu mask
+        self.assertTrue(torch.equal(~torch.ones(*causal_mask.shape).triu(diagonal=1).bool(), causal_mask))
+
     def test_chunked_mask_with_left_padding_and_large_prefill(self):
         # Make sure we have an attention_chunk_size in the config
         config = LlamaConfig(attention_chunk_size=3, attn_implementation="sdpa")