beautify the handling of mask of None in MMImageMaskInput (#1674)

aobo-y · meta-codesync[bot] · commit ff0eaf352551 · 2025-12-01T08:52:37.000-08:00
Summary: Pull Request resolved: #1674 if `mask` is `None`, the whole `image` is seen as one interpretable feature. Initiate a dummy `mask` whose pixels all belongs to feature id `0`, so the following code can be simplified by assuming `mask` is given Reviewed By: craymichael Differential Revision: D87890030 fbshipit-source-id: 8743f76464433c2bc4c9a8d7b4a193dcded48920
diff --git a/captum/attr/_utils/interpretable_input.py b/captum/attr/_utils/interpretable_input.py
@@ -537,20 +537,20 @@ class MMImageMaskInput(InterpretableInput):
         >>>
         >>>    prompt = processor.apply_chat_template(
         >>>        messages, add_generation_prompt=True
-        >>>)
+        >>>    )
         >>>
         >>>    return processor(
         >>>        text=prompt,
         >>>        images=image,
         >>>        return_tensors="pt",
         >>>    ).to(model.device)
-
+        >>>
         >>> image = Image.open("test.jpg")
-
+        >>>
         >>> # Split horizontally: left half = 0, right half = 1
         >>> mask = torch.zeros(image.size[::-1], dtype=torch.int32)
         >>> mask[:, image.size[0] // 2:] = 1
-
+        >>>
         >>> image_mask_inp = MMImageMaskInput(
         >>>     processor_fn=processor_fn,
         >>>     image=image,
@@ -567,7 +567,7 @@ class MMImageMaskInput(InterpretableInput):
 
     processor_fn: Callable[[PIL.Image.Image], Any]
     image: PIL.Image.Image
-    mask: Optional[Tensor]
+    mask: Tensor
     baselines: Tuple[int, int, int]
     n_itp_features: int
     original_model_inputs: Any
@@ -585,22 +585,24 @@ def __init__(
 
         self.processor_fn = processor_fn
         self.image = image
-        self.mask = mask
         self.baselines = baselines
 
+        # Create a dummy mask if None is provided
         if mask is None:
-            self.n_itp_features = 1
-            self.mask_id_to_idx = {}
+            # Create a mask with all zeros (entire image as one segment)
+            image_shape = (image.size[1], image.size[0])  # (height, width)
+            mask = torch.zeros(image_shape, dtype=torch.int32)
         else:
             # Validate that mask size matches image size
             image_shape = (image.size[1], image.size[0])  # (height, width)
             assert (
                 mask.shape == image_shape
             ), f"mask shape {mask.shape} must match image shape {image_shape}"
 
-            mask_ids = torch.unique(mask)
-            self.n_itp_features = len(mask_ids)
-            self.mask_id_to_idx = {int(mid): i for i, mid in enumerate(mask_ids)}
+        self.mask = mask
+        mask_ids = torch.unique(mask)
+        self.n_itp_features = len(mask_ids)
+        self.mask_id_to_idx = {int(mid): i for i, mid in enumerate(mask_ids)}
 
         self.original_model_inputs = processor_fn(image)
 
@@ -613,14 +615,10 @@ def to_model_input(self, perturbed_tensor: Optional[Tensor] = None) -> Any:
 
         img_array = np.array(self.image)
 
-        if self.mask is None:
-            if perturbed_tensor[0][0] == 0:
-                img_array[:, :] = self.baselines
-        else:
-            for mask_id, itp_idx in self.mask_id_to_idx.items():
-                if perturbed_tensor[0][itp_idx] == 0:
-                    mask_positions = self.mask == mask_id
-                    img_array[mask_positions] = self.baselines
+        for mask_id, itp_idx in self.mask_id_to_idx.items():
+            if perturbed_tensor[0][itp_idx] == 0:
+                mask_positions = self.mask == mask_id
+                img_array[mask_positions] = self.baselines
 
         perturbed_image = PIL.Image.fromarray(img_array.astype("uint8"))
 
@@ -629,18 +627,11 @@ def to_model_input(self, perturbed_tensor: Optional[Tensor] = None) -> Any:
     def format_attr(self, itp_attr: Tensor) -> Tensor:
         device = itp_attr.device
 
-        if self.mask is None:
-            # When mask is None, treat entire image as one segment
-            # Create a uniform mask of all zeros to broadcast the single attribution
-            img_array = np.array(self.image)
-            image_shape = img_array.shape[:2]  # (height, width)
-            formatted_mask = torch.zeros(image_shape, dtype=torch.long, device=device)
-        else:
-            # Map mask IDs to continuous indices
-            image_shape = self.mask.shape
-            formatted_mask = torch.zeros_like(self.mask, device=device)
-            for mask_id, itp_idx in self.mask_id_to_idx.items():
-                formatted_mask[self.mask == mask_id] = itp_idx
+        # Map mask IDs to continuous indices
+        image_shape = self.mask.shape
+        formatted_mask = torch.zeros_like(self.mask, device=device)
+        for mask_id, itp_idx in self.mask_id_to_idx.items():
+            formatted_mask[self.mask == mask_id] = itp_idx
 
         formatted_attr = _scatter_itp_attr_by_mask(
             itp_attr,
diff --git a/tests/attr/test_interpretable_input.py b/tests/attr/test_interpretable_input.py
@@ -2,7 +2,7 @@
 
 # pyre-unsafe
 
-from typing import Any, Dict, List, Literal, Optional, overload, Union
+from typing import Dict, List, Literal, Optional, overload, Union
 
 import numpy as np
 import PIL.Image
@@ -260,9 +260,14 @@ def test_init_without_mask(self) -> None:
         )
 
         # Assert: verify n_itp_features is 1 when no mask provided
+        # When mask is None, a dummy mask with all zeros is created
         self.assertEqual(mm_input.n_itp_features, 1)
-        self.assertEqual(len(mm_input.mask_id_to_idx), 0)
-        self.assertIsNone(mm_input.mask)
+        self.assertEqual(mm_input.mask_id_to_idx, {0: 0})
+        self.assertIsNotNone(mm_input.mask)
+        # Verify dummy mask has all zeros
+        self.assertTrue(torch.all(mm_input.mask == 0))
+        # Verify dummy mask shape matches image size (height, width)
+        self.assertEqual(mm_input.mask.shape, (image.size[1], image.size[0]))
 
     def test_init_with_mask(self) -> None:
         # Setup: create test image and mask with 2 segments