fix(UNETR): access Bias term in SAB block (#5149)

a-parida12 · monai-bot · web-flow · commit 57d46090a37e · 2022-09-14T22:43:31.000Z
Fixes #5148 . ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [x] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. Signed-off-by: a-parida12 <abhijeet.parida@tum.de> Signed-off-by: monai-bot <monai.miccai2019@gmail.com> Co-authored-by: monai-bot <monai.miccai2019@gmail.com>
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -23,12 +23,13 @@ class SABlock(nn.Module):
     An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
     """
 
-    def __init__(self, hidden_size: int, num_heads: int, dropout_rate: float = 0.0) -> None:
+    def __init__(self, hidden_size: int, num_heads: int, dropout_rate: float = 0.0, qkv_bias: bool = False) -> None:
         """
         Args:
             hidden_size: dimension of hidden layer.
             num_heads: number of attention heads.
             dropout_rate: faction of the input units to drop.
+            qkv_bias: bias term for the qkv linear layer.
 
         """
 
@@ -42,7 +43,7 @@ def __init__(self, hidden_size: int, num_heads: int, dropout_rate: float = 0.0)
 
         self.num_heads = num_heads
         self.out_proj = nn.Linear(hidden_size, hidden_size)
-        self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=False)
+        self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias)
         self.input_rearrange = Rearrange("b h (qkv l d) -> qkv b l h d", qkv=3, l=num_heads)
         self.out_rearrange = Rearrange("b h l d -> b l (h d)")
         self.drop_output = nn.Dropout(dropout_rate)
diff --git a/monai/networks/blocks/transformerblock.py b/monai/networks/blocks/transformerblock.py
@@ -21,13 +21,16 @@ class TransformerBlock(nn.Module):
     An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
     """
 
-    def __init__(self, hidden_size: int, mlp_dim: int, num_heads: int, dropout_rate: float = 0.0) -> None:
+    def __init__(
+        self, hidden_size: int, mlp_dim: int, num_heads: int, dropout_rate: float = 0.0, qkv_bias: bool = False
+    ) -> None:
         """
         Args:
             hidden_size: dimension of hidden layer.
             mlp_dim: dimension of feedforward layer.
             num_heads: number of attention heads.
             dropout_rate: faction of the input units to drop.
+            qkv_bias: apply bias term for the qkv linear layer
 
         """
 
@@ -41,7 +44,7 @@ def __init__(self, hidden_size: int, mlp_dim: int, num_heads: int, dropout_rate:
 
         self.mlp = MLPBlock(hidden_size, mlp_dim, dropout_rate)
         self.norm1 = nn.LayerNorm(hidden_size)
-        self.attn = SABlock(hidden_size, num_heads, dropout_rate)
+        self.attn = SABlock(hidden_size, num_heads, dropout_rate, qkv_bias)
         self.norm2 = nn.LayerNorm(hidden_size)
 
     def forward(self, x):
diff --git a/monai/networks/nets/unetr.py b/monai/networks/nets/unetr.py
@@ -40,6 +40,7 @@ def __init__(
         res_block: bool = True,
         dropout_rate: float = 0.0,
         spatial_dims: int = 3,
+        qkv_bias: bool = False,
     ) -> None:
         """
         Args:
@@ -56,6 +57,7 @@ def __init__(
             res_block: bool argument to determine if residual block is used.
             dropout_rate: faction of the input units to drop.
             spatial_dims: number of spatial dims.
+            qkv_bias: apply the bias term for the qkv linear layer in self attention block
 
         Examples::
 
@@ -96,6 +98,7 @@ def __init__(
             classification=self.classification,
             dropout_rate=dropout_rate,
             spatial_dims=spatial_dims,
+            qkv_bias=qkv_bias,
         )
         self.encoder1 = UnetrBasicBlock(
             spatial_dims=spatial_dims,
diff --git a/monai/networks/nets/vit.py b/monai/networks/nets/vit.py
@@ -44,6 +44,7 @@ def __init__(
         dropout_rate: float = 0.0,
         spatial_dims: int = 3,
         post_activation="Tanh",
+        qkv_bias: bool = False,
     ) -> None:
         """
         Args:
@@ -61,6 +62,7 @@ def __init__(
             spatial_dims: number of spatial dimensions.
             post_activation: add a final acivation function to the classification head when `classification` is True.
                 Default to "Tanh" for `nn.Tanh()`. Set to other values to remove this function.
+            qkv_bias: apply bias to the qkv linear layer in self attention block
 
         Examples::
 
@@ -95,7 +97,7 @@ def __init__(
             spatial_dims=spatial_dims,
         )
         self.blocks = nn.ModuleList(
-            [TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate) for i in range(num_layers)]
+            [TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate, qkv_bias) for i in range(num_layers)]
         )
         self.norm = nn.LayerNorm(hidden_size)
         if self.classification: