huggingface
diff --git a/‎timm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎timm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎timm/models/activations.py‎
Lines changed: 38 additions & 63 deletions b/‎timm/models/activations.py‎
Lines changed: 38 additions & 63 deletions
diff --git a/‎timm/models/conv2d_layers.py‎
Lines changed: 45 additions & 47 deletions b/‎timm/models/conv2d_layers.py‎
Lines changed: 45 additions & 47 deletions
@@ -8,6 +8,7 @@
 from .nasnet import *
 from .pnasnet import *
 from .gen_efficientnet import *
+from .mobilenetv3 import *
 from .inception_v3 import *
 from .gluon_resnet import *
 from .gluon_xception import *
 
@@ -7,91 +7,75 @@
 if _USE_MEM_EFFICIENT_ISH:
     # This version reduces memory overhead of Swish during training by
     # recomputing torch.sigmoid(x) in backward instead of saving it.
-    class SwishAutoFn(torch.autograd.Function):
-        """Swish - Described in: https://arxiv.org/abs/1710.05941
-        Memory efficient variant from:
-         https://medium.com/the-artificial-impostor/more-memory-efficient-swish-activation-function-e07c22c12a76
-        """
-        @staticmethod
-        def forward(ctx, x):
-            result = x.mul(torch.sigmoid(x))
-            ctx.save_for_backward(x)
-            return result
+    @torch.jit.script
+    def swish_jit_fwd(x):
+        return x.mul(torch.sigmoid(x))
 
-        @staticmethod
-        def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            sigmoid_x = torch.sigmoid(x)
-            return grad_output.mul(sigmoid_x * (1 + x * (1 - sigmoid_x)))
 
-    def swish(x, inplace=False):
-        # inplace ignored
-        return SwishAutoFn.apply(x)
+    @torch.jit.script
+    def swish_jit_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
 
 
-    class MishAutoFn(torch.autograd.Function):
-        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-        Experimental memory-efficient variant
+    class SwishJitAutoFn(torch.autograd.Function):
+        """ torch.jit.script optimised Swish
+        Inspired by conversation btw Jeremy Howard & Adam Pazske
+        https://twitter.com/jeremyphoward/status/1188251041835315200
         """
 
         @staticmethod
         def forward(ctx, x):
             ctx.save_for_backward(x)
-            y = x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
-            return y
+            return swish_jit_fwd(x)
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            x_sigmoid = torch.sigmoid(x)
-            x_tanh_sp = F.softplus(x).tanh()
-            return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+            x = ctx.saved_tensors[0]
+            return swish_jit_bwd(x, grad_output)
 
-    def mish(x, inplace=False):
-        # inplace ignored
-        return MishAutoFn.apply(x)
 
+    def swish(x, _inplace=False):
+        return SwishJitAutoFn.apply(x)
 
-    class WishAutoFn(torch.autograd.Function):
-        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
-        Experimental memory-efficient variant
-        """
 
+    @torch.jit.script
+    def mish_jit_fwd(x):
+        return x.mul(torch.tanh(F.softplus(x)))
+
+
+    @torch.jit.script
+    def mish_jit_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        x_tanh_sp = F.softplus(x).tanh()
+        return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+    class MishJitAutoFn(torch.autograd.Function):
         @staticmethod
         def forward(ctx, x):
             ctx.save_for_backward(x)
-            y = x.mul(torch.tanh(torch.exp(x)))
-            return y
+            return mish_jit_fwd(x)
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            x_exp = x.exp()
-            x_tanh_exp = x_exp.tanh()
-            return grad_output.mul(x_tanh_exp + x * x_exp * (1 - x_tanh_exp * x_tanh_exp))
-
-    def wish(x, inplace=False):
-        # inplace ignored
-        return WishAutoFn.apply(x)
+            x = ctx.saved_tensors[0]
+            return mish_jit_bwd(x, grad_output)
+
+    def mish(x, _inplace=False):
+        return MishJitAutoFn.apply(x)
+
 else:
     def swish(x, inplace=False):
         """Swish - Described in: https://arxiv.org/abs/1710.05941
         """
         return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
 
 
-    def mish(x, inplace=False):
+    def mish(x, _inplace=False):
         """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
         """
-        inner = F.softplus(x).tanh()
-        return x.mul_(inner) if inplace else x.mul(inner)
-
-
-    def wish(x, inplace=False):
-        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
-        """
-        inner = x.exp().tanh()
-        return x.mul_(inner) if inplace else x.mul(inner)
+        return x.mul(F.softplus(x).tanh())
 
 
 class Swish(nn.Module):
@@ -112,15 +96,6 @@ def forward(self, x):
         return mish(x, self.inplace)
 
 
-class Wish(nn.Module):
-    def __init__(self, inplace=False):
-        super(Wish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return wish(x, self.inplace)
-
-
 def sigmoid(x, inplace=False):
     return x.sigmoid_() if inplace else x.sigmoid()
 
 
@@ -102,13 +102,14 @@ def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
 
 class MixedConv2d(nn.Module):
     """ Mixed Grouped Convolution
-
     Based on MDConv and GroupedConv in MixNet impl:
       https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+
+    NOTE: This does not currently work with torch.jit.script
     """
 
     def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, mixed_dilated=False, depthwise=False, **kwargs):
+                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
         super(MixedConv2d, self).__init__()
 
         kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
@@ -118,17 +119,13 @@ def __init__(self, in_channels, out_channels, kernel_size=3,
         self.in_channels = sum(in_splits)
         self.out_channels = sum(out_splits)
         for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            d = dilation
-            # FIXME make compat with non-square kernel/dilations/strides
-            if stride == 1 and mixed_dilated:
-                d, k = (k - 1) // 2, 3
             conv_groups = out_ch if depthwise else 1
             # use add_module to keep key space clean
             self.add_module(
                 str(idx),
                 create_conv2d_pad(
                     in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=d, groups=conv_groups, **kwargs)
+                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
             )
         self.splits = in_splits
 
@@ -154,12 +151,12 @@ def condconv_initializer(weight):
 
 class CondConv2d(nn.Module):
     """ Conditional Convolution
-
     Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
 
     Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
     https://github.com/pytorch/pytorch/issues/17983
     """
+    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
 
     def __init__(self, in_channels, out_channels, kernel_size=3,
                  stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
@@ -171,13 +168,10 @@ def __init__(self, in_channels, out_channels, kernel_size=3,
         self.stride = _pair(stride)
         padding_val, is_padding_dynamic = get_padding_value(
             padding, kernel_size, stride=stride, dilation=dilation)
-        self.conv_fn = conv2d_same if is_padding_dynamic else F.conv2d
+        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
         self.padding = _pair(padding_val)
         self.dilation = _pair(dilation)
-        self.transposed = False
-        self.output_padding = _pair(0)
         self.groups = groups
-        self.padding_mode = 'zero'
         self.num_experts = num_experts
 
         self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
@@ -186,60 +180,63 @@ def __init__(self, in_channels, out_channels, kernel_size=3,
             weight_num_param *= wd
         self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
 
-        # FIXME I haven't tested bias yet
         if bias:
             self.bias_shape = (self.out_channels,)
-            condconv_bias_shape = (self.num_experts, self.out_channels)
-            self.bias = torch.nn.Parameter(torch.Tensor(condconv_bias_shape))
+            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
         else:
             self.register_parameter('bias', None)
 
         self.reset_parameters()
-        # FIXME once I'm satisfied this works, remove the looping path?
-        self._use_groups = True  # use groups for parallel per-batch-element kernel convolution
 
     def reset_parameters(self):
         init_weight = get_condconv_initializer(
             partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
         init_weight(self.weight)
         if self.bias is not None:
-            # FIXME bias not tested
             fan_in = np.prod(self.weight_shape[1:])
             bound = 1 / math.sqrt(fan_in)
             init_bias = get_condconv_initializer(
                 partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
             init_bias(self.bias)
 
     def forward(self, x, routing_weights):
-        weight = torch.matmul(routing_weights, self.weight)
-        bias = torch.matmul(routing_weights, self.bias) if self.bias is not None else None
         B, C, H, W = x.shape
-        if self._use_groups:
-            new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
-            weight = weight.view(new_weight_shape)
-            # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
-            x = x.view(1, B * C, H, W)
-            out = self.conv_fn(
+        weight = torch.matmul(routing_weights, self.weight)
+        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight = weight.view(new_weight_shape)
+        bias = None
+        if self.bias is not None:
+            bias = torch.matmul(routing_weights, self.bias)
+            bias = bias.view(B * self.out_channels)
+        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+        x = x.view(1, B * C, H, W)
+        if self.dynamic_padding:
+            out = conv2d_same(
                 x, weight, bias, stride=self.stride, padding=self.padding,
                 dilation=self.dilation, groups=self.groups * B)
-            out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
         else:
-            x = torch.split(x, 1, 0)
-            weight = torch.split(weight, 1, 0)
-            if self.bias is not None:
-                bias = torch.matmul(routing_weights, self.bias)
-                bias = torch.split(bias, 1, 0)
-            else:
-                bias = [None] * B
-            out = []
-            for xi, wi, bi in zip(x, weight, bias):
-                wi = wi.view(*self.weight_shape)
-                if bi is not None:
-                    bi = bi.view(*self.bias_shape)
-                out.append(self.conv_fn(
-                    xi, wi, bi, stride=self.stride, padding=self.padding,
-                    dilation=self.dilation, groups=self.groups))
-            out = torch.cat(out, 0)
+            out = F.conv2d(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+        # Literal port (from TF definition)
+        # x = torch.split(x, 1, 0)
+        # weight = torch.split(weight, 1, 0)
+        # if self.bias is not None:
+        #     bias = torch.matmul(routing_weights, self.bias)
+        #     bias = torch.split(bias, 1, 0)
+        # else:
+        #     bias = [None] * B
+        # out = []
+        # for xi, wi, bi in zip(x, weight, bias):
+        #     wi = wi.view(*self.weight_shape)
+        #     if bi is not None:
+        #         bi = bi.view(*self.bias_shape)
+        #     out.append(self.conv_fn(
+        #         xi, wi, bi, stride=self.stride, padding=self.padding,
+        #         dilation=self.dilation, groups=self.groups))
+        # out = torch.cat(out, 0)
         return out
 
 
@@ -250,13 +247,14 @@ def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
         assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
         # We're going to use only lists for defining the MixedConv2d kernel groups,
         # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        return MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
     else:
         depthwise = kwargs.pop('depthwise', False)
         groups = out_chs if depthwise else 1
         if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            create_fn = CondConv2d
+            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
         else:
-            create_fn = create_conv2d_pad
-        return create_fn(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+    return m
+