comment out 2nd path again

mcabbott · mcabbott · commit b2969a5d3765 · 2023-01-07T10:39:06.000-05:00
diff --git a/src/bias_act.jl b/src/bias_act.jl
@@ -59,16 +59,16 @@ function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractA
         end
         return Ω, bias_act!_fastback
 
-    # Slower path: can't overwrite x, but can use derivatives_given_output
-    # This case is WRONG and tests fail, but not sure why
-    elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
-        Ω2 = fast_act(σ, x).(x) .+ b
-        @show σ b
-        function bias_act!_back2(Δ)
-            dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
-            return (NoTangent(), NoTangent(), dx, biasgrad(dx))
-        end
-        return Ω2, bias_act!_back2
+    # # Slower path: can't overwrite x, but can use derivatives_given_output
+    # # This case is WRONG and tests fail, but not sure why
+    # elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
+    #     Ω2 = fast_act(σ, x).(x) .+ b
+    #     @show σ b
+    #     function bias_act!_back2(Δ)
+    #         dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
+    #         return (NoTangent(), NoTangent(), dx, biasgrad(dx))
+    #     end
+    #     return Ω2, bias_act!_back2
 
     # Fallback path: let AD handle the broadcast
     else
@@ -96,154 +96,3 @@ function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArr
     return x, bias_act!_trivial
 end
 
-
-
-# """
-#     add_act(σ, x, y...)
-#     add_act!(σ, x, y, z...)
-
-# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
-# """
-# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
-
-
-# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
-#     if isconcretetype(Core.Compiler._return_type(
-#             derivatives_given_output, Tuple{T, F, NotaNumber}))
-
-# end
-
-
-# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
-#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
-#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
-
-
-# using NNlib, BenchmarkTools
-
-#=
-
-## M1 mac, 1.10
-
-julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
-
-julia> @btime bias_act!(relu, $w, $b);
-  min 19.500 μs, mean 21.375 μs (0 allocations)
-
-julia> @btime relu.($w .+ $b);
-  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
-
-julia> @btime bias_act!(tanh, $w, $b);
-  min 63.792 μs, mean 65.052 μs (0 allocations)
-
-julia> @btime tanh_fast.($w .+ $b);
-  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
-
-julia> using Zygote
-
-julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
-  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
-  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
-
-julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
-  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
-  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
-
-
-
-## Cyclops
-
-julia> using CUDA  # 10x bigger
-
-julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
-
-julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
-  22.546 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
-  31.282 μs (38 allocations: 1.81 KiB)
-
-julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
-  27.030 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
-  36.421 μs (38 allocations: 1.81 KiB)
-
-julia> using Zygote
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
-  204.507 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
-  204.458 μs (409 allocations: 19.19 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
-  224.545 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
-  204.793 μs (411 allocations: 19.30 KiB)
-
-
-=#
-
-#=
-
-(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
-
-julia> using NNlib, Zygote, BenchmarkTools
-
-julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
-
-
-# Cyclops
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  24.786 μs (2 allocations: 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  25.501 μs (4 allocations: 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  91.847 μs (43 allocations: 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  98.054 μs (41 allocations: 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  80.464 μs (28 allocations: 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  4.604 μs (2 allocations: 19.61 KiB)
-
-julia> @time using CUDA; @time cu(ones(3)) .+ 1;
-
-julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
-
-
-
-=#
-
-
-
diff --git a/src/utils.jl b/src/utils.jl
@@ -119,11 +119,14 @@ unsqueeze(x) = reshape(x, 1, size(x)...)
 
 This does `x .= f.(x, y, z...)`, but works around
 an issue with broadcasting that prevents SIMD in such cases.
-Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
+Can perhaps be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
 
-Not intended for general use. Uses `@inbounds` but does not check sizes!
+Has an `rrule` to avoid mutation within derivatives.
 
-Has an `rrule` to avoid mutation within derivatives. This assumes that `f` has no derivative!
+!!! warning
+    Not intended for general use.
+    Uses `@inbounds` but does not check sizes!
+    Assumes that `f` has no derivative!
 """
 function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
     bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))