|
| 1 | +using NNlib, Zygote, ChainRulesCore, Test |
| 2 | +using Zygote: ForwardDiff |
| 3 | + |
| 4 | +ACTIVATION_FUNCTIONS = |
| 5 | + [@eval($a) for a in NNlib.ACTIVATIONS] |
| 6 | + |
| 7 | +@testset "bias_act!" begin |
| 8 | + x = randn(3,4) |
| 9 | + b = randn(3) |
| 10 | + @test @inferred(bias_act!(identity, x, false)) === x # pass-through |
| 11 | + @test @inferred(bias_act!(identity, copy(x), b)) ≈ (x .+ b) |
| 12 | + @test @inferred(bias_act!(relu, copy(x), b)) ≈ relu.(x .+ b) |
| 13 | + @test @inferred(bias_act!(tanh, copy(x), b)) ≈ tanh.(x .+ b) |
| 14 | + @test @inferred(bias_act!(tanh, copy(x), false)) ≈ tanh.(x) |
| 15 | + |
| 16 | + # Check that it does overwrite: |
| 17 | + x32 = rand(Float32, 3, 4); x32copy = copy(x32) |
| 18 | + @test @inferred(bias_act!(cbrt, x32, b)) ≈ cbrt.(x32copy .+ b) |
| 19 | + @test x32 ≈ cbrt.(x32copy .+ b) |
| 20 | + |
| 21 | + x32 = rand(Float32, 3, 4); x32copy = copy(x32) # without bias |
| 22 | + @test @inferred(bias_act!(tanh, x32, false)) ≈ tanh.(x32copy) |
| 23 | + @test x32 ≈ tanh.(x32copy) |
| 24 | + |
| 25 | + x32 = rand(Float32, 3, 4); x32copy = copy(x32) # now check gradient rule |
| 26 | + y, back = rrule(Zygote.ZygoteRuleConfig(), bias_act!, relu, x32, b) |
| 27 | + @test y ≈ x32 ≈ relu.(x32copy .+ b) |
| 28 | + |
| 29 | + x32 = rand(Float32, 3, 4); x32copy = copy(x32) # without bias |
| 30 | + y, back = rrule(Zygote.ZygoteRuleConfig(), bias_act!, relu, x32, false) |
| 31 | + @test y ≈ x32 ≈ relu.(x32copy) |
| 32 | + |
| 33 | + # Check that it doesn't try to overwrite non-float arrays: |
| 34 | + xint = rand(-3:3, 3, 4) |
| 35 | + bint = rand(-2:2, 3) |
| 36 | + @test bias_act!(identity, copy(xint), bint) ≈ xint .+ bint |
| 37 | + @test bias_act!(tanh, copy(xint), bint) ≈ tanh.(xint .+ bint) |
| 38 | + @test bias_act!(tanh, copy(xint), false) ≈ tanh.(xint) |
| 39 | + |
| 40 | + # Reject bias===true so that Bool means one thing: |
| 41 | + @test_throws Exception bias_act!(identity, rand(3), true) |
| 42 | + @test_throws Exception bias_act!(cbrt, rand(3), true) |
| 43 | + @test_throws Exception bias_act!(cbrt, rand(1:3, 3), true) |
| 44 | + |
| 45 | + @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt], |
| 46 | + ACTIVATION_FUNCTIONS, |
| 47 | + [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)]) |
| 48 | + # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about. |
| 49 | + fun == rrelu && continue # this one is randomised! |
| 50 | + fun == hardσ && continue # this one has heisenbugs, not solved by discontinuity-avoidance code below |
| 51 | + |
| 52 | + @test bias_act!(fun, copy(x), b) ≈ fun.(x .+ b) |
| 53 | + @test bias_act!(fun, copy(x), false) ≈ fun.(x) |
| 54 | + |
| 55 | + gx = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x) |
| 56 | + gxplus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x .+ eps()) |
| 57 | + gxminus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x .- eps()) |
| 58 | + if !(gx ≈ gxplus ≈ gxminus) |
| 59 | + @warn "skipping gradient tests due to discontinuity" fun x b |
| 60 | + continue |
| 61 | + end |
| 62 | + @test gx ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), b)), x)[1] |
| 63 | + |
| 64 | + gx2 = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x) |
| 65 | + gx2plus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x .- eps()) |
| 66 | + gx2minus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x .- eps()) |
| 67 | + if !(gx2 ≈ gx2plus ≈ gx2minus) |
| 68 | + @warn "skipping gradient tests due to discontinuity" fun x |
| 69 | + continue |
| 70 | + end |
| 71 | + @test gx2 ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), false)), x)[1] |
| 72 | + |
| 73 | + gb = ForwardDiff.gradient(b -> sum(bias_act!(fun, copy(x), b)), b) |
| 74 | + @test gb ≈ Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b)[1] |
| 75 | + |
| 76 | + @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), false) == (nothing,) |
| 77 | + @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b .> 0) == (nothing,) |
| 78 | + end |
| 79 | + |
| 80 | + @testset "gradient for fast_broadcast!" begin |
| 81 | + # Gradient definition is just to disable mutation inside 2nd order AD |
| 82 | + gx = ForwardDiff.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x) |
| 83 | + @test gx ≈ Zygote.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x)[1] |
| 84 | + |
| 85 | + # relu should take the fast path |
| 86 | + g2 = ForwardDiff.gradient(x) do x |
| 87 | + sum(abs2, Zygote.gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1]) |
| 88 | + end |
| 89 | + @test_skip gx ≈ Zygote.gradient(x) do x # Here global variable b causes an error |
| 90 | + sum(abs2,Zygote. gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1]) |
| 91 | + end |
| 92 | + # Can't differentiate foreigncall expression $(Expr(:foreigncall, :(:jl_eqtable_get), Any, svec(Any, Any, Any), 0, :(:ccall), %5, %3, %4)). |
| 93 | + # [5] (::typeof(∂(accum_global)))(Δ::Nothing) |
| 94 | + @test g2 ≈ Zygote.gradient(x, b) do x, b |
| 95 | + sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(relu, copy(x), b)), x, b)[1]) |
| 96 | + end[1] |
| 97 | + |
| 98 | + g3 = ForwardDiff.gradient(x) do x |
| 99 | + sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1]) |
| 100 | + end |
| 101 | + @test g3 ≈ Zygote.gradient(x, b) do x, b |
| 102 | + sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1]) |
| 103 | + end[1] |
| 104 | + |
| 105 | + # Anon function sure to take the generic path |
| 106 | + g4 = ForwardDiff.gradient(x) do x |
| 107 | + sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1]) |
| 108 | + end |
| 109 | + @test g4 ≈ Zygote.gradient(x, b) do x, b |
| 110 | + sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1]) |
| 111 | + end[1] |
| 112 | + end |
| 113 | +end |
| 114 | + |
0 commit comments