Tweak mandelbrot-fast.jl for performance.

non-Jedi · non-Jedi · commit 0fa8313ead7c · 2019-12-20T17:46:39.000-05:00
Multiple versions with different threadings included because different versions are faster depending on the machine. Depending on machine, gains can be over 20% compared to original mandelbrot-fast.jl. NOTE: running mandelbrot-fast.v3.jl requires installation of https://github.com/mohamed82008/KissThreading.jl Changes included in every version: - Removing threading from filling xvals and yvals--threading overhead is too high for such a simple operation. - Remove @simd annotation from mandel_inner--simd is occurring at the level of mand8; @simd doesn't hurt runtime but increases compilation time. - Only run mandelbrot when !isinteractive() to make development and debugging easier. - Various tweaks and minor stylistic updates for succinctness and maybe a marginal increase in performance.
diff --git a/mandelbrot/mandelbrot-fast.jl b/mandelbrot/mandelbrot-fast.jl
@@ -1,74 +1,67 @@
 #=
 The Computer Language Benchmarks Game
  https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
  direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
  https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
- modified for Julia 1.0 by Simon Danisch
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
 =#
 const zerov8 = ntuple(x-> 0f0, 8)
-
-@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
-    Zi = 2f0 .* Zr .* Zi .+ ci
-    Zr = Tr .- Ti .+ cr
-    Tr = Zr .* Zr
-    Ti = Zi .* Zi
-    return Zr,Zi,Tr,Ti
-end
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
 
 # Calculate mandelbrot set for one Vec8 into one byte
 Base.@propagate_inbounds function mand8(cr, ci)
-    Zr = zerov8
-    Zi = zerov8
-    Tr = zerov8
-    Ti = zerov8
-    t = zerov8
+    Zr = Zi = Tr = Ti = t = zerov8
     i = 0
 
-    while i<50
-        for _ in 1:5
-            Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
-            i += 1
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
         end
         t = Tr .+ Ti
         all(x-> x > 4f0, t) && (return 0x00)
     end
+
     byte = 0xff
-    t[1] <= 4.0 || (byte &= 0b01111111)
-    t[2] <= 4.0 || (byte &= 0b10111111)
-    t[3] <= 4.0 || (byte &= 0b11011111)
-    t[4] <= 4.0 || (byte &= 0b11101111)
-    t[5] <= 4.0 || (byte &= 0b11110111)
-    t[6] <= 4.0 || (byte &= 0b11111011)
-    t[7] <= 4.0 || (byte &= 0b11111101)
-    t[8] <= 4.0 || (byte &= 0b11111110)
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
     return byte
 end
 
 function mandel_inner(rows, ci, y, N, xvals)
-    @simd for x in 1:8:N
-        @inbounds begin
-            cr = ntuple(i-> xvals[x + (i - 1)], 8)
-            rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
-        end
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
     end
 end
 
-function mandelbrot(n = 200)
+function mandelbrot(io, n = 200)
     inv_ = 2.0 / n
-    N = n
-    xvals = zeros(Float32, n)
-    yvals = zeros(Float32, n)
-    Threads.@threads for i in 0:(N-1)
-        @inbounds xvals[i + 1] = i * inv_ - 1.5
-        @inbounds yvals[i + 1] = i * inv_ - 1.0
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
     end
-    rows = zeros(UInt8, n*N÷8)
-    Threads.@threads for y in 1:N
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    @sync for y=1:n
         @inbounds ci = yvals[y]
-        mandel_inner(rows, ci, y, N, xvals)
+        # This allows dynamic scheduling instead of static scheduling
+        # of Threads.@threads macro. See
+        # https://github.com/JuliaLang/julia/issues/21017 . On some
+        # computers this is faster, on others not.
+        Threads.@spawn mandel_inner(rows, ci, y, n, xvals)
     end
-    write(stdout, "P4\n$n $n\n")
-    write(stdout, rows)
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
 end
 
-mandelbrot(parse(Int, ARGS[1]))
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))
diff --git a/mandelbrot/mandelbrot-fast.v2.jl b/mandelbrot/mandelbrot-fast.v2.jl
@@ -0,0 +1,63 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
+=#
+const zerov8 = ntuple(x-> 0f0, 8)
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
+
+# Calculate mandelbrot set for one Vec8 into one byte
+Base.@propagate_inbounds function mand8(cr, ci)
+    Zr = Zi = Tr = Ti = t = zerov8
+    i = 0
+
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
+        end
+        t = Tr .+ Ti
+        all(x-> x > 4f0, t) && (return 0x00)
+    end
+
+    byte = 0xff
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
+    return byte
+end
+
+function mandel_inner(rows, ci, y, N, xvals)
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
+    end
+end
+
+function mandelbrot(io, n = 200)
+    inv_ = 2.0 / n
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
+    end
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    Threads.@threads for y=1:n
+        @inbounds ci = yvals[y]
+        mandel_inner(rows, ci, y, n, xvals)
+    end
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
+end
+
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))
diff --git a/mandelbrot/mandelbrot-fast.v3.jl b/mandelbrot/mandelbrot-fast.v3.jl
@@ -0,0 +1,64 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
+=#
+using KissThreading
+
+const zerov8 = ntuple(x-> 0f0, 8)
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
+
+# Calculate mandelbrot set for one Vec8 into one byte
+Base.@propagate_inbounds function mand8(cr, ci)
+    Zr = Zi = Tr = Ti = t = zerov8
+    i = 0
+
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
+        end
+        t = Tr .+ Ti
+        all(x-> x > 4f0, t) && (return 0x00)
+    end
+
+    byte = 0xff
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
+    return byte
+end
+
+function mandel_inner(rows, ci, y, N, xvals)
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
+    end
+end
+
+function mandelbrot(io, n = 200)
+    inv_ = 2.0 / n
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
+    end
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals)
+    tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8)
+
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
+end
+
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))