Skip to content

Commit 0fa8313

Browse files
committed
Tweak mandelbrot-fast.jl for performance.
Multiple versions with different threadings included because different versions are faster depending on the machine. Depending on machine, gains can be over 20% compared to original mandelbrot-fast.jl. NOTE: running mandelbrot-fast.v3.jl requires installation of https://github.com/mohamed82008/KissThreading.jl Changes included in every version: - Removing threading from filling xvals and yvals--threading overhead is too high for such a simple operation. - Remove @simd annotation from mandel_inner--simd is occurring at the level of mand8; @simd doesn't hurt runtime but increases compilation time. - Only run mandelbrot when !isinteractive() to make development and debugging easier. - Various tweaks and minor stylistic updates for succinctness and maybe a marginal increase in performance.
1 parent 0329d60 commit 0fa8313

File tree

3 files changed

+164
-44
lines changed

3 files changed

+164
-44
lines changed

mandelbrot/mandelbrot-fast.jl

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,67 @@
11
#=
22
The Computer Language Benchmarks Game
33
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
45
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
56
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
6-
modified for Julia 1.0 by Simon Danisch
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
710
=#
811
const zerov8 = ntuple(x-> 0f0, 8)
9-
10-
@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
11-
Zi = 2f0 .* Zr .* Zi .+ ci
12-
Zr = Tr .- Ti .+ cr
13-
Tr = Zr .* Zr
14-
Ti = Zi .* Zi
15-
return Zr,Zi,Tr,Ti
16-
end
12+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
13+
0b11111011, 0b11111101, 0b11111110)
1714

1815
# Calculate mandelbrot set for one Vec8 into one byte
1916
Base.@propagate_inbounds function mand8(cr, ci)
20-
Zr = zerov8
21-
Zi = zerov8
22-
Tr = zerov8
23-
Ti = zerov8
24-
t = zerov8
17+
Zr = Zi = Tr = Ti = t = zerov8
2518
i = 0
2619

27-
while i<50
28-
for _ in 1:5
29-
Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
30-
i += 1
20+
for _=1:10
21+
for _=1:5
22+
Zi = 2f0 .* Zr .* Zi .+ ci
23+
Zr = Tr .- Ti .+ cr
24+
Tr = Zr .* Zr
25+
Ti = Zi .* Zi
3126
end
3227
t = Tr .+ Ti
3328
all(x-> x > 4f0, t) && (return 0x00)
3429
end
30+
3531
byte = 0xff
36-
t[1] <= 4.0 || (byte &= 0b01111111)
37-
t[2] <= 4.0 || (byte &= 0b10111111)
38-
t[3] <= 4.0 || (byte &= 0b11011111)
39-
t[4] <= 4.0 || (byte &= 0b11101111)
40-
t[5] <= 4.0 || (byte &= 0b11110111)
41-
t[6] <= 4.0 || (byte &= 0b11111011)
42-
t[7] <= 4.0 || (byte &= 0b11111101)
43-
t[8] <= 4.0 || (byte &= 0b11111110)
32+
for i=1:8
33+
t[i] <= 4.0 || (byte &= masks[i])
34+
end
4435
return byte
4536
end
4637

4738
function mandel_inner(rows, ci, y, N, xvals)
48-
@simd for x in 1:8:N
49-
@inbounds begin
50-
cr = ntuple(i-> xvals[x + (i - 1)], 8)
51-
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
52-
end
39+
@inbounds for x=1:8:N
40+
cr = ntuple(i-> xvals[x + i - 1], 8)
41+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
5342
end
5443
end
5544

56-
function mandelbrot(n = 200)
45+
function mandelbrot(io, n = 200)
5746
inv_ = 2.0 / n
58-
N = n
59-
xvals = zeros(Float32, n)
60-
yvals = zeros(Float32, n)
61-
Threads.@threads for i in 0:(N-1)
62-
@inbounds xvals[i + 1] = i * inv_ - 1.5
63-
@inbounds yvals[i + 1] = i * inv_ - 1.0
47+
xvals = Vector{Float32}(undef, n)
48+
yvals = Vector{Float32}(undef, n)
49+
@inbounds for i in 0:(n-1)
50+
xvals[i + 1] = i * inv_ - 1.5
51+
yvals[i + 1] = i * inv_ - 1.0
6452
end
65-
rows = zeros(UInt8, n*N÷8)
66-
Threads.@threads for y in 1:N
53+
54+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
55+
@sync for y=1:n
6756
@inbounds ci = yvals[y]
68-
mandel_inner(rows, ci, y, N, xvals)
57+
# This allows dynamic scheduling instead of static scheduling
58+
# of Threads.@threads macro. See
59+
# https://github.com/JuliaLang/julia/issues/21017 . On some
60+
# computers this is faster, on others not.
61+
Threads.@spawn mandel_inner(rows, ci, y, n, xvals)
6962
end
70-
write(stdout, "P4\n$n $n\n")
71-
write(stdout, rows)
63+
write(io, "P4\n$n $n\n")
64+
write(io, rows)
7265
end
7366

74-
mandelbrot(parse(Int, ARGS[1]))
67+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v2.jl

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
5+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
6+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
10+
=#
11+
const zerov8 = ntuple(x-> 0f0, 8)
12+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
13+
0b11111011, 0b11111101, 0b11111110)
14+
15+
# Calculate mandelbrot set for one Vec8 into one byte
16+
Base.@propagate_inbounds function mand8(cr, ci)
17+
Zr = Zi = Tr = Ti = t = zerov8
18+
i = 0
19+
20+
for _=1:10
21+
for _=1:5
22+
Zi = 2f0 .* Zr .* Zi .+ ci
23+
Zr = Tr .- Ti .+ cr
24+
Tr = Zr .* Zr
25+
Ti = Zi .* Zi
26+
end
27+
t = Tr .+ Ti
28+
all(x-> x > 4f0, t) && (return 0x00)
29+
end
30+
31+
byte = 0xff
32+
for i=1:8
33+
t[i] <= 4.0 || (byte &= masks[i])
34+
end
35+
return byte
36+
end
37+
38+
function mandel_inner(rows, ci, y, N, xvals)
39+
@inbounds for x=1:8:N
40+
cr = ntuple(i-> xvals[x + i - 1], 8)
41+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
42+
end
43+
end
44+
45+
function mandelbrot(io, n = 200)
46+
inv_ = 2.0 / n
47+
xvals = Vector{Float32}(undef, n)
48+
yvals = Vector{Float32}(undef, n)
49+
@inbounds for i in 0:(n-1)
50+
xvals[i + 1] = i * inv_ - 1.5
51+
yvals[i + 1] = i * inv_ - 1.0
52+
end
53+
54+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
55+
Threads.@threads for y=1:n
56+
@inbounds ci = yvals[y]
57+
mandel_inner(rows, ci, y, n, xvals)
58+
end
59+
write(io, "P4\n$n $n\n")
60+
write(io, rows)
61+
end
62+
63+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v3.jl

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
5+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
6+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
10+
=#
11+
using KissThreading
12+
13+
const zerov8 = ntuple(x-> 0f0, 8)
14+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
15+
0b11111011, 0b11111101, 0b11111110)
16+
17+
# Calculate mandelbrot set for one Vec8 into one byte
18+
Base.@propagate_inbounds function mand8(cr, ci)
19+
Zr = Zi = Tr = Ti = t = zerov8
20+
i = 0
21+
22+
for _=1:10
23+
for _=1:5
24+
Zi = 2f0 .* Zr .* Zi .+ ci
25+
Zr = Tr .- Ti .+ cr
26+
Tr = Zr .* Zr
27+
Ti = Zi .* Zi
28+
end
29+
t = Tr .+ Ti
30+
all(x-> x > 4f0, t) && (return 0x00)
31+
end
32+
33+
byte = 0xff
34+
for i=1:8
35+
t[i] <= 4.0 || (byte &= masks[i])
36+
end
37+
return byte
38+
end
39+
40+
function mandel_inner(rows, ci, y, N, xvals)
41+
@inbounds for x=1:8:N
42+
cr = ntuple(i-> xvals[x + i - 1], 8)
43+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
44+
end
45+
end
46+
47+
function mandelbrot(io, n = 200)
48+
inv_ = 2.0 / n
49+
xvals = Vector{Float32}(undef, n)
50+
yvals = Vector{Float32}(undef, n)
51+
@inbounds for i in 0:(n-1)
52+
xvals[i + 1] = i * inv_ - 1.5
53+
yvals[i + 1] = i * inv_ - 1.0
54+
end
55+
56+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
57+
f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals)
58+
tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8)
59+
60+
write(io, "P4\n$n $n\n")
61+
write(io, rows)
62+
end
63+
64+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

0 commit comments

Comments
 (0)