Skip to content

Commit d1be6d4

Browse files
committed
Tweak mandelbrot-fast.jl for performance.
Multiple versions with different threadings included because different versions are faster depending on the machine. Depending on machine, gains can be over 20% compared to original mandelbrot-fast.jl. NOTE: running mandelbrot-fast.v3.jl requires installation of https://github.com/mohamed82008/KissThreading.jl Changes included in every version: - Removing threading from filling xvals and yvals--threading overhead is too high for such a simple operation. - Remove @simd annotation from mandel_inner--simd is occurring at the level of mand8; @simd doesn't hurt runtime but increases compilation time. - Only run mandelbrot when !isinteractive() to make development and debugging easier. - Various tweaks and minor stylistic updates for succinctness and maybe a marginal increase in performance.
1 parent 0329d60 commit d1be6d4

File tree

3 files changed

+171
-39
lines changed

3 files changed

+171
-39
lines changed

mandelbrot/mandelbrot-fast.jl

Lines changed: 36 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,16 @@
11
#=
22
The Computer Language Benchmarks Game
33
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
45
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
56
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
6-
modified for Julia 1.0 by Simon Danisch
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
710
=#
811
const zerov8 = ntuple(x-> 0f0, 8)
9-
10-
@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
11-
Zi = 2f0 .* Zr .* Zi .+ ci
12-
Zr = Tr .- Ti .+ cr
13-
Tr = Zr .* Zr
14-
Ti = Zi .* Zi
15-
return Zr,Zi,Tr,Ti
16-
end
12+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
13+
0b11111011, 0b11111101, 0b11111110)
1714

1815
# Calculate mandelbrot set for one Vec8 into one byte
1916
Base.@propagate_inbounds function mand8(cr, ci)
@@ -24,51 +21,51 @@ Base.@propagate_inbounds function mand8(cr, ci)
2421
t = zerov8
2522
i = 0
2623

27-
while i<50
28-
for _ in 1:5
29-
Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
30-
i += 1
24+
for _=1:10
25+
for _=1:5
26+
Zi = 2f0 .* Zr .* Zi .+ ci
27+
Zr = Tr .- Ti .+ cr
28+
Tr = Zr .* Zr
29+
Ti = Zi .* Zi
3130
end
3231
t = Tr .+ Ti
3332
all(x-> x > 4f0, t) && (return 0x00)
3433
end
34+
3535
byte = 0xff
36-
t[1] <= 4.0 || (byte &= 0b01111111)
37-
t[2] <= 4.0 || (byte &= 0b10111111)
38-
t[3] <= 4.0 || (byte &= 0b11011111)
39-
t[4] <= 4.0 || (byte &= 0b11101111)
40-
t[5] <= 4.0 || (byte &= 0b11110111)
41-
t[6] <= 4.0 || (byte &= 0b11111011)
42-
t[7] <= 4.0 || (byte &= 0b11111101)
43-
t[8] <= 4.0 || (byte &= 0b11111110)
36+
for i=1:8
37+
t[i] <= 4.0 || (byte &= masks[i])
38+
end
4439
return byte
4540
end
4641

4742
function mandel_inner(rows, ci, y, N, xvals)
48-
@simd for x in 1:8:N
49-
@inbounds begin
50-
cr = ntuple(i-> xvals[x + (i - 1)], 8)
51-
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
52-
end
43+
@inbounds for x=1:8:N
44+
cr = ntuple(i-> xvals[x + i - 1], 8)
45+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
5346
end
5447
end
5548

56-
function mandelbrot(n = 200)
49+
function mandelbrot(io, n = 200)
5750
inv_ = 2.0 / n
58-
N = n
59-
xvals = zeros(Float32, n)
60-
yvals = zeros(Float32, n)
61-
Threads.@threads for i in 0:(N-1)
62-
@inbounds xvals[i + 1] = i * inv_ - 1.5
63-
@inbounds yvals[i + 1] = i * inv_ - 1.0
51+
xvals = Vector{Float32}(undef, n)
52+
yvals = Vector{Float32}(undef, n)
53+
@inbounds for i in 0:(n-1)
54+
xvals[i + 1] = i * inv_ - 1.5
55+
yvals[i + 1] = i * inv_ - 1.0
6456
end
65-
rows = zeros(UInt8, n*N÷8)
66-
Threads.@threads for y in 1:N
57+
58+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
59+
@sync for y=1:n
6760
@inbounds ci = yvals[y]
68-
mandel_inner(rows, ci, y, N, xvals)
61+
# This allows dynamic scheduling instead of static scheduling
62+
# of Threads.@threads macro. See
63+
# https://github.com/JuliaLang/julia/issues/21017 . On some
64+
# computers this is faster, on others not.
65+
Threads.@spawn mandel_inner(rows, ci, y, n, xvals)
6966
end
70-
write(stdout, "P4\n$n $n\n")
71-
write(stdout, rows)
67+
write(io, "P4\n$n $n\n")
68+
write(io, rows)
7269
end
7370

74-
mandelbrot(parse(Int, ARGS[1]))
71+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v2.jl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
5+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
6+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
10+
=#
11+
const zerov8 = ntuple(x-> 0f0, 8)
12+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
13+
0b11111011, 0b11111101, 0b11111110)
14+
15+
# Calculate mandelbrot set for one Vec8 into one byte
16+
Base.@propagate_inbounds function mand8(cr, ci)
17+
Zr = zerov8
18+
Zi = zerov8
19+
Tr = zerov8
20+
Ti = zerov8
21+
t = zerov8
22+
i = 0
23+
24+
for _=1:10
25+
for _=1:5
26+
Zi = 2f0 .* Zr .* Zi .+ ci
27+
Zr = Tr .- Ti .+ cr
28+
Tr = Zr .* Zr
29+
Ti = Zi .* Zi
30+
end
31+
t = Tr .+ Ti
32+
all(x-> x > 4f0, t) && (return 0x00)
33+
end
34+
35+
byte = 0xff
36+
for i=1:8
37+
t[i] <= 4.0 || (byte &= masks[i])
38+
end
39+
return byte
40+
end
41+
42+
function mandel_inner(rows, ci, y, N, xvals)
43+
@inbounds for x=1:8:N
44+
cr = ntuple(i-> xvals[x + i - 1], 8)
45+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
46+
end
47+
end
48+
49+
function mandelbrot(io, n = 200)
50+
inv_ = 2.0 / n
51+
xvals = Vector{Float32}(undef, n)
52+
yvals = Vector{Float32}(undef, n)
53+
@inbounds for i in 0:(n-1)
54+
xvals[i + 1] = i * inv_ - 1.5
55+
yvals[i + 1] = i * inv_ - 1.0
56+
end
57+
58+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
59+
Threads.@threads for y=1:n
60+
@inbounds ci = yvals[y]
61+
mandel_inner(rows, ci, y, n, xvals)
62+
end
63+
write(io, "P4\n$n $n\n")
64+
write(io, rows)
65+
end
66+
67+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v3.jl

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
5+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
6+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
7+
8+
modified for Julia 1.0 by Simon Danisch.
9+
tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
10+
=#
11+
using KissThreading
12+
13+
const zerov8 = ntuple(x-> 0f0, 8)
14+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
15+
0b11111011, 0b11111101, 0b11111110)
16+
17+
# Calculate mandelbrot set for one Vec8 into one byte
18+
Base.@propagate_inbounds function mand8(cr, ci)
19+
Zr = zerov8
20+
Zi = zerov8
21+
Tr = zerov8
22+
Ti = zerov8
23+
t = zerov8
24+
i = 0
25+
26+
for _=1:10
27+
for _=1:5
28+
Zi = 2f0 .* Zr .* Zi .+ ci
29+
Zr = Tr .- Ti .+ cr
30+
Tr = Zr .* Zr
31+
Ti = Zi .* Zi
32+
end
33+
t = Tr .+ Ti
34+
all(x-> x > 4f0, t) && (return 0x00)
35+
end
36+
37+
byte = 0xff
38+
for i=1:8
39+
t[i] <= 4.0 || (byte &= masks[i])
40+
end
41+
return byte
42+
end
43+
44+
function mandel_inner(rows, ci, y, N, xvals)
45+
@inbounds for x=1:8:N
46+
cr = ntuple(i-> xvals[x + i - 1], 8)
47+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
48+
end
49+
end
50+
51+
function mandelbrot(io, n = 200)
52+
inv_ = 2.0 / n
53+
xvals = Vector{Float32}(undef, n)
54+
yvals = Vector{Float32}(undef, n)
55+
@inbounds for i in 0:(n-1)
56+
xvals[i + 1] = i * inv_ - 1.5
57+
yvals[i + 1] = i * inv_ - 1.0
58+
end
59+
60+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
61+
f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals)
62+
tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8)
63+
64+
write(io, "P4\n$n $n\n")
65+
write(io, rows)
66+
end
67+
68+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

0 commit comments

Comments
 (0)