@@ -238,6 +238,46 @@ perf(128, 8, 128, 32)
238238# 3.166 ms (81 allocations: 9.13 MiB)
239239# 16.082 ms (1049 allocations: 20.58 MiB)
240240
241+ # # Threadripper, NNlib v0.8.12
242+ # tullio
243+ # 5.658 ms (77 allocations: 7.25 MiB)
244+ # 22.373 ms (1124 allocations: 16.71 MiB)
245+ # nalib
246+ # 6.187 ms (89 allocations: 7.75 MiB)
247+ # 23.723 ms (604 allocations: 14.70 MiB)
248+ # nnlib
249+ # 6.473 ms (87 allocations: 9.25 MiB)
250+ # 24.966 ms (1055 allocations: 20.71 MiB)
251+ # tullio - gpu
252+ # 145.332 μs (520 allocations: 24.52 KiB)
253+ # 902.020 μs (2221 allocations: 117.19 KiB)
254+ # nalib - gpu
255+ # 162.354 μs (410 allocations: 18.03 KiB)
256+ # 604.111 μs (1263 allocations: 71.78 KiB)
257+ # nnlib - gpu
258+ # 156.383 μs (440 allocations: 20.00 KiB)
259+ # 835.374 μs (1969 allocations: 100.58 KiB)
260+
261+ # # Threadripper, NNlib v0.8.13 (fast_maximum)
262+ # tullio
263+ # 4.599 ms (71 allocations: 7.13 MiB)
264+ # 20.699 ms (1118 allocations: 16.59 MiB)
265+ # nalib
266+ # 5.049 ms (84 allocations: 7.63 MiB)
267+ # 22.252 ms (599 allocations: 14.57 MiB)
268+ # nnlib
269+ # 5.378 ms (81 allocations: 9.13 MiB)
270+ # 23.453 ms (1049 allocations: 20.58 MiB)
271+ # tullio - gpu
272+ # 145.824 μs (520 allocations: 24.52 KiB)
273+ # 915.305 μs (2221 allocations: 117.19 KiB)
274+ # nalib - gpu
275+ # 164.789 μs (410 allocations: 18.03 KiB)
276+ # 610.835 μs (1263 allocations: 71.78 KiB)
277+ # nnlib - gpu
278+ # 157.785 μs (440 allocations: 20.00 KiB)
279+ # 852.087 μs (1969 allocations: 100.58 KiB)
280+
241281
242282# function prof()
243283 # dim, len, batch_size, nheads = 128, 8, 128, 32;
0 commit comments