@@ -258,7 +258,7 @@ let parallel_matrix_multiply pool a b =
258258 let k_n = Array.length b in
259259 let res = Array.make_matrix i_n j_n 0 in
260260
261- Task.parallel_for pool ~chunk_size:chunk_size ~ start:0 ~finish:(i_n - 1) ~body:(fun i ->
261+ Task.parallel_for pool ~start:0 ~finish:(i_n - 1) ~body:(fun i ->
262262 for j = 0 to j_n - 1 do
263263 for k = 0 to k_n - 1 do
264264 res.(i).(j) <- res.(i).(j) + a.(i).(k) * b.(k).(j)
@@ -282,10 +282,12 @@ discussed earlier, `start` and `finish` as the names suggset are the starting
282282and ending values of the loop iterations, ` body ` contains the actual loop body
283283to be executed.
284284
285- One parameter that doesn't exist in the sequential version is
286- the ` chunk_size ` . Chunk size determines the granularity of tasks when executing
287- on multiple cores. The ideal ` chunk_size ` depends on a combination
288- of factors:
285+ Parallel for also has an optional parameter ` chunk_size ` . It determines the
286+ granularity of tasks when executing them on multiple domains. If no parameter
287+ is given for ` chunk size ` , a default chunk size is determined which performs
288+ well in most cases. Only if the default chunk size doesn't work well, it is
289+ recommended to experiment with different chunk sizes. The ideal ` chunk_size `
290+ depends on a combination of factors:
289291
290292* ** Nature of the loop:** There are two things to consider pertaining to the
291293loop while deciding on a ` chunk_size ` to use, the * number of iterations* in the
@@ -308,16 +310,16 @@ Let us find how the parallel matrix multiplication scales on multiple cores.
308310
309311The speedup vs core is enumerated below for input matrices of size 1024x1024.
310312
311- | Cores | Time(s)| Speedup |
312- | -------| --------| -------------|
313- | 1 | 10.153 | 1 |
314- | 2 | 5.166 | 1.965350368 |
315- | 4 | 2.65 | 3.831320755 |
316- | 8 | 1.35 | 7.520740741 |
317- | 12 | 0.957 | 10.6091954 |
318- | 16 | 0.742 | 13.68328841 |
319- | 20 | 0.634 | 16.01419558 |
320- | 24 | 0.655 | 15.50076336 |
313+ | Cores | Time (s) | Speedup |
314+ | -------| ---------- | -------------|
315+ | 1 | 9.172 | 1 |
316+ | 2 | 4.692 | 1.954816709 |
317+ | 4 | 2.293 | 4 |
318+ | 8 | 1.196 | 7.668896321 |
319+ | 12 | 0.854 | 10.74004684 |
320+ | 16 | 0.76 | 12.06842105 |
321+ | 20 | 0.66 | 13.8969697 |
322+ | 24 | 0.587 | 15.62521295 |
321323
322324![ matrix-graph] ( images/matrix_multiplication.png )
323325
@@ -703,7 +705,7 @@ let a = Array.create_float n
703705
704706let _ =
705707 let pool = Task.setup_pool ~num_domains:(num_domains - 1) in
706- Task.parallel_for pool ~chunk_size:(n/num_domains) ~ start:0
708+ Task.parallel_for pool ~start:0
707709 ~finish:(n - 1) ~body:(fun i -> Array.set a i (Random.float 1000.));
708710 Task.teardown_pool pool
709711```
@@ -713,7 +715,7 @@ Let us measure how it scales.
713715| #Cores | Time(s) |
714716| --------| ---------|
715717| 1 | 3.136 |
716- | 2 | 7.648 |
718+ | 2 | 10.19 |
717719| 4 | 11.815 |
718720
719721When we had expected to see speedup executing in multiple cores, what we see
@@ -751,7 +753,7 @@ let arr = Array.create_float n
751753let _ =
752754 let domains = T.setup_pool ~num_domains:(num_domains - 1) in
753755 let states = Array.init num_domains (fun _ -> Random.State.make_self_init()) in
754- T.parallel_for domains ~chunk_size:(n/num_domains) ~ start:0 ~finish:(n-1)
756+ T.parallel_for domains ~start:0 ~finish:(n-1)
755757 ~body:(fun i ->
756758 let d = (Domain.self() :> int) mod num_domains in
757759 Array.unsafe_set arr i (Random.State.float states.(d) 100. ))
0 commit comments