diff --git a/CHANGELOG.md b/CHANGELOG.md index 26911ba..3648e2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,20 +16,30 @@ This project is currently in early [pre-release], and there may be arbitrary bre ### Added -- `ThreadPool::num_workers` method which returns the current number of workers -- `ThreadPool::on_worker` variant of `with_worker` for `Send` closures. -- `ThreadPool::expect_worker` variant of `with_worker` that panics. +- `Worker::spawn_local` for spawning `!Send` work. +- `Worker::broadcast`, `ThreadPool::spawn_broadcast`, and `broadcast` for blocking broadcasts. +- `ThreadPool::broadcast`, `ThreadPool::spawn_broadcast`, and `spawn_broadcast` for non-blocking broadcasts. +- `ThreadPool::num_members` method which returns the current number of member threads. +- `ThreadPool::get_worker` which looks up the worker if it exists. +- `ThreadPool::enroll` which requests membership and blocks until it is granted. +- `ThreadPool::try_enroll` which requests a membership and returns None if none are available. ### Changed + +- `Lease` and `StackJob` have been refactored to improve stack utilization. - Work sharing has been rewritten to improve performance. -- Thread pools can now have a max of 32 workers at a time. +- Thread pools can now have a max of 32 members at a time. +- `ThreadPool::with_worker` now waits for a membership to become available. - `spawn`, `Scope::spawn`, and `Worker::spawn` now accept closures and futures. -- `ThreadPool::with_worker` now provides `Option<&Worker>` instead of `&Worker`. -- `claim_lease` now returns `Option` instead of `Lease`. +- `Lease` is now called `Membership`. - `Scope` now has two lifetimes instead of one, and is more flexible. ### Removed + - All versions of `spawn_future` and `spawn_async`; just use `spawn` instead. +- `claim_lease` has been replaced with `try_enroll`. +- `Worker::occupy` has been replaced with `Membership::activate`. +- Removed the shuttle testing framework (it's incompatible with crossbeam queues). ## [1.0.0-alpha.4] diff --git a/Cargo.lock b/Cargo.lock index 0c5fddb..30e0c8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,12 +55,6 @@ dependencies = [ "serde", ] -[[package]] -name = "assoc" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdc70193dadb9d7287fa4b633f15f90c876915b31f6af17da307fc59c9859a8" - [[package]] name = "async-channel" version = "2.3.1" @@ -162,18 +156,6 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" -[[package]] -name = "bitvec" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - [[package]] name = "bumpalo" version = "3.18.1" @@ -499,20 +481,12 @@ dependencies = [ "crossbeam-utils", "dashmap", "divan", - "hotclock", + "futures-lite", + "lazy_static", "rayon", - "shuttle", "st3", - "tracing", - "tracing-subscriber", ] -[[package]] -name = "funty" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" - [[package]] name = "futures-core" version = "0.3.31" @@ -527,9 +501,9 @@ checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" -version = "2.6.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" dependencies = [ "fastrand", "futures-core", @@ -548,32 +522,7 @@ dependencies = [ "libc", "log", "rustversion", - "windows 0.48.0", -] - -[[package]] -name = "generator" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827" -dependencies = [ - "cc", - "cfg-if", - "libc", - "log", - "rustversion", - "windows 0.61.3", -] - -[[package]] -name = "getrandom" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi", + "windows", ] [[package]] @@ -627,17 +576,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "hotclock" -version = "0.2.0" -source = "git+https://github.com/spence/hotclock#8cf14ae9d62dba7f7780a3c920ab6208b6568777" - [[package]] name = "is-terminal" version = "0.4.16" @@ -714,7 +652,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5" dependencies = [ "cfg-if", - "generator 0.7.5", + "generator", "scoped-tls", "tracing", "tracing-subscriber", @@ -772,12 +710,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" -[[package]] -name = "owo-colors" -version = "3.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" - [[package]] name = "parking" version = "2.2.1" @@ -794,7 +726,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-link 0.2.1", + "windows-link", ] [[package]] @@ -846,15 +778,6 @@ dependencies = [ "portable-atomic", ] -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "proc-macro2" version = "1.0.95" @@ -873,51 +796,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "radium" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_pcg" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e" -dependencies = [ - "rand_core", -] - [[package]] name = "rayon" version = "1.10.0" @@ -1096,26 +974,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" -[[package]] -name = "shuttle" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72e65e5ac3437476a310bd73ec924dc75e3055ffa61f376266f80576f3869ff" -dependencies = [ - "assoc", - "bitvec", - "cfg-if", - "generator 0.8.5", - "hex", - "owo-colors", - "rand", - "rand_core", - "rand_pcg", - "scoped-tls", - "smallvec", - "tracing", -] - [[package]] name = "slab" version = "0.4.10" @@ -1164,12 +1022,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "terminal_size" version = "0.4.2" @@ -1282,12 +1134,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1396,114 +1242,12 @@ dependencies = [ "windows-targets 0.48.5", ] -[[package]] -name = "windows" -version = "0.61.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893" -dependencies = [ - "windows-collections", - "windows-core", - "windows-future", - "windows-link 0.1.3", - "windows-numerics", -] - -[[package]] -name = "windows-collections" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8" -dependencies = [ - "windows-core", -] - -[[package]] -name = "windows-core" -version = "0.61.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link 0.1.3", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-future" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e" -dependencies = [ - "windows-core", - "windows-link 0.1.3", - "windows-threading", -] - -[[package]] -name = "windows-implement" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "windows-interface" -version = "0.59.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "windows-link" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" - [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-numerics" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1" -dependencies = [ - "windows-core", - "windows-link 0.1.3", -] - -[[package]] -name = "windows-result" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" -dependencies = [ - "windows-link 0.1.3", -] - -[[package]] -name = "windows-strings" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" -dependencies = [ - "windows-link 0.1.3", -] - [[package]] name = "windows-sys" version = "0.42.0" @@ -1559,15 +1303,6 @@ dependencies = [ "windows_x86_64_msvc 0.52.6", ] -[[package]] -name = "windows-threading" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6" -dependencies = [ - "windows-link 0.1.3", -] - [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -1700,15 +1435,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "wyz" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" -dependencies = [ - "tap", -] - [[package]] name = "xshell" version = "0.2.7" @@ -1723,23 +1449,3 @@ name = "xshell-macros" version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32ac00cd3f8ec9c1d33fb3e7958a82df6989c42d747bd326c822b1d625283547" - -[[package]] -name = "zerocopy" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/Cargo.toml b/Cargo.toml index 89cb4d4..dc3b1c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,10 @@ edition = "2024" license = "MIT OR Apache-2.0" description = "Low-overhead parallel and async work scheduler" repository = "https://github.com/NthTensor/Forte" +rust-version = "1.96.0" + +# Lie to cargo +links = "forte" [workspace] resolver = "2" @@ -13,14 +17,11 @@ members = ["ci"] [dependencies] async-task = "4.7.1" atomic-wait = "1.1.0" +lazy_static = "1.5.0" crossbeam-queue = "0.3.12" crossbeam-utils = "0.8.21" st3 = "0.4" -hotclock = { git = "https://github.com/spence/hotclock" } - -shuttle = { version = "0.8.0", optional = true } -tracing = { version = "0.1.41", features = ["release_max_level_off"] } -tracing-subscriber = "0.3.19" +futures-lite = "2.6.1" [dev-dependencies] # Required for comparison testing @@ -33,22 +34,12 @@ dashmap = "6.1.0" # Used for A/B perf testing criterion = { version = "0.5" } -[features] -shuttle = ["dep:shuttle"] - [profile.release] debug = true [profile.bench] opt-level = 3 -# Custom profile for shuttle tests: enable release optimizations so that the shuttle -# tests are less slow, but don't disable debug assertions. -[profile.shuttle] -inherits = "test" -lto = true -opt-level = 3 - [lints.clippy] doc_markdown = "warn" manual_let_else = "warn" diff --git a/benches/bevy_tasks.rs b/benches/bevy_tasks.rs index b730f1f..335a0e9 100644 --- a/benches/bevy_tasks.rs +++ b/benches/bevy_tasks.rs @@ -1,7 +1,8 @@ //! Comparative benchmarks against bevy_tasks struct BevyParChunksMut<'a, T>(core::slice::ChunksMut<'a, T>); -impl<'a, T> bevy_tasks::ParallelIterator> for BevyParChunksMut<'a, T> +impl<'a, T> bevy_tasks::ParallelIterator> + for BevyParChunksMut<'a, T> where T: 'a + Send + Sync, { @@ -12,8 +13,11 @@ where static THREAD_POOL: forte::ThreadPool = forte::ThreadPool::new(); -fn forte_chunks(worker: &forte::Worker, data: &mut [T], func: &F) -where +fn forte_chunks( + worker: &forte::Worker, + data: &mut [T], + func: &F, +) where T: Send + Sync, F: Fn(&mut [T]) + Send + Sync, { @@ -83,7 +87,7 @@ mod overhead { let mut vec: Vec<_> = (0..len).collect(); - THREAD_POOL.expect_worker(|worker| { + THREAD_POOL.with_worker(|worker| { bencher.bench_local(|| { forte_chunks::<64, _, _>(worker, &mut vec, &|c| { c.iter_mut().for_each(work); diff --git a/benches/flat_scope.rs b/benches/flat_scope.rs index 8d48ef0..3e42292 100644 --- a/benches/flat_scope.rs +++ b/benches/flat_scope.rs @@ -6,9 +6,6 @@ use std::hash::Hasher; use criterion::black_box; use divan::Bencher; -use tracing_subscriber::fmt; -use tracing_subscriber::layer::SubscriberExt; -use tracing_subscriber::util::SubscriberInitExt; const SIZES: &[usize] = &[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4012, 8196]; @@ -39,7 +36,7 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new(); fn forte(bencher: Bencher, size: usize) { use forte::Worker; - COMPUTE.expect_worker(|worker| { + COMPUTE.with_worker(|worker| { bencher.bench_local(|| { worker.scope(|scope| { for i in 0..size { @@ -78,14 +75,6 @@ fn rayon(bencher: Bencher, size: usize) { } fn main() { - let fmt_layer = fmt::layer() - .without_time() - .with_target(false) - .with_thread_names(true) - .compact(); - - tracing_subscriber::registry().with(fmt_layer).init(); - COMPUTE.resize_to_available(); divan::main(); diff --git a/benches/flood_fill.rs b/benches/flood_fill.rs index 471953d..627ba6b 100644 --- a/benches/flood_fill.rs +++ b/benches/flood_fill.rs @@ -9,9 +9,6 @@ use std::hash::Hasher; use criterion::black_box; use dashmap::DashSet; use divan::Bencher; -use tracing_subscriber::fmt; -use tracing_subscriber::layer::SubscriberExt; -use tracing_subscriber::util::SubscriberInitExt; const SIZES: &[usize] = &[8, 16, 32, 64, 128, 256, 512]; @@ -126,7 +123,7 @@ fn forte(bencher: Bencher, size: usize) { } } - COMPUTE.expect_worker(|worker| { + COMPUTE.with_worker(|worker| { bencher.bench_local(|| { let visited = DashSet::new(); @@ -202,14 +199,6 @@ fn rayon(bencher: Bencher, size: usize) { } fn main() { - let fmt_layer = fmt::layer() - .without_time() - .with_target(false) - .with_thread_names(true) - .compact(); - - tracing_subscriber::registry().with(fmt_layer).init(); - COMPUTE.resize_to_available(); divan::main(); diff --git a/benches/fork_join.rs b/benches/fork_join.rs index 2e55b24..4c54be6 100644 --- a/benches/fork_join.rs +++ b/benches/fork_join.rs @@ -3,10 +3,6 @@ use chili::Scope; use divan::Bencher; use forte::Worker; -use tracing::info; -use tracing_subscriber::fmt; -use tracing_subscriber::layer::SubscriberExt; -use tracing_subscriber::util::SubscriberInitExt; // ----------------------------------------------------------------------------- // Workload @@ -32,7 +28,8 @@ impl Node { // Returns an iterator over the number of layers. Also returns the total number // of nodes. const LAYERS: &[usize] = &[ - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, // 10, 24, 27, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, // 10, 24, 27, ]; fn nodes() -> impl Iterator { @@ -86,8 +83,7 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) { let tree = Node::tree(nodes.0); - COMPUTE.expect_worker(|worker| { - info!("Staring Benchmark"); + COMPUTE.with_worker(|worker| { bencher.bench_local(move || { assert_eq!(sum(&tree, worker), nodes.1 as u64); }); @@ -105,9 +101,8 @@ fn throughput_forte(bencher: Bencher, nodes: (usize, usize)) { node.val + left + right } - info!("Staring Benchmark"); bencher.bench(|| { - COMPUTE.expect_worker(|worker| { + COMPUTE.with_worker(|worker| { let tree = Node::tree(nodes.0); assert_eq!(sum(&tree, worker), nodes.1 as u64); }); @@ -187,14 +182,6 @@ fn throughput_rayon(bencher: Bencher, nodes: (usize, usize)) { } fn main() { - let fmt_layer = fmt::layer() - .without_time() - .with_target(false) - .with_thread_names(true) - .compact(); - - tracing_subscriber::registry().with(fmt_layer).init(); - COMPUTE.resize_to_available(); divan::main(); diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..959d5a5 --- /dev/null +++ b/build.rs @@ -0,0 +1,6 @@ +//! This build-script is intentionally left blank. It exists so that cargo will +//! let us use the `link` field, which ensures that only one version of forte +//! may be used in a given build. This is important to prevent fighting +//! threadpools. + +fn main() {} diff --git a/ci/src/ci.rs b/ci/src/ci.rs index b3c23ba..4d6eea7 100644 --- a/ci/src/ci.rs +++ b/ci/src/ci.rs @@ -72,18 +72,28 @@ impl CI { // Note that we are running the subcommands directly rather than using any aliases let mut cmds = vec![]; // Lint commands - cmds.append(&mut commands::FormatCommand::default().prepare(sh, flags)); - cmds.append(&mut commands::ClippyCommand::default().prepare(sh, flags)); - cmds.append(&mut commands::LintsCommand::default().prepare(sh, flags)); + cmds.append( + &mut commands::FormatCommand::default().prepare(sh, flags), + ); + cmds.append( + &mut commands::ClippyCommand::default().prepare(sh, flags), + ); + cmds.append( + &mut commands::LintsCommand::default().prepare(sh, flags), + ); // Compile commands - cmds.append(&mut commands::CompileCheckCommand::default().prepare(sh, flags)); + cmds.append( + &mut commands::CompileCheckCommand::default() + .prepare(sh, flags), + ); // Documentation commands - cmds.append(&mut commands::DocCheckCommand::default().prepare(sh, flags)); - cmds.append(&mut commands::DocTestCommand::default().prepare(sh, flags)); - // Shuttle commands - cmds.append(&mut commands::ShuttleCheckCommand::default().prepare(sh, flags)); - cmds.append(&mut commands::ShuttleClippyCommand::default().prepare(sh, flags)); - cmds.append(&mut commands::ShuttleTestCommand::default().prepare(sh, flags)); + cmds.append( + &mut commands::DocCheckCommand::default() + .prepare(sh, flags), + ); + cmds.append( + &mut commands::DocTestCommand::default().prepare(sh, flags), + ); cmds } } @@ -105,15 +115,14 @@ enum Commands { Lints(commands::LintsCommand), Clippy(commands::ClippyCommand), Format(commands::FormatCommand), - // Shuttle commands - Shuttle(commands::ShuttleCommand), - ShuttleCheck(commands::ShuttleCheckCommand), - ShuttleClippy(commands::ShuttleClippyCommand), - ShuttleTest(commands::ShuttleTestCommand), } impl Prepare for Commands { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec> { match self { // Compile commands Commands::Compile(subcommand) => subcommand.prepare(sh, flags), @@ -126,11 +135,6 @@ impl Prepare for Commands { Commands::Lints(subcommand) => subcommand.prepare(sh, flags), Commands::Clippy(subcommand) => subcommand.prepare(sh, flags), Commands::Format(subcommand) => subcommand.prepare(sh, flags), - // Shuttle commands - Commands::Shuttle(subcommand) => subcommand.prepare(sh, flags), - Commands::ShuttleCheck(subcommand) => subcommand.prepare(sh, flags), - Commands::ShuttleClippy(subcommand) => subcommand.prepare(sh, flags), - Commands::ShuttleTest(subcommand) => subcommand.prepare(sh, flags), } } } diff --git a/ci/src/commands/clippy.rs b/ci/src/commands/clippy.rs index eef88a2..f5f2a04 100644 --- a/ci/src/commands/clippy.rs +++ b/ci/src/commands/clippy.rs @@ -11,7 +11,11 @@ use crate::PreparedCommand; pub struct ClippyCommand {} impl Prepare for ClippyCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + _flags: Flag, + ) -> Vec> { vec![PreparedCommand::new::( cmd!(sh, "cargo clippy --workspace -- -Dwarnings"), "Please fix clippy errors in output above.", diff --git a/ci/src/commands/compile.rs b/ci/src/commands/compile.rs index 7d5043c..686bd29 100644 --- a/ci/src/commands/compile.rs +++ b/ci/src/commands/compile.rs @@ -11,7 +11,11 @@ use crate::commands::CompileCheckCommand; pub struct CompileCommand {} impl Prepare for CompileCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec> { let mut commands = vec![]; commands.append(&mut CompileCheckCommand::default().prepare(sh, flags)); commands diff --git a/ci/src/commands/compile_check.rs b/ci/src/commands/compile_check.rs index 3466746..b96cefc 100644 --- a/ci/src/commands/compile_check.rs +++ b/ci/src/commands/compile_check.rs @@ -11,7 +11,11 @@ use crate::PreparedCommand; pub struct CompileCheckCommand {} impl Prepare for CompileCheckCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + _flags: Flag, + ) -> Vec> { vec![PreparedCommand::new::( cmd!(sh, "cargo check --workspace"), "Please fix compiler errors in output above.", diff --git a/ci/src/commands/doc.rs b/ci/src/commands/doc.rs index 419bb12..097cbfa 100644 --- a/ci/src/commands/doc.rs +++ b/ci/src/commands/doc.rs @@ -12,7 +12,11 @@ use crate::commands::DocTestCommand; pub struct DocCommand {} impl Prepare for DocCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec> { let mut commands = vec![]; commands.append(&mut DocTestCommand::default().prepare(sh, flags)); commands.append(&mut DocCheckCommand::default().prepare(sh, flags)); diff --git a/ci/src/commands/doc_check.rs b/ci/src/commands/doc_check.rs index f2a7d59..4e862d7 100644 --- a/ci/src/commands/doc_check.rs +++ b/ci/src/commands/doc_check.rs @@ -11,7 +11,11 @@ use crate::PreparedCommand; pub struct DocCheckCommand {} impl Prepare for DocCheckCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + _flags: Flag, + ) -> Vec> { vec![ PreparedCommand::new::( cmd!( diff --git a/ci/src/commands/doc_test.rs b/ci/src/commands/doc_test.rs index e6bdaa5..658f5bb 100644 --- a/ci/src/commands/doc_test.rs +++ b/ci/src/commands/doc_test.rs @@ -11,7 +11,11 @@ use crate::PreparedCommand; pub struct DocTestCommand {} impl Prepare for DocTestCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec> { let no_fail_fast = if flags.contains(Flag::KEEP_GOING) { "--no-fail-fast" } else { diff --git a/ci/src/commands/format.rs b/ci/src/commands/format.rs index e36173d..a83e8c4 100644 --- a/ci/src/commands/format.rs +++ b/ci/src/commands/format.rs @@ -11,7 +11,11 @@ use crate::PreparedCommand; pub struct FormatCommand {} impl Prepare for FormatCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + _flags: Flag, + ) -> Vec> { vec![PreparedCommand::new::( cmd!(sh, "cargo fmt --all -- --check"), "Please run 'cargo fmt --all' to format your code.", diff --git a/ci/src/commands/lints.rs b/ci/src/commands/lints.rs index 02e2128..27a2c11 100644 --- a/ci/src/commands/lints.rs +++ b/ci/src/commands/lints.rs @@ -12,7 +12,11 @@ use crate::commands::FormatCommand; pub struct LintsCommand {} impl Prepare for LintsCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec> { let mut commands = vec![]; commands.append(&mut FormatCommand::default().prepare(sh, flags)); commands.append(&mut ClippyCommand::default().prepare(sh, flags)); diff --git a/ci/src/commands/mod.rs b/ci/src/commands/mod.rs index b9ca102..e132d34 100644 --- a/ci/src/commands/mod.rs +++ b/ci/src/commands/mod.rs @@ -22,14 +22,3 @@ mod lints; pub use clippy::*; pub use format::*; pub use lints::*; - -// Shuttle test suite commands -mod shuttle; -mod shuttle_check; -mod shuttle_clippy; -mod shuttle_test; - -pub use shuttle::*; -pub use shuttle_check::*; -pub use shuttle_clippy::*; -pub use shuttle_test::*; diff --git a/ci/src/commands/shuttle.rs b/ci/src/commands/shuttle.rs deleted file mode 100644 index 857a7d9..0000000 --- a/ci/src/commands/shuttle.rs +++ /dev/null @@ -1,23 +0,0 @@ -use argh::FromArgs; - -use crate::Flag; -use crate::Prepare; -use crate::PreparedCommand; -use crate::commands::ShuttleCheckCommand; -use crate::commands::ShuttleClippyCommand; -use crate::commands::ShuttleTestCommand; - -/// Alias for running the `shuttle-check`, `shuttle-clippy` and `shuttle-test` subcommands. -#[derive(FromArgs, Default)] -#[argh(subcommand, name = "shuttle")] -pub struct ShuttleCommand {} - -impl Prepare for ShuttleCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec> { - let mut commands = vec![]; - commands.append(&mut ShuttleCheckCommand::default().prepare(sh, flags)); - commands.append(&mut ShuttleClippyCommand::default().prepare(sh, flags)); - commands.append(&mut ShuttleTestCommand::default().prepare(sh, flags)); - commands - } -} diff --git a/ci/src/commands/shuttle_check.rs b/ci/src/commands/shuttle_check.rs deleted file mode 100644 index 05f07e0..0000000 --- a/ci/src/commands/shuttle_check.rs +++ /dev/null @@ -1,21 +0,0 @@ -use argh::FromArgs; -use xshell::cmd; - -use crate::Flag; -use crate::Prepare; -use crate::PreparedCommand; - -/// Checks that the loom test suite compiles. -#[derive(FromArgs, Default)] -#[argh(subcommand, name = "loom-check")] -pub struct ShuttleCheckCommand {} - -impl Prepare for ShuttleCheckCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { - let command = PreparedCommand::new::( - cmd!(sh, "cargo check --test shuttle --features shuttle"), - "Please fix compiler errors in output above.", - ); - vec![command] - } -} diff --git a/ci/src/commands/shuttle_clippy.rs b/ci/src/commands/shuttle_clippy.rs deleted file mode 100644 index 8c0111d..0000000 --- a/ci/src/commands/shuttle_clippy.rs +++ /dev/null @@ -1,24 +0,0 @@ -use argh::FromArgs; -use xshell::cmd; - -use crate::Flag; -use crate::Prepare; -use crate::PreparedCommand; - -/// Checks for clippy warnings and errors in the loom test suite. -#[derive(FromArgs, Default)] -#[argh(subcommand, name = "shuttle-clippy")] -pub struct ShuttleClippyCommand {} - -impl Prepare for ShuttleClippyCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { - let command = PreparedCommand::new::( - cmd!( - sh, - "cargo clippy --test shuttle --features shuttle -- -Dwarnings" - ), - "Please fix clippy errors in output above.", - ); - vec![command] - } -} diff --git a/ci/src/commands/shuttle_test.rs b/ci/src/commands/shuttle_test.rs deleted file mode 100644 index cdfc41f..0000000 --- a/ci/src/commands/shuttle_test.rs +++ /dev/null @@ -1,24 +0,0 @@ -use argh::FromArgs; -use xshell::cmd; - -use crate::Flag; -use crate::Prepare; -use crate::PreparedCommand; - -/// Runs the loom concurrency test suite. -#[derive(FromArgs, Default)] -#[argh(subcommand, name = "shuttle-test")] -pub struct ShuttleTestCommand {} - -impl Prepare for ShuttleTestCommand { - fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec> { - let command = PreparedCommand::new::( - cmd!( - sh, - "cargo test --test shuttle --profile shuttle --features shuttle" - ), - "Please fix compiler errors in output above.", - ); - vec![command] - } -} diff --git a/ci/src/prepare.rs b/ci/src/prepare.rs index 923bc33..14b3b84 100644 --- a/ci/src/prepare.rs +++ b/ci/src/prepare.rs @@ -24,7 +24,11 @@ pub trait Prepare { /// } /// } /// ``` - fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec>; + fn prepare<'a>( + &self, + sh: &'a xshell::Shell, + flags: Flag, + ) -> Vec>; } bitflags! { @@ -83,7 +87,11 @@ impl<'a> PreparedCommand<'a> { } /// A builder that adds a new environmental variable to the list. - pub fn with_env_var(mut self, key: &'static str, value: &'static str) -> Self { + pub fn with_env_var( + mut self, + key: &'static str, + value: &'static str, + ) -> Self { self.env_vars.push((key, value)); self } diff --git a/rustfmt.toml b/rustfmt.toml index a871dbd..4b863e7 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,6 +1,7 @@ use_field_init_shorthand = true newline_style = "Unix" style_edition = "2024" +max_width = 80 # The following lines may be uncommented on nightly Rust. # Once these features have stabilized, they should be added to the always-enabled options above. diff --git a/src/job.rs b/src/job.rs index edbed59..9ff40e6 100644 --- a/src/job.rs +++ b/src/job.rs @@ -1,62 +1,32 @@ -//! This module defines an executable unit of work called a [`Job`]. Jobs are what -//! get scheduled on the thread pool. There are two core job types: [`StackJob`] -//! and [`HeapJob`]. +//! This module defines an executable unit of work called a "Job". Jobs are +//! what get scheduled on the thread pool. There are two core job types: +//! [`StackJob`] and [`HeapJob`]. There is no unifying `Job` trait. Instead, +//! what makes these both jobs is their ability to yield a [`JobRef`]. //! //! After a job is allocated, we typically refer to it by a [`JobRef`]. Job refs //! are type-erased, and can be sent between threads without moving the //! underlying job. -//! -//! When using a job, one must be extremely careful to ensure that: -//! (a) The job does not outlive anything it closes over. -//! (b) The job remains valid until it is executed for the last time. -//! (c) Each job reference is executed exactly once. use alloc::boxed::Box; use alloc::collections::VecDeque; use alloc::vec::Vec; +use core::any::Any; use core::cell::UnsafeCell; use core::mem::ManuallyDrop; -use core::mem::MaybeUninit; use core::ptr::NonNull; -use core::sync::atomic::Ordering; -use core::sync::atomic::fence; -use std::thread::Result as ThreadResult; use crate::latch::Latch; -use crate::platform::AtomicU32; +use crate::platform::*; use crate::thread_pool::Worker; use crate::unwind; // ----------------------------------------------------------------------------- -// Runnable +// JobRef -/// A job is a unit of work that may be executed by a worker thread. The primary -/// purpose of this trait is to make it easy to create a `JobRef`. The `execute` -/// function is designed to interlock with the `JobRef::execute_fn` field. -trait Job { - /// Calling this function runs the job. - /// - /// # Safety - /// - /// Implementors must specify the invariant of the pointer `this` that the - /// caller is expected to uphold. - /// - /// This may be called from a different thread than the one which scheduled - /// the job, so the implementer must ensure the appropriate traits are met, - /// whether `Send`, `Sync`, or both. - /// - /// Calling this is always considered to "complete" the job, so the caller - /// must ensure this is called exactly once. - unsafe fn execute(this: NonNull<()>, worker: &Worker); -} - -// ----------------------------------------------------------------------------- -// Shared JobRef - -/// Effectively a Job trait object. It can be treated as such, even though -/// sometimes a `JobRef` will not point to a type that implements `Job`. +/// A `JobRef` is a specialized v-table, containing a pointer to work that needs to +/// be executed, and a function pointer that is capable of executing it. /// -/// This is analogous to the chili type `JobShared` or the rayon type `JobRef`. +/// It is analogous to the chili type `JobShared` or the rayon type `JobRef`. pub struct JobRef { /// A non-null pointer to some type-erased data which can be executed as a /// job by the `execute_fn`. This will usually point to either an instance @@ -74,18 +44,11 @@ impl JobRef { /// /// # Safety /// - /// The caller must ensure that: - /// - /// * `job_pointer` and `execute_fn` are *matched*; the `execute_fn` must be - /// a function that can safely receive `job_pointer` as it's first argument. - /// - /// * `job_pointer` points to an initialized and properly aligned value which - /// is neither moved nor dropped until `execute_fn` is called. - /// - /// * `job_pointer` is "valid" now and until `execute_fn` is called, - /// according to the contract of the specific `execute_fn` being stored. + /// The caller must ensure that `JobRef::execute` will only called on the + /// returned `JobRef` when it would be sound to call `execute_fn` on + /// `job_pointer`. #[inline(always)] - pub unsafe fn new_raw( + pub unsafe fn new( job_pointer: NonNull<()>, execute_fn: unsafe fn(NonNull<()>, &Worker), ) -> JobRef { @@ -105,27 +68,38 @@ impl JobRef { /// Executes the `JobRef` by passing the execute function on the job pointer. #[inline(always)] pub fn execute(self, worker: &Worker) { - // SAFETY: Calling this function on this pointer is valid due to the - // contract of `JobRef::new_raw`: - // - // * `self.execute_fn` and `self.job_pointer` are "matched": every - // `JobRef` is constructed via `new_raw`, which requires the caller - // to supply a compatible pair. - // - // * `self.job_pointer` is valid at this point: `new_raw` requires the - // pointer to remain valid until `execute_fn` is called, and we are - // calling it now. - // - // * This is called at most once: `execute` consumes `self`, so the - // pointer cannot be used again via this `JobRef`. + // SAFETY: The caller of `JobRef::new` defines the conditions under + // which this call is sound, and must ensure that this will not be + // called unless these conditions are met. unsafe { (self.execute_fn)(self.job_pointer, worker) } } } -// SAFETY: `JobRef` is a type-erased data pointer + function pointer tuple. The -// data pointer always points to a `Send` value due to the safety requirements -// of `JobRef::new_raw`. Function pointers are always `Send`. Therefore it is -// sound to move a `JobRef` across thread boundaries. +// SAFETY: This is sound, but just barely. +// +// Every `JobRef` contains a function pointer and a data pointer. Function +// pointers are always `Send`, but the data pointer may or may not be valid for +// cross-thread access (the value it points to may or may not be `Sync`). +// +// However, even when this data is not thread-safe, `JobRef` still needs to be +// `Send`. This is because we need to be able to pass pointers to `!Send` job +// data between threads. For example, if we have a thread that owns a `Future` +// that is `!Send`, and we receive a wakeup notification on an IO polling +// thread, the IO thread must send the owning thread a `JobRef` containing a +// pointer to that `!Send` future. +// +// This is only sound because the only method that can actually cause unsound +// cross-thread memory access is `JobRef::execute`. This function is safe, +// because the caller cannot know the soundness requirements of the underlying +// job being pointed to (due to type-erasure). However, `JobRef::new` is +// `unsafe`, and requires the caller to ensure that `execute` will only be +// called if it is correct for the execute function to be called on the +// job_pointer. +// +// Since every `JobRef` must be constructed with a call to `new`, it is not +// possible for _entirely safe_ code to violate the `!Send` condition. It is +// unfortunate that the soundness justification has to be squeezed into a single +// function, but thus are the constraints of type-erasure. unsafe impl Send for JobRef {} // ----------------------------------------------------------------------------- @@ -151,7 +125,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.push_back(job_ref); } @@ -161,7 +135,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.push_front(job_ref); } @@ -171,7 +145,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.pop_back() } @@ -181,7 +155,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.pop_front() } @@ -192,7 +166,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; if job_refs.back().map(JobRef::id) == Some(id) { let _ = job_refs.pop_back(); @@ -212,7 +186,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; let mut len = job_refs.len(); let num_chunks = len / Self::CHUNK_SIZE; @@ -231,7 +205,7 @@ impl JobQueue { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not // returning any references from this API, making this exclusive access - // safe. + // sound. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.append(&mut split_refs); } @@ -240,13 +214,24 @@ impl JobQueue { // ----------------------------------------------------------------------------- // Stack allocated work function +/// This union helps us conserve stack space by allowing us to store (a) the +/// function we need to run (b) the output of that function or (c) a captured +/// panic at different times throughout the job's lifecycle. +/// +/// As future work, we may want to consider pushing large values into a +/// heap-allocated pool. +union StackJobData { + func: ManuallyDrop, + output: ManuallyDrop, + error: ManuallyDrop>, +} + /// A [`StackJob`] is a job that's allocated on the stack. /// /// This is analogous to the chili type `JobStack` and the rayon type `StackJob`. pub struct StackJob { - f: UnsafeCell>, completed: Latch, - return_value: UnsafeCell>>, + data: UnsafeCell>, } impl StackJob @@ -256,11 +241,12 @@ where { /// Creates a new `StackJob` owned by the current worker. #[inline(always)] - pub fn new(f: F, worker: &Worker) -> StackJob { + pub fn new(func: F, latch: Latch) -> StackJob { StackJob { - f: UnsafeCell::new(ManuallyDrop::new(f)), - completed: worker.new_latch(), - return_value: UnsafeCell::new(MaybeUninit::uninit()), + data: UnsafeCell::new(StackJobData { + func: ManuallyDrop::new(func), + }), + completed: latch, } } @@ -271,390 +257,260 @@ where /// /// The caller must ensure that: /// - /// * The `StackJob` will outlive the `JobRef`. + /// * This is called at most once for each `StackJob`. /// - /// * The `StackJob` will not move for the lifetime of the `JobRef`. + /// * After this call, the `StackJob` will not be moved or dropped until one + /// of these conditions is met: /// - /// * The `StackJob` does not outlive any data it closes over. + /// * (A) A call to `check` on the `StackJob`'s latch returns something other + /// than `Pending`. /// - /// * This function is not called again so long as the `JobRef` lives. + /// * (B) The `JobRef` has been dropped without `execute` being called. #[inline(always)] pub unsafe fn as_job_ref(&self) -> JobRef { let job_pointer = NonNull::from(self).cast(); - // SAFETY: `JobRef::new_raw` requires: + // SAFETY: We must show that `JobRef::execute` will only be called on + // the returned `JobRef` if it is sound to call `execute_fn` on + // `job_pointer`. + // + // Assume that `JobRef::execute` has been called, under the conditions + // defined by the safety comment for this function. Then: // - // * `job_pointer` and `Self::execute` are matched. + // * `this` is an aligned pointer to an initialized `StackJob`, + // which will not be invalidated until a `check` on the latch it + // contains returns something other than `Pending`. // - // Here, `execute` expects a pointer to `Self`, which is what - // `job_pointer` is. + // We created `job_pointer` from a ref to `self`, which is a + // `StackJob`, so it must have pointed to an aligned `StackJob` at + // some point. // - // * The pointee is live, not moved, and not dropped until `execute_fn` - // is called. + // If the caller allows the `JobRef` to be executed, they must also + // ensure that the pointer will not be invalidated unless a call to + // `check` on the job's `Latch` has returned something other than + // `Pending`. // - // Here, the caller guarantees the `StackJob` outlives and does not - // move for the lifetime of the `JobRef`. + // * `StackJob::execute` is called at most once on any `StackJob`. // - // * `execute_fn` to be called at most once. + // The caller ensures only one `JobRef` is ever created for this + // `StackJob`. Since `JobRef::execute` consumes that `JobRef`, it + // cannot be called multiple times. // - // Here, `JobRef::execute` consumes the `JobRef`, and only one - // `JobRef` is created per `StackJob`, so it is called exactly once. - unsafe { JobRef::new_raw(job_pointer, Self::execute) } + // + unsafe { JobRef::new(job_pointer, Self::execute) } } - /// Returns a reference to the latch embedded in this stack job. After this - /// latch is set, it becomes safe to call `StackJob::return_value`. + /// Returns a reference to the latch embedded in this stack job. #[inline(always)] pub fn completion_latch(&self) -> &Latch { &self.completed } /// Unwraps the stack job back into a closure. This allows the closure to be - /// executed without indirection in situations where the one still has - /// direct access. - /// - /// # Safety - /// - /// The caller must ensure that either this function or `execute` are called - /// for a given `StackJob` (not both), and that this function must not be - /// called multiple times. - #[inline(always)] - pub unsafe fn unwrap(&mut self) -> F { - let f_mut = self.f.get_mut(); - // SAFETY: `ManuallyDrop` requires us to ensure that it is not used - // again after we `take()` it's contents. - // - // `take()` is called in two places: once here, and once in `execute`. - // Since this function is mutually exclusive with `execute`, and is - // called at most once, the `ManuallyDrop` is not used again. - unsafe { ManuallyDrop::take(f_mut) } - } - - /// Unwraps the job into it's return value. + /// executed without indirection in situations where one still has direct + /// access. /// /// # Safety /// /// The caller must ensure that: /// - /// * This is called only after the job's latch is set. + /// * No `JobRef` currently exists for this `StackJob`. + /// + /// * No new `JobRef` will be created for this `StackJob`. /// - /// * That this is called at most once for a given `StackJob`. + /// * If a `JobRef` did exist, it was never executed. #[inline(always)] - pub unsafe fn return_value(&mut self) -> ThreadResult { - // Synchronize with the fence in `StackJob::execute`, establishing a - // happens-after relationship with the following read.. - fence(Ordering::Acquire); - // Get a ref to the result. - let result_ref = self.return_value.get_mut(); - // SAFETY: `assume_init_read` requires: + pub unsafe fn unwrap_func(mut self) -> F { + // SAFETY: For this access to be valid, we must first establish that + // we have exclusive access to `data`. Only three other functions access + // `data`, and none of them can race with this function: // - // * The `MaybeUninit` is fully initialized. + // * Since `JobRef::execute` is not called, `StackJob::execute` is not + // called, and cannot be running. // - // As this function can only be called if the latch has been set, and - // the latch is only set at the end of `StackJob::execute` (after - // `return_value` is written and memory is synchronized via the above - // fence) the memory must be initialized. + // * Since this function and `unwrap_output` both consume the + // `StackJob`, and a `StackJob` cannot be duplicated, `unwrap_output` + // cannot be running now. // - // * That data not be incorrectly duplicated by repeated calls. + // * Since this function and `unwrap_error` both consume the `StackJob`, + // and a `StackJob` cannot be duplicated, `unwrap_error` cannot be + // running now. // - // Data is not duplicated because this function is called at most once. - unsafe { result_ref.assume_init_read() } + // Next, we must establish that it is valid to read from union field + // `func`. Each `StackJob` is constructed using field `func`, and only + // `StackJob::execute` writes to the union after construction. Since + // `StackJob::execute` is not called, it must still be valid to read + // from `func`. + let func_ref = unsafe { &mut self.data.get_mut().func }; + // SAFETY: The `StackJob` is dropped at the end of this block, so `data` + // is never accessed again. + unsafe { ManuallyDrop::take(func_ref) } } -} -impl Job for StackJob -where - F: FnOnce(&Worker) -> T + Send, - T: Send, -{ - /// Executes a `StackJob` from a const pointer. + /// Unwraps the job into its return value. /// /// # Safety /// - /// The caller must ensure that: - /// - /// * `this` is a non-null, properly aligned pointer to a live instance of - /// `StackJob`. - /// - /// * The `StackJob` will not move or be deallocated until the latch it - /// contains is set. - /// - /// * Either this function or `unwrap` are called at most once for a given - /// `StackJob`. + /// This may only be called if a `check` on the enclosed latch has returned + /// `Ok`. #[inline(always)] - unsafe fn execute(this: NonNull<()>, worker: &Worker) { - // SAFETY: The caller ensures `this` can be converted into an immutable - // reference until we set the latch, and the latch has not yet been set. - let this = unsafe { this.cast::().as_ref() }; - // Create an abort guard. If the closure panics, this will convert the - // panic into an abort. Doing so prevents use-after-free for other - // elements of the stack. - let abort_guard = unwind::AbortOnDrop; - // SAFETY: `f` is a `UnsafeCell>`. Creating a - // `&mut ManuallyDrop` is only sound so long as no other live - // references exist. - // - // `f` is accessed mutably in two places: once here, and once in - // `unwrap`. Since this function is mutually exclusive with `unwrap`, - // and is called at most once, exclusive access is guaranteed. - let f_ref = unsafe { &mut *this.f.get() }; - // SAFETY: `ManuallyDrop` requires us to ensure that it is not used - // again after we `take()` it's contents. - // - // `take()` is called in two places: once here, and once in `unwrap`. - // Since this function is mutually exclusive with `unwrap`, and is - // called at most once, the `ManuallyDrop` is not used again. - let f = unsafe { ManuallyDrop::take(f_ref) }; - // Run the job. If the job panics, we propagate the panic back to the - // main thread. - let result = unwind::halt_unwinding(|| f(worker)); - // Get the uninitialized memory where we should put the return value. - let return_value = this.return_value.get(); - // SAFETY: Writing to this unsafe cell requires that no other thread - // holds a reference to it's contents. - // - // The `return_value` is only written here and only read within - // `StackJob::return_value`, and then only after the latch has been set. - // The latch has not been set, and this function is called at most once, - // so no concurrent access can occur. - unsafe { (*return_value).write(result) }; - // This synchronizes with the `Acquire` fence within `return_value()`, - // establishing a happens-before relationship that makes the preceding - // `return_value` write visible to the reader. + pub unsafe fn unwrap_output(mut self) -> T { + // Synchronize with the fence in `StackJob::execute`, establishing a + // happens-after relationship with the following read. + fence(Ordering::Acquire); + // SAFETY: For this access to be valid, we must first establish that + // we have exclusive access to `data`. Only three other functions access + // `data`, and none of them can race with this function: // - // This is required because latches do not synchronize memory. - fence(Ordering::Release); - // SAFETY: The caller ensures the job is valid until the latch is set. - // Since the latch is a field of the job, the latch must be valid until - // it is set. - unsafe { Latch::set(&this.completed) }; - // Forget the abort guard, re-enabling panics. - core::mem::forget(abort_guard); - } -} - -// ----------------------------------------------------------------------------- -// Stack allocated work function on a non-worker thread - -/// Like [`StackJob`] but allocated on the stack of a non-worker thread. While -/// this job is pending, the owning thread is fully blocked. -#[cfg(not(feature = "shuttle"))] -pub struct ExternalJob { - f: UnsafeCell>, - completed: AtomicU32, - return_value: UnsafeCell>>, -} - -#[cfg(not(feature = "shuttle"))] -impl ExternalJob -where - F: FnOnce(&Worker) -> T + Send, - T: Send, -{ - /// Creates a new `ExternalJob`. - #[inline(always)] - pub fn new(f: F) -> ExternalJob { - ExternalJob { - f: UnsafeCell::new(ManuallyDrop::new(f)), - completed: AtomicU32::new(0), - return_value: UnsafeCell::new(MaybeUninit::uninit()), - } - } - - /// Creates a `JobRef` pointing to this job. The underlying `ExternalJob` is - /// not dropped after the `JobRef` is executed. - /// - /// # Safety - /// - /// The caller must ensure that: - /// - /// * The `ExternalJob` will not move or be deallocated until the `JobRef` - /// is executed. - /// - /// * The `JobRef` does not outlive any data the `ExternalJob` closes over. - /// - /// * This function is not called again so long as the `JobRef` lives. - #[inline(always)] - pub unsafe fn as_job_ref(&self) -> JobRef { - let job_pointer = NonNull::from(self).cast(); - // SAFETY: The `job_pointer` is trivially aligned and non-null, - // because it is derived from a reference. - // - // The caller must not allow the `ExternalJob` to move or be deallocated - // until the `JobRef` is executed. This guarantees that `job_pointer` - // remains valid for the lifetime of `JobRef`, satisfying the - // requirements of `JobRef::new_raw`. - // - // The caller guarantees that this function is not called again while - // `JobRef` lives, so `Self::execute` can be called at most once for - // this particular `ExternalJob`. This satisfies the at-most-once - // execution invariant documented on `Job::execute`. - unsafe { JobRef::new_raw(job_pointer, Self::execute) } + // * Since `check` has returned `Ok`, and the latch is only set in + // `StackJob::execute`, `StackJob::execute` must have been called at + // least once. `StackJob::execute` may be called at most once, so it + // cannot be running now. + // + // * Since this function and `unwrap_func` both consume the `StackJob`, + // and a `StackJob` cannot be duplicated, `unwrap_func` cannot be + // running now. + // + // * Since this function and `unwrap_error` both consume the `StackJob`, + // and a `StackJob` cannot be duplicated, `unwrap_error` cannot be + // running now. + // + // Next, we must establish that it is valid to read from union field + // `output`. We know this because `check` returned `Ok`, which means + // `set` was called with a false `error_flag` within + // `StackJob::execute`. This always follows a write to union field + // `output`, after which the union is not written to again. + let output_ref = unsafe { &mut self.data.get_mut().output }; + // SAFETY: The `StackJob` is dropped at the end of this block, so `data` + // is never accessed again. + unsafe { ManuallyDrop::take(output_ref) } } - /// Waits for the `ExternalJob` to be executed and returns the result. + /// Unwraps the job into an error. /// /// # Safety /// - /// This must be called at most once. + /// This may only be called if a `check` on the enclosed latch has returned + /// `Error`. #[inline(always)] - pub unsafe fn wait_for_value(&mut self) -> ThreadResult { - // Wait for the complete flag to be set. - loop { - atomic_wait::wait(&self.completed, 0); - if self.completed.load(Ordering::Relaxed) == 1 { - break; - } - } - // Synchronize memory; we do this with a fence, so that we only do a - // relaxed load in the case of a spurious wakeup. + pub unsafe fn unwrap_error(mut self) -> Box { + // Synchronize with the fence in `StackJob::execute`, establishing a + // happens-after relationship with the following read. fence(Ordering::Acquire); - // Get a ref to the result. - let result_ref = self.return_value.get_mut(); - // SAFETY: `assume_init_read` requires: + // SAFETY: For this access to be valid, we must first establish that + // we have exclusive access to `data`. Only three other functions access + // `data`, and none of them can race with this function: // - // * The `MaybeUninit` is fully initialized. + // * Since `check` has returned `Error`, and the latch is only set in + // `StackJob::execute`, `StackJob::execute` must have been called at + // least once. `StackJob::execute` may be called at most once, so it + // cannot be running now. // - // As this can only be called if we have observed that `completed` has - // been set to 1, and that only happens at the end of - // `ExternalJob::execute` (after `return_value` is written and memory - // is synchronized via the above fence) the memory must be initialized. + // * Since this function and `unwrap_func` both consume the `StackJob`, + // and a `StackJob` cannot be duplicated, `unwrap_func` cannot be + // running now. // - // * That data not be incorrectly duplicated by repeated calls. + // * Since this function and `unwrap_output` both consume the `StackJob`, + // and a `StackJob` cannot be duplicated, `unwrap_output` cannot be + // running now. // - // Data is not duplicated because this function is called at most - // once. - unsafe { result_ref.assume_init_read() } + // Next, we must establish that it is valid to read from union field + // `error`. We know this because `check` returned `Error`, which means + // `set` was called with a true `error_flag` within `StackJob::execute`. + // This always follows a write to union field `error`, after which the + // union is not written to again. + let error_ref = unsafe { &mut self.data.get_mut().error }; + // SAFETY: The `StackJob` is dropped at the end of this block, so `data` + // is never accessed again. + unsafe { ManuallyDrop::take(error_ref) } } -} -#[cfg(not(feature = "shuttle"))] -impl Job for ExternalJob -where - F: FnOnce(&Worker) -> T + Send, - T: Send, -{ - /// Executes an `ExternalJob` from a const pointer. + /// Executes a `StackJob` from a const pointer. /// /// # Safety /// /// The caller must ensure that: /// - /// * `this` is a non-null, properly aligned pointer to a live instance - /// of `ExternalJob`. - /// - /// * The `ExternalJob` will not move or be deallocated for as long as - /// `completed` remains set to 0. + /// * `this` is an aligned pointer to an initialized `StackJob`, which + /// will not be invalidated until a `check` on the latch it contains + /// returns something other than `Pending`. /// - /// * This function is called at most once for a given `ExternalJob`. + /// * This function is called at most once on any `StackJob`. #[inline(always)] unsafe fn execute(this: NonNull<()>, worker: &Worker) { - // SAFETY: The caller ensures `this` can be converted into an immutable - // reference until we set the `complete` atomic. + // SAFETY: The pointer `this` is non-null, aligned, and the caller + // ensures it points to an initialized `StackJob`. + // + // `StackJobs` are always accessed immutably except for `unwrap_func`, + // `unwrap_output`, and `unwrap_error`. The caller ensures these will not + // race this call, so the pointer is valid for immutable access. let this = unsafe { this.cast::().as_ref() }; // Create an abort guard. If the closure panics, this will convert the // panic into an abort. Doing so prevents use-after-free for other // elements of the stack. let abort_guard = unwind::AbortOnDrop; - // SAFETY: `f` is a `UnsafeCell>`. Creating a - // `&mut ManuallyDrop` is only sound so long as no other live - // references exist. - // - // Since this field is never access mutably except for here and this - // function is called at most once, exclusive access is guaranteed. - let f_ref = unsafe { &mut *this.f.get() }; - // SAFETY: `ManuallyDrop` requires us to ensure that it is not used - // again after we `take()` it's contents. - // - // Since it is not used in the remainder of this function, and this - // function is called at most once, it is indeed not used again. - let f = unsafe { ManuallyDrop::take(f_ref) }; - // Run the job. If the job panics, we propagate the panic back to the - // main thread. - let result = unwind::halt_unwinding(|| f(worker)); - // Get the uninitialized memory where we should put the return value. - let return_value = this.return_value.get(); - // SAFETY: Writing to this unsafe cell requires that no other thread - // holds a reference to it's contents. - // - // The `return_value` is only read within `ExternalJob::wait_for_value`, - // and then only after `completed` is set to 1. Since this function is - // called at most once, `completed` must still be set to 0. Therefore no - // concurrent access can occur. - unsafe { (*return_value).write(result) }; - // Set `completed` to 1, allowing reads of the return value. This - // `Release` store synchronizes with the `Acquire` fence in - // `ExternalJob::wait_for_value`, establishing a happens-before - // relationship that makes the preceding `return_value` write visible - // to the waiting reader. - this.completed.store(1, Ordering::Release); - // Notify the waiting thread that the job is complete. - atomic_wait::wake_one(&this.completed); + // Run the function and record the result. Produces a boolean flag that + // is true in the event of a panic. + let error_flag = { + // SAFETY: Only `unwrap_func`, `unwrap_output` and `unwrap_error` + // access `data`. Due to their individual safety contracts, they can + // only be called in a way that will not race with this function, so + // we must have unique access. + let data_ref = unsafe { &mut *this.data.get() }; + // SAFETY: Each `StackJob` is constructed using field `func`, and + // this is the only place we write to the union after construction. + // As this function is called at most once, it must still be valid + // to access the union with field `func`. + let func_ref = unsafe { &mut data_ref.func }; + // SAFETY: The `func` field is overwritten by the following match + // block, so it will not be accessed again. + let func = unsafe { ManuallyDrop::take(func_ref) }; + // Run the job. If the job panics, we propagate the panic back to the + // main thread. + let result = unwind::halt_unwinding(|| func(worker)); + // Emit different signals depending on if the function completed + // successfully or panicked. + match result { + Ok(output) => { + data_ref.output = ManuallyDrop::new(output); + false + } + Err(error) => { + data_ref.error = ManuallyDrop::new(error); + true + } + } + }; + // This synchronizes with the `Acquire` fence within `return_value()`, + // establishing a happens-before relationship that makes the preceding + // `return_value` write visible to the reader. + // + // This is required because latches do not synchronize memory. + fence(Ordering::Release); + // SAFETY: This casts a reference to a raw pointer, which means the + // pointer must be aligned, non-null, and point to an initialized latch. + // + // We also meet Variant 2 of the `set` safety contract: + // + // * The latch has not been `set` since it was created or last `reset`, + // and calls to `set` do not race. + // + // This is the only place where this latch is set, and the caller + // ensures this is called at most once. Therefore `set` cannot have + // been called already, and there can be no other calls to `set` that + // would race with this one. + // + // * The latch will not be dropped or moved until after `check` returns + // something other than `Pending`. + // + // The caller ensures that this `StackJob` is not dropped until a + // `check` on the latch returns something other than `Pending`, and + // nothing removes the latch from the `StackJob`. + unsafe { Latch::set(&this.completed, error_flag) }; // Forget the abort guard, re-enabling panics. core::mem::forget(abort_guard); } } -#[cfg(feature = "shuttle")] -pub struct ExternalJob { - f: UnsafeCell>, - mutex: shuttle::sync::Mutex>>, - condvar: shuttle::sync::Condvar, -} - -#[cfg(feature = "shuttle")] -impl ExternalJob -where - F: FnOnce(&Worker) -> T + Send, - T: Send, -{ - /// Creates a new `ExternalJob`. - #[inline(always)] - pub fn new(f: F) -> ExternalJob { - ExternalJob { - f: UnsafeCell::new(ManuallyDrop::new(f)), - mutex: shuttle::sync::Mutex::new(None), - condvar: shuttle::sync::Condvar::new(), - } - } - - #[inline(always)] - #[allow(clippy::undocumented_unsafe_blocks)] - pub unsafe fn as_job_ref(&self) -> JobRef { - let job_pointer = NonNull::from(self).cast(); - unsafe { JobRef::new_raw(job_pointer, Self::execute) } - } - - #[inline(always)] - pub unsafe fn wait_for_value(&mut self) -> ThreadResult { - let mut value = self.mutex.lock().unwrap(); - while value.is_none() { - value = self.condvar.wait(value).unwrap(); - } - Option::take(&mut value).unwrap() - } -} - -#[cfg(feature = "shuttle")] -impl Job for ExternalJob -where - F: FnOnce(&Worker) -> T + Send, - T: Send, -{ - #[inline(always)] - #[allow(clippy::undocumented_unsafe_blocks)] - unsafe fn execute(this: NonNull<()>, worker: &Worker) { - let this = unsafe { this.cast::().as_ref() }; - let abort_guard = unwind::AbortOnDrop; - let f_ref = unsafe { &mut *this.f.get() }; - let f = unsafe { ManuallyDrop::take(f_ref) }; - let result = unwind::halt_unwinding(|| f(worker)); - let mut value = this.mutex.lock().unwrap(); - *value = Some(result); - this.condvar.notify_one(); - core::mem::forget(abort_guard); - } -} - // ----------------------------------------------------------------------------- // Heap allocated work function @@ -667,7 +523,7 @@ pub struct HeapJob { impl HeapJob where - F: FnOnce(&Worker) + Send, + F: FnOnce(&Worker), { /// Allocates a new `HeapJob` on the heap. #[inline(always)] @@ -676,49 +532,74 @@ where } /// Converts the heap job into an "owning" `JobRef`. The job will be - /// automatically dropped when the `JobRef` is executed. - /// - /// This will leak memory if the `JobRef` is not executed, so the caller - /// must ensure that it is eventually executed (unless the process is - /// exiting). + /// automatically dropped when the `JobRef` is executed (or will leak if it + /// is not executed). /// /// # Safety /// - /// If the `JobRef` is executed, the caller must ensure that it has not - /// outlived the data it closes over. In other words, if the closure - /// references something, that thing must live until the `JobRef` is - /// executed or dropped. + /// The caller must ensure that: + /// + /// * The `JobRef` will not outlive any of the items closed over by the + /// function `f`. + /// + /// * If `f` is `!Send` then `JobRef::execute` is only called on the thread + /// where the `HeapJob` was constructed. #[inline(always)] pub unsafe fn into_job_ref(self: Box) -> JobRef { // SAFETY: Pointers produced by `Box::into_raw` are never null. - let job_pointer = unsafe { NonNull::new_unchecked(Box::into_raw(self)).cast() }; - - // SAFETY: The pointer was created by a call to `Box::into_raw` so it is - // valid to pass in to `Self::execute`. + let job_pointer = + unsafe { NonNull::new_unchecked(Box::into_raw(self)).cast() }; + // SAFETY: The doc-comment for this function defines the conditions + // under which this `JobRef` will be considered "executable". + // + // We must now show that it is sound to call `HeapJob::execute` on + // `job_ref` under these conditions, which in turn requires that: + // + // * `job_pointer` is an aligned pointer to an initialized `Box`. + // + // We created it from a ref to `self`, which is a `Box`, so it + // must be. + // + // * `HeapJob::execute` is called at most once on any `HeapJob`. + // + // `into_job_ref` converts the `HeapJob` into a `JobRef`, and + // `JobRef::execute` consumes the `JobRef` to call `HeapJob::execute`, + // so it can be called at most once. + // + // * This function is only called during the lifetime of the items + // closed over by the function. + // + // The `JobRef` is not allowed to outlive the items closed over by the + // function, so `JobRef::execute` and hence `HeapJob::execute` can + // only be called during that interval. + // + // * Accessing `f` will not violate a `!Send` requirement. // - // Because this function takes ownership of `Self` to produce a - // `JobRef`, `JobRef::execute` takes ownership of the `JobRef` to call - // `Self::execute`, the job_pointer cannot be used after `Self::execute` - // is called. So it is safe for the pointer to become dangling. - unsafe { JobRef::new_raw(job_pointer, Self::execute) } + // This is ensured by the executability condition. + unsafe { JobRef::new(job_pointer, Self::execute) } } -} -impl Job for HeapJob -where - F: FnOnce(&Worker) + Send, -{ /// Executes a `Box`, dropping it when completed. /// /// # Safety /// - /// The caller must ensure that `this` is a pointer, created by calling - /// `Box::into_raw` on a `Box>`. After the call `this` must be - /// treated as dangling. + /// The caller must ensure that: + /// + /// * `this` is an aligned pointer to an initialized `HeapJob`. + /// + /// * This function is called at most once on any `HeapJob`. + /// + /// * Any items the `HeapJob` closes over are still live. + /// + /// * If the `HeapJob` is `!Send` then this is called on the thread where + /// the `HeapJob` was constructed. #[inline(always)] unsafe fn execute(this: NonNull<()>, worker: &Worker) { - // SAFETY: The caller ensures `this` was created by `Box::into_raw` and - // that this is called only once. + // SAFETY: The caller ensures that: + // + // * `this` was created by `Box::into_raw`. + // + // * This function is called at most once. let this = unsafe { Box::from_raw(this.cast::().as_ptr()) }; // Run the job. (this.f)(worker); diff --git a/src/latch.rs b/src/latch.rs index 425a9b8..e03ee26 100644 --- a/src/latch.rs +++ b/src/latch.rs @@ -1,11 +1,8 @@ -//! Forte borrows the *latch* concept from Rayon. +//! Forte borrows the *latch* concept from Rayon. Every forte worker thread has +//! a single binary semaphore, used for parking and unparking the thread. //! -//! Every forte worker thread has a single "sleep controller" that it uses to -//! park and unpark itself. Latches build on this to create a simple boolean -//! switch, which allows the owning thread to sleep until the latch becomes set -//! by another thread. -//! -//! Every latch points at one "sleep controller". +//! Latches build on top of semaphores; allowing workers to wait for specific +//! events, while also allowing wakeups from other sources on the semaphore. use alloc::task::Wake; use core::borrow::Borrow; @@ -17,14 +14,18 @@ use crate::platform::*; /// The default state of a latch is `LOCKED`. When in the locked state, `check` /// returns `false` and `wait` blocks. -const LOCKED: u32 = 0b00; +const LOCKED: u32 = 0b000; + +/// The latch enters the `SIGNAL` state when it is set (with error flag false). +/// When in this state, `check` returns `Status::Ok` and `wait` does not block. +const SIGNAL: u32 = 0b001; -/// The latch enters the `SIGNAL` state when it is set. When in this state, -/// `check` returns `true` and `wait` does not block. -const SIGNAL: u32 = 0b01; +/// The latch enters the `ERROR` state when it is set (with error flag true). +/// When in this state, `check` returns `Status::Error` and `wait` does not block. +const ERROR: u32 = 0b010; /// The latch enters the `ASLEEP` state when blocking with `wait`. -const ASLEEP: u32 = 0b10; +const ASLEEP: u32 = 0b100; // ----------------------------------------------------------------------------- // Latch @@ -33,8 +34,8 @@ const ASLEEP: u32 = 0b10; /// occurred. The latch begins as *unset* (In the `LOCKED` state), and can later /// be *set* by any thread (entering the `SIGNAL`) state. /// -/// Each latch is associated with one *owner thread*. This is the thread that -/// may be blocking, waiting for the latch to complete. +/// Each latch is "owned" by a single thread at a time; other threads may set +/// the latch, but only the owning thread may wait on it. /// /// The general idea and spirit for latches (as well as some of the /// documentation) is due to rayon. However the implementation is specific to @@ -49,38 +50,39 @@ pub struct Latch { /// Holds the internal state of the latch. This tracks if the latch has been /// set or not. state: AtomicU32, - /// Tracks the number of sleeping threads in the pool. - sleeping: &'static AtomicU32, - /// The sleep controller for the owning thread. - sleep_controller: &'static SleepController, - /// The seat number that owns this latch - seat_number: usize, + /// The semaphore that this latch will use for signaling. + semaphore: &'static Semaphore, +} + +pub enum Status { + Pending, + Ok, + Error, } impl Latch { - /// Creates a new latch, owned by a specific thread. - pub fn new( - seat_number: usize, - sleeping: &'static AtomicU32, - sleep_controller: &'static SleepController, - ) -> Latch { + /// Creates a new latch backed by the provided semaphore. + pub fn new(semaphore: &'static Semaphore) -> Latch { Latch { state: AtomicU32::new(LOCKED), - sleeping, - sleep_controller, - seat_number, + semaphore, } } /// Checks to see if the latch has been set. Returns true if it has been. #[inline(always)] - pub fn check(&self) -> bool { - self.state.load(Ordering::Relaxed) == SIGNAL + pub fn check(&self) -> Status { + match self.state.load(Ordering::Relaxed) { + SIGNAL => Status::Ok, + ERROR => Status::Error, + _ => Status::Pending, + } } - /// Puts the thread to sleep if the latch has not been set. The thread will - /// be woken when the latch becomes set, but may also wake before then. The - /// caller should always re-check the latch condition after this returns. + /// Checks if the latch has been set, and if not waits for a signal on the + /// semaphore. This does _not_ wait for the latch to actually become set, + /// and may return early. The caller should always re-check the latch + /// condition after this returns. /// /// # Memory Ordering /// @@ -89,7 +91,7 @@ impl Latch { /// The other thread must issue a corresponding `fence(Ordering::Release)` /// call. #[cold] - pub fn wait(&self) { + pub fn wait(&self, seat_bitmask: u32, waiting_bitmask: &'static AtomicU32) { // First, check if the latch has been set. // // In the event of a race with `set`: @@ -97,21 +99,24 @@ impl Latch { // * If this happens before the store, then we will go to sleep. // // * If this happens after the store, then we notice and return. - if self.state.load(Ordering::Relaxed) == SIGNAL { + if self.state.load(Ordering::Relaxed) & (SIGNAL | ERROR) != 0 { return; } - // If it has not been set, go to sleep. + // If it has not been set, wait for a signal on the semaphore. // - // In the event of a race with `set`, the `wake` will always cause this - // to return regardless of memory ordering. - self.sleep_controller.sleep(self.seat_number, self.sleeping); + // In the event of a race with `set`, the call to `Semaphore::signal` + // will always end up unblocking this, no matter the memory-ordering. + self.semaphore.wait(seat_bitmask, waiting_bitmask); } - /// Activates the latch, potentially unblocking the owning thread. + /// Sets the latch, and sends a signal over the semaphore. /// /// This takes a raw pointer because the latch may be de-allocated by a /// different thread while this function is executing. /// + /// When `error` is set to `true`, `wait` returns `Status::Error` rather + /// than `Status::Ok`. + /// /// # Memory Ordering /// /// This does not synchronize memory. To synchronize memory with the waiting @@ -120,33 +125,61 @@ impl Latch { /// /// # Safety /// - /// The latch pointer must be valid when passed to this function. After this - /// call, the latch pointer may become dangling and must not be dereferenced - /// unless it is known to still be valid. + /// The caller must ensure that: + /// + /// * `latch` is a non-null, aligned pointer to an initialized `Latch`. + /// + /// * Additionally, one of the following condition variants must be met: + /// + /// 1. The latch will not be dropped or moved for the duration of `set`. + /// + /// 2. The latch has not been `set` since it was created or last `reset`, + /// calls to `set` do not race, and the latch will not be dropped or + /// moved until after `check` returns something other than `Pending`. #[inline(always)] - pub unsafe fn set(latch: *const Latch) { - // SAFETY: The caller guarantees the latch remain alive until `set` - // returns. - let latch = unsafe { &*latch }; - let sleep_controller = latch.sleep_controller; - // First we set the state to true. + pub unsafe fn set(latch: *const Latch, error: bool) { + // First we store a reference to the semaphore (which is 'static) so + // that we can access it even if the latch pointer becomes dangling. + // + // SAFETY: The caller guarantees the latch pointer is aligned and non-null. + // + // If Variant 1 is met, the latch cannot be dangling. + // + // If Variant 2 is met, the latch cannot become dangling so long as the + // state is `LOCKED` (because `check` will return `Pending`). Since there + // can have been no previous call to `set` since construction or the + // last `reset`, and there can be no racing calls to `set`, the state + // must be `LOCKED`. Therefore the latch cannot be dangling. + // + // Since this pointer is aligned, non-null, is not dangling, and the + // latch is never accessed mutably, it is valid to access immutably. + let semaphore = unsafe { (*latch).semaphore }; + // Determine the next state for the latch. + let state = if error { ERROR } else { SIGNAL }; + // Next we update the state. // // In the event of a race with `wait`, this may cause `wait` to return. - // Otherwise the other thread will sleep within `wait. - latch.state.store(SIGNAL, Ordering::Relaxed); - // We must try to wake the other thread, just in case it missed the - // notification and went to sleep. This guarantees that the other thread - // will make progress. - sleep_controller.wake(); + // Otherwise the other thread will sleep within `wait`. + // + // SAFETY: The latch is still valid to access immutably, following the + // same logic as above. + // + // NOTE: This store will mean `check` no longer returns `Pending`, + // invalidating the argument in Variant 2. The latch pointer therefore + // may become dangling after this line. + unsafe { (*latch).state.store(state, Ordering::Relaxed) }; + // Finally we try to signal the target thread on it's semaphore, just in + // case it missed the notification and is currently waiting. This + // guarantees that the other thread will make progress. + semaphore.signal(); } /// Restores the latch to the default state. /// /// # Deadlocks /// - /// This may only be called by the thread that "owns" the latch, and only - /// after it has *observed* the latch entering the `SIGNAL` state, e.g. - /// after either `wait` or `check` has returned `true`. + /// This must only be called by the thread that "owns" the latch, and only + /// after it has *observed* `check` return something other than `Pending`. /// /// Calling `reset` from a different thread or before observing the signal /// is likely to result in deadlocks. @@ -157,30 +190,28 @@ impl Latch { } // ----------------------------------------------------------------------------- -// Sleeper +// Signals -/// Used, in combination with a latch to park and unpark threads. -#[cfg(not(feature = "shuttle"))] -pub struct SleepController { +/// A low-overhead binary semaphore used for task signaling. +pub struct Semaphore { state: AtomicU32, } -#[cfg(not(feature = "shuttle"))] -impl SleepController { - /// Creates a new sleep controller. +impl Semaphore { + /// Creates a new signal. pub const fn new() -> Self { - SleepController { + Semaphore { state: AtomicU32::new(LOCKED), } } - /// Attempt to wake the thread to which this belongs. + /// Sends a signal to the semaphore. /// - /// Returns true if this allows the thread to make progress (by waking it up - /// or catching it before it goes to sleep) and false if the thread was - /// running. + /// Returns true if this allows a waiting thread to make progress (by + /// signaling it while it was waiting or catching it before it started + /// waiting) and false if the thread was running. #[inline(always)] - pub fn wake(&self) -> bool { + pub fn signal(&self) -> bool { // Set the state to SIGNAL and read the current state, which must be // either LOCKED, ASLEEP or SIGNAL. let sleep_state = self.state.swap(SIGNAL, Ordering::Relaxed); @@ -202,22 +233,26 @@ impl SleepController { sleep_state == ASLEEP } - /// Attempt to send the thread to sleep. This should only be called on a - /// single thread, and we say that this controller "belongs" to that thread. + /// Waits for a signal on the semaphore. /// - /// Returns true if this thread makes a syscall to suspend the thread, and - /// false if the thread was already woken (letting us skip the syscall). + /// Calls to `wait` should be fully ordered. In other words, this method + /// must not be called on the same value by two different threads unless a + /// "happens-before relationship" has been established between the calls via + /// memory synchronization. #[cold] - pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) { - // Set the state to ASLEEP and read the current state, which must be - // either LOCKED or SIGNAL. + pub fn wait(&self, seat_mask: u32, waiting_bitmask: &'static AtomicU32) { + // Set the state to ASLEEP and read the current state. let state = self.state.swap(ASLEEP, Ordering::Relaxed); + // The previous state should not have been ASLEEP, because calls to + // `sleep` must be fully ordered, and the state is only set to ASLEEP + // while `sleep` is executing. + // // If the state is LOCKED, then we have not yet received a signal, and // we should try to put the thread to sleep. Otherwise we should return // early. if state == LOCKED { // Set the sleeping bit for this worker. - sleeping.fetch_or(1 << seat_number, Ordering::Relaxed); + waiting_bitmask.fetch_or(seat_mask, Ordering::Relaxed); // If we have received a signal since entering the sleep state // (meaning the state is no longer set to ASLEEP) then this will // return immediately. @@ -228,7 +263,7 @@ impl SleepController { // Either way, there is no way we can fail to receive a `wake`. atomic_wait::wait(&self.state, ASLEEP); // Clear the sleeping bit for this worker. - sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed); + waiting_bitmask.fetch_and(!seat_mask, Ordering::Relaxed); } // Set the state back to LOCKED so that we are ready to receive new // signals. @@ -236,62 +271,19 @@ impl SleepController { } } -// ----------------------------------------------------------------------------- -// Shuttle sleeper fallback - -/// This is a fallback implementation because the futex api is not available on -/// shuttle. -#[cfg(feature = "shuttle")] -pub struct SleepController { - state: Mutex, - condvar: Condvar, -} - -#[cfg(feature = "shuttle")] -impl SleepController { - pub fn new() -> Self { - SleepController { - state: Mutex::new(LOCKED), - condvar: Condvar::new(), - } - } - - pub fn wake(&self) -> bool { - let state = core::mem::replace(&mut *self.state.lock().unwrap(), SIGNAL); - let asleep = state == ASLEEP; - if asleep { - self.condvar.notify_one(); - } - asleep - } - - pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) { - let mut state = self.state.lock().unwrap(); - if *state == LOCKED { - *state = ASLEEP; - sleeping.fetch_or(1 << seat_number, Ordering::Relaxed); - while *state == ASLEEP { - state = self.condvar.wait(state).unwrap(); - } - sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed); - } - *state = LOCKED; - } -} - // ----------------------------------------------------------------------------- // Async wakers impl Wake for Latch { fn wake(self: Arc) { // SAFETY: The borrowed `Arc` is held for the duration of this call, - // keeping the `Latch` alive. - unsafe { Latch::set(self.borrow()) }; + // keeping the `Latch` alive, and satisfying Variant 1 of `Latch::set`. + unsafe { Latch::set(self.borrow(), false) }; } fn wake_by_ref(self: &Arc) { // SAFETY: The borrowed `Arc` is held for the duration of this call, - // keeping the `Latch` alive. - unsafe { Latch::set(self.borrow()) }; + // keeping the `Latch` alive, and satisfying Variant 1 of `Latch::set`. + unsafe { Latch::set(self.borrow(), false) }; } } diff --git a/src/lib.rs b/src/lib.rs index 7fb6758..6675a7b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,7 @@ //! THREAD_POOL.resize_to_available(); //! //! // Register this thread as a worker on the pool. -//! THREAD_POOL.expect_worker(|worker| { +//! THREAD_POOL.with_worker(|worker| { //! // Spawn a job onto the pool. The closure also accepts a worker, because the //! // job may be executed on a different thread. This will be the worker for whatever //! // thread it executes on. @@ -130,19 +130,18 @@ //! //! Thread pools are comprised of (and run on) workers, represented as instances //! of the [`Worker`] type. All work done on the pool is done in a "worker -//! context" created by [`Worker::occupy`]. The recommended way to access a -//! worker context for a specific pool is via [`ThreadPool::with_worker`], -//! [`ThreadPool::on_worker`], or [`ThreadPool::expect_worker`]. +//! context" created by [`Membership::activate`]. The recommended way to access a +//! worker context for a specific pool is via [`ThreadPool::with_worker`]. //! //! ``` //! # use forte::ThreadPool; //! # static THREAD_POOL: ThreadPool = ThreadPool::new(); -//! THREAD_POOL.expect_worker(|worker_1| { // <-- Sets up this thread as a worker. -//! THREAD_POOL.expect_worker(|worker_2| { // <-- Returns a reference to the existing worker. +//! THREAD_POOL.with_worker(|worker_1| { // <-- Sets up this thread as a worker. +//! THREAD_POOL.with_worker(|worker_2| { // <-- Returns a reference to the existing worker. //! // These pointers are identical. //! assert!(std::ptr::eq(worker_1, worker_2)); -//! }); // <-- Leaving this scope does nothing. -//! }); // <-- Leaving this scope frees the worker. +//! }); // <-- Leaving this scope does nothing. +//! }); // <-- Leaving this scope frees the worker. //! ``` //! //! Every worker holds a local queue of tasks, as well as metadata that allows @@ -174,40 +173,16 @@ //! //! # Core Operations //! -//! Thread pools support four core operations: +//! Thread pools support five core operations: //! * *Join.* Executes two non-static closures, possibly in parallel, and waits for them to complete. //! * *Spawn.* Runs a static closure or future in the background. //! * *Scope.* Runs multiple non-static closures or futures, and waits for them all to complete. //! * *Block on.* Waits for a future to complete (outside of an async context). +//! * *Broadcast.* Runs the same operation across all workers. +//! //! -//! All of these with the exception of *Spawn* are blocking; they have a -//! specific join-point where a thread must wait for all the forks of the -//! parallel operation to complete before proceeding. While it is waiting, -//! threads will attempt to do background work, or help each-other out with -//! their assigned workload. -//! -//! Each operation is available in three different "flavors", depending on the -//! information available at the callsite. -//! -//! | Operation | Headless | Thread pool | Worker | -//! |-----------|----------|-------------|--------| -//! | *Join* | [`join()`] | [`ThreadPool::join()`] | [`Worker::join()`] -//! | *Spawn* | [`spawn()`] | [`ThreadPool::spawn()`] | [`Worker::spawn()`] -//! | *Scope* | [`scope()`] | [`ThreadPool::scope()`] | [`Worker::scope()`] -//! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`] -//! -//! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one. -//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one. -//! * *Worker.* Uses the provided worker context. -//! -//! The headless and thread pool flavors are more or less just aliases for the -//! worker flavor. Where possible, the worker flavor should be preferred to the -//! thread pool flavor, and the thread pool flavor should be preferred to the -//! headless flavor. #![no_std] -#![cfg_attr(feature = "shuttle", allow(dead_code))] -#![cfg_attr(feature = "shuttle", allow(unused_imports))] // ----------------------------------------------------------------------------- // Boilerplate for building without the standard library @@ -223,6 +198,7 @@ mod job; mod latch; mod scope; mod thread_pool; +mod time; mod unwind; mod util; @@ -239,68 +215,58 @@ pub struct FutureMarker(); // Top-level exports pub use scope::Scope; -pub use scope::ScopedSpawn; +pub use scope::SpawnScoped; +pub use thread_pool::Broadcast; +pub use thread_pool::DEFAULT_POOL; +pub use thread_pool::DefaultThreadPool; +pub use thread_pool::Membership; pub use thread_pool::Spawn; +pub use thread_pool::SpawnLocal; pub use thread_pool::Task; pub use thread_pool::ThreadPool; pub use thread_pool::Worker; pub use thread_pool::Yield; pub use thread_pool::block_on; +pub use thread_pool::broadcast; pub use thread_pool::join; +pub use thread_pool::num_members; pub use thread_pool::scope; pub use thread_pool::spawn; +pub use thread_pool::spawn_broadcast; // ----------------------------------------------------------------------------- // Platform Support -// This crate uses `shuttle` for testing, which requires mocking all of the core -// threading primitives (`Mutex` and the like). -// -// To make things a bit simpler, we re-export all the important types in the -// `primitives` module. - -#[cfg(not(feature = "shuttle"))] +// This exists to make it easy to swap out the basic parallelism primitives. +// Currently there are no alternative implementations, but there may be in +// future. mod platform { - // Core exports - - pub use alloc::sync::Arc; pub use core::sync::atomic::AtomicBool; pub use core::sync::atomic::AtomicPtr; pub use core::sync::atomic::AtomicU32; + pub use core::sync::atomic::AtomicUsize; pub use core::sync::atomic::Ordering; + pub use core::sync::atomic::fence; + + pub use alloc::sync::Arc; pub use std::sync::Mutex; - pub use std::sync::OnceLock; pub use std::thread::Builder as ThreadBuilder; pub use std::thread::JoinHandle; - pub use std::thread::available_parallelism; pub use std::thread_local; -} - -#[cfg(feature = "shuttle")] -mod platform { - - // Core exports - pub use std::sync::OnceLock; // shuttle has no OnceLock; std's version is fine here + pub use std::thread::available_parallelism; - pub use shuttle::rand::Rng; - pub use shuttle::rand::thread_rng; - pub use shuttle::sync::Arc; - pub use shuttle::sync::Condvar; - pub use shuttle::sync::Mutex; - pub use shuttle::sync::Weak; - pub use shuttle::sync::atomic::AtomicBool; - pub use shuttle::sync::atomic::AtomicPtr; - pub use shuttle::sync::atomic::AtomicU32; - pub use shuttle::sync::atomic::Ordering; - pub use shuttle::thread::Builder as ThreadBuilder; - pub use shuttle::thread::JoinHandle; - pub use shuttle::thread_local; + use std::sync::LazyLock; + pub struct Lazy(LazyLock); - // Available parallelism + impl Lazy { + pub const fn new(init: fn() -> T) -> Self { + Lazy(LazyLock::new(init)) + } - pub fn available_parallelism() -> std::io::Result> { - panic!("available_parallelism does not work on shuttle"); + pub fn get(&'static self) -> &'static T { + LazyLock::force(&self.0) + } } } diff --git a/src/scope.rs b/src/scope.rs index 03c3bb7..6f41229 100644 --- a/src/scope.rs +++ b/src/scope.rs @@ -11,7 +11,6 @@ use core::mem::ManuallyDrop; use core::pin::Pin; use core::ptr; use core::ptr::NonNull; -use core::sync::atomic::fence; use core::task::Context; use core::task::Poll; use core::task::RawWaker; @@ -28,9 +27,11 @@ use crate::job::HeapJob; use crate::job::JobRef; use crate::latch::Latch; use crate::platform::*; +use crate::thread_pool::Broadcast; use crate::thread_pool::Worker; use crate::unwind; use crate::unwind::AbortOnDrop; +use crate::util::IterBits; // ----------------------------------------------------------------------------- // Scope @@ -59,8 +60,10 @@ use crate::unwind::AbortOnDrop; /// /// The `'env: 'scope` bound is part of the definition of the `Scope` type. The /// requirement that scoped work outlive `'scope` is part of the definition of -/// the [`ScopedSpawn`] trait. +/// the [`SpawnScoped`] trait. pub struct Scope<'scope, 'env: 'scope> { + /// The thread-pool this scope is attached to. + thread_pool: &'static ThreadPool, /// Number of active references to the scope (including the owning /// allocation). This is incremented each time a new `ScopePtr` is created, /// and decremented when a `ScopePtr` is dropped or the owning thread is @@ -110,22 +113,15 @@ where F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, { let abort_guard = AbortOnDrop; - // SAFETY: `Scope::new` requires: - // - // 1. The `Scope` is never moved after initialization. - // - // 2. `complete` is called exactly once before the `Scope` is dropped. - // - // The scope is not moved in this function, and since no `&mut Scope` - // reference is allowed to escape, the caller cannot safely cause the scope - // to move either. - // - // `Scope::complete` is called unconditionally on the line below, before - // the implicit drop of `scope`. If the closure `f` panics, it is caught and - // re-emitted after `complete` finishes. In the event of an uncaught panic, - // we cannot ensure `complete` runs properly before the scope is dropped, so - // we force an abort via an `AbortOnDrop` guard. - let scope = unsafe { Scope::new(worker) }; + // Create a new scope object on the stack. + let scope = Scope { + thread_pool: worker.thread_pool(), + count: AtomicU32::new(1), + completed: worker.new_latch(), + panic: AtomicPtr::new(ptr::null_mut()), + _scope: PhantomData, + _env: PhantomData, + }; // Panics that occur within the closure should be caught and propagated once // all spawned work is complete. This is not a safety requirement, it's just // a nicer behavior than aborting. @@ -139,8 +135,8 @@ where // Now that the user has (presumably) spawned some work onto the scope, we // must wait for it to complete. // - // SAFETY: This is called only once, and we provide the same worker used to - // create the scope. + // SAFETY: This is called only once within this function, and then the scope + // is dropped. unsafe { scope.complete(worker) }; // At this point all work on the scope is complete, so it is safe to drop // the scope. This also means we can relinquish our abort guard (returning @@ -153,44 +149,18 @@ where } impl<'scope, 'env> Scope<'scope, 'env> { - /// Creates a new scope - /// - /// # Safety + /// Runs a closure or future sometime before the scope completes. /// - /// The caller must ensure: - /// - /// * The `Scope` is never moved after creation. `ScopePtr::new` captures a - /// raw `*const Scope` pointer, and spawned jobs hold onto these pointers - /// until they complete. Moving the scope would invalidate these pointers - /// and cause UB when any `ScopePtr` is dropped or used for scope access. - /// - /// * `complete` is called exactly once before the `Scope` is dropped, after - /// which no `ScopePtr` may be created for this scope. `complete` blocks - /// until the reference count ticks down to zero, ensuring that the scope - /// outlives all `ScopePtr` references. Failing to call `complete` may - /// result in dangling `ScopePtr` and produce use-after-free. - unsafe fn new(worker: &Worker) -> Scope<'scope, 'env> { - Scope { - count: AtomicU32::new(1), - completed: worker.new_latch(), - panic: AtomicPtr::new(ptr::null_mut()), - _scope: PhantomData, - _env: PhantomData, - } - } - - /// Runs a closure or future sometime before the scope completes. Valid - /// inputs to this method are: + /// This is like [`Worker::spawn`], allows the work to borrow local data. + /// Vald inputs to this method are: /// /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type. /// /// * A `Future` future, with no return type. - /// - /// # Panics - /// - /// If not in a worker, this panics. - pub fn spawn>(&'scope self, scoped_work: S) { - Worker::with_current(|worker| scoped_work.spawn_on(worker.unwrap(), self)); + pub fn spawn>(&'scope self, scoped_work: S) { + self.thread_pool.get_worker(|worker| { + scoped_work.spawn_scoped(self, worker); + }); } /// Runs a closure or future sometime before the scope completes. Valid @@ -201,8 +171,93 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// * A `Future` future, with no return type. /// /// Unlike [`Scope::spawn`], this accepts the current worker as a parameter. - pub fn spawn_on>(&'scope self, worker: &Worker, scoped_work: S) { - scoped_work.spawn_on(worker, self); + pub fn spawn_on>( + &'scope self, + worker: &Worker, + scoped_work: S, + ) { + scoped_work.spawn_scoped(self, Some(worker)); + } + + /// Runs an operation across multiple threads. + /// + /// This is like [`Worker::spawn_broadcast`], but allows the work to borrow + /// local data. + pub fn spawn_broadcast(&'scope self, f: F) + where + F: for<'worker> Fn(Broadcast<'worker>) + Send + Sync + 'scope, + { + // Prevent workers from leaving the pool, and read the membership bitset + // once it's frozen. + let members = self.thread_pool.freeze_membership(); + let participants = members.count_ones() as usize; + + // We are going to spawn a job for every participant, and need to keep + // the scope alove while that completes. For the sake of efficiency, + // we'll increment the counter for all of these jobs in one single + // operation. + self.count.fetch_add(participants as u32, Ordering::Relaxed); // (*) + + // Create a new job for each member + let member_data = self.thread_pool.get_member_data(); + for (i, member_index) in members.iter_bits().enumerate() { + let func = &f; + let scope = self; + let op = move |worker: &Worker| { + // Run the job + let result = unwind::halt_unwinding(|| { + func(Broadcast { + worker, + index: i, + participants, + }); + }); + // If the operation panics on any thread, write the panic out to + // the scope panic slot. + if let Err(err) = result { + scope.store_panic(err); + }; + // SAFETY: This corresponds to one of the increments performed in + // a batch with the `fetch_add` at the start of this function. + // It was incremented `p` times, and this will be called `p` + // times, as the workers complete the `p` broadcast jobs. + unsafe { scope.remove_reference() }; + }; + + let job = HeapJob::new(op); + + // SAFETY: `HeapJob::into_job_ref` requires: + // + // * The `JobRef` will not outlive any of the items closed over by + // the `op`. + // + // The only non-copy captures are `scope: &'scope Scope` and + // `func: &&F + 'scope`, so we must show that the `JobRef` will + // not outlive `'scope`. + // + // This is ensured via the scope's lifetime extension logic: we + // incremented the scope's counter on the line marked with (*), + // and we know the scope will not complete (extending the lifetime + // of `'scope`) until there is a corresponding call to + // `remove_reference()`. + // + // All `n` added references are not removed until the op has run + // `n` times on `n` threads, so the `op` cannot outlive the data + // it borrows. + // + // * If `op` is `!Send` then `JobRef::execute` will only be called + // on this thread. + // + // `op` is unconditionally `Send`. Since `F` and `Scope` are + // `Sync,` the enclosed references `&Scope` and `&&F` are `Send`. + let job_ref = unsafe { job.into_job_ref() }; + member_data.broadcasts[member_index].push(job_ref); + member_data.semaphores[member_index].signal(); + } + + // Once we have finished pushing jobs out to workers who we know are not + // in the middle of resginging, we can allow resignations again. + self.thread_pool.unfreeze_membership(); } /// Adds an additional reference to the scope's reference counter. @@ -211,25 +266,19 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// `Scope::remove_reference`, or the scope will block forever on /// completion. fn add_reference(&self) { - let counter = self.count.fetch_add(1, Ordering::Relaxed); - tracing::trace!("scope reference counter increased to {}", counter + 1); + self.count.fetch_add(1, Ordering::Relaxed); } /// Removes a reference from the scope's reference counter. /// /// # Safety /// - /// The caller must ensure that each call to `remove_reference` corresponds - /// to exactly one prior call to `add_reference` (or the implicit initial - /// count of 1 provided by `Scope::new`, in the case of `Scope::complete`). - /// - /// If `remove_reference` is called without a matching `add_reference`, the - /// scope latch will be set prematurely, potentially allowing the scope to - /// be freed while a `ScopePtr` still holds a pointer to it. Uses of the - /// `ScopePtr` thereafter may produce use-after-free. + /// The caller must be able to point to a corresponding place where the scope + /// counter was incremented by one. This could be through a call to + /// `add_reference`, a direct `fetch_add` on the underlying counter, or the + /// implicit initial increment the scope starts with. unsafe fn remove_reference(&self) { let counter = self.count.fetch_sub(1, Ordering::Relaxed); - tracing::trace!("scope reference counter decreased to {}", counter - 1); if counter == 1 { // Alerts the owning thread that the scope has completed. // @@ -242,7 +291,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { // the latch is set, which happens only here, after the count // reaches zero. Therefore, the `completed` field of this `Scope` // must still be a live latch. - unsafe { Latch::set(&self.completed) }; + unsafe { Latch::set(&self.completed, false) }; } } @@ -271,7 +320,12 @@ impl<'scope, 'env> Scope<'scope, 'env> { // `store_panic` handles the synchronization for it's panic data). if self .panic - .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed) + .compare_exchange( + nil, + err_ptr, + Ordering::Release, + Ordering::Relaxed, + ) .is_ok() { // Ownership is now transferred into the panic field. @@ -313,8 +367,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// /// # Safety /// - /// This must be called only once. This must be called with a reference to - /// the same worker the scope was created with. + /// The caller must ensure that this is called at most once. unsafe fn complete(&self, worker: &Worker) { // SAFETY: This is explicitly allowed, because every scope starts off // with a counter of 1. Because this is called only once, the following @@ -351,7 +404,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// THREAD_POOL.scope(|scope| { /// scope.spawn(|_| { }); -/// // ^^^^^^^ the trait `ScopedSpawn<'_, _>` is not implemented for closure ... +/// // ^^^^^^^ the trait `SpawnScoped<'_, _>` is not implemented for closure ... /// }); /// ``` /// Try adding a type hint to the closure's parameters, like so: @@ -364,17 +417,27 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// }); /// ``` /// Hopefully rustc will fix this type inference failure eventually. -pub trait ScopedSpawn<'scope, M>: Send + 'scope { - /// Spawns the value of self as scoped work on the provided worker. - fn spawn_on<'env>(self, worker: &Worker, scope: &'scope Scope<'scope, 'env>); +pub trait SpawnScoped<'scope, M>: Send + 'scope { + /// Similar to [`spawn`][crate::Worker::spawn] but adds the work to a + /// [`Scope`]. This work will be polled to completion some-time before the + /// scome completes, and may borrow data that outlives the scope. + fn spawn_scoped<'env>( + self, + scope: &'scope Scope<'scope, 'env>, + worker: Option<&Worker>, + ); } -impl<'scope, F> ScopedSpawn<'scope, FnOnceMarker> for F +impl<'scope, F> SpawnScoped<'scope, FnOnceMarker> for F where F: FnOnce(&Worker) + Send + 'scope, { #[inline] - fn spawn_on<'env>(self, worker: &Worker, scope: &'scope Scope<'scope, 'env>) { + fn spawn_scoped<'env>( + self, + scope: &'scope Scope<'scope, 'env>, + worker: Option<&Worker>, + ) { // Create a job to execute the spawned function in the scope. let scope_ptr = ScopePtr::new(scope); let job = HeapJob::new(move |worker| { @@ -393,22 +456,51 @@ where // keep the calling stack frame alive until this job completes, // effectively extending the lifetime of `'scope` for as long as is // necessary. + + // SAFETY: `HeapJob::into_job_ref` requires: + // + // * The `JobRef` will not outlive any of the items closed over by + // the `op`. + // + // The only non-copy captures are `self` and the scope pointer, both + // of which have lifetime `'scope`. So we must show that the `JobRef` + // will not outlive `'scope`. + // + // This is ensured via the scope's lifetime extension logic: the scope + // will not complete so long as the `scope_ptr` is held, extending the + // lifetime of `'scope` until after `self` the job executes and is + // dropped. + // + // * If `op` is `!Send` then `JobRef::execute` will only be called + // on this thread. + // + // `op` is unconditionally `Send`. let job_ref = unsafe { job.into_job_ref() }; // Send the job to a queue to be executed. - worker.fifo_queue.push_new(job_ref); + match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => scope.thread_pool.push_shared_job(job_ref), + } } } -impl<'scope, Fut> ScopedSpawn<'scope, FutureMarker> for Fut +impl<'scope, Fut> SpawnScoped<'scope, FutureMarker> for Fut where Fut: Future + Send + 'scope, { #[inline] - fn spawn_on<'env, 'worker>(self, worker: &'worker Worker, scope: &'scope Scope<'scope, 'env>) { - let poll_job = ScopeFutureJob::new(worker.thread_pool(), scope, self); + fn spawn_scoped<'env>( + self, + scope: &'scope Scope<'scope, 'env>, + worker: Option<&Worker>, + ) { + let poll_job = ScopeFutureJob::new(scope, self); let job_ref = poll_job.into_job_ref(); - worker.fifo_queue.push_new(job_ref); + match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => scope.thread_pool.push_shared_job(job_ref), + } } } @@ -480,8 +572,6 @@ struct ScopeFutureJob<'scope, 'env, Fut> { /// A scope pointer. This allows the job to interact with the scope, and /// also keeps the scope alive until the job is dropped. scope_ptr: ScopePtr<'scope, 'env>, - /// The thread pool this job is attached to. - thread_pool: &'static ThreadPool, /// The state of the job, which is either READY, WOKEN, or LOCKED. state: AtomicU32, } @@ -501,16 +591,11 @@ where /// Creates a new `ScopedFutureJob` in an `Arc`. The caller is expected to /// immediately call `into_job_ref` and queue it on a worker to be polled. - fn new( - thread_pool: &'static ThreadPool, - scope: &Scope<'scope, 'env>, - future: Fut, - ) -> Arc { + fn new(scope: &Scope<'scope, 'env>, future: Fut) -> Arc { let scope_ptr = ScopePtr::new(scope); Arc::new(Self { future: UnsafeCell::new(future), scope_ptr, - thread_pool, // The job starts in the WOKEN state because we always queue it // after creating it. state: AtomicU32::new(WOKEN), @@ -524,38 +609,46 @@ where /// Forgetting this job ref will cause a memory leak. fn into_job_ref(self: Arc) -> JobRef { // SAFETY: Pointers created by `Arc::into_raw` are never null. - let job_pointer = unsafe { NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast()) }; + let job_pointer = unsafe { + NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast()) + }; - // SAFETY: `JobRef::new_raw` requires that: + // SAFETY: `JobRef::new` requires us to show that `execute` will only be + // called on the resulting `JobRef` where it is sound to call `poll` on + // `job_pointer`. // - // * `job_pointer` and `Self::poll` be "matched". + // `Poll` has two preconditions: // - // `Self::poll` expects a pointer created by calling `Arc::into_raw` - // on an `Arc`, which is exactly what `job_pointer` is. + // 1. `job_pointer` must have been produced by `Arc::into_raw` on an + // `Arc`. // - // * `job_pointer` points to an initialized and aligned value which is - // neither moved nor dropped until it is executed. + // We produced `job_pointer` this way just above. // - // The Arc reference count must be least 1. `Arc::into_raw` transfers - // ownership of the strong count from `self` into the `JobRef`, and - // that count is only released in `poll`, after the arc produced by - // `Arc::from_raw` is dropped. The data is therefore guaranteed to - // remain live until `poll` is called. + // 2. We must hold ownership of exactly one strong reference count for + // the allocation. // - // * If `poll` has additional safety requirements, `job_pointer` upholds - // them. - // - // In this case, `poll` does not have any additional requirements. - unsafe { JobRef::new_raw(job_pointer, Self::poll) } + // We start with an `Arc`, so we must own a strong reference. + // Calling `Arc::into_raw` transfers the strong count of `self` onto + // `job_pointer` without decrementing it. Therefore, when `execute` + // is called, there will still be a strong reference for it to + // consume. + unsafe { JobRef::new(job_pointer, Self::poll) } } - /// This is what happens when the job is executed. It is this function that - /// is in charge of actually polling the future, and it is therefore an - /// extremely hot and performance sensitive function. - fn poll(this: NonNull<()>, worker: &Worker) { + /// Polls the future. + /// + /// # Safety + /// + /// `this` must be a pointer produced by `Arc::into_raw` on an `Arc`. + /// + /// This call takes ownership of exactly one strong reference count for that + /// allocation, consuming it via `Arc::from_raw` internally. The caller must + /// hold "ownership" of one such strong reference. + unsafe fn poll(this: NonNull<()>, worker: &Worker) { // While we still have a raw pointer to the job, create a raw task waker // using our vtable. - let raw_waker = RawWaker::new(this.as_ptr().cast_const(), &Self::VTABLE); + let raw_waker = + RawWaker::new(this.as_ptr().cast_const(), &Self::VTABLE); // Create a new waker from the raw waker. This is *non-owning* and // functions like a `&Arc` rather than an `Arc`. We wrap it @@ -563,8 +656,8 @@ where // through the vtable, which would cause the reference-count to // decrement (incorrectly). // - // SAFETY: The api contract of RawWaker and RawWakerVTable is upheld by - // the `Self::VTABLE` const. + // SAFETY: The API contract of `RawWaker` and `RawWakerVTable` is upheld + // by the `Self::VTABLE` const. // // * The functions are all thread safe. // @@ -605,30 +698,30 @@ where // SAFETY: The following line requires that: // - // 1. No other mutable references to the future exist. + // * No other mutable references to the future exist. // - // 2. The future will not move. + // Access to the future is protected by the `state` field, which acts + // as a mutex. Just above, we executed // - // Access to the future is protected by the `state` field, which acts - // as a mutex. Just above, we executed + // state.swap(LOCKED, Ordering::Acquire) // - // state.swap(LOCKED, Ordering::Acquire) + // which transitions us from the `WOKEN` into the `LOCKED` state. Any + // concurrent caller that also tries to execute `poll` will fail this + // swap, and cause an abort. Exclusive access is therefore + // guaranteed. // - // which transitions us from the `WOKEN` into the `LOCKED` state. Any - // concurrent caller that also tries to execute `poll` will fail this - // swap, and cause an abort. Exclusive access is therefore guaranteed. + // In the event that `poll` has been called previously, the `Acquire` + // ordering synchronizes with the call to // - // In the event that `poll` has been called previously, the `Acquire` - // ordering synchronizes with the call to + // fence(Ordering::Release) // - // state.compare_exchange(LOCKED, READY, Ordering::Release, Ordering::Release) + // later in this function. This ensures that we are not racing with + // another mutable reference to the same value. // - // later in this function. This ensures that all writes to the future - // performed by previous invocations are visible to us before we form - // the mutable reference. + // * The future will not move. // - // The future does not move, because it is stored in a field within an - // `Arc`, which has a stable heap-allocated address. + // The future does not move, because it is stored in a field within an + // `Arc`, which has a stable heap-allocated address. let future = unsafe { Pin::new_unchecked(&mut *this.future.get()) }; // Create a new context from the waker, and poll the future. @@ -652,7 +745,12 @@ where // ownership of the future. let rescheduled = this .state - .compare_exchange(LOCKED, READY, Ordering::Relaxed, Ordering::Relaxed) + .compare_exchange( + LOCKED, + READY, + Ordering::Relaxed, + Ordering::Relaxed, + ) .is_err(); // Emit a fence here, which synchronizes with the `Acquire` swap // at the start of this function to ensure that the next thread @@ -718,11 +816,11 @@ where if this.state.swap(WOKEN, Ordering::Relaxed) == READY { // Convert the waker into a job ref and queue it. - let thread_pool = this.thread_pool; + let thread_pool = this.scope_ptr.thread_pool(); let job_ref = this.into_job_ref(); - thread_pool.with_worker(|worker| match worker { + thread_pool.get_worker(|worker| match worker { Some(worker) => worker.fifo_queue.push_new(job_ref), - None => thread_pool.queue_shared_job(job_ref), + None => thread_pool.push_shared_job(job_ref), }); } } @@ -739,17 +837,17 @@ where // // SAFETY: This is called on a pointer created by `Arc::into_raw` on an // instance of `Arc`. - let this = unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::())) }; + let this = + unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::())) }; if this.state.swap(WOKEN, Ordering::Relaxed) == READY { // Clone the waker, convert it into a job-ref and queue it. let this = ManuallyDrop::into_inner(this.clone()); - let thread_pool = this.thread_pool; + let thread_pool = this.scope_ptr.thread_pool(); let job_ref = this.into_job_ref(); - - thread_pool.with_worker(|worker| match worker { + thread_pool.get_worker(|worker| match worker { Some(worker) => worker.fifo_queue.push_new(job_ref), - None => thread_pool.queue_shared_job(job_ref), + None => thread_pool.push_shared_job(job_ref), }); } } @@ -780,22 +878,31 @@ mod scope_ptr { use core::any::Any; use super::Scope; + use crate::ThreadPool; /// A reference-counted pointer to a scope. Used to capture a scope pointer /// in jobs without faking a lifetime. Holding a `ScopePtr` keeps the /// reference scope from being deallocated. pub struct ScopePtr<'scope, 'env>(*const Scope<'scope, 'env>); - // SAFETY: This is safe because (a) scope-pointer is only used to call - // `add_reference`, `remove_reference`, and `store_panic`, all of which are - // designed to be thread-safe; and (b) the `Scope` cannot be deallocated - // while any `ScopePtr` still points to it (due to reference counting). + // SAFETY: This is sound because: + // + // * `ScopePtr` is only used to call `add_reference`, `remove_reference`, + // and `store_panic`, all of which are designed to be called from multiple + // threads concurrently. + // + // * The `Scope` cannot be deallocated while any `ScopePtr` still points to + // it (due to reference counting), so the raw pointer is always valid. unsafe impl Send for ScopePtr<'_, '_> {} - // SAFETY: This is safe because (a) scope-pointer is only used to call - // `add_reference`, `remove_reference`, and `store_panic`, all of which are - // designed to be thread-safe; and (b) the `Scope` cannot be deallocated - // while any `ScopePtr` still points to it (due to reference counting). + // SAFETY: This is sound because: + // + // * `ScopePtr` is only used to call `add_reference`, `remove_reference`, + // and `store_panic`, all of which are designed to be called from multiple + // threads concurrently. + // + // * The `Scope` cannot be deallocated while any `ScopePtr` still points to + // it (due to reference counting), so the raw pointer is always valid. unsafe impl Sync for ScopePtr<'_, '_> {} impl<'scope, 'env> ScopePtr<'scope, 'env> { @@ -817,6 +924,14 @@ mod scope_ptr { let scope_ref = unsafe { &*self.0 }; scope_ref.store_panic(err); } + + pub fn thread_pool(&self) -> &'static ThreadPool { + // SAFETY: This was created using an immutable scope reference, and + // by the scope rules there can be no mutable references to this + // scope, nor can the scope have been moved or deallocated while the + // scope's counter remains incremented. + unsafe { (&*self.0).thread_pool } + } } impl Drop for ScopePtr<'_, '_> { @@ -839,7 +954,7 @@ mod scope_ptr { // ----------------------------------------------------------------------------- // Tests -#[cfg(all(test, not(feature = "shuttle")))] +#[cfg(test)] mod tests { use core::iter::once; use core::pin::Pin; @@ -989,7 +1104,10 @@ mod tests { impl Future for CountFuture { type Output = (); - fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + fn poll( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll { if self.count == 128 { Poll::Ready(()) } else { @@ -1045,7 +1163,7 @@ mod tests { let a = AtomicU8::new(0); let b = AtomicU8::new(0); - THREAD_POOL.on_worker(|worker| { + THREAD_POOL.with_worker(|worker| { scope(|scope| { for _ in 0..NUM_JOBS { scope.spawn_on(worker, |_: &Worker| { @@ -1094,12 +1212,12 @@ mod tests { let mut completed = false; - THREAD_POOL.on_worker(|worker| { + THREAD_POOL.with_worker(|worker| { worker.scope(|scope| { scope.spawn_on(worker, |_: &Worker| { // Creating a new worker instead of reusing the old one is // bad form, but we may as well test it. - THREAD_POOL.on_worker(|worker| { + THREAD_POOL.with_worker(|worker| { worker.scope(|scope| { scope.spawn_on(worker, |_: &Worker| { completed = true; @@ -1123,7 +1241,7 @@ mod tests { THREAD_POOL.resize_to_available(); let counter_p = &AtomicUsize::new(0); - THREAD_POOL.on_worker(|worker| { + THREAD_POOL.with_worker(|worker| { worker.scope(|scope| { scope.spawn(move |worker: &Worker| { divide_and_conquer(worker, scope, counter_p, 1024) @@ -1176,7 +1294,7 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - THREAD_POOL.on_worker(|_| { + THREAD_POOL.with_worker(|_| { let mut tree = random_tree(10, 1337); let values: Vec<_> = tree.iter().cloned().collect(); tree.update(|v| *v += 1); @@ -1238,7 +1356,10 @@ mod tests { random_tree_inner(depth, &mut rng) } - fn random_tree_inner(depth: usize, rng: &mut XorShift64Star) -> Tree { + fn random_tree_inner( + depth: usize, + rng: &mut XorShift64Star, + ) -> Tree { let children = if depth == 0 { vec![] } else { @@ -1264,7 +1385,7 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - THREAD_POOL.on_worker(|_| { + THREAD_POOL.with_worker(|_| { let mut max_diff = Mutex::new(0); let bottom_of_stack = 0; scope(|s| the_final_countdown(s, &bottom_of_stack, &max_diff, 5)); @@ -1298,7 +1419,9 @@ mod tests { *data = Ord::max(diff, *data); if n > 0 { - scope.spawn(move |_: &Worker| the_final_countdown(scope, bottom_of_stack, max, n - 1)); + scope.spawn(move |_: &Worker| { + the_final_countdown(scope, bottom_of_stack, max, n - 1) + }); } } @@ -1321,7 +1444,8 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - THREAD_POOL.scope(|scope| scope.spawn(|_: &Worker| panic!("Hello, world!"))); + THREAD_POOL + .scope(|scope| scope.spawn(|_: &Worker| panic!("Hello, world!"))); THREAD_POOL.depopulate(); } @@ -1335,7 +1459,9 @@ mod tests { THREAD_POOL.scope(|scope| { scope.spawn(|_: &Worker| { - scope.spawn(|_: &Worker| scope.spawn(|_: &Worker| panic!("Hello, world!"))) + scope.spawn(|_: &Worker| { + scope.spawn(|_: &Worker| panic!("Hello, world!")) + }) }) }); @@ -1351,7 +1477,9 @@ mod tests { THREAD_POOL.scope(|scope_1| { scope_1.spawn(|worker: &Worker| { - worker.scope(|scope_2| scope_2.spawn(|_: &Worker| panic!("Hello, world!"))) + worker.scope(|scope_2| { + scope_2.spawn(|_: &Worker| panic!("Hello, world!")) + }) }) }); @@ -1479,7 +1607,9 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - fn increment<'slice, 'counter>(counters: &'slice [&'counter AtomicUsize]) { + fn increment<'slice, 'counter>( + counters: &'slice [&'counter AtomicUsize], + ) { THREAD_POOL.scope::<'counter>(move |scope| { // We can borrow 'slice here, but the spawns can only borrow 'counter. for &c in counters { diff --git a/src/thread_pool.rs b/src/thread_pool.rs index 69ca80b..e50cfdb 100644 --- a/src/thread_pool.rs +++ b/src/thread_pool.rs @@ -1,6 +1,5 @@ //! This module contains the api and worker logic for the Forte thread pool. -use alloc::boxed::Box; use alloc::format; use alloc::vec::Vec; use core::array; @@ -23,23 +22,22 @@ use crossbeam_utils::CachePadded; use st3::StealError; use st3::lifo::Stealer; use st3::lifo::Worker as Sharer; -use tracing::debug; -use tracing::trace; -use tracing::trace_span; use crate::FnOnceMarker; use crate::FutureMarker; -use crate::job::ExternalJob; use crate::job::HeapJob; use crate::job::JobQueue; use crate::job::JobRef; use crate::job::StackJob; use crate::latch::Latch; -use crate::latch::SleepController; +use crate::latch::Semaphore; +use crate::latch::Status; use crate::platform::*; use crate::scope::Scope; use crate::scope::with_scope; +use crate::time::ticks; use crate::unwind; +use crate::util::IterBits; use crate::util::XorShift64Star; // ----------------------------------------------------------------------------- @@ -69,72 +67,78 @@ use crate::util::XorShift64Star; /// pool). All blocking methods (e.g. [`join`] and [`scope`]) work even with /// zero managed workers, but they won't run in parallel. pub struct ThreadPool { - /// A bit-set that tracks which seats are occupied. - occupied: CachePadded, - /// A bit-set that tracks which seats are sleeping. - sleeping: CachePadded, - /// Holds shared data for each thread participating in the pool. - seats: OnceLock>, + /// Shared data for each pool member. + member_data: Lazy, + /// A queue of data + shared_queue: SegQueue, + /// Manages how members leave the pool. + /// + /// * The 26 least significant bit count the number of ongoing broadcasts. + /// + /// * The 6 most significant bits count the number of ongoing resignations. + /// + /// Members can only resign when there are no ongoing broadcasts. Broadcasts + /// can only begin after all current resignations complete. + resignations: CachePadded, + /// A bitmask that tracks which members are waiting to resign. + wants_to_resign: CachePadded, + /// A bitmask that tracks which member indices are claimed. + claimed_bitmask: CachePadded, + /// A bitmask that tracks which members are waiting on their semaphore + /// signal. + waiting_bitmask: CachePadded, /// Holds controls for threads spawned and managed by the pool. Initialized - /// on first call to `occupy`, to allow for some non-static constructors. - managed_threads: Mutex, - /// Used to inject external work into the thread pool. This is generally - /// treated as a fallback, for when the thread-pool is at capacity and - /// threads can't register themselves as workers. - shared_jobs: SegQueue, + /// on first call to `activate`, to allow for some non-static constructors. + managed_workers: Mutex>, } /// A public interface that can be temporarily claimed and used by a thread. /// Claiming a seat allows a thread to participate in the thread pool as a /// worker. -pub(crate) struct Seats { +pub struct MemberData { /// The sharing side of each seat's work-stealing queue. These should only /// ever be accessed by the thread that currently owns the lease for this /// seat (to ensure the `!Sync` bound is respected). sharers: [Sharer; 32], /// The stealing side of each seat's work-stealing queue. stealers: [Stealer; 32], - /// The sleep/wake controller for each seat. - sleep_controllers: [SleepController; 32], + /// A set of queues used for transmitting work that must be executed on a + /// particular worker. Used for broadcasts and cross-thread nonsend worker + /// wakeups. + pub broadcasts: [SegQueue; 32], + /// A binary semaphore for each seat, used for signaling. + pub semaphores: [Semaphore; 32], } -// SAFETY: `stealers` are `Send + Sync` by their own bounds. `workers[i]` is -// only ever accessed by the single thread holding seat `i`'s occupancy lease; -// the `occupied` bitmask in `ThreadPool` enforces that exclusivity. -unsafe impl Sync for Seats {} - -/// A lease represents ownership of one of a "seats" in a thread pool, and -/// allows the owning thread to participate in that pool as a worker. -pub struct Lease { - /// The thread pool against which this lease is held. - thread_pool: &'static ThreadPool, - /// The index of the seat in the data list - seat_number: usize, - /// A reference to the pre-initialized seat data (to avoid repeated hits of - /// the `OnceLock`). - seats: &'static Seats, -} - -impl Drop for Lease { - fn drop(&mut self) { - // Unset the occupied bit for this seat - self.thread_pool - .occupied - .fetch_and(!(1 << self.seat_number), Ordering::Relaxed); +impl MemberData { + fn new() -> MemberData { + let sharers: [Sharer; 32] = + array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY)); + let stealers: [Stealer; 32] = + array::from_fn(|i| sharers[i].stealer()); + let broadcasts = array::from_fn(|_| SegQueue::new()); + let semaphores = array::from_fn(|_| Semaphore::new()); + MemberData { + sharers, + stealers, + broadcasts, + semaphores, + } } } -/// Manages threads spawned by the pool. -struct ManagedThreads { - /// Stores thread controls for workers spawned by the pool. - workers: Vec, -} +// SAFETY: `Sharer` (aka `st3::Worker`) is `!Sync`. We allow it to be stored in +// this shared structure, but we only allow one thread to access it at a time +// (via the membership claiming logic). This is effectively like sending +// `st3:Worker` ownership between threads (although in practice it always +// occupies the same place on the heap). Luckily for us, it implements `Send`. +unsafe impl Sync for MemberData {} /// Represents a worker thread that is managed by the pool, as opposed to /// external threads which temporarily participate in the pool. struct ManagedWorker { /// The index of this worker in the public worker info list. - seat_number: usize, + member_index: usize, /// Controls used to manage the lifecycle of the worker. control: ThreadControl, } @@ -148,7 +152,7 @@ struct ThreadControl { } // ----------------------------------------------------------------------------- -// Thread pool creation and maintenance +// Thread pool creation and utilities #[allow(clippy::new_without_default)] impl ThreadPool { @@ -156,125 +160,46 @@ impl ThreadPool { pub const fn new() -> ThreadPool { // Create the pool itself. ThreadPool { - seats: OnceLock::new(), - occupied: CachePadded::new(AtomicU32::new(0)), - sleeping: CachePadded::new(AtomicU32::new(0)), - managed_threads: Mutex::new(ManagedThreads { - workers: Vec::new(), - }), - shared_jobs: SegQueue::new(), - } - } - - /// Returns the pre-allocated steal queues, initializing them on the first call. - fn get_seats(&'static self) -> &'static Seats { - self.seats.get_or_init(|| { - let sharers: [Sharer; 32] = - array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY)); - let stealers: [Stealer; 32] = array::from_fn(|i| sharers[i].stealer()); - let sleep_controllers = array::from_fn(|_| SleepController::new()); - Box::new(Seats { - sharers, - stealers, - sleep_controllers, - }) - }) - } - - /// Adds a job ref to the shared queue. - pub fn queue_shared_job(&'static self, job_ref: JobRef) { - self.shared_jobs.push(job_ref); - } - - /// Claims a lease on the thread pool which can be occupied by a worker - /// (using [`Worker::occupy`]), allowing a thread to participate in the pool. - /// - /// Returns `None` if all seats are occupied. - #[cold] - pub fn claim_lease(&'static self) -> Option { - loop { - let occupied = self.occupied.load(Ordering::Relaxed); - if occupied == u32::MAX { - return None; - } - let seat_number = occupied.trailing_ones() as usize; - let mask = 1 << seat_number; - if self.occupied.fetch_or(mask, Ordering::Relaxed) & mask == 0 { - // At this point we have acquired the lease on the seat - return Some(Lease { - thread_pool: self, - seat_number, - seats: self.get_seats(), - }); - } - } - } - - /// Claims up to `n` leases at once in a single atomic transaction. - /// - /// Finds up to `n` free seats, then atomically claims all of them with a - /// single `compare_exchange`. Either every selected seat is claimed together - /// or none are (and the loop retries). Returns between 0 and `n` leases; - /// returns an empty `Vec` when `n` is 0 or the pool is full. - #[cold] - pub fn claim_leases(&'static self, n: usize) -> Vec { - if n == 0 { - return Vec::new(); - } - let seats = self.get_seats(); - loop { - let occupied = self.occupied.load(Ordering::Relaxed); - if occupied == u32::MAX { - return Vec::new(); - } - - // Build a mask of up to `n` free seats by walking the complement. - let mut claimed_seats = 0; - let mut free_seats = !occupied; - for _ in 0..n { - if free_seats == 0 { - break; - } - let seat_bit = free_seats & free_seats.wrapping_neg(); // isolate lowest set bit - claimed_seats |= seat_bit; - free_seats &= !seat_bit; - } - - // Attempt to claim all selected seats in one atomic step. - match self.occupied.compare_exchange( - occupied, - occupied | claimed_seats, - Ordering::Relaxed, - Ordering::Relaxed, - ) { - Ok(_) => { - return (0..32) - .filter(|&i| claimed_seats & (1 << i) != 0) - .map(|seat_number| Lease { - thread_pool: self, - seat_number: seat_number as usize, - seats, - }) - .collect(); - } - Err(_) => { - // Another thread modified `occupied`; retry. - } - } + member_data: Lazy::new(MemberData::new), + shared_queue: SegQueue::new(), + resignations: CachePadded::new(AtomicU32::new(0)), + claimed_bitmask: CachePadded::new(AtomicU32::new(0)), + waiting_bitmask: CachePadded::new(AtomicU32::new(0)), + wants_to_resign: CachePadded::new(AtomicU32::new(0)), + managed_workers: Mutex::new(Vec::new()), } } /// Returns an opaque identifier for this thread pool. #[inline(always)] - pub fn id(&self) -> usize { + pub fn id(&'static self) -> usize { // We can rely on `self` not to change since it's a static ref. ptr::from_ref(self) as usize } - /// Returns the number of workers participating in this thread pool. + /// Returns the number of members participating in the pool. #[inline(always)] - pub fn num_workers(&self) -> usize { - self.occupied.load(Ordering::Relaxed).count_ones() as usize + pub fn num_members(&'static self) -> usize { + self.claimed_bitmask.load(Ordering::Relaxed).count_ones() as usize + } + + /// Adds a job to the thread-pool's shared queue. + /// + /// This allows adding work from outside the pool (eg, without a reference + /// to a `Worker`). + /// + /// Note: Workers only take work from this queue as a last resort, after all + /// their other work has been exhausted. + #[inline(always)] + pub fn push_shared_job(&'static self, job_ref: JobRef) { + self.shared_queue.push(job_ref); + // Try to wake up a worker to execute this job. This is relatively cheap + // if no workers are waiting. + let waiting_bitmask = self.waiting_bitmask.load(Ordering::Relaxed); + if waiting_bitmask != 0 { + let i = waiting_bitmask.trailing_zeros() as usize; + self.get_member_data().semaphores[i].signal(); + } } } @@ -293,7 +218,8 @@ impl ThreadPool { /// /// See [`ThreadPool::resize`] for more information about resizing. pub fn resize_to_available(&'static self) -> usize { - let mut available = available_parallelism().map(NonZero::get).unwrap_or(1); + let mut available = + available_parallelism().map(NonZero::get).unwrap_or(1); available = available.saturating_sub(1).max(1); self.resize_to(available) } @@ -321,7 +247,9 @@ impl ThreadPool { /// /// See [`ThreadPool::resize`] for more information about resizing. pub fn shrink(&'static self, terminated_threads: usize) -> usize { - self.resize(|current_size| current_size.saturating_sub(terminated_threads)) + self.resize(|current_size| { + current_size.saturating_sub(terminated_threads) + }) } /// Ensures that there is at least one worker thread attached to the thread @@ -354,86 +282,76 @@ impl ThreadPool { where F: Fn(usize) -> usize, { - debug!("starting thread pool resize"); - // Resizing a pool is a critical section; only one thread can resize the // pool at a time. This is implemented using a mutex on the thread manager. - trace!("locking state"); - let mut managed_threads = self.managed_threads.lock().unwrap(); + let mut managed_workers = self.managed_workers.lock().unwrap(); // Compute the new size of the pool, given the current size. - let current_size = managed_threads.workers.len(); + let current_size = managed_workers.len(); // Calculate the new size of the pool (counting only managed workers). let new_size = get_size(current_size); - trace!( - "attempting to resize thread pool from {} to {} thread(s)", - current_size, new_size - ); match new_size.cmp(¤t_size) { // The size remained the same - cmp::Ordering::Equal => { - debug!("completed thread pool resize, size unchanged"); - return current_size; - } + cmp::Ordering::Equal => current_size, // The size increased cmp::Ordering::Greater => { // Spawn the new workers. - let leases = self.claim_leases(new_size - current_size); - for lease in leases { - let seat_number = lease.seat_number; - debug!("spawning managed worker for seat number {}", seat_number); + let memberships = self.try_enroll_many(new_size - current_size); + for membership in memberships { + let member_index = membership.member_index; let halt = Arc::new(AtomicBool::new(false)); let worker_halt = halt.clone(); let handle = ThreadBuilder::new() - .name(format!("worker {seat_number}")) + .name(format!("managed worker {member_index}")) .spawn(move || { - managed_worker(lease, worker_halt); + managed_worker(membership, worker_halt); }) .unwrap(); let control = ThreadControl { halt, handle }; - managed_threads.workers.push(ManagedWorker { - seat_number, + managed_workers.push(ManagedWorker { + member_index, control, }); } - drop(managed_threads); + managed_workers.len() } // The size decreased cmp::Ordering::Less => { // Pull the workers we intend to halt out of the thread manager. - let terminating_workers = managed_threads.workers.split_off(new_size); + let terminating_workers = managed_workers.split_off(new_size); // Terminate and wake the workers. - let seats = self.get_seats(); + let member_data = self.get_member_data(); for worker in &terminating_workers { // Tell the worker to halt. worker.control.halt.store(true, Ordering::Relaxed); - // Wake the worker up. - seats.sleep_controllers[worker.seat_number].wake(); + // Signal the worker. + member_data.semaphores[worker.member_index].signal(); } - // Drop the lock on the state so as not to block the workers or heartbeat. - drop(managed_threads); + // Drop the lock on the state so as not to block the workers or + // heartbeat. + drop(managed_workers); // Determine our seat index. - let own_seat_number = Worker::map_current(|worker| worker.lease.seat_number); + let own_member_index = + Worker::map_current(|worker| worker.member_index); // Wait for the other workers to fully halt. for worker in terminating_workers { // It's possible we may be trying to terminate ourselves, in // which case we can skip the thread-join. - if Some(worker.seat_number) != own_seat_number { + if Some(worker.member_index) != own_member_index { let _ = worker.control.handle.join(); } } + + new_size } } - - // Return the new size of the thread pool - new_size } } @@ -441,114 +359,55 @@ impl ThreadPool { // Thread pool worker access impl ThreadPool { - /// Runs the closure on a thread-pool worker. - /// - /// If this thread is not a worker, it will try to register itself as one. - /// If the thread pool is full, the closure is sent to another worker as a - /// job, and this thread is parked. - /// - /// If your closure is `!Send`, use [`with_worker`][ThreadPool::with_worker] - /// instead. + /// Returns this thread's worker if it is a member of this thread pool. #[inline(always)] - pub fn on_worker(&'static self, f: F) -> R + pub fn get_worker(&'static self, func: F) -> R where - F: FnOnce(&Worker) -> R + Send, - R: Send, + F: FnOnce(Option<&Worker>) -> R, { - self.with_worker(|worker| match worker { - Some(worker) => f(worker), - None => { - let mut job = ExternalJob::new(f); - // SAFETY: `ExternalJob::as_job_ref` requires: - // - // * The `ExternalJob` must not move or be deallocated until the - // `JobRef` is executed. - // - // * The `JobRef` does not outlive any data the `ExternalJob` closes over. - // - // * `as_job_ref` is not called again while `JobRef` lives. - // - // The `ExternalJob` is a stack-allocated variable. After - // calling `as_job_ref`, we never move `job`, and we wait for - // the job to execute by calling `job.wait_for_value`. Only - // after that returns do we allow the `job` to be dropped. This - // also means that any data closed over by the `ExternalJob` - // must outlive the `JobRef`. - // - // Also, `as_job_ref` is plainly called only once. - let job_ref = unsafe { job.as_job_ref() }; - self.queue_shared_job(job_ref); - // SAFETY: `wait_for_value` must be called at most once. This is - // the only call site for this particular `job`, which is a - // stack-local variable. - let result = unsafe { job.wait_for_value() }; - match result { - Ok(value) => value, - Err(error) => unwind::resume_unwinding(error), - } + Worker::with_current(|worker| match worker { + Some(worker) if worker.thread_pool.id() == self.id() => { + func(Some(worker)) } + _ => func(None), }) } - /// Runs the closure on a thread-pool worker. - /// - /// If this thread is not a worker, it will try to register itself as one. - /// If the thread pool is full, this panics. + /// Returns this thread's worker. If not already a member of this thread + /// pool, this thread will try to enroll itself as a member. /// - /// If you don't want to panic, use [`on_worker`][ThreadPool::on_worker] or - /// [`with_worker`][ThreadPool::with_worker] instead. + /// Note: If the thread pool is full (it already has 32 active members) this + /// waits for a vacancy before returning. #[inline(always)] - #[track_caller] - pub fn expect_worker(&'static self, f: F) -> R + pub fn with_worker(&'static self, func: F) -> R where F: FnOnce(&Worker) -> R, { - self.with_worker(|worker| match worker { - Some(worker) => f(worker), - None => panic!("thread pool full; not able to access worker"), - }) - } - - /// Runs the closure on a thread-pool worker. - /// - /// If this thread is currently acting as a worker for the thread-pool, this - /// just looks that worker up. If this thread is not registered as a worker, - /// or if the thread's worker is registered with different thread pool, the - /// thread will try to register itself with the correct pool. If the thread - /// pool is full, it passes the closure `None`. - /// - /// The provided closure is never sent to another thread. If your closure is - /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead. - #[inline(always)] - pub fn with_worker(&'static self, f: F) -> R - where - F: FnOnce(Option<&Worker>) -> R, - { - Worker::with_current(|worker| match worker { - Some(worker) if worker.lease.thread_pool.id() == self.id() => f(Some(worker)), - _ => self.with_worker_cold(f), + self.get_worker(|worker| match worker { + Some(worker) => func(worker), + None => self.with_worker_cold(func), }) } - /// Tries to register the calling thread on the thread pool, and pass a - /// worker instance to the provided closure. + /// The cold branch of `with_worker`. /// - /// This is the slow fallback for `with_worker` covering "external calls" - /// from outside the pool. Never call this directly. + /// Requests membership in the thread pool, and then activates that + /// membership to get a new local worker handle. If the thread pool is full + /// (there are already 32 members) this blocks. #[cold] - fn with_worker_cold(&'static self, f: F) -> R + fn with_worker_cold(&'static self, func: F) -> R where - F: FnOnce(Option<&Worker>) -> R, + F: FnOnce(&Worker) -> R, { - match self.claim_lease() { - Some(lease) => Worker::occupy(lease, |worker| f(Some(worker))), - None => f(None), - } + let membership = self.enroll(); + membership.activate(func) } } // ----------------------------------------------------------------------------- -// Generalized spawn trait +// Spawn Trait + +pub use async_task::Task; /// A trait for types that can be spawned onto a [`ThreadPool`]. /// @@ -585,7 +444,11 @@ pub trait Spawn: Send + 'static { type Output: Send + 'static; /// Spawns work onto the thread pool. - fn spawn(self, thread_pool: &'static ThreadPool, worker: Option<&Worker>) -> Self::Output; + fn spawn( + self, + thread_pool: &'static ThreadPool, + worker: Option<&Worker>, + ) -> Self::Output; } impl Spawn for F @@ -601,60 +464,45 @@ where // Turn the job into an "owning" `JobRef` so it can be queued. // - // SAFETY: All jobs added to the queue are guaranteed to be executed - // eventually, this is one of the core invariants of the thread pool. - // The closure `f` has a static lifetime, meaning it only closes over - // data that lasts for the duration of the program, so it's not possible - // for this job to outlive the data `f` closes over. + // SAFETY: `HeapJob::into_job_ref` has two preconditions: + // + // * The `JobRef` must not outlive any of the items closed over by the + // function `f`. + // + // Since `F: 'static`, the `JobRef` cannot outlive its captured data. + // + // * If `F: !Send` then the `JobRef` must only be executed on this + // thread. + // + // `F` is `Send`, so this does not apply. let job_ref = unsafe { job.into_job_ref() }; // Queue the job for evaluation - if let Some(worker) = worker { - worker.fifo_queue.push_new(job_ref); - } else { - // Push the work into the share queue and wake a worker - thread_pool.shared_jobs.push(job_ref); + match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => thread_pool.push_shared_job(job_ref), } } } -/// An alias for [`async_task::Task`] that includes a reference to the pool on -/// which the future is executing. -pub type Task = async_task::Task; - -/// Schedules a runnable future as a job. +/// Executes a raw pointer to a runnable. /// -/// Async-task prefers that this is a static function, rather than a closure, -/// which is why this is a separate function that pulls the thread pool from the -/// runnable metadata. -fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) { - // Get a ref to the thread pool from the runnable. - let thread_pool = *runnable.metadata(); - - // Temporarily turn the task into a raw pointer so that it can be - // used as a job. We could also use `HeapJob` here, but since - // `Runnable` is heap allocated this would result in a needless - // second allocation. - let job_pointer = runnable.into_raw(); - - // SAFETY: The raw runnable pointer will remain valid until it is - // used by `execute_runnable`, after which it will be dropped. - let job_ref = unsafe { JobRef::new_raw(job_pointer, execute_runnable) }; - - // Send this job off to be executed. - thread_pool.with_worker(|worker| match worker { - Some(worker) => worker.fifo_queue.push_new(job_ref), - None => thread_pool.shared_jobs.push(job_ref), - }); -} - -/// Executes a raw pointer to a runnable future. +/// # Safety +/// +/// The caller must ensure: +/// +/// * `this` was produced by `Runnable::into_raw` and must not have been +/// consumed by a call to `from_raw`. +/// +/// * If the `Runnable` was created for a `!Send` future, this must only be +/// called on the thread where the `Runnable` was created. #[inline(always)] -fn execute_runnable(this: NonNull<()>, _worker: &Worker) { - // SAFETY: This pointer was created by `Runnable::into_raw` in - // `schedule_runnable` with type parameter `&'static ThreadPool`, and - // `from_raw` is called at most once. - let runnable = unsafe { Runnable::<&'static ThreadPool>::from_raw(this) }; +unsafe fn execute_runnable(this: NonNull<()>, _worker: &Worker) { + // SAFETY: This pointer was created by `Runnable::into_raw` in the schedule + // closure. Jobs are executed exactly once, so `from_raw` is called at most + // once on this function (the next call to `schedule` will call `into_raw` + // again to get a "new" raw pointer to the same runnable). + let runnable = unsafe { Runnable::<()>::from_raw(this) }; // Poll the task. This will drop the future if the task is // canceled or the future completes. runnable.run(); @@ -668,11 +516,48 @@ where type Output = Task; #[inline] - fn spawn(self, thread_pool: &'static ThreadPool, _worker: Option<&Worker>) -> Task { + fn spawn( + self, + thread_pool: &'static ThreadPool, + _worker: Option<&Worker>, + ) -> Task { + // Creates a schedule function that captures a reference to the + // thread-pool. + let schedule = |runnable: Runnable| { + // Temporarily turn the task into a raw pointer so that it can be + // used as a job. We could also use `HeapJob` here, but since + // `Runnable` is heap allocated this would result in a needless + // second allocation. + let job_pointer = runnable.into_raw(); + + // SAFETY: `JobRef::new` requires us to show that `execute` will + // only be called on the returned `JobRef` when it is sound to call + // `execute_runnable` on `job_pointer`. The two preconditions to + // this are: + // + // * `job_pointer` must come from `Runnable::into_raw`. and must be + // called at most once per `into_raw`. + // + // We produced `job_pointer` this way just above. The call to + // `execute` will consume the `JobRef`, so `execute_runnable` will + // be called at most once. + // + // * If the `Runnable` was created for a `!Send` future, the + // `JobRef` must only be executed on the thread where the + // `Runnable` was created. + // + // The future is required to be `Send`, so this does not apply. + let job_ref = unsafe { JobRef::new(job_pointer, execute_runnable) }; + + // Send this job off to be executed. + thread_pool.get_worker(|worker| match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => thread_pool.push_shared_job(job_ref), + }); + }; + // Create a runnable and add the thread pool as metadata. - let (runnable, task) = async_task::Builder::new() - .metadata(thread_pool) - .spawn(|_| self, schedule_runnable); + let (runnable, task) = async_task::spawn(self, schedule); // Call the schedule function, pushing a `JobRef` for the future onto // the local work queue. If the future doesn't complete, it can be @@ -685,13 +570,150 @@ where // thread/task that woke it. // // This is potentially more efficient than `Runnable::schedule`. - schedule_runnable(runnable); + schedule(runnable); // Return the task. task } } +// ----------------------------------------------------------------------------- +// Local Spawn Trait + +/// A version of the [`Spawn`] trait without the `Send` bound. +/// +/// It is implemented for: +/// +/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + 'static`. +/// +/// * Futures that satisfy `Future + 'static` where `T: 'static`. +pub trait SpawnLocal: 'static { + /// The handled returned when spawning this type. + type Output: 'static; + + /// Spawns work that will run in the background on the current worker + /// thread. + fn spawn_local(self, worker: &Worker) -> Self::Output; +} + +impl SpawnLocal for F +where + F: for<'worker> FnOnce(&'worker Worker) + 'static, +{ + type Output = (); + + #[inline] + fn spawn_local(self, worker: &Worker) { + // Allocate a new job on the heap to store the closure. + let job = HeapJob::new(self); + + // Turn the job into an "owning" `JobRef` so it can be queued. + // + // SAFETY: `HeapJob::into_job_ref` has two preconditions: + // + // * The `JobRef` must not outlive any of the items closed over by the + // function `f`. + // + // Since `F: 'static`, the `JobRef` cannot outlive its captured data. + // + // * If `F: !Send` then the `JobRef` must only be executed on this + // thread. + // + // This `JobRef` is added to the `nonsend_fifo_queue` for this thread. + // No other thread ever pulls from this queue, and work is never + // shared from it, so it cannot be executed on any other thread. + let job_ref = unsafe { job.into_job_ref() }; + + // Push into the non-send queue, which can only be accessed from this + // thread. + worker.nonsend_fifo_queue.push(job_ref); + } +} + +impl SpawnLocal for Fut +where + Fut: Future + 'static, + T: 'static, +{ + type Output = Task; + + #[inline] + fn spawn_local(self, worker: &Worker) -> Task { + // Create a schedule function that will keep a copy of the local fifo + // queue arc and be able to wake the local worker up. + let queue = worker.nonsend_fifo_queue.clone(); + let member_index = worker.member_index; + let member_data = worker.member_data; + let schedule = move |runnable: Runnable| { + // Temporarily turn the task into a raw pointer so that it can be + // used as a job. We could also use `HeapJob` here, but since + // `Runnable` is heap allocated this would result in a needless + // second allocation. + let job_pointer = runnable.into_raw(); + + // SAFETY: `JobRef::new` requires us to show that `execute` will + // only be called on the returned `JobRef` when it is sound to call + // `execute_runnable` on `job_pointer`. The two preconditions to + // this are: + // + // * `job_pointer` must come from `Runnable::into_raw`. and must be + // called at most once per `into_raw`. + // + // We produced `job_pointer` this way just above. The call to + // `execute` will consume the `JobRef`, so `execute_runnable` will + // be called at most once. + // + // * If the `Runnable` was created for a `!Send` future, the + // `JobRef` must only be executed on the thread where the + // `Runnable` was created. + // + // This `JobRef` is added to the `nonsend_fifo_queue` for this + // thread. No other thread ever pulls from this queue, and work is + // never shared from it, so it cannot be executed on any other + // thread. + let job_ref = unsafe { JobRef::new(job_pointer, execute_runnable) }; + + // Send this job to the correct thread to be executed. + queue.push(job_ref); + + // Ensure that the worker is awake to execute this job. + member_data.semaphores[member_index].signal(); + }; + + // Create a runnable and add the thread pool as metadata. + let (runnable, task) = async_task::spawn_local(self, schedule); + + // Call the schedule function, pushing a `JobRef` for the future onto + // the local work queue. If the future doesn't complete, it can be + // woken and scheduled at a later point. + // + // Because we always look up the local worker within the schedule + // function, woken futures will tend to run on the thread that wakes + // them. This is a desirable property, as typically the next thing a + // future is going to do after being woken up is read some data from the + // thread/task that woke it. + runnable.schedule(); + + // Return the task. + task + } +} + +// ----------------------------------------------------------------------------- +// Broadcasts + +/// Context object for [`broadcast`](Worker::broadcast) operations. +pub struct Broadcast<'w> { + /// The worker this part of the broadcast is running on. This will be in + /// `[0, participants)`. + pub worker: &'w Worker, + /// The index of this worker within the broadcast. The return value will be + /// stored at this index within the results vector. + pub index: usize, + /// The number of threads participating in the broadcast. + pub participants: usize, +} + // ----------------------------------------------------------------------------- // Thread pool operations @@ -700,77 +722,435 @@ impl ThreadPool { /// /// See also: [`Worker::spawn`] and [`spawn`]. #[inline(always)] - pub fn spawn>(&'static self, work: S) -> S::Output { - work.spawn(self, None) + pub fn spawn>(&'static self, work: S) -> S::Output { + self.get_worker(|worker| work.spawn(self, worker)) + } + + /// Blocks the thread waiting for a future to complete. + /// + /// See also: [`Worker::block_on`] and [`block_on`]. + #[inline(always)] + pub fn block_on(&'static self, future: F) -> T + where + F: Future + Send, + T: Send, + { + self.get_worker(|worker| match worker { + Some(worker) => worker.block_on(future), + None => futures_lite::future::block_on(future), + }) + } + + /// Executes the two closures, possibly in parallel. + /// + /// See also: [`Worker::join`] and [`join`]. + #[inline(always)] + pub fn join(&'static self, a: A, b: B) -> (RA, RB) + where + A: FnOnce(&Worker) -> RA + Send, + B: FnOnce(&Worker) -> RB + Send, + RA: Send, + RB: Send, + { + self.with_worker(|worker| worker.join(a, b)) + } + + /// Creates a scope onto which non-static work can be spawned. + /// + /// See also: [`Worker::scope`] and [`scope`]. + #[inline(always)] + pub fn scope<'env, F, T>(&'static self, f: F) -> T + where + F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, + { + self.with_worker(|worker| worker.scope(f)) + } + + /// Runs the same closure on several threads, and returns a vector of + /// results. + /// + /// See also: [`Worker::broadcast`] and [`broadcast`]. If you don't care + /// about getting results back, you may want to use + /// [`ThreadPool::spawn_broadcast`] instead. + #[inline(always)] + pub fn broadcast(&'static self, f: F) -> Vec + where + F: for<'w> Fn(Broadcast<'w>) -> T + Sync, + T: Send, + { + self.with_worker(|worker| worker.broadcast(f)) + } + + /// Runs the same closure on sevearl threads, without waiting for them to + /// complete. + /// + /// See also: [`Worker::spawn_broadcast`] and [`spawn_broadcast`]. If you + /// care about getting results back, you may want to use + /// [`ThreadPool::broadcast`] instead. + #[inline(always)] + pub fn spawn_broadcast(&'static self, f: F) + where + F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static, + { + self.with_worker(|worker| worker.spawn_broadcast(f)); + } +} + +// ----------------------------------------------------------------------------- +// Worker registration + +/// Represents membership in a thread-pool. +/// +/// Provided by [`ThreadPool::enroll`]. +pub struct Membership { + /// The thread pool the worker is registered with. + thread_pool: &'static ThreadPool, + /// Contains the index of a row in the `MembersData` table, if the worker + /// has been granted membership on the thread-pool. + member_index: usize, + /// A reference to the `MemberData` table. + member_data: &'static MemberData, +} + +impl ThreadPool { + /// Returns member data, initializing it on the first call. + pub fn get_member_data(&'static self) -> &'static MemberData { + self.member_data.get() + } + + /// Waits for membership in the thread-pool. + /// + /// If the thread-pool is full (it has 32 members) this blocks. + pub fn enroll(&'static self) -> Membership { + loop { + match self.try_enroll() { + // If we receive a membership, break out of the loop + Some(membership) => return membership, + // If the thread-pool is full, wait for a membership to become + // free + None => atomic_wait::wait(&self.claimed_bitmask, u32::MAX), + } + } + } + + /// Requests membership in a thread-pool. + /// + /// If the thread-pool is full (it has 32 members) this returns `None`. + #[cold] + pub fn try_enroll(&'static self) -> Option { + loop { + let available_bitmask = + !self.claimed_bitmask.load(Ordering::Relaxed); + if available_bitmask == 0 { + return None; + } + let enrolled_index = available_bitmask.trailing_zeros() as usize; // TZCNT + let enrolled_bitmask = 1 << enrolled_index; + if self + .claimed_bitmask + .fetch_or(enrolled_bitmask, Ordering::Relaxed) + & enrolled_bitmask + == 0 + { + return Some(Membership { + thread_pool: self, + member_index: enrolled_index, + member_data: self.get_member_data(), + }); + } + } + } + + /// Creates multiple memberships for the thread pool. + pub fn try_enroll_many(&'static self, n: usize) -> Vec { + if n == 0 { + return Vec::new(); + } + let member_data = self.get_member_data(); + loop { + let claimed_bitmask = self.claimed_bitmask.load(Ordering::Relaxed); + if claimed_bitmask == u32::MAX { + return Vec::new(); + } + + // Build a mask of up to `n` free seats by walking the complement. + let mut enrolled_bitmask = 0; + let mut available_bitmask = !claimed_bitmask; + for _ in 0..n { + if available_bitmask == 0 { + break; + } + // Isolate the lowest available bit and add it to the enrollment + // bits + enrolled_bitmask |= + available_bitmask & available_bitmask.wrapping_neg(); + // Remove that bit from the available bits + available_bitmask &= available_bitmask - 1; + } + + // Attempt to claim all selected seats in one atomic step. + if self + .claimed_bitmask + .compare_exchange( + claimed_bitmask, + claimed_bitmask | enrolled_bitmask, + Ordering::Relaxed, + Ordering::Relaxed, + ) + .is_ok() + { + return (0..32) + .filter(|&i| enrolled_bitmask & (1 << i) != 0) + .map(|seat_number| Membership { + thread_pool: self, + member_index: seat_number as usize, + member_data, + }) + .collect(); + } + } + } + + /// Blocks workers from resigning from the pool. + /// + /// Each call to this function must be paired with exactly one following + /// call to `unfreeze_membership`. + /// + /// Returns a bitset indicating which memberships are claimed. These members + /// are guatenteed to remain in the pool at least until the corresponding + /// call to `unfreeze_membership`. + pub fn freeze_membership(&'static self) -> u32 { + // Increment the freeze counter, which will cause new resignation + // requests to be rejected. + // + // Note, this will break if we exceed 67,108,863 simultaneous + // broadcasts, because we will overflow into the bits for + // ongoing-resignations. I expect us to always run out of memory before + // that happens, so this case is not handled. + let mut resignations = + self.resignations.fetch_add(1, Ordering::Relaxed); + + // Wait for any ongoing resignations to complete. + while (resignations >> 26) != 0 { + atomic_wait::wait(&self.resignations, resignations); + resignations = self.resignations.load(Ordering::Relaxed); + } + + // Synchronizes with the `Release` store done by workers when they + // complete their resignation, ensuring that the following load of + // `claimed_bitmask` will properly reflect any recent resignations. + fence(Ordering::Acquire); + + // Return the frozen membership bitmask + self.claimed_bitmask.load(Ordering::Relaxed) + } + + /// Unblocks workers from resigning from the pool. + /// + /// Each call to this function must be paired with exactly one preceding + /// call to `unfreeze_membership`. + pub fn unfreeze_membership(&'static self) { + // Decrement the freeze counter, to allow new resignations to be + // accepted. + let resignations = self.resignations.fetch_sub(1, Ordering::Acquire); + + // If this was the last active freeze, wake up any threads that want to + // resign. + if resignations == 1 { + let wants_to_resign = self.wants_to_resign.load(Ordering::Relaxed); + for member_index in wants_to_resign.iter_bits() { + self.get_member_data().semaphores[member_index].signal(); + } + } + } +} + +thread_local! { + static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) }; +} + +const REJECTION_MASK: u32 = (1u32 << 26) - 1; + +impl Membership { + /// Returns this worker's index within its thread pool. + /// + /// The index is stable for the lifetime of the membership and is unique + /// among concurrent members of the same pool. + #[inline(always)] + pub fn member_index(&self) -> usize { + self.member_index + } + + /// Temporarily sets the thread's worker. [`Worker::with_current`] always + /// returns a reference to the worker set up by the most recent call to + /// `activate`. + /// + /// Rust's thread locals are fairly costly, so this function is expensive. + /// If you can avoid calling it, do so. + #[inline(always)] + pub fn activate(self, f: F) -> R + where + F: FnOnce(&Worker) -> R, + { + let worker = Worker { + migrated: Cell::new(false), + membership: self, + fifo_queue: JobQueue::new(), + lifo_queue: JobQueue::new(), + nonsend_fifo_queue: Arc::new(SegQueue::new()), + rng: XorShift64Star::new(), + last_promote_tick: Cell::new(0), + _phantom: PhantomData, + }; + + // Swap the local pointer to point to the newly allocated worker. + let outer_ptr = WORKER_PTR.with(|ptr| ptr.replace(&worker)); + + // Run the function within the context created by the worker pointer, + // and pass in a worker reference directly. + let result = f(&worker); + + // Indicate that we want to resign. + worker + .thread_pool + .wants_to_resign + .fetch_or(1 << worker.member_index, Ordering::Relaxed); + + // Wait for all local work to complete, and our resignation to be + // accepted. + loop { + if worker.yield_local() == Yield::Idle { + // Attempt to submit our resignation + let mut resignations = + worker.thread_pool.resignations.load(Ordering::Relaxed); + if resignations & REJECTION_MASK == 0 { + match worker.thread_pool.resignations.compare_exchange( + resignations, + resignations + (1 << 26), + Ordering::Relaxed, + Ordering::Relaxed, + ) { + // Resignation accepted + Ok(_) => break, + // Resignation rejected due to ongoing broadcast + Err(err) if err & REJECTION_MASK != 0 => worker.wait(), + // Resignation conflicted with other worker, try again + Err(err) => resignations = err, + } + } + } + } + + // Indicate we are no longer waiting to resign. + worker + .thread_pool + .wants_to_resign + .fetch_and(!(1 << worker.member_index), Ordering::Relaxed); + + // Drop the worker, which will also free the claimed membership. + let thread_pool = worker.thread_pool; + drop(worker); + + // Complete the resignation + loop { + let resignations = thread_pool.resignations.load(Ordering::Relaxed); + // Try to decrement the resignations count. This uses `Release` + // ordering so that the the `claimed_bitmask` store done as part of + // `drop(worker)` appears to any thread waiting for this resignation + // to complete. + if thread_pool + .resignations + .compare_exchange_weak( + resignations, + resignations - (1 << 26), + Ordering::Release, + Ordering::Relaxed, + ) + .is_ok() + { + // If this was the last ongoing resignation, wake up any waiting + // broadcasts. + if (resignations >> 26) - 1 == 0 { + atomic_wait::wake_all(&*thread_pool.resignations); + } + // Exit the CAS loop + break; + } + } + + // Swap back to pointing to the previous value (possibly null). + WORKER_PTR.with(|ptr| ptr.set(outer_ptr)); + + // Return the intermediate values created while running the closure, + // namely the result and any jobs still remaining on the local queue. + result + } + + /// Returns a reference to the push-side `Sharer` queue for this + /// worker's seat. + #[inline(always)] + fn sharing_queue(&self) -> &'static Sharer { + &self.member_data.sharers[self.member_index] } - /// Blocks the thread waiting for a future to complete. - /// - /// See also: [`Worker::block_on`] and [`block_on`]. + /// Returns a reference to a worker's local inbox (where !Send future + /// wakeups and broadcasts are transmitted). #[inline(always)] - pub fn block_on(&'static self, future: F) -> T - where - F: Future + Send, - T: Send, - { - self.on_worker(|worker| worker.block_on(future)) + fn broadcast_queue(&self) -> &'static SegQueue { + &self.member_data.broadcasts[self.member_index] } - /// Executes the two closures, possibly in parallel. - /// - /// See also: [`Worker::join`] and [`join`]. + /// Returns a reference to a worker's local inbox (where !Send future + /// wakeups and broadcasts are transmitted). #[inline(always)] - pub fn join(&'static self, a: A, b: B) -> (RA, RB) - where - A: FnOnce(&Worker) -> RA + Send, - B: FnOnce(&Worker) -> RB + Send, - RA: Send, - RB: Send, - { - self.on_worker(|worker| worker.join(a, b)) + fn semaphore(&self) -> &'static Semaphore { + &self.member_data.semaphores[self.member_index] } - /// Creates a scope onto which non-static work can be spawned. - /// - /// For more complete docs, see [`scope`]. If you have a reference to a - /// worker, you should call [`Worker::scope`] instead. - #[inline(always)] - pub fn scope<'env, F, T>(&'static self, f: F) -> T - where - F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T + Send, - T: Send, - { - self.on_worker(|worker| worker.scope(f)) + /// Waits for a signal on this member's semaphore. + fn wait(&self) { + let semaphore = self.semaphore(); + semaphore + .wait(1 << self.member_index, &self.thread_pool.waiting_bitmask); + } +} + +impl Drop for Membership { + fn drop(&mut self) { + // Release the claim on this membership. + self.thread_pool + .claimed_bitmask + .fetch_and(!(1 << self.member_index), Ordering::Relaxed); + // In case another thread is waiting for a membership slot to free + // up, issue a wake on the bitmask. + atomic_wait::wake_one(&*self.thread_pool.claimed_bitmask); } } // ----------------------------------------------------------------------------- // Worker context -thread_local! { - static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) }; -} - /// Represents membership in a thread pool. /// -/// To get access to worker for a given thread pool, users should call -/// [`ThreadPool::with_worker`], [`ThreadPool::on_worker`], [`ThreadPool::expect_worker`] +/// To get access to a worker for a given thread pool, users should call +/// [`ThreadPool::with_worker`]. /// /// Every thread has at most one worker at a time. If a worker has already been /// set up, it may be accessed at any time by calling [`Worker::with_current`]. -/// A thread's worker can also be manually overridden by claiming a lease -/// ([`ThreadPool::claim_lease`]) and passing it to [`Worker::occupy`]. The -/// worker returned by `with_current` always represents the lease most recently -/// occupied in the call stack. +/// A thread's worker can also be manually set up by claiming a membership +/// ([`ThreadPool::try_enroll`]) and passing it to [`Membership::activate`]. The +/// worker returned by `with_current` always represents the membership most +/// recently activated in the call stack. /// -/// Every worker belongs to exactly one thread pool, and must hold a "lease" on -/// one of the shared slots within that pool. +/// Every worker belongs to exactly one thread pool, and must hold a membership +/// in one of the shared slots within that pool. /// /// Workers have one core memory-safety guarantee: Any jobs added to the worker /// will eventually be executed. pub struct Worker { - migrated: Cell, - lease: Lease, + /// Registers the worker as belonging to a specific thread pool, and + /// potentially also grants "membership" on that thread-pool. + membership: Membership, /// A sequence of jobs waiting to be executed. Newer jobs are executed /// before older ones, allowing efficient depth-first execution. During /// promotion, the oldest job is shared. Populated by `join()`. @@ -784,13 +1164,40 @@ pub struct Worker { /// /// Jobs in this queue are executed only when the lifo queue is empty. pub(crate) fifo_queue: JobQueue, + /// A sequence of `!Sendf` jobs waitging to be executed. Older jobs are + /// executed before newer ones. + /// + /// This queue does not participate in promotion. This is a `SeqQueue` so + /// that a `Future` that is `!Send` and has been spawned onto this thread + /// can be woken on another thread (the other thread then sends this thread + /// a job that polls the future). + nonsend_fifo_queue: Arc>, + /// A local psudorandom number-generator. Used to spread out + /// worker-to-worker operations evenly across the pool. rng: XorShift64Star, + /// The CPU tick when work was last promoted from local to shared. This has + /// no absolute relation to time. last_promote_tick: Cell, - // Make non-send. + /// Set to true when executing a job that came from a different thread. + migrated: Cell, + // Make non-send. A `Worker` represents the local state of a particular + // thread, so must be `!Send` and `!Sync`. It is already `!Sync` because of + // `Cell`. _phantom: PhantomData<*const ()>, } -/// Describes the outcome of a call to [`Worker::yield_now`] or [`Worker::yield_local`]. +use core::ops::Deref; + +impl Deref for Worker { + type Target = Membership; + + fn deref(&self) -> &Membership { + &self.membership + } +} + +/// Describes the outcome of a call to [`Worker::yield_now`] or +/// [`Worker::yield_local`]. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Yield { /// Indicates that a job was executed. @@ -802,64 +1209,6 @@ pub enum Yield { } impl Worker { - /// Temporarily sets the thread's worker. [`Worker::with_current`] always - /// returns a reference to the worker set up by the most recent call to - /// `occupy`. - /// - /// Rust's thread locals are fairly costly, so this function is expensive. - /// If you can avoid calling it, do so. - #[inline(always)] - pub fn occupy(lease: Lease, f: F) -> R - where - F: FnOnce(&Worker) -> R, - { - trace!("occupying lease"); - - let span = trace_span!("occupy", seat_number = lease.seat_number); - let _enter = span.enter(); - - // Create a new worker to occupy the lease. Note: It's potentially a - // problem that the same thread can occupy multiple workers on the same - // thread. We may eventually need to design something to prevent this. - let worker = Worker { - migrated: Cell::new(false), - lease, - fifo_queue: JobQueue::new(), - lifo_queue: JobQueue::new(), - rng: XorShift64Star::new(), - last_promote_tick: Cell::new(0), - _phantom: PhantomData, - }; - - // Swap the local pointer to point to the newly allocated worker. - let outer_ptr = WORKER_PTR.with(|ptr| ptr.replace(&worker)); - - // Run the function within the context created by the worker pointer, - // and pass in a worker reference directly. - let result = f(&worker); - - // Finish executing local work before shutting down. - while let Some(job_ref) = worker.find_local_work() { - worker.execute(job_ref, false); - } - - // Swap back to pointing to the previous value (possibly null). - WORKER_PTR.with(|ptr| ptr.set(outer_ptr)); - - trace!("vacating lease"); - - // Return the intermediate values created while running the closure, - // namely the result and any jobs still remaining on the local queue. - result - } - - /// Returns a reference to the push-side `Sharer` queue for this - /// worker's seat. - #[inline(always)] - fn sharer(&self) -> &Sharer { - &self.lease.seats.sharers[self.lease.seat_number] - } - /// Calls the provided closure on the thread's worker instance, if it has /// one. If this thread is not registered as a worker, the closure is not /// called. @@ -871,18 +1220,18 @@ impl Worker { let worker_ptr = WORKER_PTR.with(Cell::get); if !worker_ptr.is_null() { // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw - // pointer to a `Worker`. It is only written to by `Worker::occupy`, - // which stores the address of a `Worker` allocated within it's own - // stack frame. Before it returns, `occupy` restores the previous - // value of `WORKER_PTR`, so that it is always either null or points - // to a live, immovable `Worker` on the current thread's call stack - // (but is never left dangling). + // pointer to a `Worker`. It is only written to by + // `Membership::activate`, which stores the address of a `Worker` + // allocated within it's own stack frame. Before it returns, + // `activate` restores the previous value of `WORKER_PTR`, so that + // it is always either null or points to a live, immovable `Worker` + // on the current thread's call stack (but is never left dangling). // // If the pointer is non-null, it is therefore valid to dereference // as a shared reference. Forming a `'static` reference is avoided // by passing the value into a closure, which bounds the reference's // lifetime to the closure body and prevents callers from retaining - // it past the point where `occupy` returns and the `Worker` is + // it past the point where `activate` returns and the `Worker` is // freed. Some(f(unsafe { &*worker_ptr })) } else { @@ -900,35 +1249,30 @@ impl Worker { { let worker_ptr = WORKER_PTR.with(Cell::get); if !worker_ptr.is_null() { - // SAFETY: The `WORKER` static is only set by `occupy`, and it's - // always set to a stack-allocated `Worker` which is never moved and - // is only accessed through shared references. Therefore, if the - // pointer is non-null, it must be safe to dereference. + // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw + // pointer to a `Worker`. It is only written to by + // `Membership::activate`, which stores the address of a `Worker` + // allocated within its own stack frame. Before it returns, + // `activate` restores the previous value of `WORKER_PTR`, so that + // it is always either null or points to a live, immovable `Worker` + // on the current thread's call stack (but is never left dangling). // - // This creates a reference with an unbounded lifetime. To avoid - // turning it into a `'static`, we pass it in to a closure. This - // restricts its lifetime to the closure body, and prevents callers - // from keeping around references to Workers that will be - // deallocated when `occupy` returns. + // If the pointer is non-null, it is therefore sound to dereference + // as a shared reference. Forming a `'static` reference is avoided + // by passing the value into a closure, which bounds the reference's + // lifetime to the closure body and prevents callers from retaining + // it past the point where `activate` returns and the `Worker` is + // freed. f(Some(unsafe { &*worker_ptr })) } else { f(None) } } - /// Returns this worker's seat index within the pool (0–31). - /// - /// Seat numbers may be re-used by different workers at different times, and - /// may not be contiguous or ordered. - #[inline(always)] - pub fn seat_number(&self) -> usize { - self.lease.seat_number - } - /// Returns the thread pool this worker belongs to. #[inline(always)] pub fn thread_pool(&self) -> &'static ThreadPool { - self.lease.thread_pool + self.thread_pool } /// Capacity of the per-worker work-stealing queue. This is the maximum @@ -945,8 +1289,10 @@ impl Worker { // Promotions are fairly costly, so we limit their frequency using the // cpu's instruction counter. Promote is called at a high frequency, and // actually doing the promotion is probably a cold path. - let current_tick = hotclock::Instant::now().as_raw(); - if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL { + let current_tick = ticks(); + if current_tick.wrapping_sub(self.last_promote_tick.get()) + >= Self::PROMOTE_TICK_INTERVAL + { // This should ideally become a conditional jump. self.promote_cold(current_tick); } @@ -959,8 +1305,9 @@ impl Worker { self.last_promote_tick.set(current_tick); // Early out if it seems like all workers are already awake. - let sleeping = self.lease.thread_pool.sleeping.load(Ordering::Relaxed); - if sleeping == 0 { + let waiting_bitmask = + self.thread_pool().waiting_bitmask.load(Ordering::Relaxed); + if waiting_bitmask == 0 { return; } cold_path(); @@ -972,7 +1319,7 @@ impl Worker { // (and therefore theoretically "large") tasks shared first. if let Some(job_ref) = self.lifo_queue.pop_oldest() { // Push into our own steal queue so siblings can steal it. - if let Err(job_ref) = self.sharer().push(job_ref) { + if let Err(job_ref) = self.sharing_queue().push(job_ref) { // If the queue is full, that indicates that the pool is // probably under high-load and we should continue local-first // operation. @@ -999,7 +1346,7 @@ impl Worker { // value, and so trivially outlives the newly created `JobRef`. let batch_job_ref = unsafe { batch_job.into_job_ref() }; // Push the batch job into the steal queue so siblings can steal it. - if let Err(job_ref) = self.sharer().push(batch_job_ref) { + if let Err(job_ref) = self.sharing_queue().push(batch_job_ref) { // If the queue is full, that indicates that the pool is // probably under high-load and we should continue local-first // operation. @@ -1014,20 +1361,21 @@ impl Worker { // If we added work to the steal queue, wake a random sibling to steal // it from us, while we do other work. if shared_job { - self.wake_random(sleeping); + self.signal_random(waiting_bitmask); } } /// Tries to wake a random sleeping worker. Expects to be given a bitset of /// sleeping workers. #[inline(always)] - fn wake_random(&self, sleeping: u32) { + fn signal_random(&self, sleeping: u32) { let offset = self.rng.next_usize(32) as u32; let mut randomized_sleeping = sleeping.rotate_right(offset); while randomized_sleeping != 0 { let index = (randomized_sleeping.trailing_zeros() + offset) % 32; randomized_sleeping &= randomized_sleeping - 1; // Clear the lowest bit - let woken = self.lease.seats.sleep_controllers[index as usize].wake(); + let woken = + self.membership.member_data.semaphores[index as usize].signal(); if woken { return; } @@ -1037,11 +1385,7 @@ impl Worker { /// Create a new latch owned by the worker. #[inline(always)] pub fn new_latch(&self) -> Latch { - Latch::new( - self.lease.seat_number, - &self.lease.thread_pool.sleeping, - &self.lease.seats.sleep_controllers[self.lease.seat_number], - ) + Latch::new(self.semaphore()) } /// Runs jobs until the provided latch is set. @@ -1049,33 +1393,60 @@ impl Worker { /// The thread may go to sleep if it runs out of work to do, but will wake /// when the latch is set or more work becomes available. #[inline(always)] - pub fn wait_for(&self, latch: &Latch) { - while !latch.check() { - if self.yield_now() == Yield::Idle { - latch.wait(); + pub fn wait_for(&self, latch: &Latch) -> bool { + loop { + match latch.check() { + Status::Pending => { + if self.yield_now() == Yield::Idle { + let member_bitmask = 1 << self.membership.member_index; + let waiting_bitmask = + &self.membership.thread_pool.waiting_bitmask; + latch.wait(member_bitmask, waiting_bitmask); + } + } + Status::Ok => return false, + Status::Error => return true, } } } - /// Finds a job to work on. This function is entirely local, and does no - /// synchronization with the queue. + /// Finds a job to work on. This function is almost entirely local, but does + /// a small amount of synchronization to allow for !Send futures that must + /// be polled on this thread but are woken on a different thread. + /// + /// Work is prioritized as follows: + /// 1. Pull from the LIFO queue (`join` calls) + /// 2. Pull from the !Send FIFO queue (`spawn_local` calls) + /// 3. Pull from the regular FIFO queue (`spawn` calls) #[inline(always)] fn find_local_work(&self) -> Option { - self.lifo_queue - .pop_newest() + (self.lifo_queue.pop_newest()) + .or_else(|| self.nonsend_fifo_queue.pop()) .or_else(|| self.fifo_queue.pop_oldest()) } - /// Finds a job to work on. This tries - /// [`find_local_work`][Worker::find_local_work] first, then falls back to - /// pulling shared work from the thread pool. + /// Finds a job to work on. + /// + /// Work is prioritized as follows: + /// 1. Pull from the LIFO queue (`join` calls) + /// 2. Pull from the !Send FIFO queue (`spawn_local` calls) + /// 3. Pull from the regular FIFO queue (`spawn` calls) + /// 4. Pull from the broadcast queue (`broadcast` calls) + /// 5. Reclaim work shared by this worker + /// 6. Steal work shared from other workers + /// 7. Read from the global queue (external calls) + /// + /// If work is found in the last two cases, it is treated as having been + /// "migrated" to this thread. #[inline(always)] fn find_work(&self) -> Option<(JobRef, bool)> { - self.find_local_work() - .map(|job| (job, false)) - .or_else(|| self.sharer().pop().map(|job| (job, false))) + (self.find_local_work().map(|job| (job, false))) + .or_else(|| self.broadcast_queue().pop().map(|job| (job, false))) + .or_else(|| self.sharing_queue().pop().map(|job| (job, false))) .or_else(|| self.steal_from_siblings().map(|job| (job, true))) - .or_else(|| self.claim_shared_job().map(|job| (job, true))) + .or_else(|| { + self.thread_pool.shared_queue.pop().map(|job| (job, true)) + }) } /// Attempts to steal a job from another worker's work-stealing queue. @@ -1084,13 +1455,16 @@ impl Worker { /// the same victim. Because stealers are pre-allocated and permanent, no /// lock or atomic load is needed to access them. fn steal_from_siblings(&self) -> Option { - let stealers = &self.lease.seats.stealers; - let occupied = self.lease.thread_pool.occupied.load(Ordering::Relaxed); - let my_seat = self.lease.seat_number as u32; + let my_member_index = self.membership.member_index; + let my_sharer = self.sharing_queue(); + let stealers = &self.membership.member_data.stealers; + let claimed_bitmask = + self.thread_pool().claimed_bitmask.load(Ordering::Relaxed); - // Randomise the starting position so all workers get a fair shot as victims. + // Randomize the starting position so all workers get a fair shot as victims. let offset = self.rng.next_usize(32) as u32; - let mut bits = (occupied & !(1u32 << my_seat)).rotate_right(offset); + let mut bits = + (claimed_bitmask & !(1u32 << my_member_index)).rotate_right(offset); while bits != 0 { let shifted_idx = bits.trailing_zeros(); @@ -1101,7 +1475,7 @@ impl Worker { // `steal_and_pop` returns one job directly and moves up to half the // remaining items into our steal queue for later use. loop { - match stealer.steal_and_pop(self.sharer(), |n| n / 2) { + match stealer.steal_and_pop(my_sharer, |n| n / 2) { Ok((job, _)) => return Some(job), Err(StealError::Busy) => {} // transient; retry Err(StealError::Empty) => break, @@ -1111,12 +1485,6 @@ impl Worker { None } - /// Claims a job from the global injector queue. - #[inline(always)] - fn claim_shared_job(&self) -> Option { - self.lease.thread_pool.shared_jobs.pop() - } - /// Cooperatively yields execution to the thread pool, allowing it to execute /// some work. /// @@ -1189,12 +1557,13 @@ impl Worker { // Worker operations impl Worker { - /// Spawns work (a closure or future) onto the thread pool. Just like a - /// standard thread, this work executes concurrently (and potentially in - /// parallel) to the place where it is spawned. It is not tied to the - /// current stack frame, and hence it cannot hold any references other than - /// those with `'static` lifetime. If you want to spawn a task that - /// references stack data, use the [`scope`], [`ThreadPool::scope`] or + /// Runs work (a closure or future) in the background. + /// + /// Just like a standard thread, this work executes concurrently (and + /// potentially in parallel) to the place where it is spawned. It is not + /// tied to the current stack frame, and hence it cannot hold any references + /// other than those with `'static` lifetime. If you want to spawn a task + /// that references stack data, use the [`scope`], [`ThreadPool::scope`] or /// [`Worker::scope`] functions. /// /// Since tasks spawned with this function cannot hold references into the @@ -1204,17 +1573,46 @@ impl Worker { /// /// If you do not have access to a [`Worker`], you may call /// [`ThreadPool::spawn`] or simply [`spawn`]. + /// + /// # Panics + /// + /// The panic behavior depends on the type of work being spawned: + /// + /// * If a closure panics, it will be caught and ignored. + /// + /// * If a future panics, the [`Task`] will panic when awaited. + /// #[inline] pub fn spawn>(&self, work: S) -> S::Output { - work.spawn(self.lease.thread_pool, Some(self)) + work.spawn(self.thread_pool, Some(self)) + } + + /// Runs work (a closure or future) in the background of this thread. + /// + /// This is quite similar to [`spawn`](Worker::spawn), except that the work + /// may be `!Send` and will only run on the current thread. If your work is + /// `Send`, consider using [`spawn`](Worker::spawn) instead. + /// + /// # Panics + /// + /// The panic behavior depends on the type of work being spawned: + /// + /// * If a closure panics, it will be caught and ignored. + /// + /// * If a future panics, the [`Task`] will panic when awaited. + /// + #[inline] + pub fn spawn_local>(&self, work: S) -> S::Output { + work.spawn_local(self) } - /// Polls a future to completion, then returns the outcome. This function - /// will prioritize polling the future as soon as it becomes available, and - /// while the future is not available it will try to do other meaningful - /// work from the thread-pool. If the thread pool runs out of work, the - /// thread is suspended until the future completes or more background work - /// becomes available. + /// Polls a future to completion, then returns the outcome. + /// + /// This function will prioritize polling the future as soon as it becomes + /// available, and while the future is not available it will try to do other + /// meaningful work from the thread-pool. If the thread pool runs out of + /// work, the thread is suspended until the future completes or more + /// background work becomes available. /// /// # Async & Concurrency /// @@ -1242,7 +1640,7 @@ impl Worker { /// /// # Panics /// - /// If the future panics, this immediately panics. + /// If the future panics, this panics. #[inline(always)] pub fn block_on(&self, future: F) -> T where @@ -1300,7 +1698,7 @@ impl Worker { /// # THREAD_POOL.resize_to_available(); /// /// let mut v = vec![5, 1, 8, 22, 0, 44]; - /// THREAD_POOL.on_worker(|worker| quick_sort(worker, &mut v)); + /// THREAD_POOL.with_worker(|worker| quick_sort(worker, &mut v)); /// assert_eq!(v, vec![0, 1, 5, 8, 22, 44]); /// /// fn quick_sort(worker: &Worker, v: &mut [T]) { @@ -1339,7 +1737,7 @@ impl Worker { /// # THREAD_POOL.resize_to_available(); /// /// let tree = gen_tree(8); - /// let result = THREAD_POOL.on_worker(|worker| sum(worker, &tree)); + /// let result = THREAD_POOL.with_worker(|worker| sum(worker, &tree)); /// assert_eq!(result, 255); /// /// struct Node { @@ -1407,15 +1805,34 @@ impl Worker { // Allocate a job to run the closure `a` on the stack. It is vital to // the correctness of this function that this stack-job never move until // it is freed. - let mut stack_job = StackJob::new(a, self); - - // SAFETY: The `StackJob` is allocated on the stack just above, is never - // moved, and so will live for the entirety of this function in the same - // memory location. If closure `a` closes over data, that must be valid - // for the lifetime of this function as well. The `JobRef` cannot - // outlive either, because it is guaranteed to be executed before the - // function returns. We also clearly never create more than one `JobRef` - // using the `stack_job`. + let stack_job = StackJob::new(a, self.new_latch()); + + // SAFETY: We are only allowed to create a `JobRef` to this `StackJob` + // if we can show that... + // + // * `as_job_ref` is called at most once for this `StackJob`. + // + // The `StackJob` is only accessible in this function (it is + // created here, dropped here, and no direct references escape + // this scope), and within this function we only call `as_job_ref` + // once. + // + // * The `StackJob` will not be moved or dropped until either: + // + // A. A call to `check` on the enclosed `Latch` returns something + // other than `Pending`. + // + // B. The `JobRef` is dropped without `execute` being called. + // + // If `recover_newest` returns `true`, then the `JobRef` must have + // been dropped without `execute` being called (satisfying B). + // + // If `recover_newest` returns `false`, then we call `wait_for`, which + // will not allow the function to progress until `check` returns + // something other than `Pending` (satisfying A). + // + // In either case, we cannot move or drop the `StackJob` until we pass + // the branch marked with "(*)". We clearly do not. let job_ref = unsafe { stack_job.as_job_ref() }; // Store the id of the `JobRef` for later, when we will need it to @@ -1438,23 +1855,35 @@ impl Worker { // Attempt to recover the job from the queue. It should still be there // if we didn't share it. if self.lifo_queue.recover_newest(job_ref_id) { - // SAFETY: Because the ids match, the JobRef we just popped from - // the queue must point to `stack_job`, implying that - // `stack_job` cannot have been executed yet. - let a = unsafe { stack_job.unwrap() }; + // (*) + // SAFETY: Because the ids match, the JobRef we just popped from the + // queue must point to `stack_job`, implying that `stack_job` cannot + // have been executed yet, and `JobRef::execute` will never be + // called. + let a = unsafe { stack_job.unwrap_func() }; // Execute the closure directly and return the results. This is // allows the compiler to inline and optimize `a`. result_a = unwind::halt_unwinding(|| a(self)); } else { // Wait for the job to complete. - self.wait_for(stack_job.completion_latch()); - // SAFETY: The job must be complete, because we just waited on the latch. - result_a = unsafe { stack_job.return_value() }; + if self.wait_for(stack_job.completion_latch()) { + // SAFETY: Since `wait_for` returned `true`, a `check` must have + // returned `Error`. + let error = unsafe { stack_job.unwrap_error() }; + result_a = Err(error); + } else { + // SAFETY: Since `wait_for` returned `false`, a `check` must have + // returned `Ok`. + let output = unsafe { stack_job.unwrap_output() }; + result_a = Ok(output); + } } // Resume unwinding if either job panicked. match (result_a, result_b) { - (Err(error), _) | (_, Err(error)) => unwind::resume_unwinding(error), + (Err(error), _) | (_, Err(error)) => { + unwind::resume_unwinding(error) + } (Ok(value_a), Ok(value_b)) => (value_a, value_b), } } @@ -1477,7 +1906,7 @@ impl Worker { /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1504,7 +1933,7 @@ impl Worker { /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1520,17 +1949,17 @@ impl Worker { /// # }); /// ``` /// - /// While this works, it could be a problem if we want to use `ok` elsewhere. - /// There are two choices. We can keep the closure as a `move` closure, but - /// instead of referencing the variable `ok`, we create a shadowed variable that - /// is a borrow of `ok` and capture *that*: + /// While this works, it could be a problem if we want to use `ok` + /// elsewhere. There are two choices. We can keep the closure as a `move` + /// closure, but instead of referencing the variable `ok`, we create a + /// shadowed variable that is a borrow of `ok` and capture *that*: /// /// ```rust /// # use forte::ThreadPool; /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1549,15 +1978,15 @@ impl Worker { /// # }); /// ``` /// - /// Another option is not to use the `move` keyword but instead to take ownership - /// of individual variables: + /// Another option is not to use the `move` keyword but instead to take + /// ownership of individual variables: /// /// ```rust /// # use forte::ThreadPool; /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1576,15 +2005,15 @@ impl Worker { /// /// # Referencing the scope /// - /// The scope passed into the closure is not allowed to leak out of this call. - /// In other words, this will fail to compile: + /// The scope passed into the closure is not allowed to leak out of this + /// call. In other words, this will fail to compile: /// /// ```compile_fail /// # use forte::ThreadPool; /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let mut leak = None; /// forte::scope(|scope| { /// leak = Some(scope); // <-- ERROR: scope would be leaked here @@ -1601,7 +2030,7 @@ impl Worker { /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); - /// # THREAD_POOL.expect_worker(|worker| { + /// # THREAD_POOL.with_worker(|worker| { /// let mut counter = 0; /// let counter_ref = &mut counter; /// forte::scope(|scope| { @@ -1669,79 +2098,264 @@ impl Worker { { with_scope(self, f) } + + /// Runs the same operation across multiple threads, and returns a vector of + /// results. + /// + /// Each worker receives a [`Broadcast`] struct, telling it how many threads + /// are participating in the broadcast, and it's index among those + /// participants. The operation may return a different result on each + /// thread, and those results are collected into a vector, ordered according + /// to broadcast index. If you don't care about getting results back, + /// consider using [`spawn_broadcast`](Worker::spawn_broadcast) instead. + /// + /// Broadcasts execute after they have completed their local work queues, + /// but before they attempt to steal work from other threads. Forte will + /// generally try to run the broadcast on as many threads as possible, but + /// is not guaranteed to actually use all of them. + /// + /// While a broadcast is running, workers are temporarally forbidden from + /// leaving the pool. + /// + /// # Panics + /// + /// If the operation panics on one or more threads, exactly one panic will + /// be propagated, only after all threads have completed (or themselves + /// panicked). + #[inline(always)] + pub fn broadcast(&self, f: F) -> Vec + where + F: for<'w> Fn(Broadcast<'w>) -> T + Sync, + T: Send, + { + // Prevent workers from leaving the pool, and read the membership bitset + // once it's frozen. + let members = self.thread_pool.freeze_membership(); + let participants = members.count_ones() as usize; + + // Create a new stack job for every member. + let jobs: Vec<_> = members + .iter_bits() + .enumerate() + .map(|(i, member_index)| { + let func = &f; + let op = move |worker: &Worker| { + func(Broadcast { + worker, + index: i, + participants, + }) + }; + (member_index, StackJob::new(op, self.new_latch())) + }) + .collect(); + + // Send the broadcast to each member, and wake them up. + for (member_index, job) in &jobs { + // SAFETY: We are only allowed to create a `JobRef` for this + // `StackJob` if we can show that... + // + // * `as_job_ref` is called at most once for this `StackJob`. + // + // The `StackJob` is only accessible in this function (it is + // created here, dropped here, and no direct references escape + // this scope), and within this function we only call `as_job_ref` + // once. + // + // * The `StackJob` will not be moved or dropped until a call to + // `check` on the enclosed `Latch` returns something other than + // `Pending`. + // + // We call `wait_for` on each job's latch (marked with a *). This + // does not allow the function to progress while `check` returns + // `Pending`. No `StackJob` is moved or dropped until after this + // function has been called on every `StackJob` and returned. + let job_ref = unsafe { job.as_job_ref() }; + self.member_data.broadcasts[*member_index].push(job_ref); + self.member_data.semaphores[*member_index].signal(); + } + + // Wait for each job to finish. + let error_flags: Vec<_> = jobs + .iter() + .map(|(_, job)| self.wait_for(job.completion_latch())) // (*) + .collect(); + + // Allow workers to leave the pool again. + self.thread_pool.unfreeze_membership(); + + // Collect and return results or propagate panics. + jobs.into_iter() + .zip(error_flags) + .map(|((_, job), error_flag)| { + if error_flag { + // SAFETY: If `error_flag` is `true` then `check` has + // returned `Error`. + let error = unsafe { job.unwrap_error() }; + unwind::resume_unwinding(error); + } else { + // SAFETY: If `error_flag` is `false` then `check` has + // returned `Ok`. + unsafe { job.unwrap_output() } + } + }) + .collect() + } + + /// Runs the same operation across multiple threads, without waiting for + /// results. + /// + /// Like [`broadcast`](Worker::broadcast), except it does not allow the + /// operation to return a result, and does not wait for the operation to + /// complete before continuing. + /// + /// # Panics + /// + /// Panics are not propagated. + #[inline(always)] + pub fn spawn_broadcast(&self, f: F) + where + F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static, + { + // Prevent workers from leaving the pool, and read the membership bitset + // once it's frozen. + let members = self.thread_pool.freeze_membership(); + let participants = members.count_ones() as usize; + + // Prevent a deadlock if there are no workers. This should be + // impossible, but we will be defensive. + if participants == 0 { + cold_path(); + self.thread_pool.unfreeze_membership(); + return; + } + + // Send the broadcast to each member, and wake them up. + for (i, member_index) in members.iter_bits().enumerate() { + let func = &f; + let op = move |worker: &Worker| { + // Run the job + func(Broadcast { + worker, + index: i, + participants, + }); + }; + + let job = HeapJob::new(op); + + // SAFETY: `HeapJob::into_job_ref` has two preconditions: + // + // * The `JobRef` must not outlive any of the items closed over by + // the function `f`. + // + // Since `F: 'static`, the `JobRef` cannot outlive its captured + // data. + // + // * If `F: !Send` then the `JobRef` must only be executed on this + // thread. + // + // The `op` is `Send`, so this does not apply. + let job_ref = unsafe { job.into_job_ref() }; + self.member_data.broadcasts[member_index].push(job_ref); + self.member_data.semaphores[member_index].signal(); + } + + // Once we have finished pushing jobs out to workers who we know are not + // in the middle of resginging, we can allow resignations again. + self.thread_pool.unfreeze_membership(); + } } // ----------------------------------------------------------------------------- // Implicit worker registration api +/// A [`ThreadPool`] wrapper, used by the [`DEFAULT_POOL`]. +/// +/// This dereferences to a [`ThreadPool`]. The first time it is dereferenced, it +/// resizes itself to fill all available cores. +pub struct DefaultThreadPool { + thread_pool: &'static ThreadPool, + initialized: AtomicU32, +} + +impl Deref for DefaultThreadPool { + type Target = ThreadPool; + + fn deref(&self) -> &'static ThreadPool { + if self.initialized.swap(1, Ordering::Relaxed) == 0 { + self.thread_pool.resize_to_available(); + }; + self.thread_pool + } +} + +static DEFAULT_POOL_INNER: ThreadPool = ThreadPool::new(); + +/// The default thread pool. +/// +/// Unless you set up your own thread pool, this is where your operations run. +/// The first time this is dereferenced, it resizes itself to fill all available +/// cores. +pub static DEFAULT_POOL: DefaultThreadPool = DefaultThreadPool { + thread_pool: &DEFAULT_POOL_INNER, + initialized: AtomicU32::new(0), +}; + /// Runs the provided closure in the background. /// /// When executed on a thread that is currently registered as a worker (i.e. the -/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) -/// this is able to look up that registration and find the worker and +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and /// thread-pool implicitly. /// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +/// /// If you have a reference to a [`Worker`], it's better to use [`Worker::spawn`] /// instead. If you don't have a worker, but know which thread pool you want to /// use, [`ThreadPool::spawn`] is more appropriate. -/// -///
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// 
pub fn spawn>(work: S) -> S::Output { - Worker::with_current(|worker| { - worker - .expect("attempt to call `forte::spawn` from outside a thread pool") - .spawn(work) + Worker::with_current(|worker| match worker { + Some(worker) => worker.spawn(work), + None => DEFAULT_POOL.spawn(work), }) } /// Waits for a future to complete. /// /// When executed on a thread that is currently registered as a worker (i.e. the -/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) -/// this is able to look up that registration and find the worker and +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and /// thread-pool implicitly. /// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +/// /// If you have a reference to a [`Worker`], it's better to use /// [`Worker::block_on`] instead. If you don't have a worker, but know which /// thread pool you want to use, [`ThreadPool::block_on`] is more appropriate. -/// -///
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// 
pub fn block_on(future: F) -> T where F: Future + Send, T: Send, { - Worker::with_current(|worker| { - worker - .expect("attempt to call `forte::block_on` from outside a thread pool") - .block_on(future) + Worker::with_current(|worker| match worker { + Some(worker) => worker.block_on(future), + None => DEFAULT_POOL.block_on(future), }) } /// Executes the two closures, possibly in parallel. /// /// When executed on a thread that is currently registered as a worker (i.e. the -/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) -/// this is able to look up that registration and find the worker and +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and /// thread-pool implicitly. /// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +/// /// If you have a reference to a [`Worker`], it's better to use [`Worker::join`] /// instead. If you don't have a worker, but know which thread pool you want to /// use, [`ThreadPool::join`] is more appropriate. -/// -///
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// 
pub fn join(a: A, b: B) -> (RA, RB) where A: FnOnce(&Worker) -> RA + Send, @@ -1749,37 +2363,93 @@ where RA: Send, RB: Send, { - Worker::with_current(|worker| { - worker - .expect("attempt to call `forte::join` from outside a thread pool") - .join(a, b) + Worker::with_current(|worker| match worker { + Some(worker) => worker.join(a, b), + None => DEFAULT_POOL.join(a, b), }) } /// Creates a new scope for spawning non-static work. /// /// When executed on a thread that is currently registered as a worker (i.e. the -/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) -/// this is able to look up that registration and find the worker and +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and /// thread-pool implicitly. /// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +/// /// If you have a reference to a [`Worker`], it's better to use /// [`Worker::scope`] instead. If you don't have a worker, but know which thread /// pool you want to use, [`ThreadPool::scope`] is more appropriate. +pub fn scope<'env, F, T>(f: F) -> T +where + F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, +{ + Worker::with_current(|worker| match worker { + Some(worker) => worker.scope(f), + None => DEFAULT_POOL.scope(f), + }) +} + +/// Runs an operation on multiple threads and returns a vector of results. /// -///
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
 ///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
 ///
-/// 
-pub fn scope<'env, F, T>(f: F) -> T +/// If you have a reference to a [`Worker`], it's better to use +/// [`Worker::broadcast`] instead. If you don't have a worker, but know which +/// thread pool you want to use, [`ThreadPool::spawn_broadcast`] is more +/// appropriate. +pub fn broadcast(f: F) -> Vec where - F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, + F: for<'w> Fn(Broadcast<'w>) -> T + Sync, + T: Send, { - Worker::with_current(|worker| { - worker - .expect("attempt to call `forte::scope` from outside a thread pool") - .scope(f) + Worker::with_current(|worker| match worker { + Some(worker) => worker.broadcast(f), + None => DEFAULT_POOL.broadcast(f), + }) +} + +/// Runs an operation on multiple threads without waiting for results. +/// +/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and +/// thread-pool implicitly. +/// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +/// +/// If you have a reference to a [`Worker`], it's better to use +/// [`Worker::spawn_broadcast`] instead. If you don't have a worker, but know +/// which thread pool you want to use, [`ThreadPool::spawn_broadcast`] is more +/// appropriate. +pub fn spawn_broadcast(f: F) +where + F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static, +{ + Worker::with_current(|worker| match worker { + Some(worker) => worker.spawn_broadcast(f), + None => DEFAULT_POOL.spawn_broadcast(f), + }); +} + +/// Returns the number of members participating in a thread-pool. +/// +/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or +/// similar) this is able to look up that registration and find the worker and +/// thread-pool implicitly. +/// +/// If not called within a thread pool, this uses the [`DEFAULT_POOL`]. +pub fn num_members() -> usize { + Worker::with_current(|worker| match worker { + Some(worker) => worker.thread_pool().num_members(), + None => DEFAULT_POOL.num_members(), }) } @@ -1790,35 +2460,25 @@ where /// Operating on the principle that you should finish what you start before /// starting something new, workers will first execute their queue, then execute /// shared jobs, then pull new jobs from the injector. -fn managed_worker(lease: Lease, halt: Arc) { - trace!("starting managed worker"); - +fn managed_worker(membership: Membership, halt: Arc) { // Register as the indicated worker, and work until we are told to halt. - Worker::occupy(lease, |worker| { + membership.activate(|worker| { while !halt.load(Ordering::Relaxed) { - #[cfg(feature = "shuttle")] - shuttle::hint::spin_loop(); - - if let Some((job, migrated)) = worker.find_work() { - worker.execute(job, migrated); - } else { - worker.lease.seats.sleep_controllers[worker.lease.seat_number] - .sleep(worker.lease.seat_number, &worker.lease.thread_pool.sleeping); + if worker.yield_now() == Yield::Idle { + worker.wait(); } } }); - - trace!("exiting managed worker"); } // ----------------------------------------------------------------------------- // Tests -#[cfg(all(test, not(feature = "shuttle")))] +#[cfg(test)] mod tests { - use alloc::vec; use std::sync::mpsc::channel; + use std::vec; use super::*; @@ -1848,6 +2508,18 @@ mod tests { THREAD_POOL.depopulate(); } + #[test] + fn spawn_cancel_safety() { + static THREAD_POOL: ThreadPool = ThreadPool::new(); + THREAD_POOL.resize_to_available(); + + THREAD_POOL.with_worker(|worker| { + let _ = worker.spawn(core::future::pending::<()>()); + }); + + THREAD_POOL.depopulate(); + } + #[test] fn join_basic() { static THREAD_POOL: ThreadPool = ThreadPool::new(); @@ -1864,8 +2536,7 @@ mod tests { } #[test] - #[cfg(not(miri))] // This is too much for miri to handle - fn join_long() { + fn join_deep() { fn increment(worker: &Worker, slice: &mut [u32]) { match slice.len() { 0 => (), @@ -1873,7 +2544,10 @@ mod tests { _ => { let (head, tail) = slice.split_at_mut(1); - worker.join(|_| head[0] += 1, |worker| increment(worker, tail)); + worker.join( + |_| head[0] += 1, + |worker| increment(worker, tail), + ); } } } @@ -1881,16 +2555,15 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - let mut vals = [0; 1_024]; - THREAD_POOL.on_worker(|worker| increment(worker, &mut vals)); - assert_eq!(vals, [1; 1_024]); + let mut vals = [0; 800]; + THREAD_POOL.with_worker(|worker| increment(worker, &mut vals)); + assert_eq!(vals, [1; 800]); THREAD_POOL.depopulate(); } #[test] - #[cfg(not(miri))] // This is too much for miri to handle - fn join_very_long() { + fn join_wide() { fn increment(worker: &Worker, slice: &mut [u32]) { match slice.len() { 0 => (), @@ -1910,9 +2583,9 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - let mut vals = vec![0; 512 * 512]; - THREAD_POOL.on_worker(|worker| increment(worker, &mut vals)); - assert_eq!(vals, vec![1; 512 * 512]); + let mut vals = vec![0; 65_536]; + THREAD_POOL.with_worker(|worker| increment(worker, &mut vals)); + assert_eq!(vals, vec![1; 65_536]); THREAD_POOL.depopulate(); } diff --git a/src/time.rs b/src/time.rs new file mode 100644 index 0000000..451d5e7 --- /dev/null +++ b/src/time.rs @@ -0,0 +1,47 @@ +//! Architecture-specific timing functions, taken from +//! + +/// Read from the `cntvct_el0` register on Arm `AArch64`. +#[cfg(target_arch = "aarch64")] +#[inline(always)] +pub fn ticks() -> u64 { + use core::arch::asm; + let cnt: u64; + // SAFETY: `mrs cntvct_el0` only reads the architectural virtual counter + // register and does not touch memory or the stack. + unsafe { + asm!( + "mrs {}, cntvct_el0", + out(reg) cnt, + options(nostack, nomem, preserves_flags) + ); + } + cnt +} + +/// Read from rdtime on RISC-V +#[cfg(target_arch = "riscv64")] +#[inline(always)] +pub fn ticks() -> u64 { + use core::arch::asm; + let cnt: u64; + // SAFETY: `rdtime` reads a timer CSR into a general-purpose register and does not access + // Rust memory. + unsafe { + asm!( + "rdtime {}", + out(reg) cnt, + options(nostack, nomem, preserves_flags) + ); + } + cnt +} + +/// Read from the real-time stamp counter on windows +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn ticks() -> u64 { + // SAFETY: `_rdtsc` emits the CPU counter read instruction and has no Rust memory safety + // preconditions. + unsafe { core::arch::x86_64::_rdtsc() } +} diff --git a/src/util.rs b/src/util.rs index 661a430..62352a3 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,19 +1,17 @@ use core::cell::Cell; use core::hash::Hasher; -use core::sync::atomic::AtomicUsize; -use core::sync::atomic::Ordering; use std::hash::DefaultHasher; +use crate::platform::*; + /// [xorshift*] is a fast pseudorandom number generator which will /// even tolerate weak seeding, as long as it's not zero. /// /// [xorshift*]: https://en.wikipedia.org/wiki/Xorshift#xorshift* -#[cfg(not(feature = "shuttle"))] pub struct XorShift64Star { state: Cell, } -#[cfg(not(feature = "shuttle"))] impl XorShift64Star { pub fn new() -> Self { // Any non-zero seed will do -- this uses the hash of a global counter. @@ -53,19 +51,37 @@ impl XorShift64Star { } } -#[cfg(feature = "shuttle")] -pub struct XorShift64Star; +pub trait IterBits { + fn iter_bits(self) -> BitIter; +} -#[cfg(feature = "shuttle")] -impl XorShift64Star { - pub fn new() -> Self { - Self +impl IterBits for u32 { + fn iter_bits(self) -> BitIter { + BitIter { bitset: self } } +} - pub fn next_usize(&self, n: usize) -> usize { - use shuttle::rand::Rng; - use shuttle::rand::thread_rng; +pub struct BitIter { + bitset: u32, +} + +impl Iterator for BitIter { + type Item = usize; + + fn next(&mut self) -> Option { + if self.bitset == 0 { + None + } else { + let i = self.bitset.trailing_zeros(); // TZCNT + self.bitset &= self.bitset - 1; // BLSR + Some(i as usize) + } + } - thread_rng().gen_range(0..n) + fn size_hint(&self) -> (usize, Option) { + let populated = self.bitset.count_ones(); // POPCNT + (populated as usize, Some(populated as usize)) } } + +impl ExactSizeIterator for BitIter {} diff --git a/tests/general.rs b/tests/general.rs deleted file mode 100644 index 305e4b1..0000000 --- a/tests/general.rs +++ /dev/null @@ -1,3 +0,0 @@ -//! General integration tests - -#![cfg(not(feature = "shuttle"))] diff --git a/tests/miri.rs b/tests/miri.rs deleted file mode 100644 index c9d27be..0000000 --- a/tests/miri.rs +++ /dev/null @@ -1,52 +0,0 @@ -//! Tests specifically for miri - -#![cfg(miri)] - -use forte::prelude::*; -use tracing::info; - -/// A node in a binary tree. -struct Node { - val: u64, - left: Option>, - right: Option>, -} - -impl Node { - // Constructs a new binary tree with the given number of layers. - pub fn tree(layers: usize) -> Self { - Self { - val: 1, - left: (layers != 1).then(|| Box::new(Self::tree(layers - 1))), - right: (layers != 1).then(|| Box::new(Self::tree(layers - 1))), - } - } -} - -#[test] -fn fork_join() { - let layers = 10; - let target = (1 << layers) - 1; - - static COMPUTE: ThreadPool = ThreadPool::new(); - - fn sum(node: &Node, worker: &Worker) -> u64 { - let (left, right) = worker.join( - |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(), - |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(), - ); - - node.val + left + right - } - - let tree = Node::tree(layers); - - COMPUTE.with_worker(|worker| { - let worker = worker.unwrap(); - COMPUTE.resize_to_available(); - info!("Work beginning"); - assert_eq!(sum(&tree, worker), target); - info!("Work completed"); - COMPUTE.depopulate(); - }); -} diff --git a/tests/shuttle.rs b/tests/shuttle.rs deleted file mode 100644 index 776f3be..0000000 --- a/tests/shuttle.rs +++ /dev/null @@ -1,213 +0,0 @@ -//! Tests using the Shuttle testing framework. - -#![cfg(feature = "shuttle")] -#![allow(unused_imports)] - -use core::pin::Pin; -use core::task::Context; -use core::task::Poll; - -use forte::ThreadPool; -use forte::Worker; -use shuttle::hint::black_box; -use shuttle::sync::atomic::AtomicBool; -use shuttle::sync::atomic::AtomicUsize; -use shuttle::sync::atomic::Ordering; -use tracing::Level; -use tracing_subscriber::fmt::Subscriber; - -// ----------------------------------------------------------------------------- -// Infrastructure - -/// Provides access to a thread pool which can be treated as static for the -/// purposes of testing. -fn with_thread_pool(f: F) -> impl Fn() + 'static -where - F: Fn(&'static ThreadPool) + 'static, -{ - move || { - let thread_pool = Box::new(ThreadPool::new()); - let thread_pool_ptr = Box::into_raw(thread_pool); - - // SAFETY: This thread pool is never dropped. - let thread_pool_ref = unsafe { &*thread_pool_ptr }; - f(thread_pool_ref); - } -} - -// ----------------------------------------------------------------------------- -// Pool resizing - -/// Tests for concurrency issues within the `with_thread_pool` helper function. -/// This spins up a thread pool with a single thread, then spins it back down. -#[test] -pub fn shuttle_populate_depopulate() { - let test = with_thread_pool(|pool| { - pool.populate(); - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -// ----------------------------------------------------------------------------- -// Core API - -/// Tests spawning a worker on a pool of size one. -#[test] -pub fn shuttle_spawn_closure() { - let test = with_thread_pool(|pool| { - pool.resize_to(1); - pool.spawn(|_: &Worker| {}); - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -#[derive(Default)] -struct CountFuture { - count: usize, -} - -impl Future for CountFuture { - type Output = (); - - fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - if self.count == 128 { - Poll::Ready(()) - } else { - self.count += 1; - cx.waker().wake_by_ref(); - Poll::Pending - } - } -} - -/// Tests spawning a nontrivial future on a pool of size one. -#[test] -pub fn shuttle_spawn_future() { - let test = with_thread_pool(|pool| { - pool.resize_to(1); - let task = pool.spawn(CountFuture::default()); - assert!(task.is_finished()); - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -/// Tests a two-level join operation on a pool of size one. -#[test] -pub fn join_4_on_1() { - let test = with_thread_pool(|pool| { - pool.resize_to(1); - - let counter = AtomicUsize::new(0); - pool.join( - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - ); - assert_eq!(counter.load(Ordering::Relaxed), 4); - - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -/// Tests a two-level join operation on a pool of size two. -#[test] -pub fn join_4_on_2() { - let test = with_thread_pool(|pool| { - pool.resize_to(2); - - let counter = AtomicUsize::new(0); - pool.join( - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - ); - assert_eq!(counter.load(Ordering::Relaxed), 4); - - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -/// Tests a two-level join operation on a pool of size three. -#[test] -pub fn join_4_on_3() { - let test = with_thread_pool(|pool| { - pool.resize_to(3); - - let counter = AtomicUsize::new(0); - pool.join( - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - |worker| { - worker.join( - |_| counter.fetch_add(1, Ordering::Relaxed), - |_| counter.fetch_add(1, Ordering::Relaxed), - ) - }, - ); - assert_eq!(counter.load(Ordering::Relaxed), 4); - - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -} - -/// Tests a moderately deep join operation on a large pool. -#[test] -pub fn join_long() { - let test = with_thread_pool(|pool| { - pool.resize_to(8); - - fn increment(worker: &Worker, slice: &mut [u32]) { - match slice.len() { - 0 => (), - 1 => slice[0] += 1, - _ => { - let (head, tail) = slice.split_at_mut(1); - - worker.join(|_| head[0] += 1, |worker| increment(worker, tail)); - } - } - } - - let mut vals = [0; 10]; - pool.expect_worker(|worker| increment(worker, &mut vals)); - assert_eq!(vals, [1; 10]); - - pool.depopulate(); - }); - - shuttle::check_pct(test, 100_000, 100_000); -}