diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26911ba..3648e2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,20 +16,30 @@ This project is currently in early [pre-release], and there may be arbitrary bre
 
 ### Added 
 
-- `ThreadPool::num_workers` method which returns the current number of workers
-- `ThreadPool::on_worker` variant of `with_worker` for `Send` closures.
-- `ThreadPool::expect_worker` variant of `with_worker` that panics.
+- `Worker::spawn_local` for spawning `!Send` work.
+- `Worker::broadcast`, `ThreadPool::spawn_broadcast`, and `broadcast` for blocking broadcasts.
+- `ThreadPool::broadcast`, `ThreadPool::spawn_broadcast`, and `spawn_broadcast` for non-blocking broadcasts.
+- `ThreadPool::num_members` method which returns the current number of member threads.
+- `ThreadPool::get_worker` which looks up the worker if it exists.
+- `ThreadPool::enroll` which requests membership and blocks until it is granted.
+- `ThreadPool::try_enroll` which requests a membership and returns None if none are available.
 
 ### Changed
+
+- `Lease` and `StackJob` have been refactored to improve stack utilization.
 - Work sharing has been rewritten to improve performance.
-- Thread pools can now have a max of 32 workers at a time.
+- Thread pools can now have a max of 32 members at a time.
+- `ThreadPool::with_worker` now waits for a membership to become available.
 - `spawn`, `Scope::spawn`, and `Worker::spawn` now accept closures and futures.
-- `ThreadPool::with_worker` now provides `Option<&Worker>` instead of `&Worker`.
-- `claim_lease` now returns `Option<Lease>` instead of `Lease`.
+- `Lease` is now called `Membership`.
 - `Scope` now has two lifetimes instead of one, and is more flexible.
 
 ### Removed
+
 - All versions of `spawn_future` and `spawn_async`; just use `spawn` instead.
+- `claim_lease` has been replaced with `try_enroll`.
+- `Worker::occupy` has been replaced with `Membership::activate`.
+- Removed the shuttle testing framework (it's incompatible with crossbeam queues).
 
 ## [1.0.0-alpha.4]
 
diff --git a/Cargo.lock b/Cargo.lock
index 0c5fddb..30e0c8c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -55,12 +55,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "assoc"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdc70193dadb9d7287fa4b633f15f90c876915b31f6af17da307fc59c9859a8"
-
 [[package]]
 name = "async-channel"
 version = "2.3.1"
@@ -162,18 +156,6 @@ version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 
-[[package]]
-name = "bitvec"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
-dependencies = [
- "funty",
- "radium",
- "tap",
- "wyz",
-]
-
 [[package]]
 name = "bumpalo"
 version = "3.18.1"
@@ -499,20 +481,12 @@ dependencies = [
  "crossbeam-utils",
  "dashmap",
  "divan",
- "hotclock",
+ "futures-lite",
+ "lazy_static",
  "rayon",
- "shuttle",
  "st3",
- "tracing",
- "tracing-subscriber",
 ]
 
-[[package]]
-name = "funty"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
-
 [[package]]
 name = "futures-core"
 version = "0.3.31"
@@ -527,9 +501,9 @@ checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
 
 [[package]]
 name = "futures-lite"
-version = "2.6.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532"
+checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad"
 dependencies = [
  "fastrand",
  "futures-core",
@@ -548,32 +522,7 @@ dependencies = [
  "libc",
  "log",
  "rustversion",
- "windows 0.48.0",
-]
-
-[[package]]
-name = "generator"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827"
-dependencies = [
- "cc",
- "cfg-if",
- "libc",
- "log",
- "rustversion",
- "windows 0.61.3",
-]
-
-[[package]]
-name = "getrandom"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi",
+ "windows",
 ]
 
 [[package]]
@@ -627,17 +576,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
 
-[[package]]
-name = "hex"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
-
-[[package]]
-name = "hotclock"
-version = "0.2.0"
-source = "git+https://github.com/spence/hotclock#8cf14ae9d62dba7f7780a3c920ab6208b6568777"
-
 [[package]]
 name = "is-terminal"
 version = "0.4.16"
@@ -714,7 +652,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5"
 dependencies = [
  "cfg-if",
- "generator 0.7.5",
+ "generator",
  "scoped-tls",
  "tracing",
  "tracing-subscriber",
@@ -772,12 +710,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
-[[package]]
-name = "owo-colors"
-version = "3.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f"
-
 [[package]]
 name = "parking"
 version = "2.2.1"
@@ -794,7 +726,7 @@ dependencies = [
  "libc",
  "redox_syscall",
  "smallvec",
- "windows-link 0.2.1",
+ "windows-link",
 ]
 
 [[package]]
@@ -846,15 +778,6 @@ dependencies = [
  "portable-atomic",
 ]
 
-[[package]]
-name = "ppv-lite86"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
-dependencies = [
- "zerocopy",
-]
-
 [[package]]
 name = "proc-macro2"
 version = "1.0.95"
@@ -873,51 +796,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "radium"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
-
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha",
- "rand_core",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom",
-]
-
-[[package]]
-name = "rand_pcg"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e"
-dependencies = [
- "rand_core",
-]
-
 [[package]]
 name = "rayon"
 version = "1.10.0"
@@ -1096,26 +974,6 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
-[[package]]
-name = "shuttle"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e72e65e5ac3437476a310bd73ec924dc75e3055ffa61f376266f80576f3869ff"
-dependencies = [
- "assoc",
- "bitvec",
- "cfg-if",
- "generator 0.8.5",
- "hex",
- "owo-colors",
- "rand",
- "rand_core",
- "rand_pcg",
- "scoped-tls",
- "smallvec",
- "tracing",
-]
-
 [[package]]
 name = "slab"
 version = "0.4.10"
@@ -1164,12 +1022,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "tap"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
-
 [[package]]
 name = "terminal_size"
 version = "0.4.2"
@@ -1282,12 +1134,6 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "wasi"
-version = "0.11.1+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
-
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.100"
@@ -1396,114 +1242,12 @@ dependencies = [
  "windows-targets 0.48.5",
 ]
 
-[[package]]
-name = "windows"
-version = "0.61.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893"
-dependencies = [
- "windows-collections",
- "windows-core",
- "windows-future",
- "windows-link 0.1.3",
- "windows-numerics",
-]
-
-[[package]]
-name = "windows-collections"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
-dependencies = [
- "windows-core",
-]
-
-[[package]]
-name = "windows-core"
-version = "0.61.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
-dependencies = [
- "windows-implement",
- "windows-interface",
- "windows-link 0.1.3",
- "windows-result",
- "windows-strings",
-]
-
-[[package]]
-name = "windows-future"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e"
-dependencies = [
- "windows-core",
- "windows-link 0.1.3",
- "windows-threading",
-]
-
-[[package]]
-name = "windows-implement"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "windows-interface"
-version = "0.59.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "windows-link"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
-
 [[package]]
 name = "windows-link"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
-[[package]]
-name = "windows-numerics"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
-dependencies = [
- "windows-core",
- "windows-link 0.1.3",
-]
-
-[[package]]
-name = "windows-result"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
-dependencies = [
- "windows-link 0.1.3",
-]
-
-[[package]]
-name = "windows-strings"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
-dependencies = [
- "windows-link 0.1.3",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.42.0"
@@ -1559,15 +1303,6 @@ dependencies = [
  "windows_x86_64_msvc 0.52.6",
 ]
 
-[[package]]
-name = "windows-threading"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6"
-dependencies = [
- "windows-link 0.1.3",
-]
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -1700,15 +1435,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "wyz"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
-dependencies = [
- "tap",
-]
-
 [[package]]
 name = "xshell"
 version = "0.2.7"
@@ -1723,23 +1449,3 @@ name = "xshell-macros"
 version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32ac00cd3f8ec9c1d33fb3e7958a82df6989c42d747bd326c822b1d625283547"
-
-[[package]]
-name = "zerocopy"
-version = "0.8.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
-dependencies = [
- "zerocopy-derive",
-]
-
-[[package]]
-name = "zerocopy-derive"
-version = "0.8.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
diff --git a/Cargo.toml b/Cargo.toml
index 89cb4d4..dc3b1c6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,10 @@ edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Low-overhead parallel and async work scheduler"
 repository = "https://github.com/NthTensor/Forte"
+rust-version = "1.96.0"
+
+# Lie to cargo
+links = "forte"
 
 [workspace]
 resolver = "2"
@@ -13,14 +17,11 @@ members = ["ci"]
 [dependencies]
 async-task = "4.7.1"
 atomic-wait = "1.1.0"
+lazy_static = "1.5.0"
 crossbeam-queue = "0.3.12"
 crossbeam-utils = "0.8.21"
 st3 = "0.4"
-hotclock = { git = "https://github.com/spence/hotclock" }
-
-shuttle = { version = "0.8.0", optional = true }
-tracing = { version = "0.1.41", features = ["release_max_level_off"] }
-tracing-subscriber = "0.3.19"
+futures-lite = "2.6.1"
 
 [dev-dependencies]
 # Required for comparison testing
@@ -33,22 +34,12 @@ dashmap = "6.1.0"
 # Used for A/B perf testing
 criterion = { version = "0.5" }
 
-[features]
-shuttle = ["dep:shuttle"]
-
 [profile.release]
 debug = true
 
 [profile.bench]
 opt-level = 3
 
-# Custom profile for shuttle tests: enable release optimizations so that the shuttle
-# tests are less slow, but don't disable debug assertions.
-[profile.shuttle]
-inherits = "test"
-lto = true
-opt-level = 3
-
 [lints.clippy]
 doc_markdown = "warn"
 manual_let_else = "warn"
diff --git a/benches/bevy_tasks.rs b/benches/bevy_tasks.rs
index b730f1f..335a0e9 100644
--- a/benches/bevy_tasks.rs
+++ b/benches/bevy_tasks.rs
@@ -1,7 +1,8 @@
 //! Comparative benchmarks against bevy_tasks
 
 struct BevyParChunksMut<'a, T>(core::slice::ChunksMut<'a, T>);
-impl<'a, T> bevy_tasks::ParallelIterator<core::slice::IterMut<'a, T>> for BevyParChunksMut<'a, T>
+impl<'a, T> bevy_tasks::ParallelIterator<core::slice::IterMut<'a, T>>
+    for BevyParChunksMut<'a, T>
 where
     T: 'a + Send + Sync,
 {
@@ -12,8 +13,11 @@ where
 
 static THREAD_POOL: forte::ThreadPool = forte::ThreadPool::new();
 
-fn forte_chunks<const CHUNK_SIZE: usize, T, F>(worker: &forte::Worker, data: &mut [T], func: &F)
-where
+fn forte_chunks<const CHUNK_SIZE: usize, T, F>(
+    worker: &forte::Worker,
+    data: &mut [T],
+    func: &F,
+) where
     T: Send + Sync,
     F: Fn(&mut [T]) + Send + Sync,
 {
@@ -83,7 +87,7 @@ mod overhead {
 
         let mut vec: Vec<_> = (0..len).collect();
 
-        THREAD_POOL.expect_worker(|worker| {
+        THREAD_POOL.with_worker(|worker| {
             bencher.bench_local(|| {
                 forte_chunks::<64, _, _>(worker, &mut vec, &|c| {
                     c.iter_mut().for_each(work);
diff --git a/benches/flat_scope.rs b/benches/flat_scope.rs
index 8d48ef0..3e42292 100644
--- a/benches/flat_scope.rs
+++ b/benches/flat_scope.rs
@@ -6,9 +6,6 @@ use std::hash::Hasher;
 
 use criterion::black_box;
 use divan::Bencher;
-use tracing_subscriber::fmt;
-use tracing_subscriber::layer::SubscriberExt;
-use tracing_subscriber::util::SubscriberInitExt;
 
 const SIZES: &[usize] = &[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4012, 8196];
 
@@ -39,7 +36,7 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new();
 fn forte(bencher: Bencher, size: usize) {
     use forte::Worker;
 
-    COMPUTE.expect_worker(|worker| {
+    COMPUTE.with_worker(|worker| {
         bencher.bench_local(|| {
             worker.scope(|scope| {
                 for i in 0..size {
@@ -78,14 +75,6 @@ fn rayon(bencher: Bencher, size: usize) {
 }
 
 fn main() {
-    let fmt_layer = fmt::layer()
-        .without_time()
-        .with_target(false)
-        .with_thread_names(true)
-        .compact();
-
-    tracing_subscriber::registry().with(fmt_layer).init();
-
     COMPUTE.resize_to_available();
 
     divan::main();
diff --git a/benches/flood_fill.rs b/benches/flood_fill.rs
index 471953d..627ba6b 100644
--- a/benches/flood_fill.rs
+++ b/benches/flood_fill.rs
@@ -9,9 +9,6 @@ use std::hash::Hasher;
 use criterion::black_box;
 use dashmap::DashSet;
 use divan::Bencher;
-use tracing_subscriber::fmt;
-use tracing_subscriber::layer::SubscriberExt;
-use tracing_subscriber::util::SubscriberInitExt;
 
 const SIZES: &[usize] = &[8, 16, 32, 64, 128, 256, 512];
 
@@ -126,7 +123,7 @@ fn forte(bencher: Bencher, size: usize) {
         }
     }
 
-    COMPUTE.expect_worker(|worker| {
+    COMPUTE.with_worker(|worker| {
         bencher.bench_local(|| {
             let visited = DashSet::new();
 
@@ -202,14 +199,6 @@ fn rayon(bencher: Bencher, size: usize) {
 }
 
 fn main() {
-    let fmt_layer = fmt::layer()
-        .without_time()
-        .with_target(false)
-        .with_thread_names(true)
-        .compact();
-
-    tracing_subscriber::registry().with(fmt_layer).init();
-
     COMPUTE.resize_to_available();
 
     divan::main();
diff --git a/benches/fork_join.rs b/benches/fork_join.rs
index 2e55b24..4c54be6 100644
--- a/benches/fork_join.rs
+++ b/benches/fork_join.rs
@@ -3,10 +3,6 @@
 use chili::Scope;
 use divan::Bencher;
 use forte::Worker;
-use tracing::info;
-use tracing_subscriber::fmt;
-use tracing_subscriber::layer::SubscriberExt;
-use tracing_subscriber::util::SubscriberInitExt;
 
 // -----------------------------------------------------------------------------
 // Workload
@@ -32,7 +28,8 @@ impl Node {
 // Returns an iterator over the number of layers. Also returns the total number
 // of nodes.
 const LAYERS: &[usize] = &[
-    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, // 10, 24, 27,
+    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, // 10, 24, 27,
 ];
 
 fn nodes() -> impl Iterator<Item = (usize, usize)> {
@@ -86,8 +83,7 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) {
 
     let tree = Node::tree(nodes.0);
 
-    COMPUTE.expect_worker(|worker| {
-        info!("Staring Benchmark");
+    COMPUTE.with_worker(|worker| {
         bencher.bench_local(move || {
             assert_eq!(sum(&tree, worker), nodes.1 as u64);
         });
@@ -105,9 +101,8 @@ fn throughput_forte(bencher: Bencher, nodes: (usize, usize)) {
         node.val + left + right
     }
 
-    info!("Staring Benchmark");
     bencher.bench(|| {
-        COMPUTE.expect_worker(|worker| {
+        COMPUTE.with_worker(|worker| {
             let tree = Node::tree(nodes.0);
             assert_eq!(sum(&tree, worker), nodes.1 as u64);
         });
@@ -187,14 +182,6 @@ fn throughput_rayon(bencher: Bencher, nodes: (usize, usize)) {
 }
 
 fn main() {
-    let fmt_layer = fmt::layer()
-        .without_time()
-        .with_target(false)
-        .with_thread_names(true)
-        .compact();
-
-    tracing_subscriber::registry().with(fmt_layer).init();
-
     COMPUTE.resize_to_available();
 
     divan::main();
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..959d5a5
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,6 @@
+//! This build-script is intentionally left blank. It exists so that cargo will
+//! let us use the `link` field, which ensures that only one version of forte
+//! may be used in a given build. This is important to prevent fighting
+//! threadpools.
+
+fn main() {}
diff --git a/ci/src/ci.rs b/ci/src/ci.rs
index b3c23ba..4d6eea7 100644
--- a/ci/src/ci.rs
+++ b/ci/src/ci.rs
@@ -72,18 +72,28 @@ impl CI {
                 // Note that we are running the subcommands directly rather than using any aliases
                 let mut cmds = vec![];
                 // Lint commands
-                cmds.append(&mut commands::FormatCommand::default().prepare(sh, flags));
-                cmds.append(&mut commands::ClippyCommand::default().prepare(sh, flags));
-                cmds.append(&mut commands::LintsCommand::default().prepare(sh, flags));
+                cmds.append(
+                    &mut commands::FormatCommand::default().prepare(sh, flags),
+                );
+                cmds.append(
+                    &mut commands::ClippyCommand::default().prepare(sh, flags),
+                );
+                cmds.append(
+                    &mut commands::LintsCommand::default().prepare(sh, flags),
+                );
                 // Compile commands
-                cmds.append(&mut commands::CompileCheckCommand::default().prepare(sh, flags));
+                cmds.append(
+                    &mut commands::CompileCheckCommand::default()
+                        .prepare(sh, flags),
+                );
                 // Documentation commands
-                cmds.append(&mut commands::DocCheckCommand::default().prepare(sh, flags));
-                cmds.append(&mut commands::DocTestCommand::default().prepare(sh, flags));
-                // Shuttle commands
-                cmds.append(&mut commands::ShuttleCheckCommand::default().prepare(sh, flags));
-                cmds.append(&mut commands::ShuttleClippyCommand::default().prepare(sh, flags));
-                cmds.append(&mut commands::ShuttleTestCommand::default().prepare(sh, flags));
+                cmds.append(
+                    &mut commands::DocCheckCommand::default()
+                        .prepare(sh, flags),
+                );
+                cmds.append(
+                    &mut commands::DocTestCommand::default().prepare(sh, flags),
+                );
                 cmds
             }
         }
@@ -105,15 +115,14 @@ enum Commands {
     Lints(commands::LintsCommand),
     Clippy(commands::ClippyCommand),
     Format(commands::FormatCommand),
-    // Shuttle commands
-    Shuttle(commands::ShuttleCommand),
-    ShuttleCheck(commands::ShuttleCheckCommand),
-    ShuttleClippy(commands::ShuttleClippyCommand),
-    ShuttleTest(commands::ShuttleTestCommand),
 }
 
 impl Prepare for Commands {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         match self {
             // Compile commands
             Commands::Compile(subcommand) => subcommand.prepare(sh, flags),
@@ -126,11 +135,6 @@ impl Prepare for Commands {
             Commands::Lints(subcommand) => subcommand.prepare(sh, flags),
             Commands::Clippy(subcommand) => subcommand.prepare(sh, flags),
             Commands::Format(subcommand) => subcommand.prepare(sh, flags),
-            // Shuttle commands
-            Commands::Shuttle(subcommand) => subcommand.prepare(sh, flags),
-            Commands::ShuttleCheck(subcommand) => subcommand.prepare(sh, flags),
-            Commands::ShuttleClippy(subcommand) => subcommand.prepare(sh, flags),
-            Commands::ShuttleTest(subcommand) => subcommand.prepare(sh, flags),
         }
     }
 }
diff --git a/ci/src/commands/clippy.rs b/ci/src/commands/clippy.rs
index eef88a2..f5f2a04 100644
--- a/ci/src/commands/clippy.rs
+++ b/ci/src/commands/clippy.rs
@@ -11,7 +11,11 @@ use crate::PreparedCommand;
 pub struct ClippyCommand {}
 
 impl Prepare for ClippyCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        _flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         vec![PreparedCommand::new::<Self>(
             cmd!(sh, "cargo clippy --workspace -- -Dwarnings"),
             "Please fix clippy errors in output above.",
diff --git a/ci/src/commands/compile.rs b/ci/src/commands/compile.rs
index 7d5043c..686bd29 100644
--- a/ci/src/commands/compile.rs
+++ b/ci/src/commands/compile.rs
@@ -11,7 +11,11 @@ use crate::commands::CompileCheckCommand;
 pub struct CompileCommand {}
 
 impl Prepare for CompileCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         let mut commands = vec![];
         commands.append(&mut CompileCheckCommand::default().prepare(sh, flags));
         commands
diff --git a/ci/src/commands/compile_check.rs b/ci/src/commands/compile_check.rs
index 3466746..b96cefc 100644
--- a/ci/src/commands/compile_check.rs
+++ b/ci/src/commands/compile_check.rs
@@ -11,7 +11,11 @@ use crate::PreparedCommand;
 pub struct CompileCheckCommand {}
 
 impl Prepare for CompileCheckCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        _flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         vec![PreparedCommand::new::<Self>(
             cmd!(sh, "cargo check --workspace"),
             "Please fix compiler errors in output above.",
diff --git a/ci/src/commands/doc.rs b/ci/src/commands/doc.rs
index 419bb12..097cbfa 100644
--- a/ci/src/commands/doc.rs
+++ b/ci/src/commands/doc.rs
@@ -12,7 +12,11 @@ use crate::commands::DocTestCommand;
 pub struct DocCommand {}
 
 impl Prepare for DocCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         let mut commands = vec![];
         commands.append(&mut DocTestCommand::default().prepare(sh, flags));
         commands.append(&mut DocCheckCommand::default().prepare(sh, flags));
diff --git a/ci/src/commands/doc_check.rs b/ci/src/commands/doc_check.rs
index f2a7d59..4e862d7 100644
--- a/ci/src/commands/doc_check.rs
+++ b/ci/src/commands/doc_check.rs
@@ -11,7 +11,11 @@ use crate::PreparedCommand;
 pub struct DocCheckCommand {}
 
 impl Prepare for DocCheckCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        _flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         vec![
             PreparedCommand::new::<Self>(
                 cmd!(
diff --git a/ci/src/commands/doc_test.rs b/ci/src/commands/doc_test.rs
index e6bdaa5..658f5bb 100644
--- a/ci/src/commands/doc_test.rs
+++ b/ci/src/commands/doc_test.rs
@@ -11,7 +11,11 @@ use crate::PreparedCommand;
 pub struct DocTestCommand {}
 
 impl Prepare for DocTestCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         let no_fail_fast = if flags.contains(Flag::KEEP_GOING) {
             "--no-fail-fast"
         } else {
diff --git a/ci/src/commands/format.rs b/ci/src/commands/format.rs
index e36173d..a83e8c4 100644
--- a/ci/src/commands/format.rs
+++ b/ci/src/commands/format.rs
@@ -11,7 +11,11 @@ use crate::PreparedCommand;
 pub struct FormatCommand {}
 
 impl Prepare for FormatCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        _flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         vec![PreparedCommand::new::<Self>(
             cmd!(sh, "cargo fmt --all -- --check"),
             "Please run 'cargo fmt --all' to format your code.",
diff --git a/ci/src/commands/lints.rs b/ci/src/commands/lints.rs
index 02e2128..27a2c11 100644
--- a/ci/src/commands/lints.rs
+++ b/ci/src/commands/lints.rs
@@ -12,7 +12,11 @@ use crate::commands::FormatCommand;
 pub struct LintsCommand {}
 
 impl Prepare for LintsCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>> {
         let mut commands = vec![];
         commands.append(&mut FormatCommand::default().prepare(sh, flags));
         commands.append(&mut ClippyCommand::default().prepare(sh, flags));
diff --git a/ci/src/commands/mod.rs b/ci/src/commands/mod.rs
index b9ca102..e132d34 100644
--- a/ci/src/commands/mod.rs
+++ b/ci/src/commands/mod.rs
@@ -22,14 +22,3 @@ mod lints;
 pub use clippy::*;
 pub use format::*;
 pub use lints::*;
-
-// Shuttle test suite commands
-mod shuttle;
-mod shuttle_check;
-mod shuttle_clippy;
-mod shuttle_test;
-
-pub use shuttle::*;
-pub use shuttle_check::*;
-pub use shuttle_clippy::*;
-pub use shuttle_test::*;
diff --git a/ci/src/commands/shuttle.rs b/ci/src/commands/shuttle.rs
deleted file mode 100644
index 857a7d9..0000000
--- a/ci/src/commands/shuttle.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-use argh::FromArgs;
-
-use crate::Flag;
-use crate::Prepare;
-use crate::PreparedCommand;
-use crate::commands::ShuttleCheckCommand;
-use crate::commands::ShuttleClippyCommand;
-use crate::commands::ShuttleTestCommand;
-
-/// Alias for running the `shuttle-check`, `shuttle-clippy` and `shuttle-test` subcommands.
-#[derive(FromArgs, Default)]
-#[argh(subcommand, name = "shuttle")]
-pub struct ShuttleCommand {}
-
-impl Prepare for ShuttleCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>> {
-        let mut commands = vec![];
-        commands.append(&mut ShuttleCheckCommand::default().prepare(sh, flags));
-        commands.append(&mut ShuttleClippyCommand::default().prepare(sh, flags));
-        commands.append(&mut ShuttleTestCommand::default().prepare(sh, flags));
-        commands
-    }
-}
diff --git a/ci/src/commands/shuttle_check.rs b/ci/src/commands/shuttle_check.rs
deleted file mode 100644
index 05f07e0..0000000
--- a/ci/src/commands/shuttle_check.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-use argh::FromArgs;
-use xshell::cmd;
-
-use crate::Flag;
-use crate::Prepare;
-use crate::PreparedCommand;
-
-/// Checks that the loom test suite compiles.
-#[derive(FromArgs, Default)]
-#[argh(subcommand, name = "loom-check")]
-pub struct ShuttleCheckCommand {}
-
-impl Prepare for ShuttleCheckCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
-        let command = PreparedCommand::new::<Self>(
-            cmd!(sh, "cargo check --test shuttle --features shuttle"),
-            "Please fix compiler errors in output above.",
-        );
-        vec![command]
-    }
-}
diff --git a/ci/src/commands/shuttle_clippy.rs b/ci/src/commands/shuttle_clippy.rs
deleted file mode 100644
index 8c0111d..0000000
--- a/ci/src/commands/shuttle_clippy.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-use argh::FromArgs;
-use xshell::cmd;
-
-use crate::Flag;
-use crate::Prepare;
-use crate::PreparedCommand;
-
-/// Checks for clippy warnings and errors in the loom test suite.
-#[derive(FromArgs, Default)]
-#[argh(subcommand, name = "shuttle-clippy")]
-pub struct ShuttleClippyCommand {}
-
-impl Prepare for ShuttleClippyCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
-        let command = PreparedCommand::new::<Self>(
-            cmd!(
-                sh,
-                "cargo clippy --test shuttle --features shuttle -- -Dwarnings"
-            ),
-            "Please fix clippy errors in output above.",
-        );
-        vec![command]
-    }
-}
diff --git a/ci/src/commands/shuttle_test.rs b/ci/src/commands/shuttle_test.rs
deleted file mode 100644
index cdfc41f..0000000
--- a/ci/src/commands/shuttle_test.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-use argh::FromArgs;
-use xshell::cmd;
-
-use crate::Flag;
-use crate::Prepare;
-use crate::PreparedCommand;
-
-/// Runs the loom concurrency test suite.
-#[derive(FromArgs, Default)]
-#[argh(subcommand, name = "shuttle-test")]
-pub struct ShuttleTestCommand {}
-
-impl Prepare for ShuttleTestCommand {
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, _flags: Flag) -> Vec<PreparedCommand<'a>> {
-        let command = PreparedCommand::new::<Self>(
-            cmd!(
-                sh,
-                "cargo test --test shuttle --profile shuttle --features shuttle"
-            ),
-            "Please fix compiler errors in output above.",
-        );
-        vec![command]
-    }
-}
diff --git a/ci/src/prepare.rs b/ci/src/prepare.rs
index 923bc33..14b3b84 100644
--- a/ci/src/prepare.rs
+++ b/ci/src/prepare.rs
@@ -24,7 +24,11 @@ pub trait Prepare {
     ///     }
     /// }
     /// ```
-    fn prepare<'a>(&self, sh: &'a xshell::Shell, flags: Flag) -> Vec<PreparedCommand<'a>>;
+    fn prepare<'a>(
+        &self,
+        sh: &'a xshell::Shell,
+        flags: Flag,
+    ) -> Vec<PreparedCommand<'a>>;
 }
 
 bitflags! {
@@ -83,7 +87,11 @@ impl<'a> PreparedCommand<'a> {
     }
 
     /// A builder that adds a new environmental variable to the list.
-    pub fn with_env_var(mut self, key: &'static str, value: &'static str) -> Self {
+    pub fn with_env_var(
+        mut self,
+        key: &'static str,
+        value: &'static str,
+    ) -> Self {
         self.env_vars.push((key, value));
         self
     }
diff --git a/rustfmt.toml b/rustfmt.toml
index a871dbd..4b863e7 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1,6 +1,7 @@
 use_field_init_shorthand = true
 newline_style = "Unix"
 style_edition = "2024"
+max_width = 80
 
 # The following lines may be uncommented on nightly Rust.
 # Once these features have stabilized, they should be added to the always-enabled options above.
diff --git a/src/job.rs b/src/job.rs
index edbed59..9ff40e6 100644
--- a/src/job.rs
+++ b/src/job.rs
@@ -1,62 +1,32 @@
-//! This module defines an executable unit of work called a [`Job`]. Jobs are what
-//! get scheduled on the thread pool. There are two core job types: [`StackJob`]
-//! and [`HeapJob`].
+//! This module defines an executable unit of work called a "Job". Jobs are
+//! what get scheduled on the thread pool. There are two core job types:
+//! [`StackJob`] and [`HeapJob`]. There is no unifying `Job` trait. Instead,
+//! what makes these both jobs is their ability to yield a [`JobRef`].
 //!
 //! After a job is allocated, we typically refer to it by a [`JobRef`]. Job refs
 //! are type-erased, and can be sent between threads without moving the
 //! underlying job.
-//!
-//! When using a job, one must be extremely careful to ensure that:
-//! (a) The job does not outlive anything it closes over.
-//! (b) The job remains valid until it is executed for the last time.
-//! (c) Each job reference is executed exactly once.
 
 use alloc::boxed::Box;
 use alloc::collections::VecDeque;
 use alloc::vec::Vec;
+use core::any::Any;
 use core::cell::UnsafeCell;
 use core::mem::ManuallyDrop;
-use core::mem::MaybeUninit;
 use core::ptr::NonNull;
-use core::sync::atomic::Ordering;
-use core::sync::atomic::fence;
-use std::thread::Result as ThreadResult;
 
 use crate::latch::Latch;
-use crate::platform::AtomicU32;
+use crate::platform::*;
 use crate::thread_pool::Worker;
 use crate::unwind;
 
 // -----------------------------------------------------------------------------
-// Runnable
+// JobRef
 
-/// A job is a unit of work that may be executed by a worker thread. The primary
-/// purpose of this trait is to make it easy to create a `JobRef`. The `execute`
-/// function is designed to interlock with the `JobRef::execute_fn` field.
-trait Job {
-    /// Calling this function runs the job.
-    ///
-    /// # Safety
-    ///
-    /// Implementors must specify the invariant of the pointer `this` that the
-    /// caller is expected to uphold.
-    ///
-    /// This may be called from a different thread than the one which scheduled
-    /// the job, so the implementer must ensure the appropriate traits are met,
-    /// whether `Send`, `Sync`, or both.
-    ///
-    /// Calling this is always considered to "complete" the job, so the caller
-    /// must ensure this is called exactly once.
-    unsafe fn execute(this: NonNull<()>, worker: &Worker);
-}
-
-// -----------------------------------------------------------------------------
-// Shared JobRef
-
-/// Effectively a Job trait object. It can be treated as such, even though
-/// sometimes a `JobRef` will not point to a type that implements `Job`.
+/// A `JobRef` is a specialized v-table, containing a pointer to work that needs to
+/// be executed, and a function pointer that is capable of executing it.
 ///
-/// This is analogous to the chili type `JobShared` or the rayon type `JobRef`.
+/// It is analogous to the chili type `JobShared` or the rayon type `JobRef`.
 pub struct JobRef {
     /// A non-null pointer to some type-erased data which can be executed as a
     /// job by the `execute_fn`. This will usually point to either an instance
@@ -74,18 +44,11 @@ impl JobRef {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that:
-    ///
-    /// * `job_pointer` and `execute_fn` are *matched*; the `execute_fn` must be
-    ///   a function that can safely receive `job_pointer` as it's first argument.
-    ///
-    /// * `job_pointer` points to an initialized and properly aligned value which
-    ///   is neither moved nor dropped until `execute_fn` is called.
-    ///
-    /// * `job_pointer` is "valid" now and until `execute_fn` is called,
-    ///   according to the contract of the specific `execute_fn` being stored.
+    /// The caller must ensure that `JobRef::execute` will only called on the
+    /// returned `JobRef` when it would be sound to call `execute_fn` on
+    /// `job_pointer`.
     #[inline(always)]
-    pub unsafe fn new_raw(
+    pub unsafe fn new(
         job_pointer: NonNull<()>,
         execute_fn: unsafe fn(NonNull<()>, &Worker),
     ) -> JobRef {
@@ -105,27 +68,38 @@ impl JobRef {
     /// Executes the `JobRef` by passing the execute function on the job pointer.
     #[inline(always)]
     pub fn execute(self, worker: &Worker) {
-        // SAFETY: Calling this function on this pointer is valid due to the
-        // contract of `JobRef::new_raw`:
-        //
-        // * `self.execute_fn` and `self.job_pointer` are "matched": every
-        //   `JobRef` is constructed via `new_raw`, which requires the caller
-        //   to supply a compatible pair.
-        //
-        // * `self.job_pointer` is valid at this point: `new_raw` requires the
-        //   pointer to remain valid until `execute_fn` is called, and we are
-        //   calling it now.
-        //
-        // * This is called at most once: `execute` consumes `self`, so the
-        //   pointer cannot be used again via this `JobRef`.
+        // SAFETY: The caller of `JobRef::new` defines the conditions under
+        // which this call is sound, and must ensure that this will not be
+        // called unless these conditions are met.
         unsafe { (self.execute_fn)(self.job_pointer, worker) }
     }
 }
 
-// SAFETY: `JobRef` is a type-erased data pointer + function pointer tuple. The
-// data pointer always points to a `Send` value due to the safety requirements
-// of `JobRef::new_raw`. Function pointers are always `Send`. Therefore it is
-// sound to move a `JobRef` across thread boundaries.
+// SAFETY: This is sound, but just barely.
+//
+// Every `JobRef` contains a function pointer and a data pointer. Function
+// pointers are always `Send`, but the data pointer may or may not be valid for
+// cross-thread access (the value it points to may or may not be `Sync`).
+//
+// However, even when this data is not thread-safe, `JobRef` still needs to be
+// `Send`. This is because we need to be able to pass pointers to `!Send` job
+// data between threads. For example, if we have a thread that owns a `Future`
+// that is `!Send`, and we receive a wakeup notification on an IO polling
+// thread, the IO thread must send the owning thread a `JobRef` containing a
+// pointer to that `!Send` future.
+//
+// This is only sound because the only method that can actually cause unsound
+// cross-thread memory access is `JobRef::execute`. This function is safe,
+// because the caller cannot know the soundness requirements of the underlying
+// job being pointed to (due to type-erasure). However, `JobRef::new` is
+// `unsafe`, and requires the caller to ensure that `execute` will only be
+// called if it is correct for the execute function to be called on the
+// job_pointer.
+//
+// Since every `JobRef` must be constructed with a call to `new`, it is not
+// possible for _entirely safe_ code to violate the `!Send` condition. It is
+// unfortunate that the soundness justification has to be squeezed into a single
+// function, but thus are the constraints of type-erasure.
 unsafe impl Send for JobRef {}
 
 // -----------------------------------------------------------------------------
@@ -151,7 +125,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.push_back(job_ref);
     }
@@ -161,7 +135,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.push_front(job_ref);
     }
@@ -171,7 +145,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.pop_back()
     }
@@ -181,7 +155,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.pop_front()
     }
@@ -192,7 +166,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         if job_refs.back().map(JobRef::id) == Some(id) {
             let _ = job_refs.pop_back();
@@ -212,7 +186,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         let mut len = job_refs.len();
         let num_chunks = len / Self::CHUNK_SIZE;
@@ -231,7 +205,7 @@ impl JobQueue {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
         // returning any references from this API, making this exclusive access
-        // safe.
+        // sound.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.append(&mut split_refs);
     }
@@ -240,13 +214,24 @@ impl JobQueue {
 // -----------------------------------------------------------------------------
 // Stack allocated work function
 
+/// This union helps us conserve stack space by allowing us to store (a) the
+/// function we need to run (b) the output of that function or (c) a captured
+/// panic at different times throughout the job's lifecycle.
+///
+/// As future work, we may want to consider pushing large values into a
+/// heap-allocated pool.
+union StackJobData<F, T> {
+    func: ManuallyDrop<F>,
+    output: ManuallyDrop<T>,
+    error: ManuallyDrop<Box<dyn Any + Send>>,
+}
+
 /// A [`StackJob`] is a job that's allocated on the stack.
 ///
 /// This is analogous to the chili type `JobStack` and the rayon type `StackJob`.
 pub struct StackJob<F, T> {
-    f: UnsafeCell<ManuallyDrop<F>>,
     completed: Latch,
-    return_value: UnsafeCell<MaybeUninit<ThreadResult<T>>>,
+    data: UnsafeCell<StackJobData<F, T>>,
 }
 
 impl<F, T> StackJob<F, T>
@@ -256,11 +241,12 @@ where
 {
     /// Creates a new `StackJob` owned by the current worker.
     #[inline(always)]
-    pub fn new(f: F, worker: &Worker) -> StackJob<F, T> {
+    pub fn new(func: F, latch: Latch) -> StackJob<F, T> {
         StackJob {
-            f: UnsafeCell::new(ManuallyDrop::new(f)),
-            completed: worker.new_latch(),
-            return_value: UnsafeCell::new(MaybeUninit::uninit()),
+            data: UnsafeCell::new(StackJobData {
+                func: ManuallyDrop::new(func),
+            }),
+            completed: latch,
         }
     }
 
@@ -271,390 +257,260 @@ where
     ///
     /// The caller must ensure that:
     ///
-    /// * The `StackJob` will outlive the `JobRef`.
+    /// * This is called at most once for each `StackJob`.
     ///
-    /// * The `StackJob` will not move for the lifetime of the `JobRef`.
+    /// * After this call, the `StackJob` will not be moved or dropped until one
+    ///   of these conditions is met:
     ///
-    /// * The `StackJob` does not outlive any data it closes over.
+    ///   * (A) A call to `check` on the `StackJob`'s latch returns something other
+    ///     than `Pending`.
     ///
-    /// * This function is not called again so long as the `JobRef` lives.
+    ///   * (B) The `JobRef` has been dropped without `execute` being called.
     #[inline(always)]
     pub unsafe fn as_job_ref(&self) -> JobRef {
         let job_pointer = NonNull::from(self).cast();
-        // SAFETY: `JobRef::new_raw` requires:
+        // SAFETY: We must show that `JobRef::execute` will only be called on
+        // the returned `JobRef` if it is sound to call `execute_fn` on
+        // `job_pointer`.
+        //
+        // Assume that `JobRef::execute` has been called, under the conditions
+        // defined by the safety comment for this function. Then:
         //
-        // * `job_pointer` and `Self::execute` are matched.
+        // * `this` is an aligned pointer to an initialized `StackJob<F, T>`,
+        //   which will not be invalidated until a `check` on the latch it
+        //   contains returns something other than `Pending`.
         //
-        //   Here, `execute` expects a pointer to `Self`, which is what
-        //   `job_pointer` is.
+        //   We created `job_pointer` from a ref to `self`, which is a
+        //   `StackJob`, so it must have pointed to an aligned `StackJob` at
+        //   some point.
         //
-        // * The pointee is live, not moved, and not dropped until `execute_fn`
-        //   is called.
+        //   If the caller allows the `JobRef` to be executed, they must also
+        //   ensure that the pointer will not be invalidated unless a call to
+        //   `check` on the job's `Latch` has returned something other than
+        //   `Pending`.
         //
-        //   Here, the caller guarantees the `StackJob` outlives and does not
-        //   move for the lifetime of the `JobRef`.
+        // * `StackJob::execute` is called at most once on any `StackJob`.
         //
-        // * `execute_fn` to be called at most once.
+        //   The caller ensures only one `JobRef` is ever created for this
+        //   `StackJob`. Since `JobRef::execute` consumes that `JobRef`, it
+        //   cannot be called multiple times.
         //
-        //   Here, `JobRef::execute` consumes the `JobRef`, and only one
-        //   `JobRef` is created per `StackJob`, so it is called exactly once.
-        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
+        //
+        unsafe { JobRef::new(job_pointer, Self::execute) }
     }
 
-    /// Returns a reference to the latch embedded in this stack job. After this
-    /// latch is set, it becomes safe to call `StackJob::return_value`.
+    /// Returns a reference to the latch embedded in this stack job.
     #[inline(always)]
     pub fn completion_latch(&self) -> &Latch {
         &self.completed
     }
 
     /// Unwraps the stack job back into a closure. This allows the closure to be
-    /// executed without indirection in situations where the one still has
-    /// direct access.
-    ///
-    /// # Safety
-    ///
-    /// The caller must ensure that either this function or `execute` are called
-    /// for a given `StackJob` (not both), and that this function must not be
-    /// called multiple times.
-    #[inline(always)]
-    pub unsafe fn unwrap(&mut self) -> F {
-        let f_mut = self.f.get_mut();
-        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
-        // again after we `take()` it's contents.
-        //
-        // `take()` is called in two places: once here, and once in `execute`.
-        // Since this function is mutually exclusive with `execute`, and is
-        // called at most once, the `ManuallyDrop<F>` is not used again.
-        unsafe { ManuallyDrop::take(f_mut) }
-    }
-
-    /// Unwraps the job into it's return value.
+    /// executed without indirection in situations where one still has direct
+    /// access.
     ///
     /// # Safety
     ///
     /// The caller must ensure that:
     ///
-    /// * This is called only after the job's latch is set.
+    /// * No `JobRef` currently exists for this `StackJob`.
+    ///
+    /// * No new `JobRef` will be created for this `StackJob`.
     ///
-    /// * That this is called at most once for a given `StackJob`.
+    /// * If a `JobRef` did exist, it was never executed.
     #[inline(always)]
-    pub unsafe fn return_value(&mut self) -> ThreadResult<T> {
-        // Synchronize with the fence in `StackJob::execute`, establishing a
-        // happens-after relationship with the following read..
-        fence(Ordering::Acquire);
-        // Get a ref to the result.
-        let result_ref = self.return_value.get_mut();
-        // SAFETY: `assume_init_read` requires:
+    pub unsafe fn unwrap_func(mut self) -> F {
+        // SAFETY: For this access to be valid, we must first establish that
+        // we have exclusive access to `data`. Only three other functions access
+        // `data`, and none of them can race with this function:
         //
-        // * The `MaybeUninit` is fully initialized.
+        // * Since `JobRef::execute` is not called, `StackJob::execute` is not
+        //   called, and cannot be running.
         //
-        //   As this function can only be called if the latch has been set, and
-        //   the latch is only set at the end of `StackJob::execute` (after
-        //   `return_value` is written and memory is synchronized via the above
-        //   fence) the memory must be initialized.
+        // * Since this function and `unwrap_output` both consume the
+        //   `StackJob`, and a `StackJob` cannot be duplicated, `unwrap_output`
+        //   cannot be running now.
         //
-        // * That data not be incorrectly duplicated by repeated calls.
+        // * Since this function and `unwrap_error` both consume the `StackJob`,
+        //   and a `StackJob` cannot be duplicated, `unwrap_error` cannot be
+        //   running now.
         //
-        //   Data is not duplicated because this function is called at most once.
-        unsafe { result_ref.assume_init_read() }
+        // Next, we must establish that it is valid to read from union field
+        // `func`. Each `StackJob` is constructed using field `func`, and only
+        // `StackJob::execute` writes to the union after construction. Since
+        // `StackJob::execute` is not called, it must still be valid to read
+        // from `func`.
+        let func_ref = unsafe { &mut self.data.get_mut().func };
+        // SAFETY: The `StackJob` is dropped at the end of this block, so `data`
+        // is never accessed again.
+        unsafe { ManuallyDrop::take(func_ref) }
     }
-}
 
-impl<F, T> Job for StackJob<F, T>
-where
-    F: FnOnce(&Worker) -> T + Send,
-    T: Send,
-{
-    /// Executes a `StackJob` from a const pointer.
+    /// Unwraps the job into its return value.
     ///
     /// # Safety
     ///
-    /// The caller must ensure that:
-    ///
-    /// * `this` is a non-null, properly aligned pointer to a live instance of
-    ///   `StackJob<F, T>`.
-    ///
-    /// * The `StackJob` will not move or be deallocated until the latch it
-    ///   contains is set.
-    ///
-    /// * Either this function or `unwrap` are called at most once for a given
-    ///   `StackJob`.
+    /// This may only be called if a `check` on the enclosed latch has returned
+    /// `Ok`.
     #[inline(always)]
-    unsafe fn execute(this: NonNull<()>, worker: &Worker) {
-        // SAFETY: The caller ensures `this` can be converted into an immutable
-        // reference until we set the latch, and the latch has not yet been set.
-        let this = unsafe { this.cast::<Self>().as_ref() };
-        // Create an abort guard. If the closure panics, this will convert the
-        // panic into an abort. Doing so prevents use-after-free for other
-        // elements of the stack.
-        let abort_guard = unwind::AbortOnDrop;
-        // SAFETY: `f` is a `UnsafeCell<ManuallyDrop<F>>`. Creating a
-        // `&mut ManuallyDrop<F>` is only sound so long as no other live
-        // references exist.
-        //
-        // `f` is accessed mutably in two places: once here, and once in
-        // `unwrap`. Since this function is mutually exclusive with `unwrap`,
-        // and is called at most once, exclusive access is guaranteed.
-        let f_ref = unsafe { &mut *this.f.get() };
-        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
-        // again after we `take()` it's contents.
-        //
-        // `take()` is called in two places: once here, and once in `unwrap`.
-        // Since this function is mutually exclusive with `unwrap`, and is
-        // called at most once, the `ManuallyDrop<F>` is not used again.
-        let f = unsafe { ManuallyDrop::take(f_ref) };
-        // Run the job. If the job panics, we propagate the panic back to the
-        // main thread.
-        let result = unwind::halt_unwinding(|| f(worker));
-        // Get the uninitialized memory where we should put the return value.
-        let return_value = this.return_value.get();
-        // SAFETY: Writing to this unsafe cell requires that no other thread
-        // holds a reference to it's contents.
-        //
-        // The `return_value` is only written here and only read within
-        // `StackJob::return_value`, and then only after the latch has been set.
-        // The latch has not been set, and this function is called at most once,
-        // so no concurrent access can occur.
-        unsafe { (*return_value).write(result) };
-        // This synchronizes with the `Acquire` fence within `return_value()`,
-        // establishing a happens-before relationship that makes the preceding
-        // `return_value` write visible to the reader.
+    pub unsafe fn unwrap_output(mut self) -> T {
+        // Synchronize with the fence in `StackJob::execute`, establishing a
+        // happens-after relationship with the following read.
+        fence(Ordering::Acquire);
+        // SAFETY: For this access to be valid, we must first establish that
+        // we have exclusive access to `data`. Only three other functions access
+        // `data`, and none of them can race with this function:
         //
-        // This is required because latches do not synchronize memory.
-        fence(Ordering::Release);
-        // SAFETY: The caller ensures the job is valid until the latch is set.
-        // Since the latch is a field of the job, the latch must be valid until
-        // it is set.
-        unsafe { Latch::set(&this.completed) };
-        // Forget the abort guard, re-enabling panics.
-        core::mem::forget(abort_guard);
-    }
-}
-
-// -----------------------------------------------------------------------------
-// Stack allocated work function on a non-worker thread
-
-/// Like [`StackJob`] but allocated on the stack of a non-worker thread. While
-/// this job is pending, the owning thread is fully blocked.
-#[cfg(not(feature = "shuttle"))]
-pub struct ExternalJob<F, T> {
-    f: UnsafeCell<ManuallyDrop<F>>,
-    completed: AtomicU32,
-    return_value: UnsafeCell<MaybeUninit<ThreadResult<T>>>,
-}
-
-#[cfg(not(feature = "shuttle"))]
-impl<F, T> ExternalJob<F, T>
-where
-    F: FnOnce(&Worker) -> T + Send,
-    T: Send,
-{
-    /// Creates a new `ExternalJob`.
-    #[inline(always)]
-    pub fn new(f: F) -> ExternalJob<F, T> {
-        ExternalJob {
-            f: UnsafeCell::new(ManuallyDrop::new(f)),
-            completed: AtomicU32::new(0),
-            return_value: UnsafeCell::new(MaybeUninit::uninit()),
-        }
-    }
-
-    /// Creates a `JobRef` pointing to this job. The underlying `ExternalJob` is
-    /// not dropped after the `JobRef` is executed.
-    ///
-    /// # Safety
-    ///
-    /// The caller must ensure that:
-    ///
-    /// * The `ExternalJob` will not move or be deallocated until the `JobRef`
-    ///   is executed.
-    ///
-    /// * The `JobRef` does not outlive any data the `ExternalJob` closes over.
-    ///
-    /// * This function is not called again so long as the `JobRef` lives.
-    #[inline(always)]
-    pub unsafe fn as_job_ref(&self) -> JobRef {
-        let job_pointer = NonNull::from(self).cast();
-        // SAFETY: The `job_pointer` is trivially aligned and non-null,
-        // because it is derived from a reference.
-        //
-        // The caller must not allow the `ExternalJob` to move or be deallocated
-        // until the `JobRef` is executed. This guarantees that `job_pointer`
-        // remains valid for the lifetime of `JobRef`, satisfying the
-        // requirements of `JobRef::new_raw`.
-        //
-        // The caller guarantees that this function is not called again while
-        // `JobRef` lives, so `Self::execute` can be called at most once for
-        // this particular `ExternalJob`. This satisfies the at-most-once
-        // execution invariant documented on `Job::execute`.
-        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
+        // * Since `check` has returned `Ok`, and the latch is only set in
+        //   `StackJob::execute`, `StackJob::execute` must have been called at
+        //   least once. `StackJob::execute` may be called at most once, so it
+        //   cannot be running now.
+        //
+        // * Since this function and `unwrap_func` both consume the `StackJob`,
+        //   and a `StackJob` cannot be duplicated, `unwrap_func` cannot be
+        //   running now.
+        //
+        // * Since this function and `unwrap_error` both consume the `StackJob`,
+        //   and a `StackJob` cannot be duplicated, `unwrap_error` cannot be
+        //   running now.
+        //
+        // Next, we must establish that it is valid to read from union field
+        // `output`. We know this because `check` returned `Ok`, which means
+        // `set` was called with a false `error_flag` within
+        // `StackJob::execute`. This always follows a write to union field
+        // `output`, after which the union is not written to again.
+        let output_ref = unsafe { &mut self.data.get_mut().output };
+        // SAFETY: The `StackJob` is dropped at the end of this block, so `data`
+        // is never accessed again.
+        unsafe { ManuallyDrop::take(output_ref) }
     }
 
-    /// Waits for the `ExternalJob` to be executed and returns the result.
+    /// Unwraps the job into an error.
     ///
     /// # Safety
     ///
-    /// This must be called at most once.
+    /// This may only be called if a `check` on the enclosed latch has returned
+    /// `Error`.
     #[inline(always)]
-    pub unsafe fn wait_for_value(&mut self) -> ThreadResult<T> {
-        // Wait for the complete flag to be set.
-        loop {
-            atomic_wait::wait(&self.completed, 0);
-            if self.completed.load(Ordering::Relaxed) == 1 {
-                break;
-            }
-        }
-        // Synchronize memory; we do this with a fence, so that we only do a
-        // relaxed load in the case of a spurious wakeup.
+    pub unsafe fn unwrap_error(mut self) -> Box<dyn Any + Send> {
+        // Synchronize with the fence in `StackJob::execute`, establishing a
+        // happens-after relationship with the following read.
         fence(Ordering::Acquire);
-        // Get a ref to the result.
-        let result_ref = self.return_value.get_mut();
-        // SAFETY: `assume_init_read` requires:
+        // SAFETY: For this access to be valid, we must first establish that
+        // we have exclusive access to `data`. Only three other functions access
+        // `data`, and none of them can race with this function:
         //
-        // * The `MaybeUninit` is fully initialized.
+        // * Since `check` has returned `Error`, and the latch is only set in
+        //   `StackJob::execute`, `StackJob::execute` must have been called at
+        //   least once. `StackJob::execute` may be called at most once, so it
+        //   cannot be running now.
         //
-        //   As this can only be called if we have observed that `completed` has
-        //   been set to 1, and that only happens at the end of
-        //   `ExternalJob::execute` (after `return_value` is written and memory
-        //   is synchronized via the above fence) the memory must be initialized.
+        // * Since this function and `unwrap_func` both consume the `StackJob`,
+        //   and a `StackJob` cannot be duplicated, `unwrap_func` cannot be
+        //   running now.
         //
-        // * That data not be incorrectly duplicated by repeated calls.
+        // * Since this function and `unwrap_output` both consume the `StackJob`,
+        //   and a `StackJob` cannot be duplicated, `unwrap_output` cannot be
+        //   running now.
         //
-        //   Data is not duplicated because this function is called at most
-        //   once.
-        unsafe { result_ref.assume_init_read() }
+        // Next, we must establish that it is valid to read from union field
+        // `error`. We know this because `check` returned `Error`, which means
+        // `set` was called with a true `error_flag` within `StackJob::execute`.
+        // This always follows a write to union field `error`, after which the
+        // union is not written to again.
+        let error_ref = unsafe { &mut self.data.get_mut().error };
+        // SAFETY: The `StackJob` is dropped at the end of this block, so `data`
+        // is never accessed again.
+        unsafe { ManuallyDrop::take(error_ref) }
     }
-}
 
-#[cfg(not(feature = "shuttle"))]
-impl<F, T> Job for ExternalJob<F, T>
-where
-    F: FnOnce(&Worker) -> T + Send,
-    T: Send,
-{
-    /// Executes an `ExternalJob` from a const pointer.
+    /// Executes a `StackJob` from a const pointer.
     ///
     /// # Safety
     ///
     /// The caller must ensure that:
     ///
-    /// * `this` is a non-null, properly aligned pointer to a live instance
-    ///   of `ExternalJob<F, T>`.
-    ///
-    /// * The `ExternalJob` will not move or be deallocated for as long as
-    ///   `completed` remains set to 0.
+    /// * `this` is an aligned pointer to an initialized `StackJob<F, T>`, which
+    ///   will not be invalidated until a `check` on the latch it contains
+    ///   returns something other than `Pending`.
     ///
-    /// * This function is called at most once for a given `ExternalJob`.
+    /// * This function is called at most once on any `StackJob`.
     #[inline(always)]
     unsafe fn execute(this: NonNull<()>, worker: &Worker) {
-        // SAFETY: The caller ensures `this` can be converted into an immutable
-        // reference until we set the `complete` atomic.
+        // SAFETY: The pointer `this` is non-null, aligned, and the caller
+        // ensures it points to an initialized `StackJob`.
+        //
+        // `StackJobs` are always accessed immutably except for `unwrap_func`,
+        // `unwrap_output`, and `unwrap_error`. The caller ensures these will not
+        // race this call, so the pointer is valid for immutable access.
         let this = unsafe { this.cast::<Self>().as_ref() };
         // Create an abort guard. If the closure panics, this will convert the
         // panic into an abort. Doing so prevents use-after-free for other
         // elements of the stack.
         let abort_guard = unwind::AbortOnDrop;
-        // SAFETY: `f` is a `UnsafeCell<ManuallyDrop<F>>`. Creating a
-        // `&mut ManuallyDrop<F>` is only sound so long as no other live
-        // references exist.
-        //
-        // Since this field is never access mutably except for here and this
-        // function is called at most once, exclusive access is guaranteed.
-        let f_ref = unsafe { &mut *this.f.get() };
-        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
-        // again after we `take()` it's contents.
-        //
-        // Since it is not used in the remainder of this function, and this
-        // function is called at most once, it is indeed not used again.
-        let f = unsafe { ManuallyDrop::take(f_ref) };
-        // Run the job. If the job panics, we propagate the panic back to the
-        // main thread.
-        let result = unwind::halt_unwinding(|| f(worker));
-        // Get the uninitialized memory where we should put the return value.
-        let return_value = this.return_value.get();
-        // SAFETY: Writing to this unsafe cell requires that no other thread
-        // holds a reference to it's contents.
-        //
-        // The `return_value` is only read within `ExternalJob::wait_for_value`,
-        // and then only after `completed` is set to 1. Since this function is
-        // called at most once, `completed` must still be set to 0. Therefore no
-        // concurrent access can occur.
-        unsafe { (*return_value).write(result) };
-        // Set `completed` to 1, allowing reads of the return value. This
-        // `Release` store synchronizes with the `Acquire` fence in
-        // `ExternalJob::wait_for_value`, establishing a happens-before
-        // relationship that makes the preceding `return_value` write visible
-        // to the waiting reader.
-        this.completed.store(1, Ordering::Release);
-        // Notify the waiting thread that the job is complete.
-        atomic_wait::wake_one(&this.completed);
+        // Run the function and record the result. Produces a boolean flag that
+        // is true in the event of a panic.
+        let error_flag = {
+            // SAFETY: Only `unwrap_func`, `unwrap_output` and `unwrap_error`
+            // access `data`. Due to their individual safety contracts, they can
+            // only be called in a way that will not race with this function, so
+            // we must have unique access.
+            let data_ref = unsafe { &mut *this.data.get() };
+            // SAFETY: Each `StackJob` is constructed using field `func`, and
+            // this is the only place we write to the union after construction.
+            // As this function is called at most once, it must still be valid
+            // to access the union with field `func`.
+            let func_ref = unsafe { &mut data_ref.func };
+            // SAFETY: The `func` field is overwritten by the following match
+            // block, so it will not be accessed again.
+            let func = unsafe { ManuallyDrop::take(func_ref) };
+            // Run the job. If the job panics, we propagate the panic back to the
+            // main thread.
+            let result = unwind::halt_unwinding(|| func(worker));
+            // Emit different signals depending on if the function completed
+            // successfully or panicked.
+            match result {
+                Ok(output) => {
+                    data_ref.output = ManuallyDrop::new(output);
+                    false
+                }
+                Err(error) => {
+                    data_ref.error = ManuallyDrop::new(error);
+                    true
+                }
+            }
+        };
+        // This synchronizes with the `Acquire` fence within `return_value()`,
+        // establishing a happens-before relationship that makes the preceding
+        // `return_value` write visible to the reader.
+        //
+        // This is required because latches do not synchronize memory.
+        fence(Ordering::Release);
+        // SAFETY: This casts a reference to a raw pointer, which means the
+        // pointer must be aligned, non-null, and point to an initialized latch.
+        //
+        // We also meet Variant 2 of the `set` safety contract:
+        //
+        // * The latch has not been `set` since it was created or last `reset`,
+        //   and calls to `set` do not race.
+        //
+        //   This is the only place where this latch is set, and the caller
+        //   ensures this is called at most once. Therefore `set` cannot have
+        //   been called already, and there can be no other calls to `set` that
+        //   would race with this one.
+        //
+        // * The latch will not be dropped or moved until after `check` returns
+        //   something other than `Pending`.
+        //
+        //   The caller ensures that this `StackJob` is not dropped until a
+        //   `check` on the latch returns something other than `Pending`, and
+        //   nothing removes the latch from the `StackJob`.
+        unsafe { Latch::set(&this.completed, error_flag) };
         // Forget the abort guard, re-enabling panics.
         core::mem::forget(abort_guard);
     }
 }
 
-#[cfg(feature = "shuttle")]
-pub struct ExternalJob<F, T> {
-    f: UnsafeCell<ManuallyDrop<F>>,
-    mutex: shuttle::sync::Mutex<Option<ThreadResult<T>>>,
-    condvar: shuttle::sync::Condvar,
-}
-
-#[cfg(feature = "shuttle")]
-impl<F, T> ExternalJob<F, T>
-where
-    F: FnOnce(&Worker) -> T + Send,
-    T: Send,
-{
-    /// Creates a new `ExternalJob`.
-    #[inline(always)]
-    pub fn new(f: F) -> ExternalJob<F, T> {
-        ExternalJob {
-            f: UnsafeCell::new(ManuallyDrop::new(f)),
-            mutex: shuttle::sync::Mutex::new(None),
-            condvar: shuttle::sync::Condvar::new(),
-        }
-    }
-
-    #[inline(always)]
-    #[allow(clippy::undocumented_unsafe_blocks)]
-    pub unsafe fn as_job_ref(&self) -> JobRef {
-        let job_pointer = NonNull::from(self).cast();
-        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
-    }
-
-    #[inline(always)]
-    pub unsafe fn wait_for_value(&mut self) -> ThreadResult<T> {
-        let mut value = self.mutex.lock().unwrap();
-        while value.is_none() {
-            value = self.condvar.wait(value).unwrap();
-        }
-        Option::take(&mut value).unwrap()
-    }
-}
-
-#[cfg(feature = "shuttle")]
-impl<F, T> Job for ExternalJob<F, T>
-where
-    F: FnOnce(&Worker) -> T + Send,
-    T: Send,
-{
-    #[inline(always)]
-    #[allow(clippy::undocumented_unsafe_blocks)]
-    unsafe fn execute(this: NonNull<()>, worker: &Worker) {
-        let this = unsafe { this.cast::<Self>().as_ref() };
-        let abort_guard = unwind::AbortOnDrop;
-        let f_ref = unsafe { &mut *this.f.get() };
-        let f = unsafe { ManuallyDrop::take(f_ref) };
-        let result = unwind::halt_unwinding(|| f(worker));
-        let mut value = this.mutex.lock().unwrap();
-        *value = Some(result);
-        this.condvar.notify_one();
-        core::mem::forget(abort_guard);
-    }
-}
-
 // -----------------------------------------------------------------------------
 // Heap allocated work function
 
@@ -667,7 +523,7 @@ pub struct HeapJob<F> {
 
 impl<F> HeapJob<F>
 where
-    F: FnOnce(&Worker) + Send,
+    F: FnOnce(&Worker),
 {
     /// Allocates a new `HeapJob` on the heap.
     #[inline(always)]
@@ -676,49 +532,74 @@ where
     }
 
     /// Converts the heap job into an "owning" `JobRef`. The job will be
-    /// automatically dropped when the `JobRef` is executed.
-    ///
-    /// This will leak memory if the `JobRef` is not executed, so the caller
-    /// must ensure that it is eventually executed (unless the process is
-    /// exiting).
+    /// automatically dropped when the `JobRef` is executed (or will leak if it
+    /// is not executed).
     ///
     /// # Safety
     ///
-    /// If the `JobRef` is executed, the caller must ensure that it has not
-    /// outlived the data it closes over. In other words, if the closure
-    /// references something, that thing must live until the `JobRef` is
-    /// executed or dropped.
+    /// The caller must ensure that:
+    ///
+    /// * The `JobRef` will not outlive any of the items closed over by the
+    ///   function `f`.
+    ///
+    /// * If `f` is `!Send` then `JobRef::execute` is only called on the thread
+    ///   where the `HeapJob` was constructed.
     #[inline(always)]
     pub unsafe fn into_job_ref(self: Box<Self>) -> JobRef {
         // SAFETY: Pointers produced by `Box::into_raw` are never null.
-        let job_pointer = unsafe { NonNull::new_unchecked(Box::into_raw(self)).cast() };
-
-        // SAFETY: The pointer was created by a call to `Box::into_raw` so it is
-        // valid to pass in to `Self::execute`.
+        let job_pointer =
+            unsafe { NonNull::new_unchecked(Box::into_raw(self)).cast() };
+        // SAFETY: The doc-comment for this function defines the conditions
+        // under which this `JobRef` will be considered "executable".
+        //
+        // We must now show that it is sound to call `HeapJob::execute` on
+        // `job_ref` under these conditions, which in turn requires that:
+        //
+        // * `job_pointer` is an aligned pointer to an initialized `Box<HeapJob>`.
+        //
+        //   We created it from a ref to `self`, which is a `Box<HeapJob>`, so it
+        //   must be.
+        //
+        // * `HeapJob::execute` is called at most once on any `HeapJob`.
+        //
+        //   `into_job_ref` converts the `HeapJob` into a `JobRef`, and
+        //   `JobRef::execute` consumes the `JobRef` to call `HeapJob::execute`,
+        //   so it can be called at most once.
+        //
+        // * This function is only called during the lifetime of the items
+        //   closed over by the function.
+        //
+        //   The `JobRef` is not allowed to outlive the items closed over by the
+        //   function, so `JobRef::execute` and hence `HeapJob::execute` can
+        //   only be called during that interval.
+        //
+        // * Accessing `f` will not violate a `!Send` requirement.
         //
-        // Because this function takes ownership of `Self` to produce a
-        // `JobRef`, `JobRef::execute` takes ownership of the `JobRef` to call
-        // `Self::execute`, the job_pointer cannot be used after `Self::execute`
-        // is called. So it is safe for the pointer to become dangling.
-        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
+        //   This is ensured by the executability condition.
+        unsafe { JobRef::new(job_pointer, Self::execute) }
     }
-}
 
-impl<F> Job for HeapJob<F>
-where
-    F: FnOnce(&Worker) + Send,
-{
     /// Executes a `Box<HeapJob>`, dropping it when completed.
     ///
     /// # Safety
     ///
-    /// The caller must ensure that `this` is a pointer, created by calling
-    /// `Box::into_raw` on a `Box<HeapJob<F>>`. After the call `this` must be
-    /// treated as dangling.
+    /// The caller must ensure that:
+    ///
+    /// * `this` is an aligned pointer to an initialized `HeapJob<F>`.
+    ///
+    /// * This function is called at most once on any `HeapJob`.
+    ///
+    /// * Any items the `HeapJob` closes over are still live.
+    ///
+    /// * If the `HeapJob` is `!Send` then this is called on the thread where
+    ///   the `HeapJob` was constructed.
     #[inline(always)]
     unsafe fn execute(this: NonNull<()>, worker: &Worker) {
-        // SAFETY: The caller ensures `this` was created by `Box::into_raw` and
-        // that this is called only once.
+        // SAFETY: The caller ensures that:
+        //
+        // * `this` was created by `Box::into_raw`.
+        //
+        // * This function is called at most once.
         let this = unsafe { Box::from_raw(this.cast::<Self>().as_ptr()) };
         // Run the job.
         (this.f)(worker);
diff --git a/src/latch.rs b/src/latch.rs
index 425a9b8..e03ee26 100644
--- a/src/latch.rs
+++ b/src/latch.rs
@@ -1,11 +1,8 @@
-//! Forte borrows the *latch* concept from Rayon.
+//! Forte borrows the *latch* concept from Rayon. Every forte worker thread has
+//! a single binary semaphore, used for parking and unparking the thread.
 //!
-//! Every forte worker thread has a single "sleep controller" that it uses to
-//! park and unpark itself. Latches build on this to create a simple boolean
-//! switch, which allows the owning thread to sleep until the latch becomes set
-//! by another thread.
-//!
-//! Every latch points at one "sleep controller".
+//! Latches build on top of semaphores; allowing workers to wait for specific
+//! events, while also allowing wakeups from other sources on the semaphore.
 
 use alloc::task::Wake;
 use core::borrow::Borrow;
@@ -17,14 +14,18 @@ use crate::platform::*;
 
 /// The default state of a latch is `LOCKED`. When in the locked state, `check`
 /// returns `false` and `wait` blocks.
-const LOCKED: u32 = 0b00;
+const LOCKED: u32 = 0b000;
+
+/// The latch enters the `SIGNAL` state when it is set (with error flag false).
+/// When in this state, `check` returns `Status::Ok` and `wait` does not block.
+const SIGNAL: u32 = 0b001;
 
-/// The latch enters the `SIGNAL` state when it is set. When in this state,
-/// `check` returns `true` and `wait` does not block.
-const SIGNAL: u32 = 0b01;
+/// The latch enters the `ERROR` state when it is set (with error flag true).
+/// When in this state, `check` returns `Status::Error` and `wait` does not block.
+const ERROR: u32 = 0b010;
 
 /// The latch enters the `ASLEEP` state when blocking with `wait`.
-const ASLEEP: u32 = 0b10;
+const ASLEEP: u32 = 0b100;
 
 // -----------------------------------------------------------------------------
 // Latch
@@ -33,8 +34,8 @@ const ASLEEP: u32 = 0b10;
 /// occurred. The latch begins as *unset* (In the `LOCKED` state), and can later
 /// be *set* by any thread (entering the `SIGNAL`) state.
 ///
-/// Each latch is associated with one *owner thread*. This is the thread that
-/// may be blocking, waiting for the latch to complete.
+/// Each latch is "owned" by a single thread at a time; other threads may set
+/// the latch, but only the owning thread may wait on it.
 ///
 /// The general idea and spirit for latches (as well as some of the
 /// documentation) is due to rayon. However the implementation is specific to
@@ -49,38 +50,39 @@ pub struct Latch {
     /// Holds the internal state of the latch. This tracks if the latch has been
     /// set or not.
     state: AtomicU32,
-    /// Tracks the number of sleeping threads in the pool.
-    sleeping: &'static AtomicU32,
-    /// The sleep controller for the owning thread.
-    sleep_controller: &'static SleepController,
-    /// The seat number that owns this latch
-    seat_number: usize,
+    /// The semaphore that this latch will use for signaling.
+    semaphore: &'static Semaphore,
+}
+
+pub enum Status {
+    Pending,
+    Ok,
+    Error,
 }
 
 impl Latch {
-    /// Creates a new latch, owned by a specific thread.
-    pub fn new(
-        seat_number: usize,
-        sleeping: &'static AtomicU32,
-        sleep_controller: &'static SleepController,
-    ) -> Latch {
+    /// Creates a new latch backed by the provided semaphore.
+    pub fn new(semaphore: &'static Semaphore) -> Latch {
         Latch {
             state: AtomicU32::new(LOCKED),
-            sleeping,
-            sleep_controller,
-            seat_number,
+            semaphore,
         }
     }
 
     /// Checks to see if the latch has been set. Returns true if it has been.
     #[inline(always)]
-    pub fn check(&self) -> bool {
-        self.state.load(Ordering::Relaxed) == SIGNAL
+    pub fn check(&self) -> Status {
+        match self.state.load(Ordering::Relaxed) {
+            SIGNAL => Status::Ok,
+            ERROR => Status::Error,
+            _ => Status::Pending,
+        }
     }
 
-    /// Puts the thread to sleep if the latch has not been set. The thread will
-    /// be woken when the latch becomes set, but may also wake before then. The
-    /// caller should always re-check the latch condition after this returns.
+    /// Checks if the latch has been set, and if not waits for a signal on the
+    /// semaphore. This does _not_ wait for the latch to actually become set,
+    /// and may return early. The caller should always re-check the latch
+    /// condition after this returns.
     ///
     /// # Memory Ordering
     ///
@@ -89,7 +91,7 @@ impl Latch {
     /// The other thread must issue a corresponding `fence(Ordering::Release)`
     /// call.
     #[cold]
-    pub fn wait(&self) {
+    pub fn wait(&self, seat_bitmask: u32, waiting_bitmask: &'static AtomicU32) {
         // First, check if the latch has been set.
         //
         // In the event of a race with `set`:
@@ -97,21 +99,24 @@ impl Latch {
         // * If this happens before the store, then we will go to sleep.
         //
         // * If this happens after the store, then we notice and return.
-        if self.state.load(Ordering::Relaxed) == SIGNAL {
+        if self.state.load(Ordering::Relaxed) & (SIGNAL | ERROR) != 0 {
             return;
         }
-        // If it has not been set, go to sleep.
+        // If it has not been set, wait for a signal on the semaphore.
         //
-        // In the event of a race with `set`, the `wake` will always cause this
-        // to return regardless of memory ordering.
-        self.sleep_controller.sleep(self.seat_number, self.sleeping);
+        // In the event of a race with `set`, the call to `Semaphore::signal`
+        // will always end up unblocking this, no matter the memory-ordering.
+        self.semaphore.wait(seat_bitmask, waiting_bitmask);
     }
 
-    /// Activates the latch, potentially unblocking the owning thread.
+    /// Sets the latch, and sends a signal over the semaphore.
     ///
     /// This takes a raw pointer because the latch may be de-allocated by a
     /// different thread while this function is executing.
     ///
+    /// When `error` is set to `true`, `wait` returns `Status::Error` rather
+    /// than `Status::Ok`.
+    ///
     /// # Memory Ordering
     ///
     /// This does not synchronize memory. To synchronize memory with the waiting
@@ -120,33 +125,61 @@ impl Latch {
     ///
     /// # Safety
     ///
-    /// The latch pointer must be valid when passed to this function. After this
-    /// call, the latch pointer may become dangling and must not be dereferenced
-    /// unless it is known to still be valid.
+    /// The caller must ensure that:
+    ///
+    /// * `latch` is a non-null, aligned pointer to an initialized `Latch`.
+    ///
+    /// * Additionally, one of the following condition variants must be met:
+    ///
+    ///   1. The latch will not be dropped or moved for the duration of `set`.
+    ///
+    ///   2. The latch has not been `set` since it was created or last `reset`,
+    ///      calls to `set` do not race, and the latch will not be dropped or
+    ///      moved until after `check` returns something other than `Pending`.
     #[inline(always)]
-    pub unsafe fn set(latch: *const Latch) {
-        // SAFETY: The caller guarantees the latch remain alive until `set`
-        // returns.
-        let latch = unsafe { &*latch };
-        let sleep_controller = latch.sleep_controller;
-        // First we set the state to true.
+    pub unsafe fn set(latch: *const Latch, error: bool) {
+        // First we store a reference to the semaphore (which is 'static) so
+        // that we can access it even if the latch pointer becomes dangling.
+        //
+        // SAFETY: The caller guarantees the latch pointer is aligned and non-null.
+        //
+        // If Variant 1 is met, the latch cannot be dangling.
+        //
+        // If Variant 2 is met, the latch cannot become dangling so long as the
+        // state is `LOCKED` (because `check` will return `Pending`). Since there
+        // can have been no previous call to `set` since construction or the
+        // last `reset`, and there can be no racing calls to `set`, the state
+        // must be `LOCKED`. Therefore the latch cannot be dangling.
+        //
+        // Since this pointer is aligned, non-null, is not dangling, and the
+        // latch is never accessed mutably, it is valid to access immutably.
+        let semaphore = unsafe { (*latch).semaphore };
+        // Determine the next state for the latch.
+        let state = if error { ERROR } else { SIGNAL };
+        // Next we update the state.
         //
         // In the event of a race with `wait`, this may cause `wait` to return.
-        // Otherwise the other thread will sleep within `wait.
-        latch.state.store(SIGNAL, Ordering::Relaxed);
-        // We must try to wake the other thread, just in case it missed the
-        // notification and went to sleep. This guarantees that the other thread
-        // will make progress.
-        sleep_controller.wake();
+        // Otherwise the other thread will sleep within `wait`.
+        //
+        // SAFETY: The latch is still valid to access immutably, following the
+        // same logic as above.
+        //
+        // NOTE: This store will mean `check` no longer returns `Pending`,
+        // invalidating the argument in Variant 2. The latch pointer therefore
+        // may become dangling after this line.
+        unsafe { (*latch).state.store(state, Ordering::Relaxed) };
+        // Finally we try to signal the target thread on it's semaphore, just in
+        // case it missed the notification and is currently waiting. This
+        // guarantees that the other thread will make progress.
+        semaphore.signal();
     }
 
     /// Restores the latch to the default state.
     ///
     /// # Deadlocks
     ///
-    /// This may only be called by the thread that "owns" the latch, and only
-    /// after it has *observed* the latch entering the `SIGNAL` state, e.g.
-    /// after either `wait` or `check` has returned `true`.
+    /// This must only be called by the thread that "owns" the latch, and only
+    /// after it has *observed* `check` return something other than `Pending`.
     ///
     /// Calling `reset` from a different thread or before observing the signal
     /// is likely to result in deadlocks.
@@ -157,30 +190,28 @@ impl Latch {
 }
 
 // -----------------------------------------------------------------------------
-// Sleeper
+// Signals
 
-/// Used, in combination with a latch to park and unpark threads.
-#[cfg(not(feature = "shuttle"))]
-pub struct SleepController {
+/// A low-overhead binary semaphore used for task signaling.
+pub struct Semaphore {
     state: AtomicU32,
 }
 
-#[cfg(not(feature = "shuttle"))]
-impl SleepController {
-    /// Creates a new sleep controller.
+impl Semaphore {
+    /// Creates a new signal.
     pub const fn new() -> Self {
-        SleepController {
+        Semaphore {
             state: AtomicU32::new(LOCKED),
         }
     }
 
-    /// Attempt to wake the thread to which this belongs.
+    /// Sends a signal to the semaphore.
     ///
-    /// Returns true if this allows the thread to make progress (by waking it up
-    /// or catching it before it goes to sleep) and false if the thread was
-    /// running.
+    /// Returns true if this allows a waiting thread to make progress (by
+    /// signaling it while it was waiting or catching it before it started
+    /// waiting) and false if the thread was running.
     #[inline(always)]
-    pub fn wake(&self) -> bool {
+    pub fn signal(&self) -> bool {
         // Set the state to SIGNAL and read the current state, which must be
         // either LOCKED, ASLEEP or SIGNAL.
         let sleep_state = self.state.swap(SIGNAL, Ordering::Relaxed);
@@ -202,22 +233,26 @@ impl SleepController {
         sleep_state == ASLEEP
     }
 
-    /// Attempt to send the thread to sleep. This should only be called on a
-    /// single thread, and we say that this controller "belongs" to that thread.
+    /// Waits for a signal on the semaphore.
     ///
-    /// Returns true if this thread makes a syscall to suspend the thread, and
-    /// false if the thread was already woken (letting us skip the syscall).
+    /// Calls to `wait` should be fully ordered. In other words, this method
+    /// must not be called on the same value by two different threads unless a
+    /// "happens-before relationship" has been established between the calls via
+    /// memory synchronization.
     #[cold]
-    pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) {
-        // Set the state to ASLEEP and read the current state, which must be
-        // either LOCKED or SIGNAL.
+    pub fn wait(&self, seat_mask: u32, waiting_bitmask: &'static AtomicU32) {
+        // Set the state to ASLEEP and read the current state.
         let state = self.state.swap(ASLEEP, Ordering::Relaxed);
+        // The previous state should not have been ASLEEP, because calls to
+        // `sleep` must be fully ordered, and the state is only set to ASLEEP
+        // while `sleep` is executing.
+        //
         // If the state is LOCKED, then we have not yet received a signal, and
         // we should try to put the thread to sleep. Otherwise we should return
         // early.
         if state == LOCKED {
             // Set the sleeping bit for this worker.
-            sleeping.fetch_or(1 << seat_number, Ordering::Relaxed);
+            waiting_bitmask.fetch_or(seat_mask, Ordering::Relaxed);
             // If we have received a signal since entering the sleep state
             // (meaning the state is no longer set to ASLEEP) then this will
             // return immediately.
@@ -228,7 +263,7 @@ impl SleepController {
             // Either way, there is no way we can fail to receive a `wake`.
             atomic_wait::wait(&self.state, ASLEEP);
             // Clear the sleeping bit for this worker.
-            sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed);
+            waiting_bitmask.fetch_and(!seat_mask, Ordering::Relaxed);
         }
         // Set the state back to LOCKED so that we are ready to receive new
         // signals.
@@ -236,62 +271,19 @@ impl SleepController {
     }
 }
 
-// -----------------------------------------------------------------------------
-// Shuttle sleeper fallback
-
-/// This is a fallback implementation because the futex api is not available on
-/// shuttle.
-#[cfg(feature = "shuttle")]
-pub struct SleepController {
-    state: Mutex<u32>,
-    condvar: Condvar,
-}
-
-#[cfg(feature = "shuttle")]
-impl SleepController {
-    pub fn new() -> Self {
-        SleepController {
-            state: Mutex::new(LOCKED),
-            condvar: Condvar::new(),
-        }
-    }
-
-    pub fn wake(&self) -> bool {
-        let state = core::mem::replace(&mut *self.state.lock().unwrap(), SIGNAL);
-        let asleep = state == ASLEEP;
-        if asleep {
-            self.condvar.notify_one();
-        }
-        asleep
-    }
-
-    pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) {
-        let mut state = self.state.lock().unwrap();
-        if *state == LOCKED {
-            *state = ASLEEP;
-            sleeping.fetch_or(1 << seat_number, Ordering::Relaxed);
-            while *state == ASLEEP {
-                state = self.condvar.wait(state).unwrap();
-            }
-            sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed);
-        }
-        *state = LOCKED;
-    }
-}
-
 // -----------------------------------------------------------------------------
 // Async wakers
 
 impl Wake for Latch {
     fn wake(self: Arc<Self>) {
         // SAFETY: The borrowed `Arc` is held for the duration of this call,
-        // keeping the `Latch` alive.
-        unsafe { Latch::set(self.borrow()) };
+        // keeping the `Latch` alive, and satisfying Variant 1 of `Latch::set`.
+        unsafe { Latch::set(self.borrow(), false) };
     }
 
     fn wake_by_ref(self: &Arc<Self>) {
         // SAFETY: The borrowed `Arc` is held for the duration of this call,
-        // keeping the `Latch` alive.
-        unsafe { Latch::set(self.borrow()) };
+        // keeping the `Latch` alive, and satisfying Variant 1 of `Latch::set`.
+        unsafe { Latch::set(self.borrow(), false) };
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 7fb6758..6675a7b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,7 +30,7 @@
 //!     THREAD_POOL.resize_to_available();
 //!
 //!     // Register this thread as a worker on the pool.
-//!     THREAD_POOL.expect_worker(|worker| {
+//!     THREAD_POOL.with_worker(|worker| {
 //!         // Spawn a job onto the pool. The closure also accepts a worker, because the
 //!         // job may be executed on a different thread. This will be the worker for whatever
 //!         // thread it executes on.
@@ -130,19 +130,18 @@
 //!
 //! Thread pools are comprised of (and run on) workers, represented as instances
 //! of the [`Worker`] type. All work done on the pool is done in a "worker
-//! context" created by [`Worker::occupy`]. The recommended way to access a
-//! worker context for a specific pool is via [`ThreadPool::with_worker`],
-//! [`ThreadPool::on_worker`], or [`ThreadPool::expect_worker`].
+//! context" created by [`Membership::activate`]. The recommended way to access a
+//! worker context for a specific pool is via [`ThreadPool::with_worker`].
 //!
 //! ```
 //! # use forte::ThreadPool;
 //! # static THREAD_POOL: ThreadPool = ThreadPool::new();
-//! THREAD_POOL.expect_worker(|worker_1| {     // <-- Sets up this thread as a worker.
-//!     THREAD_POOL.expect_worker(|worker_2| { // <-- Returns a reference to the existing worker.
+//! THREAD_POOL.with_worker(|worker_1| {     // <-- Sets up this thread as a worker.
+//!     THREAD_POOL.with_worker(|worker_2| { // <-- Returns a reference to the existing worker.
 //!         // These pointers are identical.
 //!         assert!(std::ptr::eq(worker_1, worker_2));
-//!     });                                    // <-- Leaving this scope does nothing.
-//! });                                        // <-- Leaving this scope frees the worker.
+//!     });                                  // <-- Leaving this scope does nothing.
+//! });                                      // <-- Leaving this scope frees the worker.
 //! ```
 //!
 //! Every worker holds a local queue of tasks, as well as metadata that allows
@@ -174,40 +173,16 @@
 //!
 //! # Core Operations
 //!
-//! Thread pools support four core operations:
+//! Thread pools support five core operations:
 //! * *Join.* Executes two non-static closures, possibly in parallel, and waits for them to complete.
 //! * *Spawn.* Runs a static closure or future in the background.
 //! * *Scope.* Runs multiple non-static closures or futures, and waits for them all to complete.
 //! * *Block on.* Waits for a future to complete (outside of an async context).
+//! * *Broadcast.* Runs the same operation across all workers.
+//!
 //!
-//! All of these with the exception of *Spawn* are blocking; they have a
-//! specific join-point where a thread must wait for all the forks of the
-//! parallel operation to complete before proceeding. While it is waiting,
-//! threads will attempt to do background work, or help each-other out with
-//! their assigned workload.
-//!
-//! Each operation is available in three different "flavors", depending on the
-//! information available at the callsite.
-//!
-//! | Operation | Headless | Thread pool | Worker |
-//! |-----------|----------|-------------|--------|
-//! | *Join*     | [`join()`] | [`ThreadPool::join()`] | [`Worker::join()`]
-//! | *Spawn*    | [`spawn()`] | [`ThreadPool::spawn()`] | [`Worker::spawn()`]
-//! | *Scope*    | [`scope()`] | [`ThreadPool::scope()`] | [`Worker::scope()`]
-//! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`]
-//!
-//! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one.
-//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one.
-//! * *Worker.* Uses the provided worker context.
-//!
-//! The headless and thread pool flavors are more or less just aliases for the
-//! worker flavor. Where possible, the worker flavor should be preferred to the
-//! thread pool flavor, and the thread pool flavor should be preferred to the
-//! headless flavor.
 
 #![no_std]
-#![cfg_attr(feature = "shuttle", allow(dead_code))]
-#![cfg_attr(feature = "shuttle", allow(unused_imports))]
 
 // -----------------------------------------------------------------------------
 // Boilerplate for building without the standard library
@@ -223,6 +198,7 @@ mod job;
 mod latch;
 mod scope;
 mod thread_pool;
+mod time;
 mod unwind;
 mod util;
 
@@ -239,68 +215,58 @@ pub struct FutureMarker();
 // Top-level exports
 
 pub use scope::Scope;
-pub use scope::ScopedSpawn;
+pub use scope::SpawnScoped;
+pub use thread_pool::Broadcast;
+pub use thread_pool::DEFAULT_POOL;
+pub use thread_pool::DefaultThreadPool;
+pub use thread_pool::Membership;
 pub use thread_pool::Spawn;
+pub use thread_pool::SpawnLocal;
 pub use thread_pool::Task;
 pub use thread_pool::ThreadPool;
 pub use thread_pool::Worker;
 pub use thread_pool::Yield;
 pub use thread_pool::block_on;
+pub use thread_pool::broadcast;
 pub use thread_pool::join;
+pub use thread_pool::num_members;
 pub use thread_pool::scope;
 pub use thread_pool::spawn;
+pub use thread_pool::spawn_broadcast;
 
 // -----------------------------------------------------------------------------
 // Platform Support
 
-// This crate uses `shuttle` for testing, which requires mocking all of the core
-// threading primitives (`Mutex` and the like).
-//
-// To make things a bit simpler, we re-export all the important types in the
-// `primitives` module.
-
-#[cfg(not(feature = "shuttle"))]
+// This exists to make it easy to swap out the basic parallelism primitives.
+// Currently there are no alternative implementations, but there may be in
+// future.
 mod platform {
 
-    // Core exports
-
-    pub use alloc::sync::Arc;
     pub use core::sync::atomic::AtomicBool;
     pub use core::sync::atomic::AtomicPtr;
     pub use core::sync::atomic::AtomicU32;
+    pub use core::sync::atomic::AtomicUsize;
     pub use core::sync::atomic::Ordering;
+    pub use core::sync::atomic::fence;
+
+    pub use alloc::sync::Arc;
     pub use std::sync::Mutex;
-    pub use std::sync::OnceLock;
     pub use std::thread::Builder as ThreadBuilder;
     pub use std::thread::JoinHandle;
-    pub use std::thread::available_parallelism;
     pub use std::thread_local;
-}
-
-#[cfg(feature = "shuttle")]
-mod platform {
-
-    // Core exports
 
-    pub use std::sync::OnceLock; // shuttle has no OnceLock; std's version is fine here
+    pub use std::thread::available_parallelism;
 
-    pub use shuttle::rand::Rng;
-    pub use shuttle::rand::thread_rng;
-    pub use shuttle::sync::Arc;
-    pub use shuttle::sync::Condvar;
-    pub use shuttle::sync::Mutex;
-    pub use shuttle::sync::Weak;
-    pub use shuttle::sync::atomic::AtomicBool;
-    pub use shuttle::sync::atomic::AtomicPtr;
-    pub use shuttle::sync::atomic::AtomicU32;
-    pub use shuttle::sync::atomic::Ordering;
-    pub use shuttle::thread::Builder as ThreadBuilder;
-    pub use shuttle::thread::JoinHandle;
-    pub use shuttle::thread_local;
+    use std::sync::LazyLock;
+    pub struct Lazy<T>(LazyLock<T>);
 
-    // Available parallelism
+    impl<T> Lazy<T> {
+        pub const fn new(init: fn() -> T) -> Self {
+            Lazy(LazyLock::new(init))
+        }
 
-    pub fn available_parallelism() -> std::io::Result<core::num::NonZero<usize>> {
-        panic!("available_parallelism does not work on shuttle");
+        pub fn get(&'static self) -> &'static T {
+            LazyLock::force(&self.0)
+        }
     }
 }
diff --git a/src/scope.rs b/src/scope.rs
index 03c3bb7..6f41229 100644
--- a/src/scope.rs
+++ b/src/scope.rs
@@ -11,7 +11,6 @@ use core::mem::ManuallyDrop;
 use core::pin::Pin;
 use core::ptr;
 use core::ptr::NonNull;
-use core::sync::atomic::fence;
 use core::task::Context;
 use core::task::Poll;
 use core::task::RawWaker;
@@ -28,9 +27,11 @@ use crate::job::HeapJob;
 use crate::job::JobRef;
 use crate::latch::Latch;
 use crate::platform::*;
+use crate::thread_pool::Broadcast;
 use crate::thread_pool::Worker;
 use crate::unwind;
 use crate::unwind::AbortOnDrop;
+use crate::util::IterBits;
 
 // -----------------------------------------------------------------------------
 // Scope
@@ -59,8 +60,10 @@ use crate::unwind::AbortOnDrop;
 ///
 /// The `'env: 'scope` bound is part of the definition of the `Scope` type. The
 /// requirement that scoped work outlive `'scope` is part of the definition of
-/// the [`ScopedSpawn`] trait.
+/// the [`SpawnScoped`] trait.
 pub struct Scope<'scope, 'env: 'scope> {
+    /// The thread-pool this scope is attached to.
+    thread_pool: &'static ThreadPool,
     /// Number of active references to the scope (including the owning
     /// allocation). This is incremented each time a new `ScopePtr` is created,
     /// and decremented when a `ScopePtr` is dropped or the owning thread is
@@ -110,22 +113,15 @@ where
     F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
 {
     let abort_guard = AbortOnDrop;
-    // SAFETY: `Scope::new` requires:
-    //
-    // 1. The `Scope` is never moved after initialization.
-    //
-    // 2. `complete` is called exactly once before the `Scope` is dropped.
-    //
-    // The scope is not moved in this function, and since no `&mut Scope`
-    // reference is allowed to escape, the caller cannot safely cause the scope
-    // to move either.
-    //
-    // `Scope::complete` is called unconditionally on the line below, before
-    // the implicit drop of `scope`. If the closure `f` panics, it is caught and
-    // re-emitted after `complete` finishes. In the event of an uncaught panic,
-    // we cannot ensure `complete` runs properly before the scope is dropped, so
-    // we force an abort via an `AbortOnDrop` guard.
-    let scope = unsafe { Scope::new(worker) };
+    // Create a new scope object on the stack.
+    let scope = Scope {
+        thread_pool: worker.thread_pool(),
+        count: AtomicU32::new(1),
+        completed: worker.new_latch(),
+        panic: AtomicPtr::new(ptr::null_mut()),
+        _scope: PhantomData,
+        _env: PhantomData,
+    };
     // Panics that occur within the closure should be caught and propagated once
     // all spawned work is complete. This is not a safety requirement, it's just
     // a nicer behavior than aborting.
@@ -139,8 +135,8 @@ where
     // Now that the user has (presumably) spawned some work onto the scope, we
     // must wait for it to complete.
     //
-    // SAFETY: This is called only once, and we provide the same worker used to
-    // create the scope.
+    // SAFETY: This is called only once within this function, and then the scope
+    // is dropped.
     unsafe { scope.complete(worker) };
     // At this point all work on the scope is complete, so it is safe to drop
     // the scope. This also means we can relinquish our abort guard (returning
@@ -153,44 +149,18 @@ where
 }
 
 impl<'scope, 'env> Scope<'scope, 'env> {
-    /// Creates a new scope
-    ///
-    /// # Safety
+    /// Runs a closure or future sometime before the scope completes.
     ///
-    /// The caller must ensure:
-    ///
-    /// * The `Scope` is never moved after creation. `ScopePtr::new` captures a
-    ///   raw `*const Scope` pointer, and spawned jobs hold onto these pointers
-    ///   until they complete. Moving the scope would invalidate these pointers
-    ///   and cause UB when any `ScopePtr` is dropped or used for scope access.
-    ///
-    /// * `complete` is called exactly once before the `Scope` is dropped, after
-    ///   which no `ScopePtr` may be created for this scope. `complete` blocks
-    ///   until the reference count ticks down to zero, ensuring that the scope
-    ///   outlives all `ScopePtr` references. Failing to call `complete` may
-    ///   result in dangling `ScopePtr` and produce use-after-free.
-    unsafe fn new(worker: &Worker) -> Scope<'scope, 'env> {
-        Scope {
-            count: AtomicU32::new(1),
-            completed: worker.new_latch(),
-            panic: AtomicPtr::new(ptr::null_mut()),
-            _scope: PhantomData,
-            _env: PhantomData,
-        }
-    }
-
-    /// Runs a closure or future sometime before the scope completes. Valid
-    /// inputs to this method are:
+    /// This is like [`Worker::spawn`], allows the work to borrow local data.
+    /// Vald inputs to this method are:
     ///
     /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type.
     ///
     /// * A `Future<Output = ()>` future, with no return type.
-    ///
-    /// # Panics
-    ///
-    /// If not in a worker, this panics.
-    pub fn spawn<M, S: ScopedSpawn<'scope, M>>(&'scope self, scoped_work: S) {
-        Worker::with_current(|worker| scoped_work.spawn_on(worker.unwrap(), self));
+    pub fn spawn<M, S: SpawnScoped<'scope, M>>(&'scope self, scoped_work: S) {
+        self.thread_pool.get_worker(|worker| {
+            scoped_work.spawn_scoped(self, worker);
+        });
     }
 
     /// Runs a closure or future sometime before the scope completes. Valid
@@ -201,8 +171,93 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// * A `Future<Output = ()>` future, with no return type.
     ///
     /// Unlike [`Scope::spawn`], this accepts the current worker as a parameter.
-    pub fn spawn_on<M, S: ScopedSpawn<'scope, M>>(&'scope self, worker: &Worker, scoped_work: S) {
-        scoped_work.spawn_on(worker, self);
+    pub fn spawn_on<M, S: SpawnScoped<'scope, M>>(
+        &'scope self,
+        worker: &Worker,
+        scoped_work: S,
+    ) {
+        scoped_work.spawn_scoped(self, Some(worker));
+    }
+
+    /// Runs an operation across multiple threads.
+    ///
+    /// This is like [`Worker::spawn_broadcast`], but allows the work to borrow
+    /// local data.
+    pub fn spawn_broadcast<F>(&'scope self, f: F)
+    where
+        F: for<'worker> Fn(Broadcast<'worker>) + Send + Sync + 'scope,
+    {
+        // Prevent workers from leaving the pool, and read the membership bitset
+        // once it's frozen.
+        let members = self.thread_pool.freeze_membership();
+        let participants = members.count_ones() as usize;
+
+        // We are going to spawn a job for every participant, and need to keep
+        // the scope alove while that completes. For the sake of efficiency,
+        // we'll increment the counter for all of these jobs in one single
+        // operation.
+        self.count.fetch_add(participants as u32, Ordering::Relaxed); // (*)
+
+        // Create a new job for each member
+        let member_data = self.thread_pool.get_member_data();
+        for (i, member_index) in members.iter_bits().enumerate() {
+            let func = &f;
+            let scope = self;
+            let op = move |worker: &Worker| {
+                // Run the job
+                let result = unwind::halt_unwinding(|| {
+                    func(Broadcast {
+                        worker,
+                        index: i,
+                        participants,
+                    });
+                });
+                // If the operation panics on any thread, write the panic out to
+                // the scope panic slot.
+                if let Err(err) = result {
+                    scope.store_panic(err);
+                };
+                // SAFETY: This corresponds to one of the increments performed in
+                // a batch with the `fetch_add` at the start of this function.
+                // It was incremented `p` times, and this will be called `p`
+                // times, as the workers complete the `p` broadcast jobs.
+                unsafe { scope.remove_reference() };
+            };
+
+            let job = HeapJob::new(op);
+
+            // SAFETY: `HeapJob::into_job_ref` requires:
+            //
+            // * The `JobRef` will not outlive any of the items closed over by
+            //   the `op`.
+            //
+            //   The only non-copy captures are `scope: &'scope Scope` and
+            //   `func: &&F + 'scope`, so we must show that the `JobRef` will
+            //   not outlive `'scope`.
+            //
+            //   This is ensured via the scope's lifetime extension logic: we
+            //   incremented the scope's counter on the line marked with (*),
+            //   and we know the scope will not complete (extending the lifetime
+            //   of `'scope`) until there is a corresponding call to
+            //   `remove_reference()`.
+            //
+            //   All `n` added references are not removed until the op has run
+            //   `n` times on `n` threads, so the `op` cannot outlive the data
+            //   it borrows.
+            //
+            // * If `op` is `!Send` then `JobRef::execute` will only be called
+            //   on this thread.
+            //
+            //   `op` is unconditionally `Send`. Since `F` and `Scope` are
+            //   `Sync,` the enclosed references `&Scope` and `&&F` are `Send`.
+            let job_ref = unsafe { job.into_job_ref() };
+            member_data.broadcasts[member_index].push(job_ref);
+            member_data.semaphores[member_index].signal();
+        }
+
+        // Once we have finished pushing jobs out to workers who we know are not
+        // in the middle of resginging, we can allow resignations again.
+        self.thread_pool.unfreeze_membership();
     }
 
     /// Adds an additional reference to the scope's reference counter.
@@ -211,25 +266,19 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// `Scope::remove_reference`, or the scope will block forever on
     /// completion.
     fn add_reference(&self) {
-        let counter = self.count.fetch_add(1, Ordering::Relaxed);
-        tracing::trace!("scope reference counter increased to {}", counter + 1);
+        self.count.fetch_add(1, Ordering::Relaxed);
     }
 
     /// Removes a reference from the scope's reference counter.
     ///
     /// # Safety
     ///
-    /// The caller must ensure that each call to `remove_reference` corresponds
-    /// to exactly one prior call to `add_reference` (or the implicit initial
-    /// count of 1 provided by `Scope::new`, in the case of `Scope::complete`).
-    ///
-    /// If `remove_reference` is called without a matching `add_reference`, the
-    /// scope latch will be set prematurely, potentially allowing the scope to
-    /// be freed while a `ScopePtr` still holds a pointer to it. Uses of the
-    /// `ScopePtr` thereafter may produce use-after-free.
+    /// The caller must be able to point to a corresponding place where the scope
+    /// counter was incremented by one. This could be through a call to
+    /// `add_reference`, a direct `fetch_add` on the underlying counter, or the
+    /// implicit initial increment the scope starts with.
     unsafe fn remove_reference(&self) {
         let counter = self.count.fetch_sub(1, Ordering::Relaxed);
-        tracing::trace!("scope reference counter decreased to {}", counter - 1);
         if counter == 1 {
             // Alerts the owning thread that the scope has completed.
             //
@@ -242,7 +291,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
             // the latch is set, which happens only here, after the count
             // reaches zero. Therefore, the `completed` field of this `Scope`
             // must still be a live latch.
-            unsafe { Latch::set(&self.completed) };
+            unsafe { Latch::set(&self.completed, false) };
         }
     }
 
@@ -271,7 +320,12 @@ impl<'scope, 'env> Scope<'scope, 'env> {
             // `store_panic` handles the synchronization for it's panic data).
             if self
                 .panic
-                .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed)
+                .compare_exchange(
+                    nil,
+                    err_ptr,
+                    Ordering::Release,
+                    Ordering::Relaxed,
+                )
                 .is_ok()
             {
                 // Ownership is now transferred into the panic field.
@@ -313,8 +367,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     ///
     /// # Safety
     ///
-    /// This must be called only once. This must be called with a reference to
-    /// the same worker the scope was created with.
+    /// The caller must ensure that this is called at most once.
     unsafe fn complete(&self, worker: &Worker) {
         // SAFETY: This is explicitly allowed, because every scope starts off
         // with a counter of 1. Because this is called only once, the following
@@ -351,7 +404,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// THREAD_POOL.scope(|scope| {
 ///     scope.spawn(|_| { });
-/// //              ^^^^^^^ the trait `ScopedSpawn<'_, _>` is not implemented for closure ...
+/// //              ^^^^^^^ the trait `SpawnScoped<'_, _>` is not implemented for closure ...
 /// });
 /// ```
 /// Try adding a type hint to the closure's parameters, like so:
@@ -364,17 +417,27 @@ impl<'scope, 'env> Scope<'scope, 'env> {
 /// });
 /// ```
 /// Hopefully rustc will fix this type inference failure eventually.
-pub trait ScopedSpawn<'scope, M>: Send + 'scope {
-    /// Spawns the value of self as scoped work on the provided worker.
-    fn spawn_on<'env>(self, worker: &Worker, scope: &'scope Scope<'scope, 'env>);
+pub trait SpawnScoped<'scope, M>: Send + 'scope {
+    /// Similar to [`spawn`][crate::Worker::spawn] but adds the work to a
+    /// [`Scope`]. This work will be polled to completion some-time before the
+    /// scome completes, and may borrow data that outlives the scope.
+    fn spawn_scoped<'env>(
+        self,
+        scope: &'scope Scope<'scope, 'env>,
+        worker: Option<&Worker>,
+    );
 }
 
-impl<'scope, F> ScopedSpawn<'scope, FnOnceMarker> for F
+impl<'scope, F> SpawnScoped<'scope, FnOnceMarker> for F
 where
     F: FnOnce(&Worker) + Send + 'scope,
 {
     #[inline]
-    fn spawn_on<'env>(self, worker: &Worker, scope: &'scope Scope<'scope, 'env>) {
+    fn spawn_scoped<'env>(
+        self,
+        scope: &'scope Scope<'scope, 'env>,
+        worker: Option<&Worker>,
+    ) {
         // Create a job to execute the spawned function in the scope.
         let scope_ptr = ScopePtr::new(scope);
         let job = HeapJob::new(move |worker| {
@@ -393,22 +456,51 @@ where
         // keep the calling stack frame alive until this job completes,
         // effectively extending the lifetime of `'scope` for as long as is
         // necessary.
+
+        // SAFETY: `HeapJob::into_job_ref` requires:
+        //
+        // * The `JobRef` will not outlive any of the items closed over by
+        //   the `op`.
+        //
+        //   The only non-copy captures are `self` and the scope pointer, both
+        //   of which have lifetime `'scope`. So we must show that the `JobRef`
+        //   will not outlive `'scope`.
+        //
+        //   This is ensured via the scope's lifetime extension logic: the scope
+        //   will not complete so long as the `scope_ptr` is held, extending the
+        //   lifetime of `'scope` until after `self` the job executes and is
+        //   dropped.
+        //
+        // * If `op` is `!Send` then `JobRef::execute` will only be called
+        //   on this thread.
+        //
+        //   `op` is unconditionally `Send`.
         let job_ref = unsafe { job.into_job_ref() };
 
         // Send the job to a queue to be executed.
-        worker.fifo_queue.push_new(job_ref);
+        match worker {
+            Some(worker) => worker.fifo_queue.push_new(job_ref),
+            None => scope.thread_pool.push_shared_job(job_ref),
+        }
     }
 }
 
-impl<'scope, Fut> ScopedSpawn<'scope, FutureMarker> for Fut
+impl<'scope, Fut> SpawnScoped<'scope, FutureMarker> for Fut
 where
     Fut: Future<Output = ()> + Send + 'scope,
 {
     #[inline]
-    fn spawn_on<'env, 'worker>(self, worker: &'worker Worker, scope: &'scope Scope<'scope, 'env>) {
-        let poll_job = ScopeFutureJob::new(worker.thread_pool(), scope, self);
+    fn spawn_scoped<'env>(
+        self,
+        scope: &'scope Scope<'scope, 'env>,
+        worker: Option<&Worker>,
+    ) {
+        let poll_job = ScopeFutureJob::new(scope, self);
         let job_ref = poll_job.into_job_ref();
-        worker.fifo_queue.push_new(job_ref);
+        match worker {
+            Some(worker) => worker.fifo_queue.push_new(job_ref),
+            None => scope.thread_pool.push_shared_job(job_ref),
+        }
     }
 }
 
@@ -480,8 +572,6 @@ struct ScopeFutureJob<'scope, 'env, Fut> {
     /// A scope pointer. This allows the job to interact with the scope, and
     /// also keeps the scope alive until the job is dropped.
     scope_ptr: ScopePtr<'scope, 'env>,
-    /// The thread pool this job is attached to.
-    thread_pool: &'static ThreadPool,
     /// The state of the job, which is either READY, WOKEN, or LOCKED.
     state: AtomicU32,
 }
@@ -501,16 +591,11 @@ where
 
     /// Creates a new `ScopedFutureJob` in an `Arc`. The caller is expected to
     /// immediately call `into_job_ref` and queue it on a worker to be polled.
-    fn new(
-        thread_pool: &'static ThreadPool,
-        scope: &Scope<'scope, 'env>,
-        future: Fut,
-    ) -> Arc<Self> {
+    fn new(scope: &Scope<'scope, 'env>, future: Fut) -> Arc<Self> {
         let scope_ptr = ScopePtr::new(scope);
         Arc::new(Self {
             future: UnsafeCell::new(future),
             scope_ptr,
-            thread_pool,
             // The job starts in the WOKEN state because we always queue it
             // after creating it.
             state: AtomicU32::new(WOKEN),
@@ -524,38 +609,46 @@ where
     /// Forgetting this job ref will cause a memory leak.
     fn into_job_ref(self: Arc<Self>) -> JobRef {
         // SAFETY: Pointers created by `Arc::into_raw` are never null.
-        let job_pointer = unsafe { NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast()) };
+        let job_pointer = unsafe {
+            NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast())
+        };
 
-        // SAFETY: `JobRef::new_raw` requires that:
+        // SAFETY: `JobRef::new` requires us to show that `execute` will only be
+        // called on the resulting `JobRef` where it is sound to call `poll` on
+        // `job_pointer`.
         //
-        // * `job_pointer` and `Self::poll` be "matched".
+        // `Poll` has two preconditions:
         //
-        //   `Self::poll` expects a pointer created by calling `Arc::into_raw`
-        //   on an `Arc<Self>`, which is exactly what `job_pointer` is.
+        // 1. `job_pointer` must have been produced by `Arc::into_raw` on an
+        //    `Arc<Self>`.
         //
-        // * `job_pointer` points to an initialized and aligned value which is
-        //   neither moved nor dropped until it is executed.
+        //    We produced `job_pointer` this way just above.
         //
-        //   The Arc reference count must be least 1. `Arc::into_raw` transfers
-        //   ownership of the strong count from `self` into the `JobRef`, and
-        //   that count is only released in `poll`, after the arc produced by
-        //   `Arc::from_raw` is dropped. The data is therefore guaranteed to
-        //   remain live until `poll` is called.
+        // 2. We must hold ownership of exactly one strong reference count for
+        //    the allocation.
         //
-        // * If `poll` has additional safety requirements, `job_pointer` upholds
-        //   them.
-        //
-        //   In this case, `poll` does not have any additional requirements.
-        unsafe { JobRef::new_raw(job_pointer, Self::poll) }
+        //    We start with an `Arc<Self>`, so we must own a strong reference.
+        //    Calling `Arc::into_raw` transfers the strong count of `self` onto
+        //    `job_pointer` without decrementing it. Therefore, when `execute`
+        //    is called, there will still be a strong reference for it to
+        //    consume.
+        unsafe { JobRef::new(job_pointer, Self::poll) }
     }
 
-    /// This is what happens when the job is executed. It is this function that
-    /// is in charge of actually polling the future, and it is therefore an
-    /// extremely hot and performance sensitive function.
-    fn poll(this: NonNull<()>, worker: &Worker) {
+    /// Polls the future.
+    ///
+    /// # Safety
+    ///
+    /// `this` must be a pointer produced by `Arc::into_raw` on an `Arc<Self>`.
+    ///
+    /// This call takes ownership of exactly one strong reference count for that
+    /// allocation, consuming it via `Arc::from_raw` internally. The caller must
+    /// hold "ownership" of one such strong reference.
+    unsafe fn poll(this: NonNull<()>, worker: &Worker) {
         // While we still have a raw pointer to the job, create a raw task waker
         // using our vtable.
-        let raw_waker = RawWaker::new(this.as_ptr().cast_const(), &Self::VTABLE);
+        let raw_waker =
+            RawWaker::new(this.as_ptr().cast_const(), &Self::VTABLE);
 
         // Create a new waker from the raw waker. This is *non-owning* and
         // functions like a `&Arc<Self>` rather than an `Arc<Self>`. We wrap it
@@ -563,8 +656,8 @@ where
         // through the vtable, which would cause the reference-count to
         // decrement (incorrectly).
         //
-        // SAFETY: The api contract of RawWaker and RawWakerVTable is upheld by
-        // the `Self::VTABLE` const.
+        // SAFETY: The API contract of `RawWaker` and `RawWakerVTable` is upheld
+        // by the `Self::VTABLE` const.
         //
         // * The functions are all thread safe.
         //
@@ -605,30 +698,30 @@ where
 
         // SAFETY: The following line requires that:
         //
-        // 1. No other mutable references to the future exist.
+        // * No other mutable references to the future exist.
         //
-        // 2. The future will not move.
+        //   Access to the future is protected by the `state` field, which acts
+        //   as a mutex. Just above, we executed
         //
-        // Access to the future is protected by the `state` field, which acts
-        // as a mutex. Just above, we executed
+        //       state.swap(LOCKED, Ordering::Acquire)
         //
-        //     state.swap(LOCKED, Ordering::Acquire)
+        //   which transitions us from the `WOKEN` into the `LOCKED` state. Any
+        //   concurrent caller that also tries to execute `poll` will fail this
+        //   swap, and cause an abort. Exclusive access is therefore
+        //   guaranteed.
         //
-        // which transitions us from the `WOKEN` into the `LOCKED` state. Any
-        // concurrent caller that also tries to execute `poll` will fail this
-        // swap, and cause an abort. Exclusive access is therefore guaranteed.
+        //   In the event that `poll` has been called previously, the `Acquire`
+        //   ordering synchronizes with the call to
         //
-        // In the event that `poll` has been called previously, the `Acquire`
-        // ordering synchronizes with the call to
+        //       fence(Ordering::Release)
         //
-        //     state.compare_exchange(LOCKED, READY, Ordering::Release, Ordering::Release)
+        //   later in this function. This ensures that we are not racing with
+        //   another mutable reference to the same value.
         //
-        // later in this function. This ensures that all writes to the future
-        // performed by previous invocations are visible to us before we form
-        // the mutable reference.
+        // * The future will not move.
         //
-        // The future does not move, because it is stored in a field within an
-        // `Arc`, which has a stable heap-allocated address.
+        //   The future does not move, because it is stored in a field within an
+        //   `Arc`, which has a stable heap-allocated address.
         let future = unsafe { Pin::new_unchecked(&mut *this.future.get()) };
 
         // Create a new context from the waker, and poll the future.
@@ -652,7 +745,12 @@ where
                 // ownership of the future.
                 let rescheduled = this
                     .state
-                    .compare_exchange(LOCKED, READY, Ordering::Relaxed, Ordering::Relaxed)
+                    .compare_exchange(
+                        LOCKED,
+                        READY,
+                        Ordering::Relaxed,
+                        Ordering::Relaxed,
+                    )
                     .is_err();
                 // Emit a fence here, which synchronizes with the `Acquire` swap
                 // at the start of this function to ensure that the next thread
@@ -718,11 +816,11 @@ where
 
         if this.state.swap(WOKEN, Ordering::Relaxed) == READY {
             // Convert the waker into a job ref and queue it.
-            let thread_pool = this.thread_pool;
+            let thread_pool = this.scope_ptr.thread_pool();
             let job_ref = this.into_job_ref();
-            thread_pool.with_worker(|worker| match worker {
+            thread_pool.get_worker(|worker| match worker {
                 Some(worker) => worker.fifo_queue.push_new(job_ref),
-                None => thread_pool.queue_shared_job(job_ref),
+                None => thread_pool.push_shared_job(job_ref),
             });
         }
     }
@@ -739,17 +837,17 @@ where
         //
         // SAFETY: This is called on a pointer created by `Arc::into_raw` on an
         // instance of `Arc<Self>`.
-        let this = unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::<Self>())) };
+        let this =
+            unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::<Self>())) };
 
         if this.state.swap(WOKEN, Ordering::Relaxed) == READY {
             // Clone the waker, convert it into a job-ref and queue it.
             let this = ManuallyDrop::into_inner(this.clone());
-            let thread_pool = this.thread_pool;
+            let thread_pool = this.scope_ptr.thread_pool();
             let job_ref = this.into_job_ref();
-
-            thread_pool.with_worker(|worker| match worker {
+            thread_pool.get_worker(|worker| match worker {
                 Some(worker) => worker.fifo_queue.push_new(job_ref),
-                None => thread_pool.queue_shared_job(job_ref),
+                None => thread_pool.push_shared_job(job_ref),
             });
         }
     }
@@ -780,22 +878,31 @@ mod scope_ptr {
     use core::any::Any;
 
     use super::Scope;
+    use crate::ThreadPool;
 
     /// A reference-counted pointer to a scope. Used to capture a scope pointer
     /// in jobs without faking a lifetime. Holding a `ScopePtr` keeps the
     /// reference scope from being deallocated.
     pub struct ScopePtr<'scope, 'env>(*const Scope<'scope, 'env>);
 
-    // SAFETY: This is safe because (a) scope-pointer is only used to call
-    // `add_reference`, `remove_reference`, and `store_panic`, all of which are
-    // designed to be thread-safe; and (b) the `Scope` cannot be deallocated
-    // while any `ScopePtr` still points to it (due to reference counting).
+    // SAFETY: This is sound because:
+    //
+    // * `ScopePtr` is only used to call `add_reference`, `remove_reference`,
+    //   and `store_panic`, all of which are designed to be called from multiple
+    //   threads concurrently.
+    //
+    // * The `Scope` cannot be deallocated while any `ScopePtr` still points to
+    //   it (due to reference counting), so the raw pointer is always valid.
     unsafe impl Send for ScopePtr<'_, '_> {}
 
-    // SAFETY: This is safe because (a) scope-pointer is only used to call
-    // `add_reference`, `remove_reference`, and `store_panic`, all of which are
-    // designed to be thread-safe; and (b) the `Scope` cannot be deallocated
-    // while any `ScopePtr` still points to it (due to reference counting).
+    // SAFETY: This is sound because:
+    //
+    // * `ScopePtr` is only used to call `add_reference`, `remove_reference`,
+    //   and `store_panic`, all of which are designed to be called from multiple
+    //   threads concurrently.
+    //
+    // * The `Scope` cannot be deallocated while any `ScopePtr` still points to
+    //   it (due to reference counting), so the raw pointer is always valid.
     unsafe impl Sync for ScopePtr<'_, '_> {}
 
     impl<'scope, 'env> ScopePtr<'scope, 'env> {
@@ -817,6 +924,14 @@ mod scope_ptr {
             let scope_ref = unsafe { &*self.0 };
             scope_ref.store_panic(err);
         }
+
+        pub fn thread_pool(&self) -> &'static ThreadPool {
+            // SAFETY: This was created using an immutable scope reference, and
+            // by the scope rules there can be no mutable references to this
+            // scope, nor can the scope have been moved or deallocated while the
+            // scope's counter remains incremented.
+            unsafe { (&*self.0).thread_pool }
+        }
     }
 
     impl Drop for ScopePtr<'_, '_> {
@@ -839,7 +954,7 @@ mod scope_ptr {
 // -----------------------------------------------------------------------------
 // Tests
 
-#[cfg(all(test, not(feature = "shuttle")))]
+#[cfg(test)]
 mod tests {
     use core::iter::once;
     use core::pin::Pin;
@@ -989,7 +1104,10 @@ mod tests {
     impl Future for CountFuture {
         type Output = ();
 
-        fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        fn poll(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<Self::Output> {
             if self.count == 128 {
                 Poll::Ready(())
             } else {
@@ -1045,7 +1163,7 @@ mod tests {
         let a = AtomicU8::new(0);
         let b = AtomicU8::new(0);
 
-        THREAD_POOL.on_worker(|worker| {
+        THREAD_POOL.with_worker(|worker| {
             scope(|scope| {
                 for _ in 0..NUM_JOBS {
                     scope.spawn_on(worker, |_: &Worker| {
@@ -1094,12 +1212,12 @@ mod tests {
 
         let mut completed = false;
 
-        THREAD_POOL.on_worker(|worker| {
+        THREAD_POOL.with_worker(|worker| {
             worker.scope(|scope| {
                 scope.spawn_on(worker, |_: &Worker| {
                     // Creating a new worker instead of reusing the old one is
                     // bad form, but we may as well test it.
-                    THREAD_POOL.on_worker(|worker| {
+                    THREAD_POOL.with_worker(|worker| {
                         worker.scope(|scope| {
                             scope.spawn_on(worker, |_: &Worker| {
                                 completed = true;
@@ -1123,7 +1241,7 @@ mod tests {
         THREAD_POOL.resize_to_available();
 
         let counter_p = &AtomicUsize::new(0);
-        THREAD_POOL.on_worker(|worker| {
+        THREAD_POOL.with_worker(|worker| {
             worker.scope(|scope| {
                 scope.spawn(move |worker: &Worker| {
                     divide_and_conquer(worker, scope, counter_p, 1024)
@@ -1176,7 +1294,7 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        THREAD_POOL.on_worker(|_| {
+        THREAD_POOL.with_worker(|_| {
             let mut tree = random_tree(10, 1337);
             let values: Vec<_> = tree.iter().cloned().collect();
             tree.update(|v| *v += 1);
@@ -1238,7 +1356,10 @@ mod tests {
         random_tree_inner(depth, &mut rng)
     }
 
-    fn random_tree_inner(depth: usize, rng: &mut XorShift64Star) -> Tree<usize> {
+    fn random_tree_inner(
+        depth: usize,
+        rng: &mut XorShift64Star,
+    ) -> Tree<usize> {
         let children = if depth == 0 {
             vec![]
         } else {
@@ -1264,7 +1385,7 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        THREAD_POOL.on_worker(|_| {
+        THREAD_POOL.with_worker(|_| {
             let mut max_diff = Mutex::new(0);
             let bottom_of_stack = 0;
             scope(|s| the_final_countdown(s, &bottom_of_stack, &max_diff, 5));
@@ -1298,7 +1419,9 @@ mod tests {
         *data = Ord::max(diff, *data);
 
         if n > 0 {
-            scope.spawn(move |_: &Worker| the_final_countdown(scope, bottom_of_stack, max, n - 1));
+            scope.spawn(move |_: &Worker| {
+                the_final_countdown(scope, bottom_of_stack, max, n - 1)
+            });
         }
     }
 
@@ -1321,7 +1444,8 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        THREAD_POOL.scope(|scope| scope.spawn(|_: &Worker| panic!("Hello, world!")));
+        THREAD_POOL
+            .scope(|scope| scope.spawn(|_: &Worker| panic!("Hello, world!")));
 
         THREAD_POOL.depopulate();
     }
@@ -1335,7 +1459,9 @@ mod tests {
 
         THREAD_POOL.scope(|scope| {
             scope.spawn(|_: &Worker| {
-                scope.spawn(|_: &Worker| scope.spawn(|_: &Worker| panic!("Hello, world!")))
+                scope.spawn(|_: &Worker| {
+                    scope.spawn(|_: &Worker| panic!("Hello, world!"))
+                })
             })
         });
 
@@ -1351,7 +1477,9 @@ mod tests {
 
         THREAD_POOL.scope(|scope_1| {
             scope_1.spawn(|worker: &Worker| {
-                worker.scope(|scope_2| scope_2.spawn(|_: &Worker| panic!("Hello, world!")))
+                worker.scope(|scope_2| {
+                    scope_2.spawn(|_: &Worker| panic!("Hello, world!"))
+                })
             })
         });
 
@@ -1479,7 +1607,9 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        fn increment<'slice, 'counter>(counters: &'slice [&'counter AtomicUsize]) {
+        fn increment<'slice, 'counter>(
+            counters: &'slice [&'counter AtomicUsize],
+        ) {
             THREAD_POOL.scope::<'counter>(move |scope| {
                 // We can borrow 'slice here, but the spawns can only borrow 'counter.
                 for &c in counters {
diff --git a/src/thread_pool.rs b/src/thread_pool.rs
index 69ca80b..e50cfdb 100644
--- a/src/thread_pool.rs
+++ b/src/thread_pool.rs
@@ -1,6 +1,5 @@
 //! This module contains the api and worker logic for the Forte thread pool.
 
-use alloc::boxed::Box;
 use alloc::format;
 use alloc::vec::Vec;
 use core::array;
@@ -23,23 +22,22 @@ use crossbeam_utils::CachePadded;
 use st3::StealError;
 use st3::lifo::Stealer;
 use st3::lifo::Worker as Sharer;
-use tracing::debug;
-use tracing::trace;
-use tracing::trace_span;
 
 use crate::FnOnceMarker;
 use crate::FutureMarker;
-use crate::job::ExternalJob;
 use crate::job::HeapJob;
 use crate::job::JobQueue;
 use crate::job::JobRef;
 use crate::job::StackJob;
 use crate::latch::Latch;
-use crate::latch::SleepController;
+use crate::latch::Semaphore;
+use crate::latch::Status;
 use crate::platform::*;
 use crate::scope::Scope;
 use crate::scope::with_scope;
+use crate::time::ticks;
 use crate::unwind;
+use crate::util::IterBits;
 use crate::util::XorShift64Star;
 
 // -----------------------------------------------------------------------------
@@ -69,72 +67,78 @@ use crate::util::XorShift64Star;
 /// pool). All blocking methods (e.g. [`join`] and [`scope`]) work even with
 /// zero managed workers, but they won't run in parallel.
 pub struct ThreadPool {
-    /// A bit-set that tracks which seats are occupied.
-    occupied: CachePadded<AtomicU32>,
-    /// A bit-set that tracks which seats are sleeping.
-    sleeping: CachePadded<AtomicU32>,
-    /// Holds shared data for each thread participating in the pool.
-    seats: OnceLock<Box<Seats>>,
+    /// Shared data for each pool member.
+    member_data: Lazy<MemberData>,
+    /// A queue of data
+    shared_queue: SegQueue<JobRef>,
+    /// Manages how members leave the pool.
+    ///
+    /// * The 26 least significant bit count the number of ongoing broadcasts.
+    ///
+    /// * The 6 most significant bits count the number of ongoing resignations.
+    ///
+    /// Members can only resign when there are no ongoing broadcasts. Broadcasts
+    /// can only begin after all current resignations complete.
+    resignations: CachePadded<AtomicU32>,
+    /// A bitmask that tracks which members are waiting to resign.
+    wants_to_resign: CachePadded<AtomicU32>,
+    /// A bitmask that tracks which member indices are claimed.
+    claimed_bitmask: CachePadded<AtomicU32>,
+    /// A bitmask that tracks which members are waiting on their semaphore
+    /// signal.
+    waiting_bitmask: CachePadded<AtomicU32>,
     /// Holds controls for threads spawned and managed by the pool. Initialized
-    /// on first call to `occupy`, to allow for some non-static constructors.
-    managed_threads: Mutex<ManagedThreads>,
-    /// Used to inject external work into the thread pool. This is generally
-    /// treated as a fallback, for when the thread-pool is at capacity and
-    /// threads can't register themselves as workers.
-    shared_jobs: SegQueue<JobRef>,
+    /// on first call to `activate`, to allow for some non-static constructors.
+    managed_workers: Mutex<Vec<ManagedWorker>>,
 }
 
 /// A public interface that can be temporarily claimed and used by a thread.
 /// Claiming a seat allows a thread to participate in the thread pool as a
 /// worker.
-pub(crate) struct Seats {
+pub struct MemberData {
     /// The sharing side of each seat's work-stealing queue. These should only
     /// ever be accessed by the thread that currently owns the lease for this
     /// seat (to ensure the `!Sync` bound is respected).
     sharers: [Sharer<JobRef>; 32],
     /// The stealing side of each seat's work-stealing queue.
     stealers: [Stealer<JobRef>; 32],
-    /// The sleep/wake controller for each seat.
-    sleep_controllers: [SleepController; 32],
+    /// A set of queues used for transmitting work that must be executed on a
+    /// particular worker. Used for broadcasts and cross-thread nonsend worker
+    /// wakeups.
+    pub broadcasts: [SegQueue<JobRef>; 32],
+    /// A binary semaphore for each seat, used for signaling.
+    pub semaphores: [Semaphore; 32],
 }
 
-// SAFETY: `stealers` are `Send + Sync` by their own bounds. `workers[i]` is
-// only ever accessed by the single thread holding seat `i`'s occupancy lease;
-// the `occupied` bitmask in `ThreadPool` enforces that exclusivity.
-unsafe impl Sync for Seats {}
-
-/// A lease represents ownership of one of a "seats" in a thread pool, and
-/// allows the owning thread to participate in that pool as a worker.
-pub struct Lease {
-    /// The thread pool against which this lease is held.
-    thread_pool: &'static ThreadPool,
-    /// The index of the seat in the data list
-    seat_number: usize,
-    /// A reference to the pre-initialized seat data (to avoid repeated hits of
-    /// the `OnceLock`).
-    seats: &'static Seats,
-}
-
-impl Drop for Lease {
-    fn drop(&mut self) {
-        // Unset the occupied bit for this seat
-        self.thread_pool
-            .occupied
-            .fetch_and(!(1 << self.seat_number), Ordering::Relaxed);
+impl MemberData {
+    fn new() -> MemberData {
+        let sharers: [Sharer<JobRef>; 32] =
+            array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY));
+        let stealers: [Stealer<JobRef>; 32] =
+            array::from_fn(|i| sharers[i].stealer());
+        let broadcasts = array::from_fn(|_| SegQueue::new());
+        let semaphores = array::from_fn(|_| Semaphore::new());
+        MemberData {
+            sharers,
+            stealers,
+            broadcasts,
+            semaphores,
+        }
     }
 }
 
-/// Manages threads spawned by the pool.
-struct ManagedThreads {
-    /// Stores thread controls for workers spawned by the pool.
-    workers: Vec<ManagedWorker>,
-}
+// SAFETY: `Sharer` (aka `st3::Worker`) is `!Sync`. We allow it to be stored in
+// this shared structure, but we only allow one thread to access it at a time
+// (via the membership claiming logic). This is effectively like sending
+// `st3:Worker` ownership between threads (although in practice it always
+// occupies the same place on the heap). Luckily for us, it implements `Send`.
+unsafe impl Sync for MemberData {}
 
 /// Represents a worker thread that is managed by the pool, as opposed to
 /// external threads which temporarily participate in the pool.
 struct ManagedWorker {
     /// The index of this worker in the public worker info list.
-    seat_number: usize,
+    member_index: usize,
     /// Controls used to manage the lifecycle of the worker.
     control: ThreadControl,
 }
@@ -148,7 +152,7 @@ struct ThreadControl {
 }
 
 // -----------------------------------------------------------------------------
-// Thread pool creation and maintenance
+// Thread pool creation and utilities
 
 #[allow(clippy::new_without_default)]
 impl ThreadPool {
@@ -156,125 +160,46 @@ impl ThreadPool {
     pub const fn new() -> ThreadPool {
         // Create the pool itself.
         ThreadPool {
-            seats: OnceLock::new(),
-            occupied: CachePadded::new(AtomicU32::new(0)),
-            sleeping: CachePadded::new(AtomicU32::new(0)),
-            managed_threads: Mutex::new(ManagedThreads {
-                workers: Vec::new(),
-            }),
-            shared_jobs: SegQueue::new(),
-        }
-    }
-
-    /// Returns the pre-allocated steal queues, initializing them on the first call.
-    fn get_seats(&'static self) -> &'static Seats {
-        self.seats.get_or_init(|| {
-            let sharers: [Sharer<JobRef>; 32] =
-                array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY));
-            let stealers: [Stealer<JobRef>; 32] = array::from_fn(|i| sharers[i].stealer());
-            let sleep_controllers = array::from_fn(|_| SleepController::new());
-            Box::new(Seats {
-                sharers,
-                stealers,
-                sleep_controllers,
-            })
-        })
-    }
-
-    /// Adds a job ref to the shared queue.
-    pub fn queue_shared_job(&'static self, job_ref: JobRef) {
-        self.shared_jobs.push(job_ref);
-    }
-
-    /// Claims a lease on the thread pool which can be occupied by a worker
-    /// (using [`Worker::occupy`]), allowing a thread to participate in the pool.
-    ///
-    /// Returns `None` if all seats are occupied.
-    #[cold]
-    pub fn claim_lease(&'static self) -> Option<Lease> {
-        loop {
-            let occupied = self.occupied.load(Ordering::Relaxed);
-            if occupied == u32::MAX {
-                return None;
-            }
-            let seat_number = occupied.trailing_ones() as usize;
-            let mask = 1 << seat_number;
-            if self.occupied.fetch_or(mask, Ordering::Relaxed) & mask == 0 {
-                // At this point we have acquired the lease on the seat
-                return Some(Lease {
-                    thread_pool: self,
-                    seat_number,
-                    seats: self.get_seats(),
-                });
-            }
-        }
-    }
-
-    /// Claims up to `n` leases at once in a single atomic transaction.
-    ///
-    /// Finds up to `n` free seats, then atomically claims all of them with a
-    /// single `compare_exchange`. Either every selected seat is claimed together
-    /// or none are (and the loop retries). Returns between 0 and `n` leases;
-    /// returns an empty `Vec` when `n` is 0 or the pool is full.
-    #[cold]
-    pub fn claim_leases(&'static self, n: usize) -> Vec<Lease> {
-        if n == 0 {
-            return Vec::new();
-        }
-        let seats = self.get_seats();
-        loop {
-            let occupied = self.occupied.load(Ordering::Relaxed);
-            if occupied == u32::MAX {
-                return Vec::new();
-            }
-
-            // Build a mask of up to `n` free seats by walking the complement.
-            let mut claimed_seats = 0;
-            let mut free_seats = !occupied;
-            for _ in 0..n {
-                if free_seats == 0 {
-                    break;
-                }
-                let seat_bit = free_seats & free_seats.wrapping_neg(); // isolate lowest set bit
-                claimed_seats |= seat_bit;
-                free_seats &= !seat_bit;
-            }
-
-            // Attempt to claim all selected seats in one atomic step.
-            match self.occupied.compare_exchange(
-                occupied,
-                occupied | claimed_seats,
-                Ordering::Relaxed,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => {
-                    return (0..32)
-                        .filter(|&i| claimed_seats & (1 << i) != 0)
-                        .map(|seat_number| Lease {
-                            thread_pool: self,
-                            seat_number: seat_number as usize,
-                            seats,
-                        })
-                        .collect();
-                }
-                Err(_) => {
-                    // Another thread modified `occupied`; retry.
-                }
-            }
+            member_data: Lazy::new(MemberData::new),
+            shared_queue: SegQueue::new(),
+            resignations: CachePadded::new(AtomicU32::new(0)),
+            claimed_bitmask: CachePadded::new(AtomicU32::new(0)),
+            waiting_bitmask: CachePadded::new(AtomicU32::new(0)),
+            wants_to_resign: CachePadded::new(AtomicU32::new(0)),
+            managed_workers: Mutex::new(Vec::new()),
         }
     }
 
     /// Returns an opaque identifier for this thread pool.
     #[inline(always)]
-    pub fn id(&self) -> usize {
+    pub fn id(&'static self) -> usize {
         // We can rely on `self` not to change since it's a static ref.
         ptr::from_ref(self) as usize
     }
 
-    /// Returns the number of workers participating in this thread pool.
+    /// Returns the number of members participating in the pool.
     #[inline(always)]
-    pub fn num_workers(&self) -> usize {
-        self.occupied.load(Ordering::Relaxed).count_ones() as usize
+    pub fn num_members(&'static self) -> usize {
+        self.claimed_bitmask.load(Ordering::Relaxed).count_ones() as usize
+    }
+
+    /// Adds a job to the thread-pool's shared queue.
+    ///
+    /// This allows adding work from outside the pool (eg, without a reference
+    /// to a `Worker`).
+    ///
+    /// Note: Workers only take work from this queue as a last resort, after all
+    /// their other work has been exhausted.
+    #[inline(always)]
+    pub fn push_shared_job(&'static self, job_ref: JobRef) {
+        self.shared_queue.push(job_ref);
+        // Try to wake up a worker to execute this job. This is relatively cheap
+        // if no workers are waiting.
+        let waiting_bitmask = self.waiting_bitmask.load(Ordering::Relaxed);
+        if waiting_bitmask != 0 {
+            let i = waiting_bitmask.trailing_zeros() as usize;
+            self.get_member_data().semaphores[i].signal();
+        }
     }
 }
 
@@ -293,7 +218,8 @@ impl ThreadPool {
     ///
     /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn resize_to_available(&'static self) -> usize {
-        let mut available = available_parallelism().map(NonZero::get).unwrap_or(1);
+        let mut available =
+            available_parallelism().map(NonZero::get).unwrap_or(1);
         available = available.saturating_sub(1).max(1);
         self.resize_to(available)
     }
@@ -321,7 +247,9 @@ impl ThreadPool {
     ///
     /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn shrink(&'static self, terminated_threads: usize) -> usize {
-        self.resize(|current_size| current_size.saturating_sub(terminated_threads))
+        self.resize(|current_size| {
+            current_size.saturating_sub(terminated_threads)
+        })
     }
 
     /// Ensures that there is at least one worker thread attached to the thread
@@ -354,86 +282,76 @@ impl ThreadPool {
     where
         F: Fn(usize) -> usize,
     {
-        debug!("starting thread pool resize");
-
         // Resizing a pool is a critical section; only one thread can resize the
         // pool at a time. This is implemented using a mutex on the thread manager.
-        trace!("locking state");
-        let mut managed_threads = self.managed_threads.lock().unwrap();
+        let mut managed_workers = self.managed_workers.lock().unwrap();
 
         // Compute the new size of the pool, given the current size.
-        let current_size = managed_threads.workers.len();
+        let current_size = managed_workers.len();
 
         // Calculate the new size of the pool (counting only managed workers).
         let new_size = get_size(current_size);
 
-        trace!(
-            "attempting to resize thread pool from {} to {} thread(s)",
-            current_size, new_size
-        );
         match new_size.cmp(&current_size) {
             // The size remained the same
-            cmp::Ordering::Equal => {
-                debug!("completed thread pool resize, size unchanged");
-                return current_size;
-            }
+            cmp::Ordering::Equal => current_size,
             // The size increased
             cmp::Ordering::Greater => {
                 // Spawn the new workers.
-                let leases = self.claim_leases(new_size - current_size);
-                for lease in leases {
-                    let seat_number = lease.seat_number;
-                    debug!("spawning managed worker for seat number {}", seat_number);
+                let memberships = self.try_enroll_many(new_size - current_size);
+                for membership in memberships {
+                    let member_index = membership.member_index;
                     let halt = Arc::new(AtomicBool::new(false));
                     let worker_halt = halt.clone();
                     let handle = ThreadBuilder::new()
-                        .name(format!("worker {seat_number}"))
+                        .name(format!("managed worker {member_index}"))
                         .spawn(move || {
-                            managed_worker(lease, worker_halt);
+                            managed_worker(membership, worker_halt);
                         })
                         .unwrap();
                     let control = ThreadControl { halt, handle };
-                    managed_threads.workers.push(ManagedWorker {
-                        seat_number,
+                    managed_workers.push(ManagedWorker {
+                        member_index,
                         control,
                     });
                 }
 
-                drop(managed_threads);
+                managed_workers.len()
             }
             // The size decreased
             cmp::Ordering::Less => {
                 // Pull the workers we intend to halt out of the thread manager.
-                let terminating_workers = managed_threads.workers.split_off(new_size);
+                let terminating_workers = managed_workers.split_off(new_size);
 
                 // Terminate and wake the workers.
-                let seats = self.get_seats();
+                let member_data = self.get_member_data();
                 for worker in &terminating_workers {
                     // Tell the worker to halt.
                     worker.control.halt.store(true, Ordering::Relaxed);
-                    // Wake the worker up.
-                    seats.sleep_controllers[worker.seat_number].wake();
+                    // Signal the worker.
+                    member_data.semaphores[worker.member_index].signal();
                 }
 
-                // Drop the lock on the state so as not to block the workers or heartbeat.
-                drop(managed_threads);
+                // Drop the lock on the state so as not to block the workers or
+                // heartbeat.
+                drop(managed_workers);
 
                 // Determine our seat index.
-                let own_seat_number = Worker::map_current(|worker| worker.lease.seat_number);
+                let own_member_index =
+                    Worker::map_current(|worker| worker.member_index);
 
                 // Wait for the other workers to fully halt.
                 for worker in terminating_workers {
                     // It's possible we may be trying to terminate ourselves, in
                     // which case we can skip the thread-join.
-                    if Some(worker.seat_number) != own_seat_number {
+                    if Some(worker.member_index) != own_member_index {
                         let _ = worker.control.handle.join();
                     }
                 }
+
+                new_size
             }
         }
-
-        // Return the new size of the thread pool
-        new_size
     }
 }
 
@@ -441,114 +359,55 @@ impl ThreadPool {
 // Thread pool worker access
 
 impl ThreadPool {
-    /// Runs the closure on a thread-pool worker.
-    ///
-    /// If this thread is not a worker, it will try to register itself as one.
-    /// If the thread pool is full, the closure is sent to another worker as a
-    /// job, and this thread is parked.
-    ///
-    /// If your closure is `!Send`, use [`with_worker`][ThreadPool::with_worker]
-    /// instead.
+    /// Returns this thread's worker if it is a member of this thread pool.
     #[inline(always)]
-    pub fn on_worker<F, R>(&'static self, f: F) -> R
+    pub fn get_worker<F, R>(&'static self, func: F) -> R
     where
-        F: FnOnce(&Worker) -> R + Send,
-        R: Send,
+        F: FnOnce(Option<&Worker>) -> R,
     {
-        self.with_worker(|worker| match worker {
-            Some(worker) => f(worker),
-            None => {
-                let mut job = ExternalJob::new(f);
-                // SAFETY: `ExternalJob::as_job_ref` requires:
-                //
-                // * The `ExternalJob` must not move or be deallocated until the
-                //   `JobRef` is executed.
-                //
-                // * The `JobRef` does not outlive any data the `ExternalJob` closes over.
-                //
-                // * `as_job_ref` is not called again while `JobRef` lives.
-                //
-                // The `ExternalJob` is a stack-allocated variable. After
-                // calling `as_job_ref`, we never move `job`, and we wait for
-                // the job to execute by calling `job.wait_for_value`. Only
-                // after that returns do we allow the `job` to be dropped. This
-                // also means that any data closed over by the `ExternalJob`
-                // must outlive the `JobRef`.
-                //
-                // Also, `as_job_ref` is plainly called only once.
-                let job_ref = unsafe { job.as_job_ref() };
-                self.queue_shared_job(job_ref);
-                // SAFETY: `wait_for_value` must be called at most once. This is
-                // the only call site for this particular `job`, which is a
-                // stack-local variable.
-                let result = unsafe { job.wait_for_value() };
-                match result {
-                    Ok(value) => value,
-                    Err(error) => unwind::resume_unwinding(error),
-                }
+        Worker::with_current(|worker| match worker {
+            Some(worker) if worker.thread_pool.id() == self.id() => {
+                func(Some(worker))
             }
+            _ => func(None),
         })
     }
 
-    /// Runs the closure on a thread-pool worker.
-    ///
-    /// If this thread is not a worker, it will try to register itself as one.
-    /// If the thread pool is full, this panics.
+    /// Returns this thread's worker. If not already a member of this thread
+    /// pool, this thread will try to enroll itself as a member.
     ///
-    /// If you don't want to panic, use [`on_worker`][ThreadPool::on_worker] or
-    /// [`with_worker`][ThreadPool::with_worker] instead.
+    /// Note: If the thread pool is full (it already has 32 active members) this
+    /// waits for a vacancy before returning.
     #[inline(always)]
-    #[track_caller]
-    pub fn expect_worker<F, R>(&'static self, f: F) -> R
+    pub fn with_worker<F, R>(&'static self, func: F) -> R
     where
         F: FnOnce(&Worker) -> R,
     {
-        self.with_worker(|worker| match worker {
-            Some(worker) => f(worker),
-            None => panic!("thread pool full; not able to access worker"),
-        })
-    }
-
-    /// Runs the closure on a thread-pool worker.
-    ///
-    /// If this thread is currently acting as a worker for the thread-pool, this
-    /// just looks that worker up. If this thread is not registered as a worker,
-    /// or if the thread's worker is registered with different thread pool, the
-    /// thread will try to register itself with the correct pool. If the thread
-    /// pool is full, it passes the closure `None`.
-    ///
-    /// The provided closure is never sent to another thread. If your closure is
-    /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead.
-    #[inline(always)]
-    pub fn with_worker<F, R>(&'static self, f: F) -> R
-    where
-        F: FnOnce(Option<&Worker>) -> R,
-    {
-        Worker::with_current(|worker| match worker {
-            Some(worker) if worker.lease.thread_pool.id() == self.id() => f(Some(worker)),
-            _ => self.with_worker_cold(f),
+        self.get_worker(|worker| match worker {
+            Some(worker) => func(worker),
+            None => self.with_worker_cold(func),
         })
     }
 
-    /// Tries to register the calling thread on the thread pool, and pass a
-    /// worker instance to the provided closure.
+    /// The cold branch of `with_worker`.
     ///
-    /// This is the slow fallback for `with_worker` covering "external calls"
-    /// from outside the pool. Never call this directly.
+    /// Requests membership in the thread pool, and then activates that
+    /// membership to get a new local worker handle. If the thread pool is full
+    /// (there are already 32 members) this blocks.
     #[cold]
-    fn with_worker_cold<F, R>(&'static self, f: F) -> R
+    fn with_worker_cold<F, R>(&'static self, func: F) -> R
     where
-        F: FnOnce(Option<&Worker>) -> R,
+        F: FnOnce(&Worker) -> R,
     {
-        match self.claim_lease() {
-            Some(lease) => Worker::occupy(lease, |worker| f(Some(worker))),
-            None => f(None),
-        }
+        let membership = self.enroll();
+        membership.activate(func)
     }
 }
 
 // -----------------------------------------------------------------------------
-// Generalized spawn trait
+// Spawn Trait
+
+pub use async_task::Task;
 
 /// A trait for types that can be spawned onto a [`ThreadPool`].
 ///
@@ -585,7 +444,11 @@ pub trait Spawn<M>: Send + 'static {
     type Output: Send + 'static;
 
     /// Spawns work onto the thread pool.
-    fn spawn(self, thread_pool: &'static ThreadPool, worker: Option<&Worker>) -> Self::Output;
+    fn spawn(
+        self,
+        thread_pool: &'static ThreadPool,
+        worker: Option<&Worker>,
+    ) -> Self::Output;
 }
 
 impl<F> Spawn<FnOnceMarker> for F
@@ -601,60 +464,45 @@ where
 
         // Turn the job into an "owning" `JobRef` so it can be queued.
         //
-        // SAFETY: All jobs added to the queue are guaranteed to be executed
-        // eventually, this is one of the core invariants of the thread pool.
-        // The closure `f` has a static lifetime, meaning it only closes over
-        // data that lasts for the duration of the program, so it's not possible
-        // for this job to outlive the data `f` closes over.
+        // SAFETY: `HeapJob::into_job_ref` has two preconditions:
+        //
+        // * The `JobRef` must not outlive any of the items closed over by the
+        //   function `f`.
+        //
+        //   Since `F: 'static`, the `JobRef` cannot outlive its captured data.
+        //
+        // * If `F: !Send` then the `JobRef` must only be executed on this
+        //   thread.
+        //
+        //   `F` is `Send`, so this does not apply.
         let job_ref = unsafe { job.into_job_ref() };
 
         // Queue the job for evaluation
-        if let Some(worker) = worker {
-            worker.fifo_queue.push_new(job_ref);
-        } else {
-            // Push the work into the share queue and wake a worker
-            thread_pool.shared_jobs.push(job_ref);
+        match worker {
+            Some(worker) => worker.fifo_queue.push_new(job_ref),
+            None => thread_pool.push_shared_job(job_ref),
         }
     }
 }
 
-/// An alias for [`async_task::Task`] that includes a reference to the pool on
-/// which the future is executing.
-pub type Task<T> = async_task::Task<T, &'static ThreadPool>;
-
-/// Schedules a runnable future as a job.
+/// Executes a raw pointer to a runnable.
 ///
-/// Async-task prefers that this is a static function, rather than a closure,
-/// which is why this is a separate function that pulls the thread pool from the
-/// runnable metadata.
-fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) {
-    // Get a ref to the thread pool from the runnable.
-    let thread_pool = *runnable.metadata();
-
-    // Temporarily turn the task into a raw pointer so that it can be
-    // used as a job. We could also use `HeapJob` here, but since
-    // `Runnable` is heap allocated this would result in a needless
-    // second allocation.
-    let job_pointer = runnable.into_raw();
-
-    // SAFETY: The raw runnable pointer will remain valid until it is
-    // used by `execute_runnable`, after which it will be dropped.
-    let job_ref = unsafe { JobRef::new_raw(job_pointer, execute_runnable) };
-
-    // Send this job off to be executed.
-    thread_pool.with_worker(|worker| match worker {
-        Some(worker) => worker.fifo_queue.push_new(job_ref),
-        None => thread_pool.shared_jobs.push(job_ref),
-    });
-}
-
-/// Executes a raw pointer to a runnable future.
+/// # Safety
+///
+/// The caller must ensure:
+///
+/// * `this` was produced by `Runnable::into_raw` and must not have been
+///   consumed by a call to `from_raw`.
+///
+/// * If the `Runnable` was created for a `!Send` future, this must only be
+///   called on the thread where the `Runnable` was created.
 #[inline(always)]
-fn execute_runnable(this: NonNull<()>, _worker: &Worker) {
-    // SAFETY: This pointer was created by `Runnable::into_raw` in
-    // `schedule_runnable` with type parameter `&'static ThreadPool`, and
-    // `from_raw` is called at most once.
-    let runnable = unsafe { Runnable::<&'static ThreadPool>::from_raw(this) };
+unsafe fn execute_runnable(this: NonNull<()>, _worker: &Worker) {
+    // SAFETY: This pointer was created by `Runnable::into_raw` in the schedule
+    // closure. Jobs are executed exactly once, so `from_raw` is called at most
+    // once on this function (the next call to `schedule` will call `into_raw`
+    // again to get a "new" raw pointer to the same runnable).
+    let runnable = unsafe { Runnable::<()>::from_raw(this) };
     // Poll the task. This will drop the future if the task is
     // canceled or the future completes.
     runnable.run();
@@ -668,11 +516,48 @@ where
     type Output = Task<T>;
 
     #[inline]
-    fn spawn(self, thread_pool: &'static ThreadPool, _worker: Option<&Worker>) -> Task<T> {
+    fn spawn(
+        self,
+        thread_pool: &'static ThreadPool,
+        _worker: Option<&Worker>,
+    ) -> Task<T> {
+        // Creates a schedule function that captures a reference to the
+        // thread-pool.
+        let schedule = |runnable: Runnable| {
+            // Temporarily turn the task into a raw pointer so that it can be
+            // used as a job. We could also use `HeapJob` here, but since
+            // `Runnable` is heap allocated this would result in a needless
+            // second allocation.
+            let job_pointer = runnable.into_raw();
+
+            // SAFETY: `JobRef::new` requires us to show that `execute` will
+            // only be called on the returned `JobRef` when it is sound to call
+            // `execute_runnable` on `job_pointer`. The two preconditions to
+            // this are:
+            //
+            // * `job_pointer` must come from `Runnable::into_raw`. and must be
+            //   called at most once per `into_raw`.
+            //
+            //   We produced `job_pointer` this way just above. The call to
+            //   `execute` will consume the `JobRef`, so `execute_runnable` will
+            //   be called at most once.
+            //
+            // * If the `Runnable` was created for a `!Send` future, the
+            //   `JobRef` must only be executed on the thread where the
+            //   `Runnable` was created.
+            //
+            //   The future is required to be `Send`, so this does not apply.
+            let job_ref = unsafe { JobRef::new(job_pointer, execute_runnable) };
+
+            // Send this job off to be executed.
+            thread_pool.get_worker(|worker| match worker {
+                Some(worker) => worker.fifo_queue.push_new(job_ref),
+                None => thread_pool.push_shared_job(job_ref),
+            });
+        };
+
         // Create a runnable and add the thread pool as metadata.
-        let (runnable, task) = async_task::Builder::new()
-            .metadata(thread_pool)
-            .spawn(|_| self, schedule_runnable);
+        let (runnable, task) = async_task::spawn(self, schedule);
 
         // Call the schedule function, pushing a `JobRef` for the future onto
         // the local work queue. If the future doesn't complete, it can be
@@ -685,13 +570,150 @@ where
         // thread/task that woke it.
         //
         // This is potentially more efficient than `Runnable::schedule`.
-        schedule_runnable(runnable);
+        schedule(runnable);
 
         // Return the task.
         task
     }
 }
 
+// -----------------------------------------------------------------------------
+// Local Spawn Trait
+
+/// A version of the [`Spawn`] trait without the `Send` bound.
+///
+/// It is implemented for:
+///
+/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + 'static`.
+///
+/// * Futures that satisfy `Future<Output = T> + 'static` where `T: 'static`.
+pub trait SpawnLocal<M>: 'static {
+    /// The handled returned when spawning this type.
+    type Output: 'static;
+
+    /// Spawns work that will run in the background on the current worker
+    /// thread.
+    fn spawn_local(self, worker: &Worker) -> Self::Output;
+}
+
+impl<F> SpawnLocal<FnOnceMarker> for F
+where
+    F: for<'worker> FnOnce(&'worker Worker) + 'static,
+{
+    type Output = ();
+
+    #[inline]
+    fn spawn_local(self, worker: &Worker) {
+        // Allocate a new job on the heap to store the closure.
+        let job = HeapJob::new(self);
+
+        // Turn the job into an "owning" `JobRef` so it can be queued.
+        //
+        // SAFETY: `HeapJob::into_job_ref` has two preconditions:
+        //
+        // * The `JobRef` must not outlive any of the items closed over by the
+        //   function `f`.
+        //
+        //   Since `F: 'static`, the `JobRef` cannot outlive its captured data.
+        //
+        // * If `F: !Send` then the `JobRef` must only be executed on this
+        //   thread.
+        //
+        //   This `JobRef` is added to the `nonsend_fifo_queue` for this thread.
+        //   No other thread ever pulls from this queue, and work is never
+        //   shared from it, so it cannot be executed on any other thread.
+        let job_ref = unsafe { job.into_job_ref() };
+
+        // Push into the non-send queue, which can only be accessed from this
+        // thread.
+        worker.nonsend_fifo_queue.push(job_ref);
+    }
+}
+
+impl<Fut, T> SpawnLocal<FutureMarker> for Fut
+where
+    Fut: Future<Output = T> + 'static,
+    T: 'static,
+{
+    type Output = Task<T>;
+
+    #[inline]
+    fn spawn_local(self, worker: &Worker) -> Task<T> {
+        // Create a schedule function that will keep a copy of the local fifo
+        // queue arc and be able to wake the local worker up.
+        let queue = worker.nonsend_fifo_queue.clone();
+        let member_index = worker.member_index;
+        let member_data = worker.member_data;
+        let schedule = move |runnable: Runnable| {
+            // Temporarily turn the task into a raw pointer so that it can be
+            // used as a job. We could also use `HeapJob` here, but since
+            // `Runnable` is heap allocated this would result in a needless
+            // second allocation.
+            let job_pointer = runnable.into_raw();
+
+            // SAFETY: `JobRef::new` requires us to show that `execute` will
+            // only be called on the returned `JobRef` when it is sound to call
+            // `execute_runnable` on `job_pointer`. The two preconditions to
+            // this are:
+            //
+            // * `job_pointer` must come from `Runnable::into_raw`. and must be
+            //   called at most once per `into_raw`.
+            //
+            //   We produced `job_pointer` this way just above. The call to
+            //   `execute` will consume the `JobRef`, so `execute_runnable` will
+            //   be called at most once.
+            //
+            // * If the `Runnable` was created for a `!Send` future, the
+            //   `JobRef` must only be executed on the thread where the
+            //   `Runnable` was created.
+            //
+            //   This `JobRef` is added to the `nonsend_fifo_queue` for this
+            //   thread. No other thread ever pulls from this queue, and work is
+            //   never shared from it, so it cannot be executed on any other
+            //   thread.
+            let job_ref = unsafe { JobRef::new(job_pointer, execute_runnable) };
+
+            // Send this job to the correct thread to be executed.
+            queue.push(job_ref);
+
+            // Ensure that the worker is awake to execute this job.
+            member_data.semaphores[member_index].signal();
+        };
+
+        // Create a runnable and add the thread pool as metadata.
+        let (runnable, task) = async_task::spawn_local(self, schedule);
+
+        // Call the schedule function, pushing a `JobRef` for the future onto
+        // the local work queue. If the future doesn't complete, it can be
+        // woken and scheduled at a later point.
+        //
+        // Because we always look up the local worker within the schedule
+        // function, woken futures will tend to run on the thread that wakes
+        // them. This is a desirable property, as typically the next thing a
+        // future is going to do after being woken up is read some data from the
+        // thread/task that woke it.
+        runnable.schedule();
+
+        // Return the task.
+        task
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Broadcasts
+
+/// Context object for [`broadcast`](Worker::broadcast) operations.
+pub struct Broadcast<'w> {
+    /// The worker this part of the broadcast is running on. This will be in
+    /// `[0, participants)`.
+    pub worker: &'w Worker,
+    /// The index of this worker within the broadcast. The return value will be
+    /// stored at this index within the results vector.
+    pub index: usize,
+    /// The number of threads participating in the broadcast.
+    pub participants: usize,
+}
+
 // -----------------------------------------------------------------------------
 // Thread pool operations
 
@@ -700,77 +722,435 @@ impl ThreadPool {
     ///
     /// See also: [`Worker::spawn`] and [`spawn`].
     #[inline(always)]
-    pub fn spawn<M, S: Spawn<M>>(&'static self, work: S) -> S::Output {
-        work.spawn(self, None)
+    pub fn spawn<M, S: Spawn<M>>(&'static self, work: S) -> S::Output {
+        self.get_worker(|worker| work.spawn(self, worker))
+    }
+
+    /// Blocks the thread waiting for a future to complete.
+    ///
+    /// See also: [`Worker::block_on`] and [`block_on`].
+    #[inline(always)]
+    pub fn block_on<F, T>(&'static self, future: F) -> T
+    where
+        F: Future<Output = T> + Send,
+        T: Send,
+    {
+        self.get_worker(|worker| match worker {
+            Some(worker) => worker.block_on(future),
+            None => futures_lite::future::block_on(future),
+        })
+    }
+
+    /// Executes the two closures, possibly in parallel.
+    ///
+    /// See also: [`Worker::join`] and [`join`].
+    #[inline(always)]
+    pub fn join<A, B, RA, RB>(&'static self, a: A, b: B) -> (RA, RB)
+    where
+        A: FnOnce(&Worker) -> RA + Send,
+        B: FnOnce(&Worker) -> RB + Send,
+        RA: Send,
+        RB: Send,
+    {
+        self.with_worker(|worker| worker.join(a, b))
+    }
+
+    /// Creates a scope onto which non-static work can be spawned.
+    ///
+    /// See also: [`Worker::scope`] and [`scope`].
+    #[inline(always)]
+    pub fn scope<'env, F, T>(&'static self, f: F) -> T
+    where
+        F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
+    {
+        self.with_worker(|worker| worker.scope(f))
+    }
+
+    /// Runs the same closure on several threads, and returns a vector of
+    /// results.
+    ///
+    /// See also: [`Worker::broadcast`] and [`broadcast`]. If you don't care
+    /// about getting results back, you may want to use
+    /// [`ThreadPool::spawn_broadcast`] instead.
+    #[inline(always)]
+    pub fn broadcast<F, T>(&'static self, f: F) -> Vec<T>
+    where
+        F: for<'w> Fn(Broadcast<'w>) -> T + Sync,
+        T: Send,
+    {
+        self.with_worker(|worker| worker.broadcast(f))
+    }
+
+    /// Runs the same closure on sevearl threads, without waiting for them to
+    /// complete.
+    ///
+    /// See also: [`Worker::spawn_broadcast`] and [`spawn_broadcast`]. If you
+    /// care about getting results back, you may want to use
+    /// [`ThreadPool::broadcast`] instead.
+    #[inline(always)]
+    pub fn spawn_broadcast<F>(&'static self, f: F)
+    where
+        F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static,
+    {
+        self.with_worker(|worker| worker.spawn_broadcast(f));
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Worker registration
+
+/// Represents membership in a thread-pool.
+///
+/// Provided by [`ThreadPool::enroll`].
+pub struct Membership {
+    /// The thread pool the worker is registered with.
+    thread_pool: &'static ThreadPool,
+    /// Contains the index of a row in the `MembersData` table, if the worker
+    /// has been granted membership on the thread-pool.
+    member_index: usize,
+    /// A reference to the `MemberData` table.
+    member_data: &'static MemberData,
+}
+
+impl ThreadPool {
+    /// Returns member data, initializing it on the first call.
+    pub fn get_member_data(&'static self) -> &'static MemberData {
+        self.member_data.get()
+    }
+
+    /// Waits for membership in the thread-pool.
+    ///
+    /// If the thread-pool is full (it has 32 members) this blocks.
+    pub fn enroll(&'static self) -> Membership {
+        loop {
+            match self.try_enroll() {
+                // If we receive a membership, break out of the loop
+                Some(membership) => return membership,
+                // If the thread-pool is full, wait for a membership to become
+                // free
+                None => atomic_wait::wait(&self.claimed_bitmask, u32::MAX),
+            }
+        }
+    }
+
+    /// Requests membership in a thread-pool.
+    ///
+    /// If the thread-pool is full (it has 32 members) this returns `None`.
+    #[cold]
+    pub fn try_enroll(&'static self) -> Option<Membership> {
+        loop {
+            let available_bitmask =
+                !self.claimed_bitmask.load(Ordering::Relaxed);
+            if available_bitmask == 0 {
+                return None;
+            }
+            let enrolled_index = available_bitmask.trailing_zeros() as usize; // TZCNT
+            let enrolled_bitmask = 1 << enrolled_index;
+            if self
+                .claimed_bitmask
+                .fetch_or(enrolled_bitmask, Ordering::Relaxed)
+                & enrolled_bitmask
+                == 0
+            {
+                return Some(Membership {
+                    thread_pool: self,
+                    member_index: enrolled_index,
+                    member_data: self.get_member_data(),
+                });
+            }
+        }
+    }
+
+    /// Creates multiple memberships for the thread pool.
+    pub fn try_enroll_many(&'static self, n: usize) -> Vec<Membership> {
+        if n == 0 {
+            return Vec::new();
+        }
+        let member_data = self.get_member_data();
+        loop {
+            let claimed_bitmask = self.claimed_bitmask.load(Ordering::Relaxed);
+            if claimed_bitmask == u32::MAX {
+                return Vec::new();
+            }
+
+            // Build a mask of up to `n` free seats by walking the complement.
+            let mut enrolled_bitmask = 0;
+            let mut available_bitmask = !claimed_bitmask;
+            for _ in 0..n {
+                if available_bitmask == 0 {
+                    break;
+                }
+                // Isolate the lowest available bit and add it to the enrollment
+                // bits
+                enrolled_bitmask |=
+                    available_bitmask & available_bitmask.wrapping_neg();
+                // Remove that bit from the available bits
+                available_bitmask &= available_bitmask - 1;
+            }
+
+            // Attempt to claim all selected seats in one atomic step.
+            if self
+                .claimed_bitmask
+                .compare_exchange(
+                    claimed_bitmask,
+                    claimed_bitmask | enrolled_bitmask,
+                    Ordering::Relaxed,
+                    Ordering::Relaxed,
+                )
+                .is_ok()
+            {
+                return (0..32)
+                    .filter(|&i| enrolled_bitmask & (1 << i) != 0)
+                    .map(|seat_number| Membership {
+                        thread_pool: self,
+                        member_index: seat_number as usize,
+                        member_data,
+                    })
+                    .collect();
+            }
+        }
+    }
+
+    /// Blocks workers from resigning from the pool.
+    ///
+    /// Each call to this function must be paired with exactly one following
+    /// call to `unfreeze_membership`.
+    ///
+    /// Returns a bitset indicating which memberships are claimed. These members
+    /// are guatenteed to remain in the pool at least until the corresponding
+    /// call to `unfreeze_membership`.
+    pub fn freeze_membership(&'static self) -> u32 {
+        // Increment the freeze counter, which will cause new resignation
+        // requests to be rejected.
+        //
+        // Note, this will break if we exceed 67,108,863 simultaneous
+        // broadcasts, because we will overflow into the bits for
+        // ongoing-resignations. I expect us to always run out of memory before
+        // that happens, so this case is not handled.
+        let mut resignations =
+            self.resignations.fetch_add(1, Ordering::Relaxed);
+
+        // Wait for any ongoing resignations to complete.
+        while (resignations >> 26) != 0 {
+            atomic_wait::wait(&self.resignations, resignations);
+            resignations = self.resignations.load(Ordering::Relaxed);
+        }
+
+        // Synchronizes with the `Release` store done by workers when they
+        // complete their resignation, ensuring that the following load of
+        // `claimed_bitmask` will properly reflect any recent resignations.
+        fence(Ordering::Acquire);
+
+        // Return the frozen membership bitmask
+        self.claimed_bitmask.load(Ordering::Relaxed)
+    }
+
+    /// Unblocks workers from resigning from the pool.
+    ///
+    /// Each call to this function must be paired with exactly one preceding
+    /// call to `unfreeze_membership`.
+    pub fn unfreeze_membership(&'static self) {
+        // Decrement the freeze counter, to allow new resignations to be
+        // accepted.
+        let resignations = self.resignations.fetch_sub(1, Ordering::Acquire);
+
+        // If this was the last active freeze, wake up any threads that want to
+        // resign.
+        if resignations == 1 {
+            let wants_to_resign = self.wants_to_resign.load(Ordering::Relaxed);
+            for member_index in wants_to_resign.iter_bits() {
+                self.get_member_data().semaphores[member_index].signal();
+            }
+        }
+    }
+}
+
+thread_local! {
+    static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) };
+}
+
+const REJECTION_MASK: u32 = (1u32 << 26) - 1;
+
+impl Membership {
+    /// Returns this worker's index within its thread pool.
+    ///
+    /// The index is stable for the lifetime of the membership and is unique
+    /// among concurrent members of the same pool.
+    #[inline(always)]
+    pub fn member_index(&self) -> usize {
+        self.member_index
+    }
+
+    /// Temporarily sets the thread's worker. [`Worker::with_current`] always
+    /// returns a reference to the worker set up by the most recent call to
+    /// `activate`.
+    ///
+    /// Rust's thread locals are fairly costly, so this function is expensive.
+    /// If you can avoid calling it, do so.
+    #[inline(always)]
+    pub fn activate<F, R>(self, f: F) -> R
+    where
+        F: FnOnce(&Worker) -> R,
+    {
+        let worker = Worker {
+            migrated: Cell::new(false),
+            membership: self,
+            fifo_queue: JobQueue::new(),
+            lifo_queue: JobQueue::new(),
+            nonsend_fifo_queue: Arc::new(SegQueue::new()),
+            rng: XorShift64Star::new(),
+            last_promote_tick: Cell::new(0),
+            _phantom: PhantomData,
+        };
+
+        // Swap the local pointer to point to the newly allocated worker.
+        let outer_ptr = WORKER_PTR.with(|ptr| ptr.replace(&worker));
+
+        // Run the function within the context created by the worker pointer,
+        // and pass in a worker reference directly.
+        let result = f(&worker);
+
+        // Indicate that we want to resign.
+        worker
+            .thread_pool
+            .wants_to_resign
+            .fetch_or(1 << worker.member_index, Ordering::Relaxed);
+
+        // Wait for all local work to complete, and our resignation to be
+        // accepted.
+        loop {
+            if worker.yield_local() == Yield::Idle {
+                // Attempt to submit our resignation
+                let mut resignations =
+                    worker.thread_pool.resignations.load(Ordering::Relaxed);
+                if resignations & REJECTION_MASK == 0 {
+                    match worker.thread_pool.resignations.compare_exchange(
+                        resignations,
+                        resignations + (1 << 26),
+                        Ordering::Relaxed,
+                        Ordering::Relaxed,
+                    ) {
+                        // Resignation accepted
+                        Ok(_) => break,
+                        // Resignation rejected due to ongoing broadcast
+                        Err(err) if err & REJECTION_MASK != 0 => worker.wait(),
+                        // Resignation conflicted with other worker, try again
+                        Err(err) => resignations = err,
+                    }
+                }
+            }
+        }
+
+        // Indicate we are no longer waiting to resign.
+        worker
+            .thread_pool
+            .wants_to_resign
+            .fetch_and(!(1 << worker.member_index), Ordering::Relaxed);
+
+        // Drop the worker, which will also free the claimed membership.
+        let thread_pool = worker.thread_pool;
+        drop(worker);
+
+        // Complete the resignation
+        loop {
+            let resignations = thread_pool.resignations.load(Ordering::Relaxed);
+            // Try to decrement the resignations count. This uses `Release`
+            // ordering so that the the `claimed_bitmask` store done as part of
+            // `drop(worker)` appears to any thread waiting for this resignation
+            // to complete.
+            if thread_pool
+                .resignations
+                .compare_exchange_weak(
+                    resignations,
+                    resignations - (1 << 26),
+                    Ordering::Release,
+                    Ordering::Relaxed,
+                )
+                .is_ok()
+            {
+                // If this was the last ongoing resignation, wake up any waiting
+                // broadcasts.
+                if (resignations >> 26) - 1 == 0 {
+                    atomic_wait::wake_all(&*thread_pool.resignations);
+                }
+                // Exit the CAS loop
+                break;
+            }
+        }
+
+        // Swap back to pointing to the previous value (possibly null).
+        WORKER_PTR.with(|ptr| ptr.set(outer_ptr));
+
+        // Return the intermediate values created while running the closure,
+        // namely the result and any jobs still remaining on the local queue.
+        result
+    }
+
+    /// Returns a reference to the push-side `Sharer` queue for this
+    /// worker's seat.
+    #[inline(always)]
+    fn sharing_queue(&self) -> &'static Sharer<JobRef> {
+        &self.member_data.sharers[self.member_index]
     }
 
-    /// Blocks the thread waiting for a future to complete.
-    ///
-    /// See also: [`Worker::block_on`] and [`block_on`].
+    /// Returns a reference to a worker's local inbox (where !Send future
+    /// wakeups and broadcasts are transmitted).
     #[inline(always)]
-    pub fn block_on<F, T>(&'static self, future: F) -> T
-    where
-        F: Future<Output = T> + Send,
-        T: Send,
-    {
-        self.on_worker(|worker| worker.block_on(future))
+    fn broadcast_queue(&self) -> &'static SegQueue<JobRef> {
+        &self.member_data.broadcasts[self.member_index]
     }
 
-    /// Executes the two closures, possibly in parallel.
-    ///
-    /// See also: [`Worker::join`] and [`join`].
+    /// Returns a reference to a worker's local inbox (where !Send future
+    /// wakeups and broadcasts are transmitted).
     #[inline(always)]
-    pub fn join<A, B, RA, RB>(&'static self, a: A, b: B) -> (RA, RB)
-    where
-        A: FnOnce(&Worker) -> RA + Send,
-        B: FnOnce(&Worker) -> RB + Send,
-        RA: Send,
-        RB: Send,
-    {
-        self.on_worker(|worker| worker.join(a, b))
+    fn semaphore(&self) -> &'static Semaphore {
+        &self.member_data.semaphores[self.member_index]
     }
 
-    /// Creates a scope onto which non-static work can be spawned.
-    ///
-    /// For more complete docs, see [`scope`]. If you have a reference to a
-    /// worker, you should call [`Worker::scope`] instead.
-    #[inline(always)]
-    pub fn scope<'env, F, T>(&'static self, f: F) -> T
-    where
-        F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T + Send,
-        T: Send,
-    {
-        self.on_worker(|worker| worker.scope(f))
+    /// Waits for a signal on this member's semaphore.
+    fn wait(&self) {
+        let semaphore = self.semaphore();
+        semaphore
+            .wait(1 << self.member_index, &self.thread_pool.waiting_bitmask);
+    }
+}
+
+impl Drop for Membership {
+    fn drop(&mut self) {
+        // Release the claim on this membership.
+        self.thread_pool
+            .claimed_bitmask
+            .fetch_and(!(1 << self.member_index), Ordering::Relaxed);
+        // In case another thread is waiting for a membership slot to free
+        // up, issue a wake on the bitmask.
+        atomic_wait::wake_one(&*self.thread_pool.claimed_bitmask);
     }
 }
 
 // -----------------------------------------------------------------------------
 // Worker context
 
-thread_local! {
-    static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) };
-}
-
 /// Represents membership in a thread pool.
 ///
-/// To get access to worker for a given thread pool, users should call
-/// [`ThreadPool::with_worker`], [`ThreadPool::on_worker`], [`ThreadPool::expect_worker`]
+/// To get access to a worker for a given thread pool, users should call
+/// [`ThreadPool::with_worker`].
 ///
 /// Every thread has at most one worker at a time. If a worker has already been
 /// set up, it may be accessed at any time by calling [`Worker::with_current`].
-/// A thread's worker can also be manually overridden by claiming a lease
-/// ([`ThreadPool::claim_lease`]) and passing it to [`Worker::occupy`]. The
-/// worker returned by `with_current` always represents the lease most recently
-/// occupied in the call stack.
+/// A thread's worker can also be manually set up by claiming a membership
+/// ([`ThreadPool::try_enroll`]) and passing it to [`Membership::activate`]. The
+/// worker returned by `with_current` always represents the membership most
+/// recently activated in the call stack.
 ///
-/// Every worker belongs to exactly one thread pool, and must hold a "lease" on
-/// one of the shared slots within that pool.
+/// Every worker belongs to exactly one thread pool, and must hold a membership
+/// in one of the shared slots within that pool.
 ///
 /// Workers have one core memory-safety guarantee: Any jobs added to the worker
 /// will eventually be executed.
 pub struct Worker {
-    migrated: Cell<bool>,
-    lease: Lease,
+    /// Registers the worker as belonging to a specific thread pool, and
+    /// potentially also grants "membership" on that thread-pool.
+    membership: Membership,
     /// A sequence of jobs waiting to be executed. Newer jobs are executed
     /// before older ones, allowing efficient depth-first execution. During
     /// promotion, the oldest job is shared. Populated by `join()`.
@@ -784,13 +1164,40 @@ pub struct Worker {
     ///
     /// Jobs in this queue are executed only when the lifo queue is empty.
     pub(crate) fifo_queue: JobQueue,
+    /// A sequence of `!Sendf` jobs waitging to be executed. Older jobs are
+    /// executed before newer ones.
+    ///
+    /// This queue does not participate in promotion. This is a `SeqQueue` so
+    /// that a `Future` that is `!Send` and has been spawned onto this thread
+    /// can be woken on another thread (the other thread then sends this thread
+    /// a job that polls the future).
+    nonsend_fifo_queue: Arc<SegQueue<JobRef>>,
+    /// A local psudorandom number-generator. Used to spread out
+    /// worker-to-worker operations evenly across the pool.
     rng: XorShift64Star,
+    /// The CPU tick when work was last promoted from local to shared. This has
+    /// no absolute relation to time.
     last_promote_tick: Cell<u64>,
-    // Make non-send.
+    /// Set to true when executing a job that came from a different thread.
+    migrated: Cell<bool>,
+    // Make non-send. A `Worker` represents the local state of a particular
+    // thread, so must be `!Send` and `!Sync`. It is already `!Sync` because of
+    // `Cell`.
     _phantom: PhantomData<*const ()>,
 }
 
-/// Describes the outcome of a call to [`Worker::yield_now`] or [`Worker::yield_local`].
+use core::ops::Deref;
+
+impl Deref for Worker {
+    type Target = Membership;
+
+    fn deref(&self) -> &Membership {
+        &self.membership
+    }
+}
+
+/// Describes the outcome of a call to [`Worker::yield_now`] or
+/// [`Worker::yield_local`].
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum Yield {
     /// Indicates that a job was executed.
@@ -802,64 +1209,6 @@ pub enum Yield {
 }
 
 impl Worker {
-    /// Temporarily sets the thread's worker. [`Worker::with_current`] always
-    /// returns a reference to the worker set up by the most recent call to
-    /// `occupy`.
-    ///
-    /// Rust's thread locals are fairly costly, so this function is expensive.
-    /// If you can avoid calling it, do so.
-    #[inline(always)]
-    pub fn occupy<F, R>(lease: Lease, f: F) -> R
-    where
-        F: FnOnce(&Worker) -> R,
-    {
-        trace!("occupying lease");
-
-        let span = trace_span!("occupy", seat_number = lease.seat_number);
-        let _enter = span.enter();
-
-        // Create a new worker to occupy the lease. Note: It's potentially a
-        // problem that the same thread can occupy multiple workers on the same
-        // thread. We may eventually need to design something to prevent this.
-        let worker = Worker {
-            migrated: Cell::new(false),
-            lease,
-            fifo_queue: JobQueue::new(),
-            lifo_queue: JobQueue::new(),
-            rng: XorShift64Star::new(),
-            last_promote_tick: Cell::new(0),
-            _phantom: PhantomData,
-        };
-
-        // Swap the local pointer to point to the newly allocated worker.
-        let outer_ptr = WORKER_PTR.with(|ptr| ptr.replace(&worker));
-
-        // Run the function within the context created by the worker pointer,
-        // and pass in a worker reference directly.
-        let result = f(&worker);
-
-        // Finish executing local work before shutting down.
-        while let Some(job_ref) = worker.find_local_work() {
-            worker.execute(job_ref, false);
-        }
-
-        // Swap back to pointing to the previous value (possibly null).
-        WORKER_PTR.with(|ptr| ptr.set(outer_ptr));
-
-        trace!("vacating lease");
-
-        // Return the intermediate values created while running the closure,
-        // namely the result and any jobs still remaining on the local queue.
-        result
-    }
-
-    /// Returns a reference to the push-side `Sharer` queue for this
-    /// worker's seat.
-    #[inline(always)]
-    fn sharer(&self) -> &Sharer<JobRef> {
-        &self.lease.seats.sharers[self.lease.seat_number]
-    }
-
     /// Calls the provided closure on the thread's worker instance, if it has
     /// one. If this thread is not registered as a worker, the closure is not
     /// called.
@@ -871,18 +1220,18 @@ impl Worker {
         let worker_ptr = WORKER_PTR.with(Cell::get);
         if !worker_ptr.is_null() {
             // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw
-            // pointer to a `Worker`. It is only written to by `Worker::occupy`,
-            // which stores the address of a `Worker` allocated within it's own
-            // stack frame. Before it returns, `occupy` restores the previous
-            // value of `WORKER_PTR`, so that it is always either null or points
-            // to a live, immovable `Worker` on the current thread's call stack
-            // (but is never left dangling).
+            // pointer to a `Worker`. It is only written to by
+            // `Membership::activate`, which stores the address of a `Worker`
+            // allocated within it's own stack frame. Before it returns,
+            // `activate` restores the previous value of `WORKER_PTR`, so that
+            // it is always either null or points to a live, immovable `Worker`
+            // on the current thread's call stack (but is never left dangling).
             //
             // If the pointer is non-null, it is therefore valid to dereference
             // as a shared reference. Forming a `'static` reference is avoided
             // by passing the value into a closure, which bounds the reference's
             // lifetime to the closure body and prevents callers from retaining
-            // it past the point where `occupy` returns and the `Worker` is
+            // it past the point where `activate` returns and the `Worker` is
             // freed.
             Some(f(unsafe { &*worker_ptr }))
         } else {
@@ -900,35 +1249,30 @@ impl Worker {
     {
         let worker_ptr = WORKER_PTR.with(Cell::get);
         if !worker_ptr.is_null() {
-            // SAFETY: The `WORKER` static is only set by `occupy`, and it's
-            // always set to a stack-allocated `Worker` which is never moved and
-            // is only accessed through shared references. Therefore, if the
-            // pointer is non-null, it must be safe to dereference.
+            // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw
+            // pointer to a `Worker`. It is only written to by
+            // `Membership::activate`, which stores the address of a `Worker`
+            // allocated within its own stack frame. Before it returns,
+            // `activate` restores the previous value of `WORKER_PTR`, so that
+            // it is always either null or points to a live, immovable `Worker`
+            // on the current thread's call stack (but is never left dangling).
             //
-            // This creates a reference with an unbounded lifetime. To avoid
-            // turning it into a `'static`, we pass it in to a closure. This
-            // restricts its lifetime to the closure body, and prevents callers
-            // from keeping around references to Workers that will be
-            // deallocated when `occupy` returns.
+            // If the pointer is non-null, it is therefore sound to dereference
+            // as a shared reference. Forming a `'static` reference is avoided
+            // by passing the value into a closure, which bounds the reference's
+            // lifetime to the closure body and prevents callers from retaining
+            // it past the point where `activate` returns and the `Worker` is
+            // freed.
             f(Some(unsafe { &*worker_ptr }))
         } else {
             f(None)
         }
     }
 
-    /// Returns this worker's seat index within the pool (0–31).
-    ///
-    /// Seat numbers may be re-used by different workers at different times, and
-    /// may not be contiguous or ordered.
-    #[inline(always)]
-    pub fn seat_number(&self) -> usize {
-        self.lease.seat_number
-    }
-
     /// Returns the thread pool this worker belongs to.
     #[inline(always)]
     pub fn thread_pool(&self) -> &'static ThreadPool {
-        self.lease.thread_pool
+        self.thread_pool
     }
 
     /// Capacity of the per-worker work-stealing queue. This is the maximum
@@ -945,8 +1289,10 @@ impl Worker {
         // Promotions are fairly costly, so we limit their frequency using the
         // cpu's instruction counter. Promote is called at a high frequency, and
         // actually doing the promotion is probably a cold path.
-        let current_tick = hotclock::Instant::now().as_raw();
-        if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL {
+        let current_tick = ticks();
+        if current_tick.wrapping_sub(self.last_promote_tick.get())
+            >= Self::PROMOTE_TICK_INTERVAL
+        {
             // This should ideally become a conditional jump.
             self.promote_cold(current_tick);
         }
@@ -959,8 +1305,9 @@ impl Worker {
         self.last_promote_tick.set(current_tick);
 
         // Early out if it seems like all workers are already awake.
-        let sleeping = self.lease.thread_pool.sleeping.load(Ordering::Relaxed);
-        if sleeping == 0 {
+        let waiting_bitmask =
+            self.thread_pool().waiting_bitmask.load(Ordering::Relaxed);
+        if waiting_bitmask == 0 {
             return;
         }
         cold_path();
@@ -972,7 +1319,7 @@ impl Worker {
         // (and therefore theoretically "large") tasks shared first.
         if let Some(job_ref) = self.lifo_queue.pop_oldest() {
             // Push into our own steal queue so siblings can steal it.
-            if let Err(job_ref) = self.sharer().push(job_ref) {
+            if let Err(job_ref) = self.sharing_queue().push(job_ref) {
                 // If the queue is full, that indicates that the pool is
                 // probably under high-load and we should continue local-first
                 // operation.
@@ -999,7 +1346,7 @@ impl Worker {
             // value, and so trivially outlives the newly created `JobRef`.
             let batch_job_ref = unsafe { batch_job.into_job_ref() };
             // Push the batch job into the steal queue so siblings can steal it.
-            if let Err(job_ref) = self.sharer().push(batch_job_ref) {
+            if let Err(job_ref) = self.sharing_queue().push(batch_job_ref) {
                 // If the queue is full, that indicates that the pool is
                 // probably under high-load and we should continue local-first
                 // operation.
@@ -1014,20 +1361,21 @@ impl Worker {
         // If we added work to the steal queue, wake a random sibling to steal
         // it from us, while we do other work.
         if shared_job {
-            self.wake_random(sleeping);
+            self.signal_random(waiting_bitmask);
         }
     }
 
     /// Tries to wake a random sleeping worker. Expects to be given a bitset of
     /// sleeping workers.
     #[inline(always)]
-    fn wake_random(&self, sleeping: u32) {
+    fn signal_random(&self, sleeping: u32) {
         let offset = self.rng.next_usize(32) as u32;
         let mut randomized_sleeping = sleeping.rotate_right(offset);
         while randomized_sleeping != 0 {
             let index = (randomized_sleeping.trailing_zeros() + offset) % 32;
             randomized_sleeping &= randomized_sleeping - 1; // Clear the lowest bit
-            let woken = self.lease.seats.sleep_controllers[index as usize].wake();
+            let woken =
+                self.membership.member_data.semaphores[index as usize].signal();
             if woken {
                 return;
             }
@@ -1037,11 +1385,7 @@ impl Worker {
     /// Create a new latch owned by the worker.
     #[inline(always)]
     pub fn new_latch(&self) -> Latch {
-        Latch::new(
-            self.lease.seat_number,
-            &self.lease.thread_pool.sleeping,
-            &self.lease.seats.sleep_controllers[self.lease.seat_number],
-        )
+        Latch::new(self.semaphore())
     }
 
     /// Runs jobs until the provided latch is set.
@@ -1049,33 +1393,60 @@ impl Worker {
     /// The thread may go to sleep if it runs out of work to do, but will wake
     /// when the latch is set or more work becomes available.
     #[inline(always)]
-    pub fn wait_for(&self, latch: &Latch) {
-        while !latch.check() {
-            if self.yield_now() == Yield::Idle {
-                latch.wait();
+    pub fn wait_for(&self, latch: &Latch) -> bool {
+        loop {
+            match latch.check() {
+                Status::Pending => {
+                    if self.yield_now() == Yield::Idle {
+                        let member_bitmask = 1 << self.membership.member_index;
+                        let waiting_bitmask =
+                            &self.membership.thread_pool.waiting_bitmask;
+                        latch.wait(member_bitmask, waiting_bitmask);
+                    }
+                }
+                Status::Ok => return false,
+                Status::Error => return true,
             }
         }
     }
 
-    /// Finds a job to work on. This function is entirely local, and does no
-    /// synchronization with the queue.
+    /// Finds a job to work on. This function is almost entirely local, but does
+    /// a small amount of synchronization to allow for !Send futures that must
+    /// be polled on this thread but are woken on a different thread.
+    ///
+    /// Work is prioritized as follows:
+    /// 1. Pull from the LIFO queue (`join` calls)
+    /// 2. Pull from the !Send FIFO queue (`spawn_local` calls)
+    /// 3. Pull from the regular FIFO queue (`spawn` calls)
     #[inline(always)]
     fn find_local_work(&self) -> Option<JobRef> {
-        self.lifo_queue
-            .pop_newest()
+        (self.lifo_queue.pop_newest())
+            .or_else(|| self.nonsend_fifo_queue.pop())
             .or_else(|| self.fifo_queue.pop_oldest())
     }
 
-    /// Finds a job to work on. This tries
-    /// [`find_local_work`][Worker::find_local_work] first, then falls back to
-    /// pulling shared work from the thread pool.
+    /// Finds a job to work on.
+    ///
+    /// Work is prioritized as follows:
+    /// 1. Pull from the LIFO queue (`join` calls)
+    /// 2. Pull from the !Send FIFO queue (`spawn_local` calls)
+    /// 3. Pull from the regular FIFO queue (`spawn` calls)
+    /// 4. Pull from the broadcast queue (`broadcast` calls)
+    /// 5. Reclaim work shared by this worker
+    /// 6. Steal work shared from other workers
+    /// 7. Read from the global queue (external calls)
+    ///
+    /// If work is found in the last two cases, it is treated as having been
+    /// "migrated" to this thread.
     #[inline(always)]
     fn find_work(&self) -> Option<(JobRef, bool)> {
-        self.find_local_work()
-            .map(|job| (job, false))
-            .or_else(|| self.sharer().pop().map(|job| (job, false)))
+        (self.find_local_work().map(|job| (job, false)))
+            .or_else(|| self.broadcast_queue().pop().map(|job| (job, false)))
+            .or_else(|| self.sharing_queue().pop().map(|job| (job, false)))
             .or_else(|| self.steal_from_siblings().map(|job| (job, true)))
-            .or_else(|| self.claim_shared_job().map(|job| (job, true)))
+            .or_else(|| {
+                self.thread_pool.shared_queue.pop().map(|job| (job, true))
+            })
     }
 
     /// Attempts to steal a job from another worker's work-stealing queue.
@@ -1084,13 +1455,16 @@ impl Worker {
     /// the same victim. Because stealers are pre-allocated and permanent, no
     /// lock or atomic load is needed to access them.
     fn steal_from_siblings(&self) -> Option<JobRef> {
-        let stealers = &self.lease.seats.stealers;
-        let occupied = self.lease.thread_pool.occupied.load(Ordering::Relaxed);
-        let my_seat = self.lease.seat_number as u32;
+        let my_member_index = self.membership.member_index;
+        let my_sharer = self.sharing_queue();
+        let stealers = &self.membership.member_data.stealers;
+        let claimed_bitmask =
+            self.thread_pool().claimed_bitmask.load(Ordering::Relaxed);
 
-        // Randomise the starting position so all workers get a fair shot as victims.
+        // Randomize the starting position so all workers get a fair shot as victims.
         let offset = self.rng.next_usize(32) as u32;
-        let mut bits = (occupied & !(1u32 << my_seat)).rotate_right(offset);
+        let mut bits =
+            (claimed_bitmask & !(1u32 << my_member_index)).rotate_right(offset);
 
         while bits != 0 {
             let shifted_idx = bits.trailing_zeros();
@@ -1101,7 +1475,7 @@ impl Worker {
             // `steal_and_pop` returns one job directly and moves up to half the
             // remaining items into our steal queue for later use.
             loop {
-                match stealer.steal_and_pop(self.sharer(), |n| n / 2) {
+                match stealer.steal_and_pop(my_sharer, |n| n / 2) {
                     Ok((job, _)) => return Some(job),
                     Err(StealError::Busy) => {} // transient; retry
                     Err(StealError::Empty) => break,
@@ -1111,12 +1485,6 @@ impl Worker {
         None
     }
 
-    /// Claims a job from the global injector queue.
-    #[inline(always)]
-    fn claim_shared_job(&self) -> Option<JobRef> {
-        self.lease.thread_pool.shared_jobs.pop()
-    }
-
     /// Cooperatively yields execution to the thread pool, allowing it to execute
     /// some work.
     ///
@@ -1189,12 +1557,13 @@ impl Worker {
 // Worker operations
 
 impl Worker {
-    /// Spawns work (a closure or future) onto the thread pool. Just like a
-    /// standard thread, this work executes concurrently (and potentially in
-    /// parallel) to the place where it is spawned. It is not tied to the
-    /// current stack frame, and hence it cannot hold any references other than
-    /// those with `'static` lifetime. If you want to spawn a task that
-    /// references stack data, use the [`scope`], [`ThreadPool::scope`] or
+    /// Runs work (a closure or future) in the background.
+    ///
+    /// Just like a standard thread, this work executes concurrently (and
+    /// potentially in parallel) to the place where it is spawned. It is not
+    /// tied to the current stack frame, and hence it cannot hold any references
+    /// other than those with `'static` lifetime. If you want to spawn a task
+    /// that references stack data, use the [`scope`], [`ThreadPool::scope`] or
     /// [`Worker::scope`] functions.
     ///
     /// Since tasks spawned with this function cannot hold references into the
@@ -1204,17 +1573,46 @@ impl Worker {
     ///
     /// If you do not have access to a [`Worker`], you may call
     /// [`ThreadPool::spawn`] or simply [`spawn`].
+    ///
+    /// # Panics
+    ///
+    /// The panic behavior depends on the type of work being spawned:
+    ///
+    /// * If a closure panics, it will be caught and ignored.
+    ///
+    /// * If a future panics, the [`Task`] will panic when awaited.
+    ///
     #[inline]
     pub fn spawn<M, S: Spawn<M>>(&self, work: S) -> S::Output {
-        work.spawn(self.lease.thread_pool, Some(self))
+        work.spawn(self.thread_pool, Some(self))
+    }
+
+    /// Runs work (a closure or future) in the background of this thread.
+    ///
+    /// This is quite similar to [`spawn`](Worker::spawn), except that the work
+    /// may be `!Send` and will only run on the current thread. If your work is
+    /// `Send`, consider using [`spawn`](Worker::spawn) instead.
+    ///
+    /// # Panics
+    ///
+    /// The panic behavior depends on the type of work being spawned:
+    ///
+    /// * If a closure panics, it will be caught and ignored.
+    ///
+    /// * If a future panics, the [`Task`] will panic when awaited.
+    ///
+    #[inline]
+    pub fn spawn_local<M, S: SpawnLocal<M>>(&self, work: S) -> S::Output {
+        work.spawn_local(self)
     }
 
-    /// Polls a future to completion, then returns the outcome. This function
-    /// will prioritize polling the future as soon as it becomes available, and
-    /// while the future is not available it will try to do other meaningful
-    /// work from the thread-pool. If the thread pool runs out of work, the
-    /// thread is suspended until the future completes or more background work
-    /// becomes available.
+    /// Polls a future to completion, then returns the outcome.
+    ///
+    /// This function will prioritize polling the future as soon as it becomes
+    /// available, and while the future is not available it will try to do other
+    /// meaningful work from the thread-pool. If the thread pool runs out of
+    /// work, the thread is suspended until the future completes or more
+    /// background work becomes available.
     ///
     /// # Async & Concurrency
     ///
@@ -1242,7 +1640,7 @@ impl Worker {
     ///
     /// # Panics
     ///
-    /// If the future panics, this immediately panics.
+    /// If the future panics, this panics.
     #[inline(always)]
     pub fn block_on<F, T>(&self, future: F) -> T
     where
@@ -1300,7 +1698,7 @@ impl Worker {
     /// # THREAD_POOL.resize_to_available();
     ///
     /// let mut v = vec![5, 1, 8, 22, 0, 44];
-    /// THREAD_POOL.on_worker(|worker| quick_sort(worker, &mut v));
+    /// THREAD_POOL.with_worker(|worker| quick_sort(worker, &mut v));
     /// assert_eq!(v, vec![0, 1, 5, 8, 22, 44]);
     ///
     /// fn quick_sort<T: PartialOrd + Send>(worker: &Worker, v: &mut [T]) {
@@ -1339,7 +1737,7 @@ impl Worker {
     /// # THREAD_POOL.resize_to_available();
     ///
     /// let tree = gen_tree(8);
-    /// let result = THREAD_POOL.on_worker(|worker| sum(worker, &tree));
+    /// let result = THREAD_POOL.with_worker(|worker| sum(worker, &tree));
     /// assert_eq!(result, 255);
     ///
     /// struct Node {
@@ -1407,15 +1805,34 @@ impl Worker {
         // Allocate a job to run the closure `a` on the stack. It is vital to
         // the correctness of this function that this stack-job never move until
         // it is freed.
-        let mut stack_job = StackJob::new(a, self);
-
-        // SAFETY: The `StackJob` is allocated on the stack just above, is never
-        // moved, and so will live for the entirety of this function in the same
-        // memory location. If closure `a` closes over data, that must be valid
-        // for the lifetime of this function as well. The `JobRef` cannot
-        // outlive either, because it is guaranteed to be executed before the
-        // function returns. We also clearly never create more than one `JobRef`
-        // using the `stack_job`.
+        let stack_job = StackJob::new(a, self.new_latch());
+
+        // SAFETY: We are only allowed to create a `JobRef` to this `StackJob`
+        // if we can show that...
+        //
+        // * `as_job_ref` is called at most once for this `StackJob`.
+        //
+        //   The `StackJob` is only accessible in this function (it is
+        //   created here, dropped here, and no direct references escape
+        //   this scope), and within this function we only call `as_job_ref`
+        //   once.
+        //
+        // * The `StackJob` will not be moved or dropped until either:
+        //
+        //   A. A call to `check` on the enclosed `Latch` returns something
+        //      other than `Pending`.
+        //
+        //   B. The `JobRef` is dropped without `execute` being called.
+        //
+        //   If `recover_newest` returns `true`, then the `JobRef` must have
+        //   been dropped without `execute` being called (satisfying B).
+        //
+        //   If `recover_newest` returns `false`, then we call `wait_for`, which
+        //   will not allow the function to progress until `check` returns
+        //   something other than `Pending` (satisfying A).
+        //
+        //   In either case, we cannot move or drop the `StackJob` until we pass
+        //   the branch marked with "(*)". We clearly do not.
         let job_ref = unsafe { stack_job.as_job_ref() };
 
         // Store the id of the `JobRef` for later, when we will need it to
@@ -1438,23 +1855,35 @@ impl Worker {
         // Attempt to recover the job from the queue. It should still be there
         // if we didn't share it.
         if self.lifo_queue.recover_newest(job_ref_id) {
-            // SAFETY: Because the ids match, the JobRef we just popped from
-            // the queue must point to `stack_job`, implying that
-            // `stack_job` cannot have been executed yet.
-            let a = unsafe { stack_job.unwrap() };
+            // (*)
+            // SAFETY: Because the ids match, the JobRef we just popped from the
+            // queue must point to `stack_job`, implying that `stack_job` cannot
+            // have been executed yet, and `JobRef::execute` will never be
+            // called.
+            let a = unsafe { stack_job.unwrap_func() };
             // Execute the closure directly and return the results. This is
             // allows the compiler to inline and optimize `a`.
             result_a = unwind::halt_unwinding(|| a(self));
         } else {
             // Wait for the job to complete.
-            self.wait_for(stack_job.completion_latch());
-            // SAFETY: The job must be complete, because we just waited on the latch.
-            result_a = unsafe { stack_job.return_value() };
+            if self.wait_for(stack_job.completion_latch()) {
+                // SAFETY: Since `wait_for` returned `true`, a `check` must have
+                // returned `Error`.
+                let error = unsafe { stack_job.unwrap_error() };
+                result_a = Err(error);
+            } else {
+                // SAFETY: Since `wait_for` returned `false`, a `check` must have
+                // returned `Ok`.
+                let output = unsafe { stack_job.unwrap_output() };
+                result_a = Ok(output);
+            }
         }
 
         // Resume unwinding if either job panicked.
         match (result_a, result_b) {
-            (Err(error), _) | (_, Err(error)) => unwind::resume_unwinding(error),
+            (Err(error), _) | (_, Err(error)) => {
+                unwind::resume_unwinding(error)
+            }
             (Ok(value_a), Ok(value_b)) => (value_a, value_b),
         }
     }
@@ -1477,7 +1906,7 @@ impl Worker {
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let ok: Vec<i32> = vec![1, 2, 3];
     /// forte::scope(|scope| {
     ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1504,7 +1933,7 @@ impl Worker {
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let ok: Vec<i32> = vec![1, 2, 3];
     /// forte::scope(|scope| {
     ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1520,17 +1949,17 @@ impl Worker {
     /// # });
     /// ```
     ///
-    /// While this works, it could be a problem if we want to use `ok` elsewhere.
-    /// There are two choices. We can keep the closure as a `move` closure, but
-    /// instead of referencing the variable `ok`, we create a shadowed variable that
-    /// is a borrow of `ok` and capture *that*:
+    /// While this works, it could be a problem if we want to use `ok`
+    /// elsewhere. There are two choices. We can keep the closure as a `move`
+    /// closure, but instead of referencing the variable `ok`, we create a
+    /// shadowed variable that is a borrow of `ok` and capture *that*:
     ///
     /// ```rust
     /// # use forte::ThreadPool;
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let ok: Vec<i32> = vec![1, 2, 3];
     /// forte::scope(|scope| {
     ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1549,15 +1978,15 @@ impl Worker {
     /// # });
     /// ```
     ///
-    /// Another option is not to use the `move` keyword but instead to take ownership
-    /// of individual variables:
+    /// Another option is not to use the `move` keyword but instead to take
+    /// ownership of individual variables:
     ///
     /// ```rust
     /// # use forte::ThreadPool;
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let ok: Vec<i32> = vec![1, 2, 3];
     /// forte::scope(|scope| {
     ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1576,15 +2005,15 @@ impl Worker {
     ///
     /// # Referencing the scope
     ///
-    /// The scope passed into the closure is not allowed to leak out of this call.
-    /// In other words, this will fail to compile:
+    /// The scope passed into the closure is not allowed to leak out of this
+    /// call. In other words, this will fail to compile:
     ///
     /// ```compile_fail
     /// # use forte::ThreadPool;
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let mut leak = None;
     /// forte::scope(|scope| {
     ///     leak = Some(scope); // <-- ERROR: scope would be leaked here
@@ -1601,7 +2030,7 @@ impl Worker {
     /// # use forte::Worker;
     /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
     /// # THREAD_POOL.populate();
-    /// # THREAD_POOL.expect_worker(|worker| {
+    /// # THREAD_POOL.with_worker(|worker| {
     /// let mut counter = 0;
     /// let counter_ref = &mut counter;
     /// forte::scope(|scope| {
@@ -1669,79 +2098,264 @@ impl Worker {
     {
         with_scope(self, f)
     }
+
+    /// Runs the same operation across multiple threads, and returns a vector of
+    /// results.
+    ///
+    /// Each worker receives a [`Broadcast`] struct, telling it how many threads
+    /// are participating in the broadcast, and it's index among those
+    /// participants. The operation may return a different result on each
+    /// thread, and those results are collected into a vector, ordered according
+    /// to broadcast index. If you don't care about getting results back,
+    /// consider using [`spawn_broadcast`](Worker::spawn_broadcast) instead.
+    ///
+    /// Broadcasts execute after they have completed their local work queues,
+    /// but before they attempt to steal work from other threads. Forte will
+    /// generally try to run the broadcast on as many threads as possible, but
+    /// is not guaranteed to actually use all of them.
+    ///
+    /// While a broadcast is running, workers are temporarally forbidden from
+    /// leaving the pool.
+    ///
+    /// # Panics
+    ///
+    /// If the operation panics on one or more threads, exactly one panic will
+    /// be propagated, only after all threads have completed (or themselves
+    /// panicked).
+    #[inline(always)]
+    pub fn broadcast<F, T>(&self, f: F) -> Vec<T>
+    where
+        F: for<'w> Fn(Broadcast<'w>) -> T + Sync,
+        T: Send,
+    {
+        // Prevent workers from leaving the pool, and read the membership bitset
+        // once it's frozen.
+        let members = self.thread_pool.freeze_membership();
+        let participants = members.count_ones() as usize;
+
+        // Create a new stack job for every member.
+        let jobs: Vec<_> = members
+            .iter_bits()
+            .enumerate()
+            .map(|(i, member_index)| {
+                let func = &f;
+                let op = move |worker: &Worker| {
+                    func(Broadcast {
+                        worker,
+                        index: i,
+                        participants,
+                    })
+                };
+                (member_index, StackJob::new(op, self.new_latch()))
+            })
+            .collect();
+
+        // Send the broadcast to each member, and wake them up.
+        for (member_index, job) in &jobs {
+            // SAFETY: We are only allowed to create a `JobRef` for this
+            // `StackJob` if we can show that...
+            //
+            // * `as_job_ref` is called at most once for this `StackJob`.
+            //
+            //   The `StackJob` is only accessible in this function (it is
+            //   created here, dropped here, and no direct references escape
+            //   this scope), and within this function we only call `as_job_ref`
+            //   once.
+            //
+            // * The `StackJob` will not be moved or dropped until a call to
+            //   `check` on the enclosed `Latch` returns something other than
+            //   `Pending`.
+            //
+            //   We call `wait_for` on each job's latch (marked with a *). This
+            //   does not allow the function to progress while `check` returns
+            //   `Pending`. No `StackJob` is moved or dropped until after this
+            //   function has been called on every `StackJob` and returned.
+            let job_ref = unsafe { job.as_job_ref() };
+            self.member_data.broadcasts[*member_index].push(job_ref);
+            self.member_data.semaphores[*member_index].signal();
+        }
+
+        // Wait for each job to finish.
+        let error_flags: Vec<_> = jobs
+            .iter()
+            .map(|(_, job)| self.wait_for(job.completion_latch())) // (*)
+            .collect();
+
+        // Allow workers to leave the pool again.
+        self.thread_pool.unfreeze_membership();
+
+        // Collect and return results or propagate panics.
+        jobs.into_iter()
+            .zip(error_flags)
+            .map(|((_, job), error_flag)| {
+                if error_flag {
+                    // SAFETY: If `error_flag` is `true` then `check` has
+                    // returned `Error`.
+                    let error = unsafe { job.unwrap_error() };
+                    unwind::resume_unwinding(error);
+                } else {
+                    // SAFETY: If `error_flag` is `false` then `check` has
+                    // returned `Ok`.
+                    unsafe { job.unwrap_output() }
+                }
+            })
+            .collect()
+    }
+
+    /// Runs the same operation across multiple threads, without waiting for
+    /// results.
+    ///
+    /// Like [`broadcast`](Worker::broadcast), except it does not allow the
+    /// operation to return a result, and does not wait for the operation to
+    /// complete before continuing.
+    ///
+    /// # Panics
+    ///
+    /// Panics are not propagated.
+    #[inline(always)]
+    pub fn spawn_broadcast<F>(&self, f: F)
+    where
+        F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static,
+    {
+        // Prevent workers from leaving the pool, and read the membership bitset
+        // once it's frozen.
+        let members = self.thread_pool.freeze_membership();
+        let participants = members.count_ones() as usize;
+
+        // Prevent a deadlock if there are no workers. This should be
+        // impossible, but we will be defensive.
+        if participants == 0 {
+            cold_path();
+            self.thread_pool.unfreeze_membership();
+            return;
+        }
+
+        // Send the broadcast to each member, and wake them up.
+        for (i, member_index) in members.iter_bits().enumerate() {
+            let func = &f;
+            let op = move |worker: &Worker| {
+                // Run the job
+                func(Broadcast {
+                    worker,
+                    index: i,
+                    participants,
+                });
+            };
+
+            let job = HeapJob::new(op);
+
+            // SAFETY: `HeapJob::into_job_ref` has two preconditions:
+            //
+            // * The `JobRef` must not outlive any of the items closed over by
+            //   the function `f`.
+            //
+            //   Since `F: 'static`, the `JobRef` cannot outlive its captured
+            //   data.
+            //
+            // * If `F: !Send` then the `JobRef` must only be executed on this
+            //   thread.
+            //
+            //   The `op` is `Send`, so this does not apply.
+            let job_ref = unsafe { job.into_job_ref() };
+            self.member_data.broadcasts[member_index].push(job_ref);
+            self.member_data.semaphores[member_index].signal();
+        }
+
+        // Once we have finished pushing jobs out to workers who we know are not
+        // in the middle of resginging, we can allow resignations again.
+        self.thread_pool.unfreeze_membership();
+    }
 }
 
 // -----------------------------------------------------------------------------
 // Implicit worker registration api
 
+/// A [`ThreadPool`] wrapper, used by the [`DEFAULT_POOL`].
+///
+/// This dereferences to a [`ThreadPool`]. The first time it is dereferenced, it
+/// resizes itself to fill all available cores.
+pub struct DefaultThreadPool {
+    thread_pool: &'static ThreadPool,
+    initialized: AtomicU32,
+}
+
+impl Deref for DefaultThreadPool {
+    type Target = ThreadPool;
+
+    fn deref(&self) -> &'static ThreadPool {
+        if self.initialized.swap(1, Ordering::Relaxed) == 0 {
+            self.thread_pool.resize_to_available();
+        };
+        self.thread_pool
+    }
+}
+
+static DEFAULT_POOL_INNER: ThreadPool = ThreadPool::new();
+
+/// The default thread pool.
+///
+/// Unless you set up your own thread pool, this is where your operations run.
+/// The first time this is dereferenced, it resizes itself to fill all available
+/// cores.
+pub static DEFAULT_POOL: DefaultThreadPool = DefaultThreadPool {
+    thread_pool: &DEFAULT_POOL_INNER,
+    initialized: AtomicU32::new(0),
+};
+
 /// Runs the provided closure in the background.
 ///
 /// When executed on a thread that is currently registered as a worker (i.e. the
-/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
-/// this is able to look up that registration and find the worker and
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
 /// thread-pool implicitly.
 ///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+///
 /// If you have a reference to a [`Worker`], it's better to use [`Worker::spawn`]
 /// instead. If you don't have a worker, but know which thread pool you want to
 /// use, [`ThreadPool::spawn`] is more appropriate.
-///
-/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// </pre></div>
 pub fn spawn<M, S: Spawn<M>>(work: S) -> S::Output {
-    Worker::with_current(|worker| {
-        worker
-            .expect("attempt to call `forte::spawn` from outside a thread pool")
-            .spawn(work)
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.spawn(work),
+        None => DEFAULT_POOL.spawn(work),
     })
 }
 
 /// Waits for a future to complete.
 ///
 /// When executed on a thread that is currently registered as a worker (i.e. the
-/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
-/// this is able to look up that registration and find the worker and
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
 /// thread-pool implicitly.
 ///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+///
 /// If you have a reference to a [`Worker`], it's better to use
 /// [`Worker::block_on`] instead. If you don't have a worker, but know which
 /// thread pool you want to use, [`ThreadPool::block_on`] is more appropriate.
-///
-/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// </pre></div>
 pub fn block_on<F, T>(future: F) -> T
 where
     F: Future<Output = T> + Send,
     T: Send,
 {
-    Worker::with_current(|worker| {
-        worker
-            .expect("attempt to call `forte::block_on` from outside a thread pool")
-            .block_on(future)
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.block_on(future),
+        None => DEFAULT_POOL.block_on(future),
     })
 }
 
 /// Executes the two closures, possibly in parallel.
 ///
 /// When executed on a thread that is currently registered as a worker (i.e. the
-/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
-/// this is able to look up that registration and find the worker and
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
 /// thread-pool implicitly.
 ///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+///
 /// If you have a reference to a [`Worker`], it's better to use [`Worker::join`]
 /// instead. If you don't have a worker, but know which thread pool you want to
 /// use, [`ThreadPool::join`] is more appropriate.
-///
-/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
-///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
-///
-/// </pre></div>
 pub fn join<A, B, RA, RB>(a: A, b: B) -> (RA, RB)
 where
     A: FnOnce(&Worker) -> RA + Send,
@@ -1749,37 +2363,93 @@ where
     RA: Send,
     RB: Send,
 {
-    Worker::with_current(|worker| {
-        worker
-            .expect("attempt to call `forte::join` from outside a thread pool")
-            .join(a, b)
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.join(a, b),
+        None => DEFAULT_POOL.join(a, b),
     })
 }
 
 /// Creates a new scope for spawning non-static work.
 ///
 /// When executed on a thread that is currently registered as a worker (i.e. the
-/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
-/// this is able to look up that registration and find the worker and
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
 /// thread-pool implicitly.
 ///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+///
 /// If you have a reference to a [`Worker`], it's better to use
 /// [`Worker::scope`] instead. If you don't have a worker, but know which thread
 /// pool you want to use, [`ThreadPool::scope`] is more appropriate.
+pub fn scope<'env, F, T>(f: F) -> T
+where
+    F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
+{
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.scope(f),
+        None => DEFAULT_POOL.scope(f),
+    })
+}
+
+/// Runs an operation on multiple threads and returns a vector of results.
 ///
-/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
 ///
-/// **Warning:** This function panics if the current thread is not registered as a worker.
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
 ///
-/// </pre></div>
-pub fn scope<'env, F, T>(f: F) -> T
+/// If you have a reference to a [`Worker`], it's better to use
+/// [`Worker::broadcast`] instead. If you don't have a worker, but know which
+/// thread pool you want to use, [`ThreadPool::spawn_broadcast`] is more
+/// appropriate.
+pub fn broadcast<F, T>(f: F) -> Vec<T>
 where
-    F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
+    F: for<'w> Fn(Broadcast<'w>) -> T + Sync,
+    T: Send,
 {
-    Worker::with_current(|worker| {
-        worker
-            .expect("attempt to call `forte::scope` from outside a thread pool")
-            .scope(f)
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.broadcast(f),
+        None => DEFAULT_POOL.broadcast(f),
+    })
+}
+
+/// Runs an operation on multiple threads without waiting for results.
+///
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
+///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+///
+/// If you have a reference to a [`Worker`], it's better to use
+/// [`Worker::spawn_broadcast`] instead. If you don't have a worker, but know
+/// which thread pool you want to use, [`ThreadPool::spawn_broadcast`] is more
+/// appropriate.
+pub fn spawn_broadcast<F>(f: F)
+where
+    F: for<'w> Fn(Broadcast<'w>) + Send + Sync + 'static,
+{
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.spawn_broadcast(f),
+        None => DEFAULT_POOL.spawn_broadcast(f),
+    });
+}
+
+/// Returns the number of members participating in a thread-pool.
+///
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Membership::activate`], [`ThreadPool::with_worker`], or
+/// similar) this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
+///
+/// If not called within a thread pool, this uses the [`DEFAULT_POOL`].
+pub fn num_members() -> usize {
+    Worker::with_current(|worker| match worker {
+        Some(worker) => worker.thread_pool().num_members(),
+        None => DEFAULT_POOL.num_members(),
     })
 }
 
@@ -1790,35 +2460,25 @@ where
 /// Operating on the principle that you should finish what you start before
 /// starting something new, workers will first execute their queue, then execute
 /// shared jobs, then pull new jobs from the injector.
-fn managed_worker(lease: Lease, halt: Arc<AtomicBool>) {
-    trace!("starting managed worker");
-
+fn managed_worker(membership: Membership, halt: Arc<AtomicBool>) {
     // Register as the indicated worker, and work until we are told to halt.
-    Worker::occupy(lease, |worker| {
+    membership.activate(|worker| {
         while !halt.load(Ordering::Relaxed) {
-            #[cfg(feature = "shuttle")]
-            shuttle::hint::spin_loop();
-
-            if let Some((job, migrated)) = worker.find_work() {
-                worker.execute(job, migrated);
-            } else {
-                worker.lease.seats.sleep_controllers[worker.lease.seat_number]
-                    .sleep(worker.lease.seat_number, &worker.lease.thread_pool.sleeping);
+            if worker.yield_now() == Yield::Idle {
+                worker.wait();
             }
         }
     });
-
-    trace!("exiting managed worker");
 }
 
 // -----------------------------------------------------------------------------
 // Tests
 
-#[cfg(all(test, not(feature = "shuttle")))]
+#[cfg(test)]
 mod tests {
 
-    use alloc::vec;
     use std::sync::mpsc::channel;
+    use std::vec;
 
     use super::*;
 
@@ -1848,6 +2508,18 @@ mod tests {
         THREAD_POOL.depopulate();
     }
 
+    #[test]
+    fn spawn_cancel_safety() {
+        static THREAD_POOL: ThreadPool = ThreadPool::new();
+        THREAD_POOL.resize_to_available();
+
+        THREAD_POOL.with_worker(|worker| {
+            let _ = worker.spawn(core::future::pending::<()>());
+        });
+
+        THREAD_POOL.depopulate();
+    }
+
     #[test]
     fn join_basic() {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
@@ -1864,8 +2536,7 @@ mod tests {
     }
 
     #[test]
-    #[cfg(not(miri))] // This is too much for miri to handle
-    fn join_long() {
+    fn join_deep() {
         fn increment(worker: &Worker, slice: &mut [u32]) {
             match slice.len() {
                 0 => (),
@@ -1873,7 +2544,10 @@ mod tests {
                 _ => {
                     let (head, tail) = slice.split_at_mut(1);
 
-                    worker.join(|_| head[0] += 1, |worker| increment(worker, tail));
+                    worker.join(
+                        |_| head[0] += 1,
+                        |worker| increment(worker, tail),
+                    );
                 }
             }
         }
@@ -1881,16 +2555,15 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        let mut vals = [0; 1_024];
-        THREAD_POOL.on_worker(|worker| increment(worker, &mut vals));
-        assert_eq!(vals, [1; 1_024]);
+        let mut vals = [0; 800];
+        THREAD_POOL.with_worker(|worker| increment(worker, &mut vals));
+        assert_eq!(vals, [1; 800]);
 
         THREAD_POOL.depopulate();
     }
 
     #[test]
-    #[cfg(not(miri))] // This is too much for miri to handle
-    fn join_very_long() {
+    fn join_wide() {
         fn increment(worker: &Worker, slice: &mut [u32]) {
             match slice.len() {
                 0 => (),
@@ -1910,9 +2583,9 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        let mut vals = vec![0; 512 * 512];
-        THREAD_POOL.on_worker(|worker| increment(worker, &mut vals));
-        assert_eq!(vals, vec![1; 512 * 512]);
+        let mut vals = vec![0; 65_536];
+        THREAD_POOL.with_worker(|worker| increment(worker, &mut vals));
+        assert_eq!(vals, vec![1; 65_536]);
 
         THREAD_POOL.depopulate();
     }
diff --git a/src/time.rs b/src/time.rs
new file mode 100644
index 0000000..451d5e7
--- /dev/null
+++ b/src/time.rs
@@ -0,0 +1,47 @@
+//! Architecture-specific timing functions, taken from
+//! <https://github.com/spence/tach>
+
+/// Read from the `cntvct_el0` register on Arm `AArch64`.
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub fn ticks() -> u64 {
+    use core::arch::asm;
+    let cnt: u64;
+    // SAFETY: `mrs cntvct_el0` only reads the architectural virtual counter
+    // register and does not touch memory or the stack.
+    unsafe {
+        asm!(
+            "mrs {}, cntvct_el0",
+            out(reg) cnt,
+            options(nostack, nomem, preserves_flags)
+        );
+    }
+    cnt
+}
+
+/// Read from rdtime on RISC-V
+#[cfg(target_arch = "riscv64")]
+#[inline(always)]
+pub fn ticks() -> u64 {
+    use core::arch::asm;
+    let cnt: u64;
+    // SAFETY: `rdtime` reads a timer CSR into a general-purpose register and does not access
+    // Rust memory.
+    unsafe {
+        asm!(
+            "rdtime {}",
+            out(reg) cnt,
+            options(nostack, nomem, preserves_flags)
+        );
+    }
+    cnt
+}
+
+/// Read from the real-time stamp counter on windows
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+pub fn ticks() -> u64 {
+    // SAFETY: `_rdtsc` emits the CPU counter read instruction and has no Rust memory safety
+    // preconditions.
+    unsafe { core::arch::x86_64::_rdtsc() }
+}
diff --git a/src/util.rs b/src/util.rs
index 661a430..62352a3 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,19 +1,17 @@
 use core::cell::Cell;
 use core::hash::Hasher;
-use core::sync::atomic::AtomicUsize;
-use core::sync::atomic::Ordering;
 use std::hash::DefaultHasher;
 
+use crate::platform::*;
+
 /// [xorshift*] is a fast pseudorandom number generator which will
 /// even tolerate weak seeding, as long as it's not zero.
 ///
 /// [xorshift*]: https://en.wikipedia.org/wiki/Xorshift#xorshift*
-#[cfg(not(feature = "shuttle"))]
 pub struct XorShift64Star {
     state: Cell<u64>,
 }
 
-#[cfg(not(feature = "shuttle"))]
 impl XorShift64Star {
     pub fn new() -> Self {
         // Any non-zero seed will do -- this uses the hash of a global counter.
@@ -53,19 +51,37 @@ impl XorShift64Star {
     }
 }
 
-#[cfg(feature = "shuttle")]
-pub struct XorShift64Star;
+pub trait IterBits {
+    fn iter_bits(self) -> BitIter;
+}
 
-#[cfg(feature = "shuttle")]
-impl XorShift64Star {
-    pub fn new() -> Self {
-        Self
+impl IterBits for u32 {
+    fn iter_bits(self) -> BitIter {
+        BitIter { bitset: self }
     }
+}
 
-    pub fn next_usize(&self, n: usize) -> usize {
-        use shuttle::rand::Rng;
-        use shuttle::rand::thread_rng;
+pub struct BitIter {
+    bitset: u32,
+}
+
+impl Iterator for BitIter {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<usize> {
+        if self.bitset == 0 {
+            None
+        } else {
+            let i = self.bitset.trailing_zeros(); // TZCNT
+            self.bitset &= self.bitset - 1; // BLSR
+            Some(i as usize)
+        }
+    }
 
-        thread_rng().gen_range(0..n)
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let populated = self.bitset.count_ones(); // POPCNT
+        (populated as usize, Some(populated as usize))
     }
 }
+
+impl ExactSizeIterator for BitIter {}
diff --git a/tests/general.rs b/tests/general.rs
deleted file mode 100644
index 305e4b1..0000000
--- a/tests/general.rs
+++ /dev/null
@@ -1,3 +0,0 @@
-//! General integration tests
-
-#![cfg(not(feature = "shuttle"))]
diff --git a/tests/miri.rs b/tests/miri.rs
deleted file mode 100644
index c9d27be..0000000
--- a/tests/miri.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-//! Tests specifically for miri
-
-#![cfg(miri)]
-
-use forte::prelude::*;
-use tracing::info;
-
-/// A node in a binary tree.
-struct Node {
-    val: u64,
-    left: Option<Box<Node>>,
-    right: Option<Box<Node>>,
-}
-
-impl Node {
-    // Constructs a new binary tree with the given number of layers.
-    pub fn tree(layers: usize) -> Self {
-        Self {
-            val: 1,
-            left: (layers != 1).then(|| Box::new(Self::tree(layers - 1))),
-            right: (layers != 1).then(|| Box::new(Self::tree(layers - 1))),
-        }
-    }
-}
-
-#[test]
-fn fork_join() {
-    let layers = 10;
-    let target = (1 << layers) - 1;
-
-    static COMPUTE: ThreadPool = ThreadPool::new();
-
-    fn sum(node: &Node, worker: &Worker) -> u64 {
-        let (left, right) = worker.join(
-            |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
-            |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
-        );
-
-        node.val + left + right
-    }
-
-    let tree = Node::tree(layers);
-
-    COMPUTE.with_worker(|worker| {
-        let worker = worker.unwrap();
-        COMPUTE.resize_to_available();
-        info!("Work beginning");
-        assert_eq!(sum(&tree, worker), target);
-        info!("Work completed");
-        COMPUTE.depopulate();
-    });
-}
diff --git a/tests/shuttle.rs b/tests/shuttle.rs
deleted file mode 100644
index 776f3be..0000000
--- a/tests/shuttle.rs
+++ /dev/null
@@ -1,213 +0,0 @@
-//! Tests using the Shuttle testing framework.
-
-#![cfg(feature = "shuttle")]
-#![allow(unused_imports)]
-
-use core::pin::Pin;
-use core::task::Context;
-use core::task::Poll;
-
-use forte::ThreadPool;
-use forte::Worker;
-use shuttle::hint::black_box;
-use shuttle::sync::atomic::AtomicBool;
-use shuttle::sync::atomic::AtomicUsize;
-use shuttle::sync::atomic::Ordering;
-use tracing::Level;
-use tracing_subscriber::fmt::Subscriber;
-
-// -----------------------------------------------------------------------------
-// Infrastructure
-
-/// Provides access to a thread pool which can be treated as static for the
-/// purposes of testing.
-fn with_thread_pool<F>(f: F) -> impl Fn() + 'static
-where
-    F: Fn(&'static ThreadPool) + 'static,
-{
-    move || {
-        let thread_pool = Box::new(ThreadPool::new());
-        let thread_pool_ptr = Box::into_raw(thread_pool);
-
-        // SAFETY: This thread pool is never dropped.
-        let thread_pool_ref = unsafe { &*thread_pool_ptr };
-        f(thread_pool_ref);
-    }
-}
-
-// -----------------------------------------------------------------------------
-// Pool resizing
-
-/// Tests for concurrency issues within the `with_thread_pool` helper function.
-/// This spins up a thread pool with a single thread, then spins it back down.
-#[test]
-pub fn shuttle_populate_depopulate() {
-    let test = with_thread_pool(|pool| {
-        pool.populate();
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-// -----------------------------------------------------------------------------
-// Core API
-
-/// Tests spawning a worker on a pool of size one.
-#[test]
-pub fn shuttle_spawn_closure() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(1);
-        pool.spawn(|_: &Worker| {});
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-#[derive(Default)]
-struct CountFuture {
-    count: usize,
-}
-
-impl Future for CountFuture {
-    type Output = ();
-
-    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        if self.count == 128 {
-            Poll::Ready(())
-        } else {
-            self.count += 1;
-            cx.waker().wake_by_ref();
-            Poll::Pending
-        }
-    }
-}
-
-/// Tests spawning a nontrivial future on a pool of size one.
-#[test]
-pub fn shuttle_spawn_future() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(1);
-        let task = pool.spawn(CountFuture::default());
-        assert!(task.is_finished());
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-/// Tests a two-level join operation on a pool of size one.
-#[test]
-pub fn join_4_on_1() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(1);
-
-        let counter = AtomicUsize::new(0);
-        pool.join(
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-        );
-        assert_eq!(counter.load(Ordering::Relaxed), 4);
-
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-/// Tests a two-level join operation on a pool of size two.
-#[test]
-pub fn join_4_on_2() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(2);
-
-        let counter = AtomicUsize::new(0);
-        pool.join(
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-        );
-        assert_eq!(counter.load(Ordering::Relaxed), 4);
-
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-/// Tests a two-level join operation on a pool of size three.
-#[test]
-pub fn join_4_on_3() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(3);
-
-        let counter = AtomicUsize::new(0);
-        pool.join(
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-            |worker| {
-                worker.join(
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                    |_| counter.fetch_add(1, Ordering::Relaxed),
-                )
-            },
-        );
-        assert_eq!(counter.load(Ordering::Relaxed), 4);
-
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}
-
-/// Tests a moderately deep join operation on a large pool.
-#[test]
-pub fn join_long() {
-    let test = with_thread_pool(|pool| {
-        pool.resize_to(8);
-
-        fn increment(worker: &Worker, slice: &mut [u32]) {
-            match slice.len() {
-                0 => (),
-                1 => slice[0] += 1,
-                _ => {
-                    let (head, tail) = slice.split_at_mut(1);
-
-                    worker.join(|_| head[0] += 1, |worker| increment(worker, tail));
-                }
-            }
-        }
-
-        let mut vals = [0; 10];
-        pool.expect_worker(|worker| increment(worker, &mut vals));
-        assert_eq!(vals, [1; 10]);
-
-        pool.depopulate();
-    });
-
-    shuttle::check_pct(test, 100_000, 100_000);
-}