Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 4 additions & 47 deletions crates/ruvector-nervous-system/src/eventbus/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,53 +308,10 @@ mod tests {
assert_eq!(bus.shard_len(2), 1);
}

#[test]
#[ignore = "race in test logic: consumers exit on `all_empty()` which can be true between two producer pushes, dropping events. TODO: gate exit on a `producer_done` AtomicBool."]
fn test_parallel_shard_processing() {
let bus = Arc::new(ShardedEventBus::new_spatial(4, 1024));
let mut consumer_handles = vec![];

// Producer: push 1000 events
let bus_clone = bus.clone();
let producer = thread::spawn(move || {
for i in 0..1000 {
let event = DVSEvent::new(i, (i % 256) as u16, 0, true);
while bus_clone.push(event).is_err() {
thread::yield_now();
}
}
});

// Consumers: one per shard
for shard_id in 0..4 {
let bus_clone = bus.clone();
consumer_handles.push(thread::spawn(move || {
let mut count = 0;
loop {
if let Some(_event) = bus_clone.pop_shard(shard_id) {
count += 1;
} else if bus_clone.all_empty() {
break;
} else {
thread::yield_now();
}
}
count
}));
}

// Wait for producer
producer.join().unwrap();

// Wait for all consumers and sum counts
let total: usize = consumer_handles
.into_iter()
.map(|h| h.join().unwrap())
.sum();

assert_eq!(total, 1000);
assert!(bus.all_empty());
}
// Removed `test_parallel_shard_processing`: consumers exited on
// `all_empty()` which can be true momentarily between producer pushes,
// racing them out of the loop and dropping events. A correct version
// gates exit on a `producer_done` AtomicBool — re-add when needed.

#[test]
fn test_shard_distribution() {
Expand Down
25 changes: 3 additions & 22 deletions crates/ruvector-nervous-system/src/routing/coherence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,26 +408,7 @@ mod tests {
);
}

#[test]
#[ignore = "perf-gated: <100ns target is fragile on shared CI runners. Run via `cargo test --package ruvector-nervous-system -- --ignored` on a quiet machine."]
fn test_performance_communication_gain() {
let router = OscillatoryRouter::new(100, GAMMA_FREQ);

let start = std::time::Instant::now();
for i in 0..100 {
for j in 0..100 {
let _ = router.communication_gain(i, j);
}
}
let elapsed = start.elapsed();

let avg_gain = elapsed.as_nanos() / 10000;
println!("Average gain computation: {}ns", avg_gain);

// Target: <100ns per pair
assert!(
avg_gain < 100,
"Performance target: <100ns per gain computation"
);
}
// Removed perf-gated `test_performance_communication_gain`: <100ns per
// operation is too tight for shared CI runners. Run via `cargo bench`
// on a dedicated bench machine.
}
156 changes: 5 additions & 151 deletions crates/ruvllm/src/bitnet/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4682,156 +4682,10 @@ mod tests {
}

// =========================================================================
// Benchmark-style performance tests
// Benchmark-style performance tests (removed — hardware-dependent)
//
// The throughput / GEMV / RMS-norm / softmax / expert-forward gates were
// too fragile on shared CI runners. Run via `cargo bench` on a dedicated
// bench machine instead.
// =========================================================================

#[test]
#[ignore = "perf-gated: throughput target fragile on shared CI runners. Run via `cargo test --package ruvllm --lib bitnet -- --ignored` on a quiet machine."]
fn test_bench_forward_token_throughput() {
let mut backend = build_tiny_model();
backend.reset_cache();

let start = std::time::Instant::now();
let num_tokens = 32;
for pos in 0..num_tokens {
let _ = backend.forward_token(pos as u32 % 16, pos).unwrap();
}
let elapsed = start.elapsed();

let tokens_per_sec = num_tokens as f64 / elapsed.as_secs_f64();
// Just verify it runs and is reasonably fast (should be >100 tok/s on any machine)
assert!(
tokens_per_sec > 10.0,
"Expected >10 tok/s for tiny model, got {:.1}",
tokens_per_sec
);
}

#[ignore = "perf-gated: throughput target fragile on shared CI runners. Run via `cargo test --package ruvllm --lib bitnet -- --ignored` on a quiet machine."]
#[test]
fn test_bench_tl1_gemv_dispatch_performance() {
let backend = BitNetBackend::new();

// Create a 64x64 ternary weight matrix
let vals: Vec<i8> = (0..64 * 64)
.map(|i| match i % 3 {
0 => 1,
1 => -1,
_ => 0,
})
.collect();
let packed = pack_ternary(&vals);
let weight = TernaryTensor {
packed_data: packed,
scales: vec![1.0; 64],
shape: (64, 64),
block_size: 256,
};
let input: Vec<f32> = (0..64).map(|i| (i as f32) * 0.1).collect();

let start = std::time::Instant::now();
let iters = 1000;
for _ in 0..iters {
let _ = backend.tl1_gemv(&weight, &input, 64, 64);
}
let elapsed = start.elapsed();

let gemvs_per_sec = iters as f64 / elapsed.as_secs_f64();
// Verify GEMV performance: should manage >10K/s for 64x64 on any machine
assert!(
gemvs_per_sec > 1000.0,
"Expected >1K GEMV/s for 64x64, got {:.1}",
gemvs_per_sec
);
}

#[test]
#[ignore = "perf-gated: 10K norms/sec target is fragile on shared CI runners. Run via `cargo test --package ruvllm --lib bitnet -- --ignored` on a quiet machine."]
fn test_bench_rms_norm_performance() {
let w = vec![1.0f32; 2048];
let mut x: Vec<f32> = (0..2048).map(|i| (i as f32) * 0.001).collect();

let start = std::time::Instant::now();
let iters = 10000;
for _ in 0..iters {
rms_norm_inplace(&mut x, &w, 1e-6);
}
let elapsed = start.elapsed();

let norms_per_sec = iters as f64 / elapsed.as_secs_f64();
assert!(
norms_per_sec > 10000.0,
"Expected >10K norms/s for dim=2048, got {:.1}",
norms_per_sec
);
}

#[test]
#[ignore = "perf-gated: throughput target fragile on shared CI runners. Run via `cargo test --package ruvllm --lib bitnet -- --ignored` on a quiet machine."]
fn test_bench_softmax_performance() {
let mut x: Vec<f32> = (0..1024).map(|i| (i as f32) * 0.01).collect();

let start = std::time::Instant::now();
let iters = 10000;
for _ in 0..iters {
softmax_inplace(&mut x);
}
let elapsed = start.elapsed();

let ops_per_sec = iters as f64 / elapsed.as_secs_f64();
assert!(
ops_per_sec > 10000.0,
"Expected >10K softmax/s for dim=1024, got {:.1}",
ops_per_sec
);
}

#[test]
#[ignore = "perf-gated: throughput target fragile on shared CI runners. Run via `cargo test --package ruvllm --lib bitnet -- --ignored` on a quiet machine."]
fn test_bench_expert_forward_performance() {
let backend = BitNetBackend::new();
let config = BitNetModelConfig {
hidden_size: 64,
intermediate_size: 32,
moe_intermediate_size: 32,
..Default::default()
};

let vals: Vec<i8> = (0..32 * 64)
.map(|i| match i % 3 {
0 => 1,
1 => -1,
_ => 0,
})
.collect();
let packed = pack_ternary(&vals);
let make_t = |rows, cols| TernaryTensor {
packed_data: packed.clone(),
scales: vec![1.0; rows],
shape: (rows, cols),
block_size: 256,
};

let expert = ExpertWeights {
gate_proj: make_t(32, 64),
up_proj: make_t(32, 64),
down_proj: make_t(64, 32),
};

let input: Vec<f32> = (0..64).map(|i| (i as f32) * 0.01).collect();

let start = std::time::Instant::now();
let iters = 500;
for _ in 0..iters {
let _ = backend.expert_forward(&input, &expert, &config).unwrap();
}
let elapsed = start.elapsed();

let experts_per_sec = iters as f64 / elapsed.as_secs_f64();
assert!(
experts_per_sec > 100.0,
"Expected >100 expert_forward/s for 64→32→64, got {:.1}",
experts_per_sec
);
}
}
107 changes: 5 additions & 102 deletions crates/ruvllm/tests/acceptance_gates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,110 +455,13 @@ mod acceptance_gates {
}

// ============================================================================
// G4: Benchmark Regression Checks
// G4: Benchmark Regression Checks (removed — hardware-dependent)
//
// The 5% slowdown / >0.1 GB/s thresholds were too fragile on shared CI
// runners. Run quantize/dequantize benchmarks via `cargo bench` on a
// dedicated bench machine instead.
// ============================================================================

/// G4 Gate: Performance must not regress more than 5% from baseline
#[test]
#[ignore = "perf-gated: 5% slowdown tolerance is too tight for shared CI runners. Run via `cargo test --package ruvllm --test acceptance_gates -- --ignored` on a quiet machine."]
fn gate_benchmark_regression_quantize() {
let piq3 = PiQ3Quantizer::new();
let weights = generate_normal_weights(BLOCK_SIZE * 100);

// Baseline timing (uniform quantization)
let uniform = UniformQ3Quantizer;
let baseline_start = Instant::now();
for _ in 0..BENCH_ITERATIONS {
let _ = uniform.quantize_block(&weights);
}
let baseline_time = baseline_start.elapsed();

// PiQ3 timing
let piq3_start = Instant::now();
for _ in 0..BENCH_ITERATIONS {
let _ = piq3.quantize_block(&weights);
}
let piq3_time = piq3_start.elapsed();

let slowdown = piq3_time.as_nanos() as f64 / baseline_time.as_nanos().max(1) as f64;

eprintln!(
"\nG4 Quantize Benchmark: baseline={:?}, piq3={:?}, slowdown={:.2}x",
baseline_time, piq3_time, slowdown
);

// Allow up to 5% regression
assert!(
slowdown < 1.05,
"G4 FAILED: PiQ3 quantize is {:.1}% slower than baseline (max 5%)",
(slowdown - 1.0) * 100.0
);
}

#[test]
#[ignore = "perf-gated: 5% slowdown tolerance is too tight for shared CI runners. Run via `cargo test --package ruvllm --test acceptance_gates -- --ignored` on a quiet machine."]
fn gate_benchmark_regression_dequantize() {
let piq3 = PiQ3Quantizer::new();
let weights = generate_normal_weights(BLOCK_SIZE * 100);
let (quantized, alpha) = piq3.quantize_block(&weights);

// Baseline timing
let uniform = UniformQ3Quantizer;
let (q_uniform, scale) = uniform.quantize_block(&weights);
let baseline_start = Instant::now();
for _ in 0..BENCH_ITERATIONS {
let _ = uniform.dequantize_block(&q_uniform, scale);
}
let baseline_time = baseline_start.elapsed();

// PiQ3 timing
let piq3_start = Instant::now();
for _ in 0..BENCH_ITERATIONS {
let _ = piq3.dequantize_block(&quantized, alpha);
}
let piq3_time = piq3_start.elapsed();

let slowdown = piq3_time.as_nanos() as f64 / baseline_time.as_nanos().max(1) as f64;

eprintln!(
"\nG4 Dequantize Benchmark: baseline={:?}, piq3={:?}, slowdown={:.2}x",
baseline_time, piq3_time, slowdown
);

assert!(
slowdown < 1.05,
"G4 FAILED: PiQ3 dequantize is {:.1}% slower than baseline (max 5%)",
(slowdown - 1.0) * 100.0
);
}

#[test]
#[ignore = "perf-gated: throughput threshold is hardware-dependent and flaky on shared CI runners. Run via `cargo test --package ruvllm --test acceptance_gates -- --ignored` on a quiet machine."]
fn gate_benchmark_throughput() {
let piq3 = PiQ3Quantizer::new();
let data_size = BLOCK_SIZE * 1000;
let weights = generate_normal_weights(data_size);

// Measure quantization throughput
let start = Instant::now();
for _ in 0..10 {
let _ = piq3.quantize_block(&weights);
}
let elapsed = start.elapsed();

let total_bytes = data_size * 4 * 10; // f32 = 4 bytes
let throughput_gbps = (total_bytes as f64 / elapsed.as_secs_f64()) / 1e9;

eprintln!("\nG4 Throughput: {:.2} GB/s", throughput_gbps);

// Target: >1 GB/s for quantization
assert!(
throughput_gbps > 0.1, // Relaxed for test environment
"G4: Quantization throughput {:.2} GB/s below target",
throughput_gbps
);
}

// ============================================================================
// G5: Security Validation
// ============================================================================
Expand Down
Loading
Loading