From a0785b29c641dfb30f81bfbeedc211a2eb50ef90 Mon Sep 17 00:00:00 2001 From: MANDY Alimaa Date: Sat, 13 Dec 2025 22:52:07 -0600 Subject: [PATCH] use amazon book dataset for benchmarks --- Cargo.toml | 2 + benches/query_benchmarks.rs | 107 ++++++++++++++++++++++++++++++------ src/data_gen.rs | 30 ++++++++++ 3 files changed, 121 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cce4200..508f7a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,8 @@ grafite = "0.2.0" [dev-dependencies] divan = "0.1" +reqwest = { version = "0.12", features = ["blocking"] } +zstd = "0.13" [[bench]] name = "query_benchmarks" diff --git a/benches/query_benchmarks.rs b/benches/query_benchmarks.rs index a0bf5fe..c97d8cd 100644 --- a/benches/query_benchmarks.rs +++ b/benches/query_benchmarks.rs @@ -1,12 +1,15 @@ use divan::{black_box, Bencher}; use range_filters::{ bloom_filter::BloomFilter, - data_gen::generate_smooth_u64, + data_gen::load_amazon_dataset, diva::Diva, grafite_filter::GrafiteFilter, Key, }; use rand::Rng; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; +use std::path::Path; fn main() { divan::main(); @@ -15,6 +18,74 @@ fn main() { // const SIZES: &[usize] = &[10_000, 100_000, 1_000_000, 10_000_000]; const SIZES: &[usize] = &[10_000, 100_000, 1_000_000]; +// Amazon dataset paths and URL +const AMAZON_DATASET_URL: &str = "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/SVN8PI"; +const AMAZON_DATASET_COMPRESSED: &str = "amazon_dataset.tab"; +const AMAZON_DATASET_DECOMPRESSED: &str = "amazon_dataset_decompressed.tab"; + +/// Download the compressed Amazon dataset +fn download_amazon_dataset() -> std::io::Result<()> { + println!("Downloading Amazon dataset from {}", AMAZON_DATASET_URL); + + let response = reqwest::blocking::get(AMAZON_DATASET_URL) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + let bytes = response.bytes() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + let mut file = File::create(AMAZON_DATASET_COMPRESSED)?; + file.write_all(&bytes)?; + + println!("Downloaded {} ({} bytes)", AMAZON_DATASET_COMPRESSED, bytes.len()); + Ok(()) +} + +/// Decompress the Amazon dataset using zstd +fn decompress_amazon_dataset() -> std::io::Result<()> { + println!("Decompressing {} to {}", AMAZON_DATASET_COMPRESSED, AMAZON_DATASET_DECOMPRESSED); + + let input_file = File::open(AMAZON_DATASET_COMPRESSED)?; + let output_file = File::create(AMAZON_DATASET_DECOMPRESSED)?; + + let mut decoder = zstd::stream::read::Decoder::new(BufReader::new(input_file))?; + let mut writer = BufWriter::new(output_file); + + std::io::copy(&mut decoder, &mut writer)?; + writer.flush()?; + + println!("Decompressed successfully"); + Ok(()) +} + +/// Ensure Amazon dataset is available (download and decompress if needed) +fn ensure_amazon_dataset() -> std::io::Result<()> { + // Check if decompressed file already exists + if Path::new(AMAZON_DATASET_DECOMPRESSED).exists() { + return Ok(()); + } + + // Check if compressed file exists + if !Path::new(AMAZON_DATASET_COMPRESSED).exists() { + download_amazon_dataset()?; + } + + // Decompress + decompress_amazon_dataset()?; + + Ok(()) +} + +// Helper function to load keys from Amazon dataset +fn load_keys(size: usize) -> Vec { + // Ensure dataset is available + ensure_amazon_dataset() + .expect("Failed to download/decompress Amazon dataset"); + + // Load the dataset + load_amazon_dataset(AMAZON_DATASET_DECOMPRESSED, Some(size)) + .expect("panic: could not load amazon dataset") +} + // generate query ranges for benchmarking fn generate_query_ranges(keys: &[Key], percent: f64, num_queries: usize) -> Vec<(Key, Key)> { let mut rng = rand::thread_rng(); @@ -40,7 +111,7 @@ fn generate_query_ranges(keys: &[Key], percent: f64, num_queries: usize) -> Vec< #[divan::bench(args = SIZES)] fn diva_construction(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); bencher.bench_local(|| { black_box(Diva::new_with_keys( @@ -53,7 +124,7 @@ fn diva_construction(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_point_query(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let diva = Diva::new_with_keys(&keys, 1024, 0.01); // generate query keys (mix of existing and non-existing) @@ -79,7 +150,7 @@ fn diva_point_query(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_range_query_small(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let diva = Diva::new_with_keys(&keys, 1024, 0.01); let query_ranges = generate_query_ranges(&keys, 0.01, 1000); @@ -93,7 +164,7 @@ fn diva_range_query_small(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_range_query_medium(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let diva = Diva::new_with_keys(&keys, 1024, 0.01); let query_ranges = generate_query_ranges(&keys, 0.07, 1000); @@ -107,7 +178,7 @@ fn diva_range_query_medium(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_range_query_large(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let diva = Diva::new_with_keys(&keys, 1024, 0.01); let query_ranges = generate_query_ranges(&keys, 0.4, 1000); @@ -121,7 +192,7 @@ fn diva_range_query_large(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_insert(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let target_size = 1024; bencher @@ -159,7 +230,7 @@ fn diva_insert(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn diva_delete_infix(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let target_size = 1024; let mut sorted_keys = keys.clone(); @@ -192,7 +263,7 @@ fn diva_delete_infix(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn bloom_construction(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); bencher.bench_local(|| { black_box(BloomFilter::new_with_keys( @@ -204,7 +275,7 @@ fn bloom_construction(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn bloom_point_query(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let bloom = BloomFilter::new_with_keys(&keys, 0.01); let mut rng = rand::thread_rng(); @@ -229,7 +300,7 @@ fn bloom_point_query(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn bloom_range_query_small(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let bloom = BloomFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.01, 1000); @@ -243,7 +314,7 @@ fn bloom_range_query_small(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn bloom_range_query_medium(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let bloom = BloomFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.07, 1000); @@ -257,7 +328,7 @@ fn bloom_range_query_medium(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn bloom_range_query_large(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let bloom = BloomFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.4, 1000); @@ -275,7 +346,7 @@ fn bloom_range_query_large(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn grafite_construction(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); bencher.bench_local(|| { black_box(GrafiteFilter::new_with_keys( @@ -287,7 +358,7 @@ fn grafite_construction(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn grafite_point_query(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let grafite = GrafiteFilter::new_with_keys(&keys, 0.01); let mut rng = rand::thread_rng(); @@ -312,7 +383,7 @@ fn grafite_point_query(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn grafite_range_query_small(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let grafite = GrafiteFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.01, 1000); @@ -326,7 +397,7 @@ fn grafite_range_query_small(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn grafite_range_query_medium(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let grafite = GrafiteFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.07, 1000); @@ -340,7 +411,7 @@ fn grafite_range_query_medium(bencher: Bencher, size: usize) { #[divan::bench(args = SIZES)] fn grafite_range_query_large(bencher: Bencher, size: usize) { - let keys = generate_smooth_u64(Some(size)); + let keys = load_keys(size); let grafite = GrafiteFilter::new_with_keys(&keys, 0.01); let query_ranges = generate_query_ranges(&keys, 0.4, 1000); diff --git a/src/data_gen.rs b/src/data_gen.rs index e59877d..82cbfaf 100644 --- a/src/data_gen.rs +++ b/src/data_gen.rs @@ -1,6 +1,8 @@ use rand::Rng; use rand::thread_rng; use rand_distr::{Distribution, Normal, Uniform}; +use std::fs::File; +use std::io::{BufReader, Read}; // default = 64k keys const DEFAULT_COUNT: usize = 1 << 16; @@ -136,6 +138,34 @@ pub fn generate_smooth_u8(count: Option) -> Vec { generate_normal_u8(count, mean, std_dev) } +/// Load Amazon dataset from binary file +/// Format: Each entry is 8 bytes (u64 little-endian) +pub fn load_amazon_dataset(path: &str, count: Option) -> std::io::Result> { + let file = File::open(path)?; + let mut reader = BufReader::new(file); + let mut buffer = [0u8; 8]; + let mut keys = Vec::new(); + + let max_count = count.unwrap_or(usize::MAX); + + while keys.len() < max_count { + match reader.read_exact(&mut buffer) { + Ok(_) => { + let key = u64::from_le_bytes(buffer); + keys.push(key); + } + Err(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + } + } + + // Sort and deduplicate + keys.sort_unstable(); + keys.dedup(); + + Ok(keys) +} + #[cfg(test)] mod tests { use super::*;