Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ grafite = "0.2.0"

[dev-dependencies]
divan = "0.1"
reqwest = { version = "0.12", features = ["blocking"] }
zstd = "0.13"

[[bench]]
name = "query_benchmarks"
Expand Down
107 changes: 89 additions & 18 deletions benches/query_benchmarks.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
use divan::{black_box, Bencher};
use range_filters::{
bloom_filter::BloomFilter,
data_gen::generate_smooth_u64,
data_gen::load_amazon_dataset,
diva::Diva,
grafite_filter::GrafiteFilter,
Key,
};
use rand::Rng;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::Path;

fn main() {
divan::main();
Expand All @@ -15,6 +18,74 @@ fn main() {
// const SIZES: &[usize] = &[10_000, 100_000, 1_000_000, 10_000_000];
const SIZES: &[usize] = &[10_000, 100_000, 1_000_000];

// Amazon dataset paths and URL
const AMAZON_DATASET_URL: &str = "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/SVN8PI";
const AMAZON_DATASET_COMPRESSED: &str = "amazon_dataset.tab";
const AMAZON_DATASET_DECOMPRESSED: &str = "amazon_dataset_decompressed.tab";

/// Download the compressed Amazon dataset
fn download_amazon_dataset() -> std::io::Result<()> {
println!("Downloading Amazon dataset from {}", AMAZON_DATASET_URL);

let response = reqwest::blocking::get(AMAZON_DATASET_URL)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;

let bytes = response.bytes()
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;

let mut file = File::create(AMAZON_DATASET_COMPRESSED)?;
file.write_all(&bytes)?;

println!("Downloaded {} ({} bytes)", AMAZON_DATASET_COMPRESSED, bytes.len());
Ok(())
}

/// Decompress the Amazon dataset using zstd
fn decompress_amazon_dataset() -> std::io::Result<()> {
println!("Decompressing {} to {}", AMAZON_DATASET_COMPRESSED, AMAZON_DATASET_DECOMPRESSED);

let input_file = File::open(AMAZON_DATASET_COMPRESSED)?;
let output_file = File::create(AMAZON_DATASET_DECOMPRESSED)?;

let mut decoder = zstd::stream::read::Decoder::new(BufReader::new(input_file))?;
let mut writer = BufWriter::new(output_file);

std::io::copy(&mut decoder, &mut writer)?;
writer.flush()?;

println!("Decompressed successfully");
Ok(())
}

/// Ensure Amazon dataset is available (download and decompress if needed)
fn ensure_amazon_dataset() -> std::io::Result<()> {
// Check if decompressed file already exists
if Path::new(AMAZON_DATASET_DECOMPRESSED).exists() {
return Ok(());
}

// Check if compressed file exists
if !Path::new(AMAZON_DATASET_COMPRESSED).exists() {
download_amazon_dataset()?;
}

// Decompress
decompress_amazon_dataset()?;

Ok(())
}

// Helper function to load keys from Amazon dataset
fn load_keys(size: usize) -> Vec<Key> {
// Ensure dataset is available
ensure_amazon_dataset()
.expect("Failed to download/decompress Amazon dataset");

// Load the dataset
load_amazon_dataset(AMAZON_DATASET_DECOMPRESSED, Some(size))
.expect("panic: could not load amazon dataset")
}

// generate query ranges for benchmarking
fn generate_query_ranges(keys: &[Key], percent: f64, num_queries: usize) -> Vec<(Key, Key)> {
let mut rng = rand::thread_rng();
Expand All @@ -40,7 +111,7 @@ fn generate_query_ranges(keys: &[Key], percent: f64, num_queries: usize) -> Vec<

#[divan::bench(args = SIZES)]
fn diva_construction(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);

bencher.bench_local(|| {
black_box(Diva::new_with_keys(
Expand All @@ -53,7 +124,7 @@ fn diva_construction(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_point_query(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let diva = Diva::new_with_keys(&keys, 1024, 0.01);

// generate query keys (mix of existing and non-existing)
Expand All @@ -79,7 +150,7 @@ fn diva_point_query(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_range_query_small(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let diva = Diva::new_with_keys(&keys, 1024, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.01, 1000);

Expand All @@ -93,7 +164,7 @@ fn diva_range_query_small(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_range_query_medium(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let diva = Diva::new_with_keys(&keys, 1024, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.07, 1000);

Expand All @@ -107,7 +178,7 @@ fn diva_range_query_medium(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_range_query_large(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let diva = Diva::new_with_keys(&keys, 1024, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.4, 1000);

Expand All @@ -121,7 +192,7 @@ fn diva_range_query_large(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_insert(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let target_size = 1024;

bencher
Expand Down Expand Up @@ -159,7 +230,7 @@ fn diva_insert(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn diva_delete_infix(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let target_size = 1024;

let mut sorted_keys = keys.clone();
Expand Down Expand Up @@ -192,7 +263,7 @@ fn diva_delete_infix(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn bloom_construction(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);

bencher.bench_local(|| {
black_box(BloomFilter::new_with_keys(
Expand All @@ -204,7 +275,7 @@ fn bloom_construction(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn bloom_point_query(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let bloom = BloomFilter::new_with_keys(&keys, 0.01);

let mut rng = rand::thread_rng();
Expand All @@ -229,7 +300,7 @@ fn bloom_point_query(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn bloom_range_query_small(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let bloom = BloomFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.01, 1000);

Expand All @@ -243,7 +314,7 @@ fn bloom_range_query_small(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn bloom_range_query_medium(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let bloom = BloomFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.07, 1000);

Expand All @@ -257,7 +328,7 @@ fn bloom_range_query_medium(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn bloom_range_query_large(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let bloom = BloomFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.4, 1000);

Expand All @@ -275,7 +346,7 @@ fn bloom_range_query_large(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn grafite_construction(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);

bencher.bench_local(|| {
black_box(GrafiteFilter::new_with_keys(
Expand All @@ -287,7 +358,7 @@ fn grafite_construction(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn grafite_point_query(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let grafite = GrafiteFilter::new_with_keys(&keys, 0.01);

let mut rng = rand::thread_rng();
Expand All @@ -312,7 +383,7 @@ fn grafite_point_query(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn grafite_range_query_small(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let grafite = GrafiteFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.01, 1000);

Expand All @@ -326,7 +397,7 @@ fn grafite_range_query_small(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn grafite_range_query_medium(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let grafite = GrafiteFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.07, 1000);

Expand All @@ -340,7 +411,7 @@ fn grafite_range_query_medium(bencher: Bencher, size: usize) {

#[divan::bench(args = SIZES)]
fn grafite_range_query_large(bencher: Bencher, size: usize) {
let keys = generate_smooth_u64(Some(size));
let keys = load_keys(size);
let grafite = GrafiteFilter::new_with_keys(&keys, 0.01);
let query_ranges = generate_query_ranges(&keys, 0.4, 1000);

Expand Down
30 changes: 30 additions & 0 deletions src/data_gen.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use rand::Rng;
use rand::thread_rng;
use rand_distr::{Distribution, Normal, Uniform};
use std::fs::File;
use std::io::{BufReader, Read};

// default = 64k keys
const DEFAULT_COUNT: usize = 1 << 16;
Expand Down Expand Up @@ -136,6 +138,34 @@ pub fn generate_smooth_u8(count: Option<usize>) -> Vec<u8> {
generate_normal_u8(count, mean, std_dev)
}

/// Load Amazon dataset from binary file
/// Format: Each entry is 8 bytes (u64 little-endian)
pub fn load_amazon_dataset(path: &str, count: Option<usize>) -> std::io::Result<Vec<u64>> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut buffer = [0u8; 8];
let mut keys = Vec::new();

let max_count = count.unwrap_or(usize::MAX);

while keys.len() < max_count {
match reader.read_exact(&mut buffer) {
Ok(_) => {
let key = u64::from_le_bytes(buffer);
keys.push(key);
}
Err(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
Err(e) => return Err(e),
}
}

// Sort and deduplicate
keys.sort_unstable();
keys.dedup();

Ok(keys)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down