From 176d4270b1bbec4281b4974a01f3f72f56ef74fa Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:41:08 -0400 Subject: [PATCH 01/11] Changes as suggested by clippy --- benches/benchmarks.rs | 2 +- examples/tree01.rs | 2 +- examples/tree02.rs | 16 ++++++++-------- src/rolling_hash.rs | 2 +- src/separator.rs | 6 ++++++ 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index a4c8981..ab05678 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -2,7 +2,7 @@ extern crate cdc; extern crate criterion; use cdc::{Rabin64, RollingHash64}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, Criterion}; pub fn slide_benchmarks(c: &mut Criterion) { for i in [1_000, 10_000, 100_000] { diff --git a/examples/tree01.rs b/examples/tree01.rs index defb5bb..37e560f 100644 --- a/examples/tree01.rs +++ b/examples/tree01.rs @@ -16,7 +16,7 @@ fn get_new_hash_id() -> IntHash { fn my_new_node(level: usize, children: &Vec) -> Node { Node { hash: get_new_hash_id(), - level: level, + level, children: children.clone(), } } diff --git a/examples/tree02.rs b/examples/tree02.rs index f382232..f10fbfd 100644 --- a/examples/tree02.rs +++ b/examples/tree02.rs @@ -19,8 +19,8 @@ pub struct DigestReader { impl DigestReader { pub fn new(inner: R, digest: digest::Context) -> DigestReader { DigestReader { - inner: inner, - digest: digest, + inner, + digest, } } } @@ -48,11 +48,11 @@ fn new_hash_node(level: usize, children: &Vec) -> Node { ctx.update(child); } let digest = ctx.finish(); - let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone(); + let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8]; Node { - hash: hash, - level: level, + hash, + level, children: children.clone(), } } @@ -79,14 +79,14 @@ fn chunk_file(path: &String) -> io::Result<()> { digest_reader.digest.update(&[0u8]); // To mark that it is a chunk, not a node. io::copy(&mut digest_reader, &mut io::sink()).unwrap(); let digest = digest_reader.digest.finish(); - let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone(); + let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8]; // Calculates the level of the separators. let level = HashToLevel::custom_new(13, 3).to_level(chunk.separator_hash); HashedChunk { - hash: hash, - level: level, + hash, + level, } }); diff --git a/src/rolling_hash.rs b/src/rolling_hash.rs index ca5fa08..84129ac 100644 --- a/src/rolling_hash.rs +++ b/src/rolling_hash.rs @@ -83,7 +83,7 @@ impl Rabin64 { for v in bytes { self.hash <<= 8; self.hash |= *v as Polynom64; - self.hash = self.hash.modulo(&mod_polynom); + self.hash = self.hash.modulo(mod_polynom); } } } diff --git a/src/separator.rs b/src/separator.rs index e6f9a97..5c5ecf6 100644 --- a/src/separator.rs +++ b/src/separator.rs @@ -113,6 +113,12 @@ impl HashToLevel { } } +impl Default for HashToLevel { + fn default() -> Self { + Self::new() + } +} + #[cfg(test)] mod tests { use super::*; From 6d496fa8dd62010e3060b360b39fe5b3f36c4691 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:41:43 -0400 Subject: [PATCH 02/11] Prep for moving to new edition --- src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8c376b3..a22a5ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,8 @@ mod rolling_hash; mod separator; mod tree; -pub use chunk::{Chunk, ChunkIter}; -pub use polynom::{Polynom, Polynom64}; -pub use rolling_hash::{Rabin64, RollingHash64}; -pub use separator::{HashToLevel, Separator, SeparatorIter}; -pub use tree::{HashedChunk, Node, NodeIter}; +pub use crate::chunk::{Chunk, ChunkIter}; +pub use crate::polynom::{Polynom, Polynom64}; +pub use crate::rolling_hash::{Rabin64, RollingHash64}; +pub use crate::separator::{HashToLevel, Separator, SeparatorIter}; +pub use crate::tree::{HashedChunk, Node, NodeIter}; From ef5dfaac2a2c75326af502db47779b0cce1eff49 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:42:36 -0400 Subject: [PATCH 03/11] Move to new edition --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 309eed4..f097104 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.1" description = "A library for performing Content-Defined Chunking (CDC) on data streams." readme = "README.md" license = "MIT" +edition = "2018" authors = ["Vincent Cantin "] homepage = "https://github.com/green-coder/cdc" From 075681bca002f28bbe994f5efa80da715ed30f36 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:43:14 -0400 Subject: [PATCH 04/11] No need for extern crate anymore --- benches/benchmarks.rs | 4 ++-- examples/chunk.rs | 2 +- examples/separator.rs | 2 +- examples/tree01.rs | 2 +- examples/tree02.rs | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index ab05678..ea25989 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,5 +1,5 @@ -extern crate cdc; -extern crate criterion; + + use cdc::{Rabin64, RollingHash64}; use criterion::{criterion_group, criterion_main, Criterion}; diff --git a/examples/chunk.rs b/examples/chunk.rs index 41edfef..904854a 100644 --- a/examples/chunk.rs +++ b/examples/chunk.rs @@ -1,4 +1,4 @@ -extern crate cdc; + use std::cmp::{max, min}; use std::fs::File; diff --git a/examples/separator.rs b/examples/separator.rs index 8d6b54b..87ea759 100644 --- a/examples/separator.rs +++ b/examples/separator.rs @@ -1,4 +1,4 @@ -extern crate cdc; + use std::fs::File; use std::io; diff --git a/examples/tree01.rs b/examples/tree01.rs index 37e560f..88651cf 100644 --- a/examples/tree01.rs +++ b/examples/tree01.rs @@ -1,4 +1,4 @@ -extern crate cdc; + use cdc::*; diff --git a/examples/tree02.rs b/examples/tree02.rs index f10fbfd..36e2c3d 100644 --- a/examples/tree02.rs +++ b/examples/tree02.rs @@ -1,5 +1,5 @@ -extern crate cdc; -extern crate ring; + + #[macro_use] extern crate arrayref; From 5cced81ca17cf08c7bc3982d51cb807e17f9e666 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:44:03 -0400 Subject: [PATCH 05/11] Move to edition 2021 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f097104..46c38a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.1" description = "A library for performing Content-Defined Chunking (CDC) on data streams." readme = "README.md" license = "MIT" -edition = "2018" +edition = "2021" authors = ["Vincent Cantin "] homepage = "https://github.com/green-coder/cdc" From 7dcdeaa1002dd302025d4f108a34f94adc8dc83e Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 14:57:57 -0400 Subject: [PATCH 06/11] Cargo fmt --- benches/benchmarks.rs | 3 --- examples/chunk.rs | 2 -- examples/separator.rs | 2 -- examples/tree01.rs | 2 -- examples/tree02.rs | 13 ++----------- 5 files changed, 2 insertions(+), 20 deletions(-) diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index ea25989..8e2cd73 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,6 +1,3 @@ - - - use cdc::{Rabin64, RollingHash64}; use criterion::{criterion_group, criterion_main, Criterion}; diff --git a/examples/chunk.rs b/examples/chunk.rs index 904854a..69aa083 100644 --- a/examples/chunk.rs +++ b/examples/chunk.rs @@ -1,5 +1,3 @@ - - use std::cmp::{max, min}; use std::fs::File; use std::io; diff --git a/examples/separator.rs b/examples/separator.rs index 87ea759..38803aa 100644 --- a/examples/separator.rs +++ b/examples/separator.rs @@ -1,5 +1,3 @@ - - use std::fs::File; use std::io; use std::io::prelude::*; diff --git a/examples/tree01.rs b/examples/tree01.rs index 88651cf..1cdbfbe 100644 --- a/examples/tree01.rs +++ b/examples/tree01.rs @@ -1,5 +1,3 @@ - - use cdc::*; type IntHash = u32; diff --git a/examples/tree02.rs b/examples/tree02.rs index 36e2c3d..7c056a9 100644 --- a/examples/tree02.rs +++ b/examples/tree02.rs @@ -1,6 +1,3 @@ - - - #[macro_use] extern crate arrayref; @@ -18,10 +15,7 @@ pub struct DigestReader { impl DigestReader { pub fn new(inner: R, digest: digest::Context) -> DigestReader { - DigestReader { - inner, - digest, - } + DigestReader { inner, digest } } } @@ -84,10 +78,7 @@ fn chunk_file(path: &String) -> io::Result<()> { // Calculates the level of the separators. let level = HashToLevel::custom_new(13, 3).to_level(chunk.separator_hash); - HashedChunk { - hash, - level, - } + HashedChunk { hash, level } }); // Builds a tree of hash nodes. From 0882c95292ca97bf32ba17423667a5bffffca913 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 15:06:52 -0400 Subject: [PATCH 07/11] Minor misc changes --- src/rolling_hash.rs | 82 ++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/src/rolling_hash.rs b/src/rolling_hash.rs index 84129ac..4c80299 100644 --- a/src/rolling_hash.rs +++ b/src/rolling_hash.rs @@ -2,12 +2,23 @@ use super::{Polynom, Polynom64}; pub trait RollingHash64 { fn reset(&mut self); - fn prefill_window(&mut self, iter: &mut I) -> usize + + /// Attempt to fills the window - 1 byte. + fn prefill_window(&mut self, iter: I) -> usize where I: Iterator; - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize + + /// Combine a reset, and prefill_window + /// + /// This should have the same effect as calling reset() and prefill_window(), + /// but an implementation may be able to do so more efficiently. + fn reset_and_prefill_window(&mut self, iter: I) -> usize where - I: Iterator; + I: Iterator, + { + self.reset(); + self.prefill_window(iter) + } fn slide(&mut self, byte: &u8); fn get_hash(&self) -> &Polynom64; } @@ -32,7 +43,7 @@ pub struct Rabin64 { pub const MOD_POLYNOM: Polynom64 = 0x3DA3358B4DC173; impl Rabin64 { - pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] { + fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] { let mut out_table = [0; 256]; for (b, elem) in out_table.iter_mut().enumerate() { let mut hash = (b as Polynom64).modulo(mod_polynom); @@ -46,7 +57,7 @@ impl Rabin64 { out_table } - pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] { + fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] { let mut mod_table = [0; 256]; let k = mod_polynom.degree(); for (b, elem) in mod_table.iter_mut().enumerate() { @@ -57,11 +68,13 @@ impl Rabin64 { mod_table } - pub fn new(window_size_nb_bits: u32) -> Rabin64 { + pub fn new(window_size_nb_bits: u32) -> Self { Self::new_with_polynom(window_size_nb_bits, &MOD_POLYNOM) } - pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 { + pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Self { + // We don't really want to allocate 4 GiB of memory for the window. + assert!(window_size_nb_bits < 32); let window_size = 1 << window_size_nb_bits; let window_data = vec![0; window_size]; @@ -90,8 +103,7 @@ impl Rabin64 { impl RollingHash64 for Rabin64 { fn reset(&mut self) { - self.window_data.clear(); - self.window_data.resize(self.window_size, 0); + self.window_data.fill(0); self.window_index = 0; self.hash = 0; @@ -99,52 +111,40 @@ impl RollingHash64 for Rabin64 { // self.slide(1); } - // Attempt to fills the window - 1 byte. - fn prefill_window(&mut self, iter: &mut I) -> usize + fn prefill_window(&mut self, iter: I) -> usize where I: Iterator, { let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - self.slide(&b); - nb_bytes_read += 1; - } - None => break, - } + for byte in iter.take(self.window_size - 1) { + self.slide(&byte); + nb_bytes_read += 1; } nb_bytes_read } - // Combines a reset with a prefill in an optimized way. - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize + fn reset_and_prefill_window(&mut self, iter: I) -> usize where I: Iterator, { self.hash = 0; let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - // Take the old value out of the window and the hash. - // ... let's suppose that the buffer contains zeroes, do nothing. - - // Put the new value in the window and in the hash. - self.window_data[self.window_index] = b; - let mod_index = (self.hash >> self.polynom_shift) & 255; - self.hash <<= 8; - self.hash |= b as Polynom64; - self.hash ^= self.mod_table[mod_index as usize]; - - // Move the windowIndex to the next position. - self.window_index = (self.window_index + 1) & self.window_size_mask; - - nb_bytes_read += 1; - } - None => break, - } + for b in iter.take(self.window_size - 1) { + // Take the old value out of the window and the hash. + // ... let's suppose that the buffer contains zeroes, do nothing. + + // Put the new value in the window and in the hash. + self.window_data[self.window_index] = b; + let mod_index = (self.hash >> self.polynom_shift) & 255; + self.hash <<= 8; + self.hash |= b as Polynom64; + self.hash ^= self.mod_table[mod_index as usize]; + + // Move the windowIndex to the next position. + self.window_index = (self.window_index + 1) & self.window_size_mask; + + nb_bytes_read += 1; } // Because we didn't overwrite that element in the loop above. From 5306bce182de43869fa7bc4151996fb211102159 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 16:08:29 -0400 Subject: [PATCH 08/11] Group benchmarks --- benches/benchmarks.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 8e2cd73..6947652 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,16 +1,21 @@ use cdc::{Rabin64, RollingHash64}; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId, black_box, Throughput}; pub fn slide_benchmarks(c: &mut Criterion) { - for i in [1_000, 10_000, 100_000] { - c.bench_function(&format!("slide {}x", i), |b| { - let data: u8 = 16; //arbitrary value - b.iter(|| { - let mut rabin = Rabin64::new(5); - for _ in 0..i { - rabin.slide(&data) - } - }) + let mut group = c.benchmark_group("slide"); + let data = 16; + for size in [1_000, 10_000, 100_000] { + group.throughput(Throughput::Bytes(size)); + group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| { + b.iter_batched( + || Rabin64::new(5), + |mut rabin| { + for _ in 0..size { + rabin.slide(black_box(&data)); + } + }, + criterion::BatchSize::SmallInput, + ); }); } } From 773a4ae0102602f67db463adec408f1985b5f257 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 16:10:51 -0400 Subject: [PATCH 09/11] Fix comments at the top of tree.rs --- src/tree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree.rs b/src/tree.rs index 8db9006..e98c351 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -1,5 +1,5 @@ -/// Example of type to use with the generic structures below. -//pub type Hash256 = [u8; 256/8]; +// Example of type to use with the generic structures below. +// pub type Hash256 = [u8; 256/8]; #[derive(Debug)] pub struct HashedChunk { From 5d85ad4ba6e4feab66f1d91e2427f22534a30195 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Thu, 2 Jun 2022 16:16:21 -0400 Subject: [PATCH 10/11] Remove unsafe in example --- examples/tree01.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/tree01.rs b/examples/tree01.rs index 1cdbfbe..22ab50a 100644 --- a/examples/tree01.rs +++ b/examples/tree01.rs @@ -1,14 +1,11 @@ +use std::sync::atomic::{AtomicU32, Ordering}; use cdc::*; type IntHash = u32; -static mut HASH_ID: IntHash = 0; +static HASH_ID: AtomicU32 = AtomicU32::new(0); fn get_new_hash_id() -> IntHash { - unsafe { - let id = HASH_ID; - HASH_ID += 1; - id - } + HASH_ID.fetch_add(1, Ordering::Relaxed) } fn my_new_node(level: usize, children: &Vec) -> Node { @@ -27,9 +24,7 @@ fn main() { level: *level, }); - unsafe { - HASH_ID = levels.len() as IntHash; - } + HASH_ID.store(levels.len() as _, Ordering::Relaxed); for node in NodeIter::new(hashed_chunk_it, my_new_node, 0) { println!("{:?}", node); From c17568969a180541a805c12f474512a138d42972 Mon Sep 17 00:00:00 2001 From: Zachary Dremann Date: Tue, 5 Jul 2022 21:29:35 -0400 Subject: [PATCH 11/11] Add tests for creating a new Rabin64 --- benches/benchmarks.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 6947652..4b05622 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,7 +1,7 @@ use cdc::{Rabin64, RollingHash64}; -use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId, black_box, Throughput}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -pub fn slide_benchmarks(c: &mut Criterion) { +fn slide_benchmarks(c: &mut Criterion) { let mut group = c.benchmark_group("slide"); let data = 16; for size in [1_000, 10_000, 100_000] { @@ -20,5 +20,15 @@ pub fn slide_benchmarks(c: &mut Criterion) { } } -criterion_group!(benches, slide_benchmarks); +fn create_benchmarks(c: &mut Criterion) { + c.bench_function("new", |b| { + b.iter(|| Rabin64::new(5)); + }); + + c.bench_function("with_polynom", |b| { + b.iter(|| Rabin64::new_with_polynom(5, &0x3847fe406c36e1)); + }); +} + +criterion_group!(benches, slide_benchmarks, create_benchmarks); criterion_main!(benches);