green-coder · Dr-Emann · Jun 2, 2022 · Jun 2, 2022 · Jun 2, 2022 · Jun 2, 2022
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.1"
 description = "A library for performing Content-Defined Chunking (CDC) on data streams."
 readme = "README.md"
 license = "MIT"
+edition = "2021"
 
 authors = ["Vincent Cantin <vincent@404.taipei>"]
 homepage = "https://github.com/green-coder/cdc"

diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
@@ -1,22 +1,34 @@
-extern crate cdc;
-extern crate criterion;
-
 use cdc::{Rabin64, RollingHash64};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 
-pub fn slide_benchmarks(c: &mut Criterion) {
-    for i in [1_000, 10_000, 100_000] {
-        c.bench_function(&format!("slide {}x", i), |b| {
-            let data: u8 = 16; //arbitrary value
-            b.iter(|| {
-                let mut rabin = Rabin64::new(5);
-                for _ in 0..i {
-                    rabin.slide(&data)
-                }
-            })
+fn slide_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("slide");
+    let data = 16;
+    for size in [1_000, 10_000, 100_000] {
+        group.throughput(Throughput::Bytes(size));
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| {
+            b.iter_batched(
+                || Rabin64::new(5),
+                |mut rabin| {
+                    for _ in 0..size {
+                        rabin.slide(black_box(&data));
+                    }
+                },
+                criterion::BatchSize::SmallInput,
+            );
         });
     }
 }
 
-criterion_group!(benches, slide_benchmarks);
+fn create_benchmarks(c: &mut Criterion) {
+    c.bench_function("new", |b| {
+        b.iter(|| Rabin64::new(5));
+    });
+
+    c.bench_function("with_polynom", |b| {
+        b.iter(|| Rabin64::new_with_polynom(5, &0x3847fe406c36e1));
+    });
+}
+
+criterion_group!(benches, slide_benchmarks, create_benchmarks);
 criterion_main!(benches);
diff --git a/examples/chunk.rs b/examples/chunk.rs
@@ -1,5 +1,3 @@
-extern crate cdc;
-
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io;

diff --git a/examples/separator.rs b/examples/separator.rs
@@ -1,5 +1,3 @@
-extern crate cdc;
-
 use std::fs::File;
 use std::io;
 use std::io::prelude::*;

diff --git a/examples/tree01.rs b/examples/tree01.rs
@@ -1,22 +1,17 @@
-extern crate cdc;
-
+use std::sync::atomic::{AtomicU32, Ordering};
 use cdc::*;
 
 type IntHash = u32;
 
-static mut HASH_ID: IntHash = 0;
+static HASH_ID: AtomicU32 = AtomicU32::new(0);
 fn get_new_hash_id() -> IntHash {
-    unsafe {
-        let id = HASH_ID;
-        HASH_ID += 1;
-        id
-    }
+    HASH_ID.fetch_add(1, Ordering::Relaxed)
 }
 
 fn my_new_node(level: usize, children: &Vec<IntHash>) -> Node<IntHash> {
     Node {
         hash: get_new_hash_id(),
-        level: level,
+        level,
         children: children.clone(),
     }
 }
@@ -29,9 +24,7 @@ fn main() {
         level: *level,
     });
 
-    unsafe {
-        HASH_ID = levels.len() as IntHash;
-    }
+    HASH_ID.store(levels.len() as _, Ordering::Relaxed);
 
     for node in NodeIter::new(hashed_chunk_it, my_new_node, 0) {
         println!("{:?}", node);

diff --git a/examples/tree02.rs b/examples/tree02.rs
@@ -1,6 +1,3 @@
-extern crate cdc;
-extern crate ring;
-
 #[macro_use]
 extern crate arrayref;
 
@@ -18,10 +15,7 @@ pub struct DigestReader<R> {
 
 impl<R: Read> DigestReader<R> {
     pub fn new(inner: R, digest: digest::Context) -> DigestReader<R> {
-        DigestReader {
-            inner: inner,
-            digest: digest,
-        }
+        DigestReader { inner, digest }
     }
 }
 
@@ -48,11 +42,11 @@ fn new_hash_node(level: usize, children: &Vec<Hash256>) -> Node<Hash256> {
         ctx.update(child);
     }
     let digest = ctx.finish();
-    let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone();
+    let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8];
 
     Node {
-        hash: hash,
-        level: level,
+        hash,
+        level,
         children: children.clone(),
     }
 }
@@ -79,15 +73,12 @@ fn chunk_file(path: &String) -> io::Result<()> {
         digest_reader.digest.update(&[0u8]); // To mark that it is a chunk, not a node.
         io::copy(&mut digest_reader, &mut io::sink()).unwrap();
         let digest = digest_reader.digest.finish();
-        let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone();
+        let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8];
 
         // Calculates the level of the separators.
         let level = HashToLevel::custom_new(13, 3).to_level(chunk.separator_hash);
 
-        HashedChunk {
-            hash: hash,
-            level: level,
-        }
+        HashedChunk { hash, level }
     });
 
     // Builds a tree of hash nodes.

diff --git a/src/lib.rs b/src/lib.rs
@@ -4,8 +4,8 @@ mod rolling_hash;
 mod separator;
 mod tree;
 
-pub use chunk::{Chunk, ChunkIter};
-pub use polynom::{Polynom, Polynom64};
-pub use rolling_hash::{Rabin64, RollingHash64};
-pub use separator::{HashToLevel, Separator, SeparatorIter};
-pub use tree::{HashedChunk, Node, NodeIter};
+pub use crate::chunk::{Chunk, ChunkIter};
+pub use crate::polynom::{Polynom, Polynom64};
+pub use crate::rolling_hash::{Rabin64, RollingHash64};
+pub use crate::separator::{HashToLevel, Separator, SeparatorIter};
+pub use crate::tree::{HashedChunk, Node, NodeIter};
diff --git a/src/rolling_hash.rs b/src/rolling_hash.rs
@@ -2,12 +2,23 @@ use super::{Polynom, Polynom64};
 
 pub trait RollingHash64 {
     fn reset(&mut self);
-    fn prefill_window<I>(&mut self, iter: &mut I) -> usize
+
+    /// Attempt to fills the window - 1 byte.
+    fn prefill_window<I>(&mut self, iter: I) -> usize
     where
         I: Iterator<Item = u8>;
-    fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
+
+    /// Combine a reset, and prefill_window
+    ///
+    /// This should have the same effect as calling reset() and prefill_window(),
+    /// but an implementation may be able to do so more efficiently.
+    fn reset_and_prefill_window<I>(&mut self, iter: I) -> usize
     where
-        I: Iterator<Item = u8>;
+        I: Iterator<Item = u8>,
+    {
+        self.reset();
+        self.prefill_window(iter)
+    }
     fn slide(&mut self, byte: &u8);
     fn get_hash(&self) -> &Polynom64;
 }
@@ -32,7 +43,7 @@ pub struct Rabin64 {
 pub const MOD_POLYNOM: Polynom64 = 0x3DA3358B4DC173;
 
 impl Rabin64 {
-    pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
+    fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
         let mut out_table = [0; 256];
         for (b, elem) in out_table.iter_mut().enumerate() {
             let mut hash = (b as Polynom64).modulo(mod_polynom);
@@ -46,7 +57,7 @@ impl Rabin64 {
         out_table
     }
 
-    pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
+    fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
         let mut mod_table = [0; 256];
         let k = mod_polynom.degree();
         for (b, elem) in mod_table.iter_mut().enumerate() {
@@ -57,11 +68,13 @@ impl Rabin64 {
         mod_table
     }
 
-    pub fn new(window_size_nb_bits: u32) -> Rabin64 {
+    pub fn new(window_size_nb_bits: u32) -> Self {
         Self::new_with_polynom(window_size_nb_bits, &MOD_POLYNOM)
     }
 
-    pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 {
+    pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Self {
+        // We don't really want to allocate 4 GiB of memory for the window.
+        assert!(window_size_nb_bits < 32);
         let window_size = 1 << window_size_nb_bits;
 
         let window_data = vec![0; window_size];
@@ -83,68 +96,55 @@ impl Rabin64 {
         for v in bytes {
             self.hash <<= 8;
             self.hash |= *v as Polynom64;
-            self.hash = self.hash.modulo(&mod_polynom);
+            self.hash = self.hash.modulo(mod_polynom);
         }
     }
 }
 
 impl RollingHash64 for Rabin64 {
     fn reset(&mut self) {
-        self.window_data.clear();
-        self.window_data.resize(self.window_size, 0);
+        self.window_data.fill(0);
         self.window_index = 0;
         self.hash = 0;
 
         // Not needed.
         // self.slide(1);
     }
 
-    // Attempt to fills the window - 1 byte.
-    fn prefill_window<I>(&mut self, iter: &mut I) -> usize
+    fn prefill_window<I>(&mut self, iter: I) -> usize
     where
         I: Iterator<Item = u8>,
     {
         let mut nb_bytes_read = 0;
-        for _ in 0..self.window_size - 1 {
-            match iter.next() {
-                Some(b) => {
-                    self.slide(&b);
-                    nb_bytes_read += 1;
-                }
-                None => break,
-            }
+        for byte in iter.take(self.window_size - 1) {
+            self.slide(&byte);
+            nb_bytes_read += 1;
         }
 
         nb_bytes_read
     }
 
-    // Combines a reset with a prefill in an optimized way.
-    fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
+    fn reset_and_prefill_window<I>(&mut self, iter: I) -> usize
     where
         I: Iterator<Item = u8>,
     {
         self.hash = 0;
         let mut nb_bytes_read = 0;
-        for _ in 0..self.window_size - 1 {
-            match iter.next() {
-                Some(b) => {
-                    // Take the old value out of the window and the hash.
-                    // ... let's suppose that the buffer contains zeroes, do nothing.
-
-                    // Put the new value in the window and in the hash.
-                    self.window_data[self.window_index] = b;
-                    let mod_index = (self.hash >> self.polynom_shift) & 255;
-                    self.hash <<= 8;
-                    self.hash |= b as Polynom64;
-                    self.hash ^= self.mod_table[mod_index as usize];
-
-                    // Move the windowIndex to the next position.
-                    self.window_index = (self.window_index + 1) & self.window_size_mask;
-
-                    nb_bytes_read += 1;
-                }
-                None => break,
-            }
+        for b in iter.take(self.window_size - 1) {
+            // Take the old value out of the window and the hash.
+            // ... let's suppose that the buffer contains zeroes, do nothing.
+
+            // Put the new value in the window and in the hash.
+            self.window_data[self.window_index] = b;
+            let mod_index = (self.hash >> self.polynom_shift) & 255;
+            self.hash <<= 8;
+            self.hash |= b as Polynom64;
+            self.hash ^= self.mod_table[mod_index as usize];
+
+            // Move the windowIndex to the next position.
+            self.window_index = (self.window_index + 1) & self.window_size_mask;
+
+            nb_bytes_read += 1;
         }
 
         // Because we didn't overwrite that element in the loop above.

diff --git a/src/separator.rs b/src/separator.rs
@@ -113,6 +113,12 @@ impl HashToLevel {
     }
 }
 
+impl Default for HashToLevel {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/src/tree.rs b/src/tree.rs
@@ -1,5 +1,5 @@
-/// Example of type to use with the generic structures below.
-//pub type Hash256 = [u8; 256/8];
+// Example of type to use with the generic structures below.
+// pub type Hash256 = [u8; 256/8];
 
 #[derive(Debug)]
 pub struct HashedChunk<H> {