Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.1.1"
description = "A library for performing Content-Defined Chunking (CDC) on data streams."
readme = "README.md"
license = "MIT"
edition = "2021"

authors = ["Vincent Cantin <vincent@404.taipei>"]
homepage = "https://github.com/green-coder/cdc"
Expand Down
42 changes: 27 additions & 15 deletions benches/benchmarks.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,34 @@
extern crate cdc;
extern crate criterion;

use cdc::{Rabin64, RollingHash64};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

pub fn slide_benchmarks(c: &mut Criterion) {
for i in [1_000, 10_000, 100_000] {
c.bench_function(&format!("slide {}x", i), |b| {
let data: u8 = 16; //arbitrary value
b.iter(|| {
let mut rabin = Rabin64::new(5);
for _ in 0..i {
rabin.slide(&data)
}
})
fn slide_benchmarks(c: &mut Criterion) {
let mut group = c.benchmark_group("slide");
let data = 16;
for size in [1_000, 10_000, 100_000] {
group.throughput(Throughput::Bytes(size));
group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| {
b.iter_batched(
|| Rabin64::new(5),
|mut rabin| {
for _ in 0..size {
rabin.slide(black_box(&data));
}
},
criterion::BatchSize::SmallInput,
);
});
}
}

criterion_group!(benches, slide_benchmarks);
fn create_benchmarks(c: &mut Criterion) {
c.bench_function("new", |b| {
b.iter(|| Rabin64::new(5));
});

c.bench_function("with_polynom", |b| {
b.iter(|| Rabin64::new_with_polynom(5, &0x3847fe406c36e1));
});
}

criterion_group!(benches, slide_benchmarks, create_benchmarks);
criterion_main!(benches);
2 changes: 0 additions & 2 deletions examples/chunk.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
extern crate cdc;

use std::cmp::{max, min};
use std::fs::File;
use std::io;
Expand Down
2 changes: 0 additions & 2 deletions examples/separator.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
extern crate cdc;

use std::fs::File;
use std::io;
use std::io::prelude::*;
Expand Down
17 changes: 5 additions & 12 deletions examples/tree01.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
extern crate cdc;

use std::sync::atomic::{AtomicU32, Ordering};
use cdc::*;

type IntHash = u32;

static mut HASH_ID: IntHash = 0;
static HASH_ID: AtomicU32 = AtomicU32::new(0);
fn get_new_hash_id() -> IntHash {
unsafe {
let id = HASH_ID;
HASH_ID += 1;
id
}
HASH_ID.fetch_add(1, Ordering::Relaxed)
}

fn my_new_node(level: usize, children: &Vec<IntHash>) -> Node<IntHash> {
Node {
hash: get_new_hash_id(),
level: level,
level,
children: children.clone(),
}
}
Expand All @@ -29,9 +24,7 @@ fn main() {
level: *level,
});

unsafe {
HASH_ID = levels.len() as IntHash;
}
HASH_ID.store(levels.len() as _, Ordering::Relaxed);

for node in NodeIter::new(hashed_chunk_it, my_new_node, 0) {
println!("{:?}", node);
Expand Down
21 changes: 6 additions & 15 deletions examples/tree02.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
extern crate cdc;
extern crate ring;

#[macro_use]
extern crate arrayref;

Expand All @@ -18,10 +15,7 @@ pub struct DigestReader<R> {

impl<R: Read> DigestReader<R> {
pub fn new(inner: R, digest: digest::Context) -> DigestReader<R> {
DigestReader {
inner: inner,
digest: digest,
}
DigestReader { inner, digest }
}
}

Expand All @@ -48,11 +42,11 @@ fn new_hash_node(level: usize, children: &Vec<Hash256>) -> Node<Hash256> {
ctx.update(child);
}
let digest = ctx.finish();
let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone();
let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8];

Node {
hash: hash,
level: level,
hash,
level,
children: children.clone(),
}
}
Expand All @@ -79,15 +73,12 @@ fn chunk_file(path: &String) -> io::Result<()> {
digest_reader.digest.update(&[0u8]); // To mark that it is a chunk, not a node.
io::copy(&mut digest_reader, &mut io::sink()).unwrap();
let digest = digest_reader.digest.finish();
let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone();
let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8];

// Calculates the level of the separators.
let level = HashToLevel::custom_new(13, 3).to_level(chunk.separator_hash);

HashedChunk {
hash: hash,
level: level,
}
HashedChunk { hash, level }
});

// Builds a tree of hash nodes.
Expand Down
10 changes: 5 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ mod rolling_hash;
mod separator;
mod tree;

pub use chunk::{Chunk, ChunkIter};
pub use polynom::{Polynom, Polynom64};
pub use rolling_hash::{Rabin64, RollingHash64};
pub use separator::{HashToLevel, Separator, SeparatorIter};
pub use tree::{HashedChunk, Node, NodeIter};
pub use crate::chunk::{Chunk, ChunkIter};
pub use crate::polynom::{Polynom, Polynom64};
pub use crate::rolling_hash::{Rabin64, RollingHash64};
pub use crate::separator::{HashToLevel, Separator, SeparatorIter};
pub use crate::tree::{HashedChunk, Node, NodeIter};
84 changes: 42 additions & 42 deletions src/rolling_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@ use super::{Polynom, Polynom64};

pub trait RollingHash64 {
fn reset(&mut self);
fn prefill_window<I>(&mut self, iter: &mut I) -> usize

/// Attempt to fills the window - 1 byte.
fn prefill_window<I>(&mut self, iter: I) -> usize
where
I: Iterator<Item = u8>;
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize

/// Combine a reset, and prefill_window
///
/// This should have the same effect as calling reset() and prefill_window(),
/// but an implementation may be able to do so more efficiently.
fn reset_and_prefill_window<I>(&mut self, iter: I) -> usize
where
I: Iterator<Item = u8>;
I: Iterator<Item = u8>,
{
self.reset();
self.prefill_window(iter)
}
fn slide(&mut self, byte: &u8);
fn get_hash(&self) -> &Polynom64;
}
Expand All @@ -32,7 +43,7 @@ pub struct Rabin64 {
pub const MOD_POLYNOM: Polynom64 = 0x3DA3358B4DC173;

impl Rabin64 {
pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
let mut out_table = [0; 256];
for (b, elem) in out_table.iter_mut().enumerate() {
let mut hash = (b as Polynom64).modulo(mod_polynom);
Expand All @@ -46,7 +57,7 @@ impl Rabin64 {
out_table
}

pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
let mut mod_table = [0; 256];
let k = mod_polynom.degree();
for (b, elem) in mod_table.iter_mut().enumerate() {
Expand All @@ -57,11 +68,13 @@ impl Rabin64 {
mod_table
}

pub fn new(window_size_nb_bits: u32) -> Rabin64 {
pub fn new(window_size_nb_bits: u32) -> Self {
Self::new_with_polynom(window_size_nb_bits, &MOD_POLYNOM)
}

pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 {
pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Self {
// We don't really want to allocate 4 GiB of memory for the window.
assert!(window_size_nb_bits < 32);
let window_size = 1 << window_size_nb_bits;

let window_data = vec![0; window_size];
Expand All @@ -83,68 +96,55 @@ impl Rabin64 {
for v in bytes {
self.hash <<= 8;
self.hash |= *v as Polynom64;
self.hash = self.hash.modulo(&mod_polynom);
self.hash = self.hash.modulo(mod_polynom);
}
}
}

impl RollingHash64 for Rabin64 {
fn reset(&mut self) {
self.window_data.clear();
self.window_data.resize(self.window_size, 0);
self.window_data.fill(0);
self.window_index = 0;
self.hash = 0;

// Not needed.
// self.slide(1);
}

// Attempt to fills the window - 1 byte.
fn prefill_window<I>(&mut self, iter: &mut I) -> usize
fn prefill_window<I>(&mut self, iter: I) -> usize
where
I: Iterator<Item = u8>,
{
let mut nb_bytes_read = 0;
for _ in 0..self.window_size - 1 {
match iter.next() {
Some(b) => {
self.slide(&b);
nb_bytes_read += 1;
}
None => break,
}
for byte in iter.take(self.window_size - 1) {
self.slide(&byte);
nb_bytes_read += 1;
}

nb_bytes_read
}

// Combines a reset with a prefill in an optimized way.
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
fn reset_and_prefill_window<I>(&mut self, iter: I) -> usize
where
I: Iterator<Item = u8>,
{
self.hash = 0;
let mut nb_bytes_read = 0;
for _ in 0..self.window_size - 1 {
match iter.next() {
Some(b) => {
// Take the old value out of the window and the hash.
// ... let's suppose that the buffer contains zeroes, do nothing.

// Put the new value in the window and in the hash.
self.window_data[self.window_index] = b;
let mod_index = (self.hash >> self.polynom_shift) & 255;
self.hash <<= 8;
self.hash |= b as Polynom64;
self.hash ^= self.mod_table[mod_index as usize];

// Move the windowIndex to the next position.
self.window_index = (self.window_index + 1) & self.window_size_mask;

nb_bytes_read += 1;
}
None => break,
}
for b in iter.take(self.window_size - 1) {
// Take the old value out of the window and the hash.
// ... let's suppose that the buffer contains zeroes, do nothing.

// Put the new value in the window and in the hash.
self.window_data[self.window_index] = b;
let mod_index = (self.hash >> self.polynom_shift) & 255;
self.hash <<= 8;
self.hash |= b as Polynom64;
self.hash ^= self.mod_table[mod_index as usize];

// Move the windowIndex to the next position.
self.window_index = (self.window_index + 1) & self.window_size_mask;

nb_bytes_read += 1;
}

// Because we didn't overwrite that element in the loop above.
Expand Down
6 changes: 6 additions & 0 deletions src/separator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ impl HashToLevel {
}
}

impl Default for HashToLevel {
fn default() -> Self {
Self::new()
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
4 changes: 2 additions & 2 deletions src/tree.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/// Example of type to use with the generic structures below.
//pub type Hash256 = [u8; 256/8];
// Example of type to use with the generic structures below.
// pub type Hash256 = [u8; 256/8];

#[derive(Debug)]
pub struct HashedChunk<H> {
Expand Down