Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 4 additions & 18 deletions datasketches/src/cpc/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ use crate::codec::assert::ensure_serial_version_is;
use crate::codec::assert::insufficient_data;
use crate::codec::family::Family;
use crate::common::NumStdDev;
use crate::common::canonical_double;
use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
use crate::cpc::DEFAULT_LG_K;
use crate::cpc::Flavor;
Expand All @@ -49,6 +48,7 @@ use crate::error::Error;
use crate::error::ErrorKind;
use crate::hash::DEFAULT_UPDATE_SEED;
use crate::hash::MurmurHash3X64128;
use crate::hash::SketchHashable;
use crate::hash::compute_seed_hash;

/// A Compressed Probabilistic Counting sketch.
Expand Down Expand Up @@ -170,12 +170,10 @@ impl CpcSketch {
self.num_coupons == 0
}

/// Update the sketch with a hashable value.
///
/// For `f32`/`f64` values, use `update_f32`/`update_f64` instead.
pub fn update<T: Hash>(&mut self, value: T) {
/// Update the sketch with a value that implements [`SketchHashable`].
pub fn update<T: SketchHashable>(&mut self, value: T) {
let mut hasher = MurmurHash3X64128::with_seed(self.seed);
value.hash(&mut hasher);
value.to_hashable().hash(&mut hasher);
let (h1, h2) = hasher.finish128();

let k = 1 << self.lg_k;
Expand All @@ -191,18 +189,6 @@ impl CpcSketch {
self.row_col_update(row_col);
}

/// Update the sketch with a f64 value.
pub fn update_f64(&mut self, value: f64) {
// Canonicalize double for compatibility with Java
let canonical = canonical_double(value);
self.update(canonical);
}

/// Update the sketch with a f32 value.
pub fn update_f32(&mut self, value: f32) {
self.update_f64(value as f64);
}

pub(super) fn flavor(&self) -> Flavor {
determine_flavor(self.lg_k, self.num_coupons)
}
Expand Down
6 changes: 3 additions & 3 deletions datasketches/src/cpc/union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ impl CpcUnion {
/// # use datasketches::cpc::CpcSketch;
///
/// let mut s1 = CpcSketch::new(12);
/// s1.update(&"apple");
/// s1.update("apple");
///
/// let mut s2 = CpcSketch::new(12);
/// s2.update(&"apple");
/// s2.update(&"banana");
/// s2.update("apple");
/// s2.update("banana");
///
/// let mut union = CpcUnion::new(12);
/// union.update(&s1);
Expand Down
4 changes: 4 additions & 0 deletions datasketches/src/hash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
// specific language governing permissions and limitations
// under the License.

//! Shared hashing utilities.

mod murmurhash;
mod sketch_hashable;
mod xxhash;

pub(crate) use self::murmurhash::MurmurHash3X64128;
pub use self::sketch_hashable::SketchHashable;
pub(crate) use self::xxhash::XxHash64;

/// The seed 9001 used in the sketch update methods is a prime number that was chosen very early
Expand Down
170 changes: 170 additions & 0 deletions datasketches/src/hash/sketch_hashable.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::hash::Hash;

use crate::common::canonical_double;

mod private {
pub trait Sealed {}
}

/// A trait for customizing sketch update hash behavior.
pub trait SketchHashable: private::Sealed {
/// Returns a canonical hashable view for use by sketch update operations.
fn to_hashable(&self) -> impl Hash;
}

/// A wrapper for byte-oriented inputs that hashes only the payload bytes.
///
/// Rust's `Hash` implementations for byte-like types such as `&str`, `String`, `&[u8]`, and
/// `Vec<u8>` are not raw-byte writes. They delegate through `Hasher::write_*` helpers that also
/// mix structural information, notably the slice length, into the hash stream. That behavior is
/// correct for Rust collections in general, but it does not match DataSketches update hashing.
///
/// The Java and C++ DataSketches implementations hash string and byte inputs by feeding only the
/// UTF-8 / byte payload into the sketch hash function. They do not append an extra Rust-specific
/// length marker. For cross-language compatibility we need to reproduce that "raw bytes only"
/// contract here.
struct RawBytes<'a>(&'a [u8]);

impl Hash for RawBytes<'_> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
state.write(self.0);
}
}

macro_rules! impl_sketch_hashable_via_i64 {
($($src:ty => $mid:ty),* $(,)?) => {
$(
impl private::Sealed for $src {}

impl SketchHashable for $src {
fn to_hashable(&self) -> impl Hash {
(*self as $mid) as i64
}
}
)*
};
}

macro_rules! impl_sketch_hashable_passthrough {
($($src:ty),* $(,)?) => {
$(
impl private::Sealed for $src {}

impl SketchHashable for $src {
fn to_hashable(&self) -> impl Hash {
*self
}
}
)*
};
}

impl_sketch_hashable_via_i64!(
i8 => i64,
i16 => i64,
i32 => i64,
i64 => i64,
isize => i64,
u8 => i8,
u16 => i16,
u32 => i32,
);

impl_sketch_hashable_passthrough!(bool, char, i128, u64, u128, usize);

impl private::Sealed for f64 {}

impl SketchHashable for f64 {
fn to_hashable(&self) -> impl Hash {
canonical_double(*self)
}
}

impl private::Sealed for f32 {}

impl SketchHashable for f32 {
fn to_hashable(&self) -> impl Hash {
canonical_double(*self as f64)
}
}

impl private::Sealed for &str {}

impl SketchHashable for &str {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_bytes())
}
}

impl private::Sealed for String {}

impl SketchHashable for String {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_bytes())
}
}

impl private::Sealed for &String {}

impl SketchHashable for &String {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_bytes())
}
}

impl private::Sealed for &[u8] {}

impl SketchHashable for &[u8] {
fn to_hashable(&self) -> impl Hash {
RawBytes(self)
}
}

impl private::Sealed for Vec<u8> {}

impl SketchHashable for Vec<u8> {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_slice())
}
}

impl private::Sealed for &Vec<u8> {}

impl SketchHashable for &Vec<u8> {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_slice())
}
}

impl<const N: usize> private::Sealed for [u8; N] {}

impl<const N: usize> SketchHashable for [u8; N] {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_slice())
}
}

impl<const N: usize> private::Sealed for &[u8; N] {}

impl<const N: usize> SketchHashable for &[u8; N] {
fn to_hashable(&self) -> impl Hash {
RawBytes(self.as_slice())
}
}
5 changes: 3 additions & 2 deletions datasketches/src/hll/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
use std::hash::Hash;

use crate::hash::MurmurHash3X64128;
use crate::hash::SketchHashable;

mod array4;
mod array6;
Expand Down Expand Up @@ -178,9 +179,9 @@ fn pack_coupon(slot: u32, value: u8) -> u32 {
}

/// Generate a coupon from a hashable value.
fn coupon<H: Hash>(v: H) -> u32 {
fn coupon<H: SketchHashable>(v: H) -> u32 {
let mut hasher = MurmurHash3X64128::default();
v.hash(&mut hasher);
v.to_hashable().hash(&mut hasher);
let (lo, hi) = hasher.finish128();

let addr26 = lo as u32 & KEY_MASK_26;
Expand Down
10 changes: 4 additions & 6 deletions datasketches/src/hll/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@
//! This module provides the main [`HllSketch`] struct, which is the primary interface
//! for creating and using HLL sketches for cardinality estimation.

use std::hash::Hash;

use crate::codec::SketchSlice;
use crate::codec::assert::ensure_serial_version_is;
use crate::codec::assert::insufficient_data;
use crate::codec::family::Family;
use crate::common::NumStdDev;
use crate::error::Error;
use crate::hash::SketchHashable;
use crate::hll::HllType;
use crate::hll::RESIZE_DENOMINATOR;
use crate::hll::RESIZE_NUMERATOR;
Expand Down Expand Up @@ -156,10 +155,9 @@ impl HllSketch {
self.lg_config_k
}

/// Update the sketch with a value
/// Update the sketch with a value that implements [`SketchHashable`].
///
/// This accepts any type that implements `Hash`. The value is hashed
/// and converted to a coupon, which is then inserted into the sketch.
/// The value is hashed and converted to a coupon, which is then inserted into the sketch.
///
/// # Examples
///
Expand All @@ -170,7 +168,7 @@ impl HllSketch {
/// sketch.update("apple");
/// assert!(sketch.estimate() >= 1.0);
/// ```
pub fn update<T: Hash>(&mut self, value: T) {
pub fn update<T: SketchHashable>(&mut self, value: T) {
let coupon = coupon(value);
self.update_with_coupon(coupon);
}
Expand Down
10 changes: 4 additions & 6 deletions datasketches/src/hll/union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@
//! * Different modes (List, Set, Array4/6/8)
//! * Different target HLL types

use std::hash::Hash;

use crate::common::NumStdDev;
use crate::hash::SketchHashable;
use crate::hll::HllSketch;
use crate::hll::HllType;
use crate::hll::array4::Array4;
Expand Down Expand Up @@ -89,10 +88,9 @@ impl HllUnion {
Self { lg_max_k, gadget }
}

/// Update the union's gadget with a value
/// Update the union's gadget with a value that implements [`SketchHashable`].
///
/// This accepts any type that implements `Hash`. The value is hashed
/// and converted to a coupon, which is then inserted into the sketch.
/// The value is hashed and converted to a coupon, which is then inserted into the sketch.
///
/// # Examples
///
Expand All @@ -103,7 +101,7 @@ impl HllUnion {
/// union.update_value("apple");
/// let _result = union.to_sketch(HllType::Hll8);
/// ```
pub fn update_value<T: Hash>(&mut self, value: T) {
pub fn update_value<T: SketchHashable>(&mut self, value: T) {
self.gadget.update(value);
}

Expand Down
3 changes: 1 addition & 2 deletions datasketches/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ pub mod countmin;
pub mod cpc;
pub mod error;
pub mod frequencies;
pub mod hash;
pub mod hll;
pub mod tdigest;
pub mod theta;

mod hash;
Loading