Skip to content

Commit 78cb18d

Browse files
feat: add MurmurHash2 hash() and unit+integration tests (#374)
* feat: add MurmurHash2 hash() implementation (#355) * feat: add MurmurHash2 hash() implementation test (#355) * feat: add seed‐variation, idempotent and large‐input tests (#355) * style: format with black --------- Co-authored-by: 盐粒 Yanli <yanli.yu@vesoft.com>
1 parent 1f71404 commit 78cb18d

File tree

4 files changed

+144
-0
lines changed

4 files changed

+144
-0
lines changed

nebula3/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .hash import hash

nebula3/utils/hash.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# nebula3/hash.py
2+
from __future__ import annotations
3+
4+
_M: int = 0xC6A4A7935BD1E995
5+
_R: int = 47
6+
_MASK64: int = (1 << 64) - 1
7+
8+
9+
def _read_u64_le(buf: bytes) -> int:
10+
"""Convert little-endian bytes of up to 8 bytes to an unsigned integer."""
11+
return int.from_bytes(buf, byteorder="little", signed=False)
12+
13+
14+
def hash(data: bytes | str, seed: int = 0xC70F6907) -> int:
15+
"""MurmurHash2 64-bit variant:
16+
:Param data: supports str (utf-8 encoding), bytes, bytearray
17+
:Param seed: defaults to 0xC70F6907
18+
:return: Python int, in the range of signed 64-bit
19+
"""
20+
if isinstance(data, str):
21+
data_as_bytes = data.encode("utf-8")
22+
elif isinstance(data, (bytes, bytearray)):
23+
data_as_bytes = bytes(data)
24+
else:
25+
raise TypeError("Input must be str, bytes, or bytearray")
26+
27+
h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64
28+
off = len(data_as_bytes) // 8 * 8
29+
for i in range(0, off, 8):
30+
k = _read_u64_le(data_as_bytes[i : i + 8])
31+
k = (k * _M) & _MASK64
32+
k ^= k >> _R
33+
k = (k * _M) & _MASK64
34+
h ^= k
35+
h = (h * _M) & _MASK64
36+
37+
tail = data_as_bytes[off:]
38+
if tail:
39+
t = _read_u64_le(tail)
40+
h ^= t
41+
h = (h * _M) & _MASK64
42+
43+
h ^= h >> _R
44+
h = (h * _M) & _MASK64
45+
h ^= h >> _R
46+
47+
if h & (1 << 63):
48+
h -= 1 << 64
49+
return h

tests/test_hash.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env python
2+
# --coding:utf-8--
3+
4+
# Copyright (c) 2020 vesoft inc. All rights reserved.
5+
#
6+
# This source code is licensed under Apache 2.0 License.
7+
8+
import pytest
9+
from nebula3.utils.hash import hash as murmur_hash
10+
11+
TEST_VECTORS = [
12+
(b"", 6142509188972423790),
13+
(b"a", 4993892634952068459),
14+
(b"abcdefgh", 8664279048047335611), # length-8 bytes cases
15+
(b"abcdefghi", -5409788147785758033),
16+
("to_be_hashed", -1098333533029391540),
17+
("中文", -8591787916246384322),
18+
]
19+
20+
21+
@pytest.mark.parametrize("data, expected", TEST_VECTORS)
22+
def test_known_vectors(data, expected):
23+
assert murmur_hash(data) == expected
24+
25+
26+
def test_str_bytes_equiv():
27+
"""
28+
Ensure str and bytes inputs produce the same hash.
29+
"""
30+
s = "pytest"
31+
assert murmur_hash(s) == murmur_hash(s.encode("utf-8"))
32+
33+
34+
def test_type_error():
35+
"""
36+
TypeError
37+
"""
38+
with pytest.raises(TypeError):
39+
murmur_hash(12345)
40+
41+
42+
def test_seed_variation():
43+
"""Different seed values should produce different hashes."""
44+
data = b"seed_test"
45+
hash1 = murmur_hash(data, seed=0)
46+
hash2 = murmur_hash(data, seed=1)
47+
assert hash1 != hash2
48+
49+
50+
def test_idempotent():
51+
"""Repeated calls with same input must yield the same result."""
52+
data = b"consistent"
53+
assert murmur_hash(data) == murmur_hash(data)
54+
55+
56+
def test_large_input_performance():
57+
"""Large inputs should be processed without error and return an int."""
58+
data = b"x" * 10_000
59+
result = murmur_hash(data)
60+
assert isinstance(result, int)

tests/test_hash_integration.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env python
2+
# --coding:utf-8--
3+
4+
# Copyright (c) 2020 vesoft inc. All rights reserved.
5+
#
6+
# This source code is licensed under Apache 2.0 License.
7+
8+
import pytest
9+
from nebula3.Config import Config
10+
from nebula3.gclient.net import ConnectionPool
11+
from nebula3.utils.hash import hash as murmur_hash
12+
13+
14+
@pytest.fixture(scope="module")
15+
def nebula_session():
16+
config = Config()
17+
config.max_connection_pool_size = 10
18+
pool = ConnectionPool()
19+
pool.init([("127.0.0.1", 9669)], config)
20+
session = pool.get_session("root", "nebula")
21+
yield session
22+
pool.close()
23+
24+
25+
@pytest.mark.parametrize(
26+
"data", ["", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文"]
27+
)
28+
def test_hash_against_server(nebula_session, data):
29+
# Local Computing
30+
expected = murmur_hash(data)
31+
result = nebula_session.execute(f'YIELD hash("{data}")')
32+
assert result.is_succeeded(), result.error_msg()
33+
actual = result.row_values(0)[0].as_int()
34+
assert actual == expected

0 commit comments

Comments
 (0)