Skip to content

Commit a25afcf

Browse files
Merge pull request #42 from UmbrellaMalware/main
Add glom-like searching for keys
2 parents 93d0df0 + fc1d8b7 commit a25afcf

File tree

8 files changed

+195
-24
lines changed

8 files changed

+195
-24
lines changed

dictdatabase/dataclasses.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import dataclasses
2+
3+
4+
@dataclasses.dataclass(frozen=True)
5+
class SearchResult:
6+
start_byte: int
7+
end_byte: int
8+
found: bool

dictdatabase/index_manager.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import hashlib
2+
3+
from dictdatabase import utils
4+
5+
6+
class IndexManager:
7+
@staticmethod
8+
def create_index(all_file_bytes: bytes, key: str, start, end):
9+
"""
10+
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
11+
value
12+
13+
Args:
14+
all_file_bytes (bytes): The entire file as a byte string.
15+
key (str): The key of the value we're indexing.
16+
start: the start of the value in the file
17+
end: the end of the value in the file
18+
19+
Returns:
20+
The key, start, end, indent_level, indent_with, value_hash, end
21+
"""
22+
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
23+
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
24+
all_file_bytes, key_start
25+
)
26+
value_bytes = all_file_bytes[start:end]
27+
value_hash = hashlib.sha256(value_bytes).hexdigest()
28+
return key, start, end, indent_level, indent_with, value_hash, end

dictdatabase/indexing.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
import orjson
21
import os
2+
3+
import orjson
4+
35
from . import config
46

7+
58
# Problem: Multiple read processes will concurrently read and write the same file
69
# In some cases this will result in a empty read error, thats why the try-except exists
710

dictdatabase/io_unsafe.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
from __future__ import annotations
2-
from typing import Tuple
2+
3+
import hashlib
4+
import json
35
from dataclasses import dataclass
6+
from typing import Tuple
7+
48
import orjson
5-
import json
6-
import hashlib
7-
from . import config, utils, byte_codes, indexing, io_bytes
9+
10+
from . import byte_codes
11+
from . import config
12+
from . import indexing
13+
from . import io_bytes
14+
from . import searching
15+
from . import utils
16+
from .index_manager import IndexManager
817

918

1019
@dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9
@@ -79,21 +88,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None:
7988

8089
# Not found in index file, search for key in the entire file
8190
all_file_bytes = io_bytes.read(db_name)
82-
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
83-
84-
if key_end == -1:
91+
start, end, found = searching.search_value_position_in_db(all_file_bytes, key)
92+
if not found:
8593
return None
86-
87-
# Key found, now determine the bounding byte indices of the value
88-
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
89-
end = utils.seek_index_through_value_bytes(all_file_bytes, start)
90-
91-
indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
9294
value_bytes = all_file_bytes[start:end]
93-
value_hash = hashlib.sha256(value_bytes).hexdigest()
94-
9595
# Write key info to index file
96-
indexer.write(key, start, end, indent_level, indent_with, value_hash, end)
96+
indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end))
9797
return orjson.loads(value_bytes)
9898

9999

@@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
185185
return partial_handle
186186

187187
# Not found in index file, search for key in the entire file
188-
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
188+
position = searching.search_key_position_in_db(all_file_bytes, key)
189189

190-
if key_end == -1:
190+
if not position.found:
191191
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")
192192

193193
# Key found, now determine the bounding byte indices of the value
194-
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
194+
start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0)
195195
end = utils.seek_index_through_value_bytes(all_file_bytes, start)
196196

197-
indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
197+
indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte)
198198

199199
partial_value = orjson.loads(all_file_bytes[start:end])
200200
prefix_bytes = all_file_bytes[:start] if config.use_compression else None

dictdatabase/searching.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from typing import Tuple
2+
3+
import orjson
4+
5+
from dictdatabase import byte_codes
6+
from dictdatabase import utils
7+
from dictdatabase.dataclasses import SearchResult
8+
9+
10+
def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult:
11+
"""
12+
It finds the start and end indices of the value of a key in a JSON file
13+
14+
Args:
15+
file (bytes): bytes
16+
key (str): The key to find in the JSON file.
17+
18+
Returns:
19+
A tuple of the start and end index of the key, and a boolean value indicating whether the key was found.
20+
"""
21+
key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key)
22+
if key_end == -1:
23+
return SearchResult(start_byte=-1, end_byte=-1, found=False)
24+
start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0)
25+
end = utils.seek_index_through_value_bytes(file, start)
26+
return SearchResult(start_byte=start, end_byte=end, found=True)
27+
28+
29+
def search_key_position_in_db(
30+
file: bytes, key: str, glom_searching=True
31+
) -> SearchResult:
32+
original_value_start = 0
33+
original_value_end = len(file)
34+
original_key_start = 0
35+
original_key_end = len(file)
36+
for k in key.split(".") if glom_searching else [key]:
37+
key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k)
38+
if key_end == -1:
39+
return SearchResult(start_byte=-1, end_byte=-1, found=False)
40+
original_key_end = original_value_start + key_end
41+
original_key_start = original_value_start + key_start
42+
position = find_key_position_in_bytes(file, k)
43+
original_value_end = original_value_start + original_value_end
44+
original_value_start += position.start_byte
45+
file = file[original_value_start:original_value_end]
46+
return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True)
47+
48+
49+
def search_value_position_in_db(
50+
all_file_bytes: bytes, key: str, glom_searching=True
51+
) -> Tuple[int, int, bool]:
52+
"""
53+
It takes a byte string, a key, and a boolean, and returns a tuple of three integers
54+
55+
Args:
56+
all_file_bytes (bytes): The bytes of the file you're searching in.
57+
key (str): The key to search for.
58+
glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to
59+
True
60+
61+
Returns:
62+
The start and end of the key in the file.
63+
"""
64+
original_start = 0
65+
original_end = len(all_file_bytes)
66+
for k in key.split(".") if glom_searching else [key]:
67+
position = find_key_position_in_bytes(
68+
all_file_bytes[original_start:original_end], k
69+
)
70+
if not position.found:
71+
return -1, -1, False
72+
original_end = original_start + position.end_byte
73+
original_start += position.start_byte
74+
return original_start, original_end, True

tests/benchmark/run_parallel.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ class Scenario:
8989
ops: int = 10
9090

9191
def print(self):
92-
res = f"Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)"
93-
res += ", 🔸 compression" if self.use_compression else ""
94-
res += ", 💎 big file" if self.big_file else ""
92+
res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)"
93+
res += ", [] compression" if self.use_compression else ""
94+
res += ", {} big file" if self.big_file else ""
9595
print(res)
9696

9797

tests/test_glom_like_searching.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import dictdatabase as DDB
2+
3+
data = {
4+
"users": {
5+
"Ben": {"age": 30, "job": "Software Engineer"},
6+
"Bob": {"age": 30, "job": "Plumbers"},
7+
},
8+
"Ben": {"job": {"age": 30, "job": "Software Engineer"}},
9+
}
10+
11+
12+
def test_glom_searching():
13+
DDB.at("users").create(data, force_overwrite=True)
14+
assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer"
15+
16+
17+
def test_without_glom_searching():
18+
DDB.at("users").create(data, force_overwrite=True)
19+
assert DDB.at("users", key="Ben").read() == {
20+
"job": {"age": 30, "job": "Software Engineer"}
21+
}
22+
23+
24+
def test_glom_searching_if_key_not_exists():
25+
DDB.at("users").create(data, force_overwrite=True)
26+
assert DDB.at("users", key="users.Job.Ben").read() is None
27+
28+
29+
def test_glom_searching_if_subkey_not_exists():
30+
DDB.at("users").create(data, force_overwrite=True)
31+
assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None

tests/test_glom_writing.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pytest
2+
3+
import dictdatabase as DDB
4+
5+
data = {
6+
"users": {
7+
"Ben": {"age": 30, "job": "Software Engineer"},
8+
"Bob": {"age": 30, "job": "Plumbers"},
9+
},
10+
"Ben": {"job": {"age": 30, "job": "Software Engineer"}},
11+
}
12+
13+
14+
def test_glom_writing():
15+
DDB.at("users").create(data, force_overwrite=True)
16+
with DDB.at("users", key="users.Ben").session() as (session, purchase):
17+
purchase["status"] = "cancelled"
18+
session.write()
19+
assert DDB.at("users", key="users.Ben.status").read() == "cancelled"
20+
21+
22+
def test_glom_writing_sub_key_not_exists():
23+
DDB.at("users").create(data, force_overwrite=True)
24+
with pytest.raises(KeyError):
25+
with DDB.at("users", key="users.SUBKEY").session() as (session, purchase):
26+
purchase["status"] = "cancelled"
27+
session.write()

0 commit comments

Comments
 (0)