Skip to content

Commit f6387eb

Browse files
committed
init
1 parent 8fbad87 commit f6387eb

File tree

3 files changed

+96
-2
lines changed

3 files changed

+96
-2
lines changed

dictdatabase/byte_codes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@
88
SPACE = 32
99
TAB = 9
1010
NEWLINE = 10
11+
COLON = 58
1112
COMMA = 44

dictdatabase/indexing.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
from dataclasses import dataclass
12
import orjson
23
import os
3-
from . import config
4+
from . import config, utils, byte_codes, io_bytes
45

56
# Problem: Multiple read processes will concurrently read and write the same file
67
# In some cases this will result in a empty read error, thats why the try-except exists
@@ -21,6 +22,42 @@
2122
# - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read
2223

2324

25+
26+
27+
28+
29+
@dataclass
30+
class KeyFinderState:
31+
skip_next = False
32+
in_str = False
33+
list_depth = 0
34+
dict_depth = 1
35+
key_start = None
36+
key_end = None
37+
value_end = None
38+
indices = []
39+
i = 1
40+
41+
42+
def batched_find_all_top_level_keys(db_name):
43+
state, b = KeyFinderState(), 0
44+
while True:
45+
batch_start = b * 10_000_000
46+
batch_end = batch_start + 10_000_000
47+
48+
batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end)
49+
50+
if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY:
51+
raise ValueError("The first byte of the database file must be an opening curly brace")
52+
if len(batch_bytes) == 0:
53+
break
54+
utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes))
55+
return state.indices
56+
57+
58+
59+
60+
2461
class Indexer:
2562
"""
2663
The Indexer takes the name of a database file, and tries to load the .index file
@@ -57,6 +94,7 @@ def __init__(self, db_name: str):
5794
self.data = {}
5895

5996

97+
6098
def get(self, key):
6199
"""
62100
Returns a list of 5 elements for a key if it exists, otherwise None

dictdatabase/utils.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from __future__ import annotations
2+
from dataclasses import dataclass
23
from typing import Tuple
34
import os
45
import glob
56
from . import config, byte_codes
7+
from . indexing import KeyFinderState
68

79

810
def file_info(db_name: str) -> Tuple[str, bool, str, bool]:
@@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]:
3739
return files_all
3840

3941

42+
43+
def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState:
44+
"""
45+
In the bytes of the json object find all top level keys and the start and end
46+
indices of their values.
47+
"""
48+
49+
while state.i < batch_size:
50+
current = json_bytes[state.i]
51+
if state.skip_next:
52+
state.skip_next = False
53+
elif current == byte_codes.BACKSLASH:
54+
state.skip_next = True
55+
elif current == byte_codes.QUOTE:
56+
if state.dict_depth == 1 and state.list_depth == 0:
57+
if state.in_str:
58+
state.key_end = state.i
59+
state.i += 1
60+
while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]:
61+
state.i += 1
62+
state.value_start = state.i
63+
else:
64+
state.key_start = state.i + 1
65+
state.in_str = not state.in_str
66+
elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]:
67+
pass
68+
elif current == byte_codes.OPEN_SQUARE:
69+
state.list_depth += 1
70+
elif current == byte_codes.CLOSE_SQUARE:
71+
state.list_depth -= 1
72+
elif current == byte_codes.OPEN_CURLY:
73+
state.dict_depth += 1
74+
elif current == byte_codes.CLOSE_CURLY:
75+
state.dict_depth -= 1
76+
elif state.list_depth == 0 and state.dict_depth == 1:
77+
state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1))
78+
state.i += 1
79+
80+
81+
82+
83+
84+
85+
4086
def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
4187
"""
4288
Finds the index of the next comma or closing bracket/brace after the value
4389
of a key-value pair in a bytes object containing valid JSON when decoded.
4490
91+
Valid start indices are the index after the colon or the index after that.
92+
93+
Example:
94+
95+
01234567
96+
"2": {},
97+
98+
Valid start indices are 4 and 5. Returns 7.
99+
45100
Args:
46101
- `json_bytes`: A bytes object containing valid JSON when decoded
47102
- `index`: The start index in json_bytes
48103
49104
Returns:
50-
- The end index of the value.
105+
- The end index of the first byte right after the value's bytes.
51106
"""
52107

53108
# See https://www.json.org/json-en.html for the JSON syntax

0 commit comments

Comments
 (0)