Skip to content

Commit 6074dd4

Browse files
authored
Fix partial number read (#41)
1 parent bfcb551 commit 6074dd4

File tree

4 files changed

+176
-45
lines changed

4 files changed

+176
-45
lines changed

dictdatabase/byte_codes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# See: https://www.charset.org/utf-8
12
BACKSLASH = 92
23
QUOTE = 34
34
OPEN_SQUARE = 91
@@ -7,3 +8,4 @@
78
SPACE = 32
89
TAB = 9
910
NEWLINE = 10
11+
COMMA = 44

dictdatabase/utils.py

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -52,32 +52,47 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
5252

5353
# See https://www.json.org/json-en.html for the JSON syntax
5454

55-
skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
55+
in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes)
5656

57-
for i in range(index, len(json_bytes)):
58-
if skip_next:
59-
skip_next = False
60-
continue
57+
while i < len_json_bytes:
6158
current = json_bytes[i]
59+
# If backslash, skip the next character
6260
if current == byte_codes.BACKSLASH:
63-
skip_next = True
64-
continue
65-
if current == byte_codes.QUOTE:
61+
i += 1
62+
# If quote, toggle in_str
63+
elif current == byte_codes.QUOTE:
6664
in_str = not in_str
67-
if in_str or current == byte_codes.SPACE:
68-
continue
69-
if current == byte_codes.OPEN_SQUARE:
65+
# Possible exit point where string ends and nesting is zero
66+
if not in_str and list_depth == 0 and dict_depth == 0:
67+
return i + 1
68+
# If in string, skip
69+
elif in_str:
70+
pass
71+
72+
# Invariant: Not in_str, not escaped
73+
74+
# Handle opening brackets
75+
elif current == byte_codes.OPEN_SQUARE:
7076
list_depth += 1
71-
elif current == byte_codes.CLOSE_SQUARE:
72-
list_depth -= 1
7377
elif current == byte_codes.OPEN_CURLY:
7478
dict_depth += 1
75-
elif current == byte_codes.CLOSE_CURLY:
76-
dict_depth -= 1
77-
if list_depth == 0 and dict_depth == 0:
78-
return i + 1
79-
80-
raise TypeError("Invalid JSON syntax")
79+
# Handle closing brackets
80+
elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
81+
if current == byte_codes.CLOSE_SQUARE:
82+
list_depth -= 1
83+
if current == byte_codes.CLOSE_CURLY:
84+
dict_depth -= 1
85+
if list_depth == 0:
86+
if dict_depth == 0:
87+
return i + 1
88+
if dict_depth == -1:
89+
return i # Case: {"a": {}}
90+
elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1):
91+
# Handle commas and newline as exit points
92+
return i
93+
i += 1
94+
95+
raise TypeError("Invalid JSON")
8196

8297

8398
def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
@@ -90,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
90105
- `json_bytes`: A bytes object containing valid JSON when decoded
91106
"""
92107

93-
skip_next, in_str, nesting = False, False, 0
94-
for i in range(start, end):
95-
if skip_next:
96-
skip_next = False
97-
continue
98-
current = json_bytes[i]
99-
if current == byte_codes.BACKSLASH:
100-
skip_next = True
101-
continue
102-
if current == byte_codes.QUOTE:
108+
in_str, nesting, i = False, 0, start
109+
while i < end:
110+
byte_i = json_bytes[i]
111+
if byte_i == byte_codes.BACKSLASH:
112+
i += 1
113+
elif byte_i == byte_codes.QUOTE:
103114
in_str = not in_str
104-
if in_str or current == byte_codes.SPACE:
105-
continue
106-
elif current == byte_codes.OPEN_CURLY:
115+
elif in_str:
116+
pass
117+
elif byte_i == byte_codes.OPEN_CURLY:
107118
nesting += 1
108-
elif current == byte_codes.CLOSE_CURLY:
119+
elif byte_i == byte_codes.CLOSE_CURLY:
109120
nesting -= 1
121+
i += 1
110122
return nesting
111123

112124

tests/test_read.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent):
3030

3131
def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
3232
cases = [
33-
r'{"a": "\\", "b": 2}',
34-
r'{"a": "\\\\", "b": 2}',
35-
r'{"a": "\\\\\"", "b": 2}',
36-
r'{"a": "\\\"\\", "b": 2}',
37-
r'{"a": "\"\\\\", "b": 2}',
38-
r'{"a": "\"", "b": 2}',
39-
r'{"a": "\"\"", "b": 2}',
40-
r'{"a": "\"\"\\", "b": 2}',
41-
r'{"a": "\"\\\"", "b": 2}',
42-
r'{"a": "\\\"\"", "b": 2}',
33+
r'{"a": "\\", "b": 0}',
34+
r'{"a": "\\\\", "b": 1234}',
35+
r'{"a": "\\\\\"", "b": 1234}',
36+
r'{"a": "\\\"\\", "b": 1234}',
37+
r'{"a": "\"\\\\", "b": 1234}',
38+
r'{"a": "\"", "b": 1234}',
39+
r'{"a": "\"\"", "b": 1234}',
40+
r'{"a": "\"\"\\", "b": 1234}',
41+
r'{"a": "\"\\\"", "b": 1234}',
42+
r'{"a": "\\\"\"", "b": 1234}',
4343
]
4444

4545
for case in cases:
4646
with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f:
4747
f.write(case)
48-
dd = DDB.at("test_read_integrity", key="a").read()
49-
assert dd == json.loads(case)["a"]
48+
key_a = DDB.at("test_read_integrity", key="a").read()
49+
key_b = DDB.at("test_read_integrity", key="b").read()
50+
assert key_a == json.loads(case)["a"]
51+
assert key_b == json.loads(case)["b"]
52+
5053

5154

5255

tests/test_utils.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import itertools
2+
import orjson
3+
from dictdatabase import utils, io_unsafe, byte_codes
4+
5+
6+
def test_seek_index_through_value_bytes(use_test_dir):
7+
v = b'{"a": 1, "b": {}}'
8+
vc = b'{"a":1,"b":{}}'
9+
10+
assert utils.seek_index_through_value_bytes(v, 5) == 7
11+
assert utils.seek_index_through_value_bytes(v, 6) == 7
12+
assert utils.seek_index_through_value_bytes(vc, 5) == 6
13+
14+
assert utils.seek_index_through_value_bytes(v, 13) == 16
15+
assert utils.seek_index_through_value_bytes(vc, 11) == 13
16+
17+
18+
n = b'{"a": 1234, "b": {"c": 2}}'
19+
assert utils.seek_index_through_value_bytes(n, 5) == 10
20+
assert utils.seek_index_through_value_bytes(n, 6) == 10
21+
22+
23+
24+
25+
26+
def load_with_orjson(bytes, key):
27+
# print("load with orjson", bytes)
28+
return orjson.loads(bytes)[key]
29+
30+
31+
def load_with_seeker(bytes, key):
32+
key_bytes = f"\"{key}\":".encode()
33+
a_val_start = bytes.find(key_bytes) + len(key_bytes)
34+
if bytes[a_val_start] == byte_codes.SPACE:
35+
a_val_start += 1
36+
a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start)
37+
return orjson.loads(bytes[a_val_start:a_val_end])
38+
39+
40+
def test_seek_index_through_value_bytes_2(use_test_dir):
41+
42+
43+
def orjson_dump_with_indent(data):
44+
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)
45+
46+
def orjson_dump_without_indent(data):
47+
return orjson.dumps(data, option=orjson.OPT_SORT_KEYS)
48+
49+
orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent]
50+
51+
values = [
52+
# Lists
53+
[],
54+
[1, 2, 3],
55+
["xs", "value", "c"],
56+
[1, "xs", 2, "value", 3, "c"],
57+
[1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]],
58+
[{}, {}, {}],
59+
[{"xs": 1}, {"value": 2}, {"c": 3}],
60+
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}],
61+
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]],
62+
# Dicts
63+
{},
64+
{"xs": 1},
65+
{"xs": 1, "value": 2},
66+
{"xs": 1, "value": 2, "c": 3},
67+
{"xs": []},
68+
{"xs": [], "value": []},
69+
{"xs": -3.3, "value": ""},
70+
# Numbers
71+
1,
72+
1234,
73+
1.3,
74+
-1.3,
75+
32.3,
76+
0,
77+
-0,
78+
# Strings
79+
"",
80+
"a",
81+
"hello",
82+
"a\\b",
83+
"\\",
84+
"\\\\",
85+
"\\\\\"",
86+
"\\\"\\",
87+
"\"\\\\",
88+
"\"",
89+
"\"\"",
90+
"\"\"\\",
91+
"\"\\\"",
92+
"\\\"\"",
93+
]
94+
95+
for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values):
96+
97+
obj = {"a": v1, "b": v2}
98+
99+
json_bytes = dumper(obj)
100+
101+
102+
a_from_orjson = load_with_orjson(json_bytes, "a")
103+
a_from_seeker = load_with_seeker(json_bytes, "a")
104+
105+
b_from_orjson = load_with_orjson(json_bytes, "b")
106+
b_from_seeker = load_with_seeker(json_bytes, "b")
107+
108+
# print("obj", obj)
109+
# print("a_from_orjson", a_from_orjson)
110+
# print("a_from_seeker", a_from_seeker)
111+
assert a_from_orjson == a_from_seeker
112+
# print("b_from_orjson", b_from_orjson)
113+
# print("b_from_seeker", b_from_seeker)
114+
assert b_from_orjson == b_from_seeker

0 commit comments

Comments
 (0)