|
1 | 1 | from __future__ import annotations |
| 2 | +from dataclasses import dataclass |
2 | 3 | from typing import Tuple |
3 | 4 | import os |
4 | 5 | import glob |
5 | 6 | from . import config, byte_codes |
| 7 | +from . indexing import KeyFinderState |
6 | 8 |
|
7 | 9 |
|
8 | 10 | def file_info(db_name: str) -> Tuple[str, bool, str, bool]: |
@@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]: |
37 | 39 | return files_all |
38 | 40 |
|
39 | 41 |
|
| 42 | + |
| 43 | +def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState: |
| 44 | + """ |
| 45 | + In the bytes of the json object find all top level keys and the start and end |
| 46 | + indices of their values. |
| 47 | + """ |
| 48 | + |
| 49 | + while state.i < batch_size: |
| 50 | + current = json_bytes[state.i] |
| 51 | + if state.skip_next: |
| 52 | + state.skip_next = False |
| 53 | + elif current == byte_codes.BACKSLASH: |
| 54 | + state.skip_next = True |
| 55 | + elif current == byte_codes.QUOTE: |
| 56 | + if state.dict_depth == 1 and state.list_depth == 0: |
| 57 | + if state.in_str: |
| 58 | + state.key_end = state.i |
| 59 | + state.i += 1 |
| 60 | + while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]: |
| 61 | + state.i += 1 |
| 62 | + state.value_start = state.i |
| 63 | + else: |
| 64 | + state.key_start = state.i + 1 |
| 65 | + state.in_str = not state.in_str |
| 66 | + elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]: |
| 67 | + pass |
| 68 | + elif current == byte_codes.OPEN_SQUARE: |
| 69 | + state.list_depth += 1 |
| 70 | + elif current == byte_codes.CLOSE_SQUARE: |
| 71 | + state.list_depth -= 1 |
| 72 | + elif current == byte_codes.OPEN_CURLY: |
| 73 | + state.dict_depth += 1 |
| 74 | + elif current == byte_codes.CLOSE_CURLY: |
| 75 | + state.dict_depth -= 1 |
| 76 | + elif state.list_depth == 0 and state.dict_depth == 1: |
| 77 | + state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1)) |
| 78 | + state.i += 1 |
| 79 | + |
| 80 | + |
| 81 | + |
| 82 | + |
| 83 | + |
| 84 | + |
| 85 | + |
40 | 86 | def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: |
41 | 87 | """ |
42 | 88 | Finds the index of the next comma or closing bracket/brace after the value |
43 | 89 | of a key-value pair in a bytes object containing valid JSON when decoded. |
44 | 90 |
|
| 91 | + Valid start indices are the index after the colon or the index after that. |
| 92 | +
|
| 93 | + Example: |
| 94 | +
|
| 95 | + 01234567 |
| 96 | + "2": {}, |
| 97 | +
|
| 98 | + Valid start indices are 4 and 5. Returns 7. |
| 99 | +
|
45 | 100 | Args: |
46 | 101 | - `json_bytes`: A bytes object containing valid JSON when decoded |
47 | 102 | - `index`: The start index in json_bytes |
48 | 103 |
|
49 | 104 | Returns: |
50 | | - - The end index of the value. |
| 105 | + - The end index of the first byte right after the value's bytes. |
51 | 106 | """ |
52 | 107 |
|
53 | 108 | # See https://www.json.org/json-en.html for the JSON syntax |
|
0 commit comments