Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions guardrails/utils/parsing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,34 +72,47 @@ def get_code_block(
return trimmed_output


def try_to_parse(json_string: str) -> Tuple[Optional[Any], Optional[Exception]]:
error = None
try:
parsed = json.loads(json_string, strict=False)
except json.decoder.JSONDecodeError as e:
parsed = None
error = e
return parsed, error


def extract_json_from_ouput(
output: str,
) -> Tuple[Optional[Union[Dict, List]], Optional[Exception]]:
# Find and extract json from code blocks
extracted_code_block = output
output_as_dict = None
error = None
has_json_block, json_start, json_end = has_code_block(output, "json")
if has_json_block and json_start is not None and json_end is not None:
extracted_code_block = get_code_block(output, json_start, json_end, "json")
else:
output_as_dict, error = try_to_parse(extracted_code_block)

if not output_as_dict:
has_block, block_start, block_end = has_code_block(output)
if has_block and block_start is not None and block_end is not None:
extracted_code_block = get_code_block(output, block_start, block_end)
else:
json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}")
json_groups = json_pattern.findall(output)
json_start, json_end = output.find("{"), output.rfind("}")
if len(json_groups) > 0 and len(json_groups[0]) == (
json_end - json_start + 1
):
extracted_code_block = json_groups[0]
output_as_dict, error = try_to_parse(extracted_code_block)

if not output_as_dict:
json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}")
json_groups = json_pattern.findall(output)
json_start, json_end = output.find("{"), output.rfind("}")
if len(json_groups) > 0 and len(json_groups[0]) == (json_end - json_start + 1):
extracted_code_block = json_groups[0]
output_as_dict, error = try_to_parse(extracted_code_block)

if output_as_dict:
error = None
elif not error:
error = ValueError("No valid JSON could be extracted from the llm output!")

# Treat the output as a JSON string, and load it into a dict.
error = None
try:
output_as_dict = json.loads(extracted_code_block, strict=False)
except json.decoder.JSONDecodeError as e:
output_as_dict = None
error = e
return output_as_dict, error


Expand Down
73 changes: 73 additions & 0 deletions tests/unit_tests/utils/test_parsing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest

from guardrails.utils.parsing_utils import (
extract_json_from_ouput,
get_code_block,
has_code_block,
prune_extra_keys,
Expand Down Expand Up @@ -81,6 +82,78 @@ def test_get_code_block(llm_ouput, expected_output, code_type):
assert actual_output == expected_output


too_much_information = """
Sure! Here's a bunch of code blocks you didn't ask for:

```
# Some Markdown
This is markdown
```

```
some_var = "this is python code"
```

```
this:
- is
- yaml
```

{
"finally": "real json"
}

"""
expected_from_tmi = {"finally": "real json"}

expected_json_code = {"a": 1}

non_json_block = """
Sure! Here's a code block that's not JSON

```
Definitely not JSON
```
"""

braces_but_not_json = """
Sometimes I like to add braces around words { like this }
"""

# Ideally this should be supported, but our regex doesn't pick this up currently
json_array_1 = """
[{ "a": 1 }]
"""
json_array_2 = """
[{ "a": 1 }, { "b": 2 }]
"""


@pytest.mark.parametrize(
"llm_ouput,expected_output,expect_error",
[
(json_code_block, expected_json_code, False),
(anonymous_code_block, expected_json_code, False),
(js_code_block, expected_json_code, False),
(no_code_block, expected_json_code, False),
(too_much_information, expected_from_tmi, False),
(not_even_json, None, True),
(non_json_block, None, True),
(braces_but_not_json, None, True),
# This is not desired behaviour, but it is descriptive of the current regex
(json_array_1, expected_json_code, False),
(json_array_2, None, True),
],
)
def test_extract_json_from_ouput(llm_ouput, expected_output, expect_error):
output, error = extract_json_from_ouput(llm_ouput)

assert output == expected_output
if expect_error:
assert error is not None


with open(
"tests/integration_tests/test_assets/json_schemas/choice_case_openapi.json", "r"
) as choice_case_openapi_file:
Expand Down
Loading