From e177dab5b0d82275723c6692433eb52ea630d40d Mon Sep 17 00:00:00 2001 From: Caleb Courier Date: Thu, 11 Dec 2025 14:22:21 -0600 Subject: [PATCH 1/2] try all methods before giving up instead of accepting first code block found --- guardrails/utils/parsing_utils.py | 45 +++++++----- tests/unit_tests/utils/test_parsing_utils.py | 75 +++++++++++++++++++- 2 files changed, 103 insertions(+), 17 deletions(-) diff --git a/guardrails/utils/parsing_utils.py b/guardrails/utils/parsing_utils.py index 11163549e..b5ed24b9a 100644 --- a/guardrails/utils/parsing_utils.py +++ b/guardrails/utils/parsing_utils.py @@ -72,34 +72,47 @@ def get_code_block( return trimmed_output +def try_to_parse(json_string: str) -> Tuple[Optional[Any], Optional[Exception]]: + error = None + try: + parsed = json.loads(json_string, strict=False) + except json.decoder.JSONDecodeError as e: + parsed = None + error = e + return parsed, error + + def extract_json_from_ouput( output: str, ) -> Tuple[Optional[Union[Dict, List]], Optional[Exception]]: # Find and extract json from code blocks extracted_code_block = output + output_as_dict = None + error = None has_json_block, json_start, json_end = has_code_block(output, "json") if has_json_block and json_start is not None and json_end is not None: extracted_code_block = get_code_block(output, json_start, json_end, "json") - else: + output_as_dict, error = try_to_parse(extracted_code_block) + + if not output_as_dict: has_block, block_start, block_end = has_code_block(output) if has_block and block_start is not None and block_end is not None: extracted_code_block = get_code_block(output, block_start, block_end) - else: - json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}") - json_groups = json_pattern.findall(output) - json_start, json_end = output.find("{"), output.rfind("}") - if len(json_groups) > 0 and len(json_groups[0]) == ( - json_end - json_start + 1 - ): - extracted_code_block = json_groups[0] + output_as_dict, error = try_to_parse(extracted_code_block) + + if not output_as_dict: + json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}") + json_groups = json_pattern.findall(output) + json_start, json_end = output.find("{"), output.rfind("}") + if len(json_groups) > 0 and len(json_groups[0]) == (json_end - json_start + 1): + extracted_code_block = json_groups[0] + output_as_dict, error = try_to_parse(extracted_code_block) + + if output_as_dict: + error = None + elif not error: + error = ValueError("No valid JSON could be extracted from the llm output!") - # Treat the output as a JSON string, and load it into a dict. - error = None - try: - output_as_dict = json.loads(extracted_code_block, strict=False) - except json.decoder.JSONDecodeError as e: - output_as_dict = None - error = e return output_as_dict, error diff --git a/tests/unit_tests/utils/test_parsing_utils.py b/tests/unit_tests/utils/test_parsing_utils.py index 6b1e667a9..1208ff174 100644 --- a/tests/unit_tests/utils/test_parsing_utils.py +++ b/tests/unit_tests/utils/test_parsing_utils.py @@ -2,6 +2,7 @@ import pytest from guardrails.utils.parsing_utils import ( + extract_json_from_ouput, get_code_block, has_code_block, prune_extra_keys, @@ -75,12 +76,84 @@ def test_has_code_block(llm_ouput, expected_output): ], ) def test_get_code_block(llm_ouput, expected_output, code_type): - has, start, end = has_code_block(llm_ouput) + has, start, end = has_code_block(llm_ouput, code_type) actual_output = get_code_block(llm_ouput, start, end, code_type) assert actual_output == expected_output +too_much_information = """ +Sure! Here's a bunch of code blocks you didn't ask for: + +``` +# Some Markdown +This is markdown +``` + +``` +some_var = "this is python code" +``` + +``` +this: + - is + - yaml +``` + +{ + "finally": "real json" +} + +""" +expected_from_tmi = {"finally": "real json"} + +expected_json_code = {"a": 1} + +non_json_block = """ +Sure! Here's a code block that's not JSON + +``` +Definitely not JSON +``` +""" + +braces_but_not_json = """ +Sometimes I like to add braces around words { like this } +""" + +# Ideally this should be supported, but our regex doesn't pick this up currently +json_array_1 = """ +[{ "a": 1 }] +""" +json_array_2 = """ +[{ "a": 1 }, { "b": 2 }] +""" + + +@pytest.mark.parametrize( + "llm_ouput,expected_output,expect_error", + [ + (json_code_block, expected_json_code, False), + (anonymous_code_block, expected_json_code, False), + (js_code_block, expected_json_code, False), + (no_code_block, expected_json_code, False), + (too_much_information, expected_from_tmi, False), + (not_even_json, None, True), + (non_json_block, None, True), + (braces_but_not_json, None, True), + # This is not desired behaviour, but it is descriptive of the current regex + (json_array_1, expected_json_code, False), + (json_array_2, None, True), + ], +) +def test_extract_json_from_ouput(llm_ouput, expected_output, expect_error): + output, error = extract_json_from_ouput(llm_ouput) + + assert output == expected_output + if expect_error: + assert error is not None + + with open( "tests/integration_tests/test_assets/json_schemas/choice_case_openapi.json", "r" ) as choice_case_openapi_file: From 1d6d27fae22e9f20281636e24b49cf9a14e0a39a Mon Sep 17 00:00:00 2001 From: Caleb Courier Date: Thu, 11 Dec 2025 14:23:42 -0600 Subject: [PATCH 2/2] revert --- tests/unit_tests/utils/test_parsing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/utils/test_parsing_utils.py b/tests/unit_tests/utils/test_parsing_utils.py index 1208ff174..9bb49eacf 100644 --- a/tests/unit_tests/utils/test_parsing_utils.py +++ b/tests/unit_tests/utils/test_parsing_utils.py @@ -76,7 +76,7 @@ def test_has_code_block(llm_ouput, expected_output): ], ) def test_get_code_block(llm_ouput, expected_output, code_type): - has, start, end = has_code_block(llm_ouput, code_type) + has, start, end = has_code_block(llm_ouput) actual_output = get_code_block(llm_ouput, start, end, code_type) assert actual_output == expected_output