From e177dab5b0d82275723c6692433eb52ea630d40d Mon Sep 17 00:00:00 2001
From: Caleb Courier <caleb.courier@gmail.com>
Date: Thu, 11 Dec 2025 14:22:21 -0600
Subject: [PATCH 1/2] try all methods before giving up instead of accepting
 first code block found

---
 guardrails/utils/parsing_utils.py            | 45 +++++++-----
 tests/unit_tests/utils/test_parsing_utils.py | 75 +++++++++++++++++++-
 2 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/guardrails/utils/parsing_utils.py b/guardrails/utils/parsing_utils.py
index 11163549e..b5ed24b9a 100644
--- a/guardrails/utils/parsing_utils.py
+++ b/guardrails/utils/parsing_utils.py
@@ -72,34 +72,47 @@ def get_code_block(
     return trimmed_output
 
 
+def try_to_parse(json_string: str) -> Tuple[Optional[Any], Optional[Exception]]:
+    error = None
+    try:
+        parsed = json.loads(json_string, strict=False)
+    except json.decoder.JSONDecodeError as e:
+        parsed = None
+        error = e
+    return parsed, error
+
+
 def extract_json_from_ouput(
     output: str,
 ) -> Tuple[Optional[Union[Dict, List]], Optional[Exception]]:
     # Find and extract json from code blocks
     extracted_code_block = output
+    output_as_dict = None
+    error = None
     has_json_block, json_start, json_end = has_code_block(output, "json")
     if has_json_block and json_start is not None and json_end is not None:
         extracted_code_block = get_code_block(output, json_start, json_end, "json")
-    else:
+        output_as_dict, error = try_to_parse(extracted_code_block)
+
+    if not output_as_dict:
         has_block, block_start, block_end = has_code_block(output)
         if has_block and block_start is not None and block_end is not None:
             extracted_code_block = get_code_block(output, block_start, block_end)
-        else:
-            json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}")
-            json_groups = json_pattern.findall(output)
-            json_start, json_end = output.find("{"), output.rfind("}")
-            if len(json_groups) > 0 and len(json_groups[0]) == (
-                json_end - json_start + 1
-            ):
-                extracted_code_block = json_groups[0]
+            output_as_dict, error = try_to_parse(extracted_code_block)
+
+    if not output_as_dict:
+        json_pattern = regex.compile(r"\{(?:[^{}]+|\{(?:(?R)|[^{}]+)*\})*\}")
+        json_groups = json_pattern.findall(output)
+        json_start, json_end = output.find("{"), output.rfind("}")
+        if len(json_groups) > 0 and len(json_groups[0]) == (json_end - json_start + 1):
+            extracted_code_block = json_groups[0]
+            output_as_dict, error = try_to_parse(extracted_code_block)
+
+    if output_as_dict:
+        error = None
+    elif not error:
+        error = ValueError("No valid JSON could be extracted from the llm output!")
 
-    # Treat the output as a JSON string, and load it into a dict.
-    error = None
-    try:
-        output_as_dict = json.loads(extracted_code_block, strict=False)
-    except json.decoder.JSONDecodeError as e:
-        output_as_dict = None
-        error = e
     return output_as_dict, error
 
 
diff --git a/tests/unit_tests/utils/test_parsing_utils.py b/tests/unit_tests/utils/test_parsing_utils.py
index 6b1e667a9..1208ff174 100644
--- a/tests/unit_tests/utils/test_parsing_utils.py
+++ b/tests/unit_tests/utils/test_parsing_utils.py
@@ -2,6 +2,7 @@
 import pytest
 
 from guardrails.utils.parsing_utils import (
+    extract_json_from_ouput,
     get_code_block,
     has_code_block,
     prune_extra_keys,
@@ -75,12 +76,84 @@ def test_has_code_block(llm_ouput, expected_output):
     ],
 )
 def test_get_code_block(llm_ouput, expected_output, code_type):
-    has, start, end = has_code_block(llm_ouput)
+    has, start, end = has_code_block(llm_ouput, code_type)
     actual_output = get_code_block(llm_ouput, start, end, code_type)
 
     assert actual_output == expected_output
 
 
+too_much_information = """
+Sure! Here's a bunch of code blocks you didn't ask for:
+
+```
+# Some Markdown
+This is markdown
+```
+
+```
+some_var = "this is python code"
+```
+
+```
+this:
+    - is
+    - yaml
+```
+
+{
+    "finally": "real json"
+}
+
+"""
+expected_from_tmi = {"finally": "real json"}
+
+expected_json_code = {"a": 1}
+
+non_json_block = """
+Sure! Here's a code block that's not JSON
+
+```
+Definitely not JSON
+```
+"""
+
+braces_but_not_json = """
+Sometimes I like to add braces around words { like this }
+"""
+
+# Ideally this should be supported, but our regex doesn't pick this up currently
+json_array_1 = """
+[{ "a": 1 }]
+"""
+json_array_2 = """
+[{ "a": 1 }, { "b": 2 }]
+"""
+
+
+@pytest.mark.parametrize(
+    "llm_ouput,expected_output,expect_error",
+    [
+        (json_code_block, expected_json_code, False),
+        (anonymous_code_block, expected_json_code, False),
+        (js_code_block, expected_json_code, False),
+        (no_code_block, expected_json_code, False),
+        (too_much_information, expected_from_tmi, False),
+        (not_even_json, None, True),
+        (non_json_block, None, True),
+        (braces_but_not_json, None, True),
+        # This is not desired behaviour, but it is descriptive of the current regex
+        (json_array_1, expected_json_code, False),
+        (json_array_2, None, True),
+    ],
+)
+def test_extract_json_from_ouput(llm_ouput, expected_output, expect_error):
+    output, error = extract_json_from_ouput(llm_ouput)
+
+    assert output == expected_output
+    if expect_error:
+        assert error is not None
+
+
 with open(
     "tests/integration_tests/test_assets/json_schemas/choice_case_openapi.json", "r"
 ) as choice_case_openapi_file:

From 1d6d27fae22e9f20281636e24b49cf9a14e0a39a Mon Sep 17 00:00:00 2001
From: Caleb Courier <caleb.courier@gmail.com>
Date: Thu, 11 Dec 2025 14:23:42 -0600
Subject: [PATCH 2/2] revert

---
 tests/unit_tests/utils/test_parsing_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit_tests/utils/test_parsing_utils.py b/tests/unit_tests/utils/test_parsing_utils.py
index 1208ff174..9bb49eacf 100644
--- a/tests/unit_tests/utils/test_parsing_utils.py
+++ b/tests/unit_tests/utils/test_parsing_utils.py
@@ -76,7 +76,7 @@ def test_has_code_block(llm_ouput, expected_output):
     ],
 )
 def test_get_code_block(llm_ouput, expected_output, code_type):
-    has, start, end = has_code_block(llm_ouput, code_type)
+    has, start, end = has_code_block(llm_ouput)
     actual_output = get_code_block(llm_ouput, start, end, code_type)
 
     assert actual_output == expected_output