From f74ab7ce4920d1009d177bea082ee7645ad8e856 Mon Sep 17 00:00:00 2001 From: Minh-Duc Bui Date: Mon, 15 Jun 2026 16:39:37 +0700 Subject: [PATCH] fix(analyzer): parse structured output from text for OpenAI-compatible endpoints The LLM analyzers built their structured-output client with `ChatOpenAI.with_structured_output(schema)`, which defaults to `method="json_schema"` (strict `response_format`). Endpoints that speak the OpenAI API but do not enforce that contract are free to ignore it, and several return the JSON wrapped in a markdown code fence or surrounded by prose. The strict parser then receives non-JSON and raises, so every semantic analyzer fails and the whole scan exits with a JSON decode error. Static analysis still runs, but all LLM findings are lost. Swap the strict client for the canonical langchain chain `llm | PydanticOutputParser`. The parser uses langchain's fence-tolerant `parse_json_markdown` under the hood and validates into the same Pydantic schema, so `parse_response` is unchanged for both the per-file analyzers and the meta-analyzer. Strict endpoints keep working; lenient ones now work too. Inject the parser's `get_format_instructions()` through a single `_with_format` helper at the run-loop chokepoint so the default prompt and the meta-analyzer's overridden prompt both get JSON guidance without duplicating it. Add regression tests that drive fenced and prose-wrapped JSON through the real parser, and update the three tests that asserted the removed `with_structured_output` seam to the direct-mock pattern the other tests use. Closes #69 Signed-off-by: Minh-Duc Bui --- src/skillspector/llm_analyzer_base.py | 35 ++++++-- tests/nodes/test_llm_analyzer_base.py | 118 +++++++++++++++++++------- 2 files changed, 119 insertions(+), 34 deletions(-) diff --git a/src/skillspector/llm_analyzer_base.py b/src/skillspector/llm_analyzer_base.py index a678e4e..99d7047 100644 --- a/src/skillspector/llm_analyzer_base.py +++ b/src/skillspector/llm_analyzer_base.py @@ -32,6 +32,7 @@ from dataclasses import dataclass, field from typing import Literal +from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field from skillspector.llm_utils import get_chat_model @@ -241,9 +242,21 @@ def __init__(self, base_prompt: str, model: str): self.model = model self._input_budget = get_max_input_tokens(model) self._llm = get_chat_model(model=model) - self._structured_llm = ( - self._llm.with_structured_output(self.response_schema) if self.response_schema else None - ) + # Parse structured output from the text response rather than relying on + # the endpoint's native structured-output mode. ChatOpenAI defaults to + # method="json_schema" (strict response_format), which OpenAI-compatible + # endpoints are free to ignore -- they may wrap the JSON in markdown + # fences or surround it with prose. PydanticOutputParser uses langchain's + # fence-tolerant parser and supplies matching format instructions, so + # both strict and lenient endpoints work. + if self.response_schema: + self._parser = PydanticOutputParser(pydantic_object=self.response_schema) + self._structured_llm = self._llm | self._parser + self._format_instructions = self._parser.get_format_instructions() + else: + self._parser = None + self._structured_llm = None + self._format_instructions = "" # -- Batching ----------------------------------------------------------- @@ -303,6 +316,18 @@ def get_batches( # -- Prompt / parse ----------------------------------------------------- + def _with_format(self, prompt: str) -> str: + """Append schema format instructions for the structured-output chain. + + Centralized here so both the default :meth:`build_prompt` and any + subclass override (e.g. the meta-analyzer) get the parser's format + guidance without each prompt template having to embed it. No-op when + running in raw-string mode (no ``response_schema``). + """ + if not self._format_instructions: + return prompt + return f"{prompt}\n\n{self._format_instructions}" + def build_prompt(self, batch: Batch, **kwargs: object) -> str: """Build the LLM prompt for a single batch. @@ -353,7 +378,7 @@ def run_batches( len(batch.findings), ) if self._structured_llm: - response = self._structured_llm.invoke(prompt) + response = self._structured_llm.invoke(self._with_format(prompt)) else: response = self._llm.invoke(prompt).content logger.debug("LLM response for %s", batch.file_label) @@ -388,7 +413,7 @@ async def _process(batch: Batch) -> tuple[Batch, list]: len(batch.findings), ) if self._structured_llm: - response = await self._structured_llm.ainvoke(prompt) + response = await self._structured_llm.ainvoke(self._with_format(prompt)) else: response = (await self._llm.ainvoke(prompt)).content logger.debug("LLM response for %s", batch.file_label) diff --git a/tests/nodes/test_llm_analyzer_base.py b/tests/nodes/test_llm_analyzer_base.py index 9899a7f..87675f7 100644 --- a/tests/nodes/test_llm_analyzer_base.py +++ b/tests/nodes/test_llm_analyzer_base.py @@ -20,6 +20,8 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest +from langchain_core.messages import AIMessage +from langchain_core.runnables import Runnable from skillspector.llm_analyzer_base import ( Batch, @@ -155,15 +157,77 @@ def test_chunk_label(self) -> None: def _mock_get_chat_model(*_args, **_kwargs): - """Return a mock ChatOpenAI that supports with_structured_output.""" - mock_llm = MagicMock() - mock_llm.with_structured_output.return_value = MagicMock() - return mock_llm + """Return a mock chat model for tests that override ``_structured_llm``.""" + return MagicMock() MOCK_PATCH_TARGET = "skillspector.llm_analyzer_base.get_chat_model" +class _FakeChatModel(Runnable): + """Minimal Runnable that returns a fixed ``AIMessage`` content. + + Stands in for an OpenAI-compatible endpoint that ignores + ``response_format`` so the real ``PydanticOutputParser`` in the + structured-output chain is exercised end-to-end. + """ + + def __init__(self, content: str): + self._content = content + + def invoke(self, _input, _config=None, **_kwargs): + return AIMessage(content=self._content) + + async def ainvoke(self, _input, _config=None, **_kwargs): + return AIMessage(content=self._content) + + +class TestStructuredOutputParsing: + """Regression tests: structured chain tolerates non-strict endpoints. + + OpenAI-compatible endpoints may ignore ``response_format`` and wrap the + JSON in markdown fences or surround it with prose. The chain must still + parse into the response schema rather than crashing the scan. + """ + + MODEL = "nvidia/openai/gpt-oss-120b" + + @pytest.mark.parametrize( + "content", + [ + '{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}', + '```json\n{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}\n```', + 'Here is the analysis:\n```json\n{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}\n```\nDone.', + ], + ) + @patch(MOCK_PATCH_TARGET, _mock_get_chat_model) + def test_parses_fenced_and_prose_wrapped_json(self, content: str) -> None: + analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL) + analyzer._structured_llm = _FakeChatModel(content) | analyzer._parser + results = analyzer.run_batches([Batch(file_path="x.py", content="code")]) + _, findings = results[0] + assert len(findings) == 1 + assert findings[0].rule_id == "R-1" + + @patch(MOCK_PATCH_TARGET, _mock_get_chat_model) + def test_empty_findings_object_parses(self) -> None: + analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL) + analyzer._structured_llm = ( + _FakeChatModel('```json\n{"findings": []}\n```') | analyzer._parser + ) + results = analyzer.run_batches([Batch(file_path="x.py", content="code")]) + _, findings = results[0] + assert findings == [] + + @patch(MOCK_PATCH_TARGET, _mock_get_chat_model) + def test_format_instructions_appended_to_prompt(self) -> None: + analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL) + assert analyzer._format_instructions + augmented = analyzer._with_format("PROMPT BODY") + assert augmented.startswith("PROMPT BODY") + assert analyzer._format_instructions in augmented + + # --------------------------------------------------------------------------- # number_lines # --------------------------------------------------------------------------- @@ -1096,22 +1160,22 @@ class TestLLMMetaAnalyzerRunBatches: @patch(MOCK_PATCH_TARGET) def test_run_batches_calls_structured_llm_per_batch(self, mock_get_model: MagicMock) -> None: mock_llm = MagicMock() - mock_structured = MagicMock() mock_get_model.return_value = mock_llm - mock_llm.with_structured_output.return_value = mock_structured - mock_structured.invoke.return_value = MetaAnalyzerResult( - findings=[ - MetaAnalyzerFinding( - pattern_id="E1", - is_vulnerability=True, - confidence=0.9, - intent="malicious", - impact="high", - ) - ], - ) analyzer = LLMMetaAnalyzer(model=self.MODEL) + analyzer._structured_llm.invoke = MagicMock( + return_value=MetaAnalyzerResult( + findings=[ + MetaAnalyzerFinding( + pattern_id="E1", + is_vulnerability=True, + confidence=0.9, + intent="malicious", + impact="high", + ) + ], + ) + ) f1 = Finding(rule_id="E1", message="test", file="a.py", start_line=1) f2 = Finding(rule_id="E2", message="test", file="b.py", start_line=1) batches = [ @@ -1119,7 +1183,7 @@ def test_run_batches_calls_structured_llm_per_batch(self, mock_get_model: MagicM Batch(file_path="b.py", content="code b", findings=[f2]), ] results = analyzer.run_batches(batches, metadata_text="Name: skill") - assert mock_structured.invoke.call_count == 2 + assert analyzer._structured_llm.invoke.call_count == 2 assert len(results) == 2 @patch(MOCK_PATCH_TARGET) @@ -1140,10 +1204,10 @@ class TestLLMMetaAnalyzerARunBatches: @patch(MOCK_PATCH_TARGET) async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicMock) -> None: mock_llm = MagicMock() - mock_structured = MagicMock() mock_get_model.return_value = mock_llm - mock_llm.with_structured_output.return_value = mock_structured - mock_structured.ainvoke = AsyncMock( + + analyzer = LLMMetaAnalyzer(model=self.MODEL) + analyzer._structured_llm.ainvoke = AsyncMock( return_value=MetaAnalyzerResult( findings=[ MetaAnalyzerFinding( @@ -1156,8 +1220,6 @@ async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicM ], ) ) - - analyzer = LLMMetaAnalyzer(model=self.MODEL) f1 = Finding(rule_id="E1", message="test", file="a.py", start_line=1) f2 = Finding(rule_id="E2", message="test", file="b.py", start_line=1) batches = [ @@ -1165,7 +1227,7 @@ async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicM Batch(file_path="b.py", content="code b", findings=[f2]), ] results = await analyzer.arun_batches(batches, metadata_text="Name: skill") - assert mock_structured.ainvoke.call_count == 2 + assert analyzer._structured_llm.ainvoke.call_count == 2 assert len(results) == 2 @patch(MOCK_PATCH_TARGET) @@ -1174,10 +1236,10 @@ async def test_arun_batches_results_compatible_with_apply_filter( mock_get_model: MagicMock, ) -> None: mock_llm = MagicMock() - mock_structured = MagicMock() mock_get_model.return_value = mock_llm - mock_llm.with_structured_output.return_value = mock_structured - mock_structured.ainvoke = AsyncMock( + + analyzer = LLMMetaAnalyzer(model=self.MODEL) + analyzer._structured_llm.ainvoke = AsyncMock( return_value=MetaAnalyzerResult( findings=[ MetaAnalyzerFinding( @@ -1192,8 +1254,6 @@ async def test_arun_batches_results_compatible_with_apply_filter( ], ) ) - - analyzer = LLMMetaAnalyzer(model=self.MODEL) finding = Finding(rule_id="E1", message="test", file="a.py", start_line=1) batches = [Batch(file_path="a.py", content="code", findings=[finding])] batch_results = await analyzer.arun_batches(batches, metadata_text="")