From f74ab7ce4920d1009d177bea082ee7645ad8e856 Mon Sep 17 00:00:00 2001
From: Minh-Duc Bui <minhduc2k31905@gmail.com>
Date: Mon, 15 Jun 2026 16:39:37 +0700
Subject: [PATCH] fix(analyzer): parse structured output from text for
 OpenAI-compatible endpoints

The LLM analyzers built their structured-output client with
`ChatOpenAI.with_structured_output(schema)`, which defaults to
`method="json_schema"` (strict `response_format`). Endpoints that speak the
OpenAI API but do not enforce that contract are free to ignore it, and several
return the JSON wrapped in a markdown code fence or surrounded by prose. The
strict parser then receives non-JSON and raises, so every semantic analyzer
fails and the whole scan exits with a JSON decode error. Static analysis still
runs, but all LLM findings are lost.

Swap the strict client for the canonical langchain chain `llm | PydanticOutputParser`.
The parser uses langchain's fence-tolerant `parse_json_markdown` under the hood
and validates into the same Pydantic schema, so `parse_response` is unchanged
for both the per-file analyzers and the meta-analyzer. Strict endpoints keep
working; lenient ones now work too.

Inject the parser's `get_format_instructions()` through a single `_with_format`
helper at the run-loop chokepoint so the default prompt and the meta-analyzer's
overridden prompt both get JSON guidance without duplicating it.

Add regression tests that drive fenced and prose-wrapped JSON through the real
parser, and update the three tests that asserted the removed
`with_structured_output` seam to the direct-mock pattern the other tests use.

Closes #69

Signed-off-by: Minh-Duc Bui <minhduc2k31905@gmail.com>
---
 src/skillspector/llm_analyzer_base.py |  35 ++++++--
 tests/nodes/test_llm_analyzer_base.py | 118 +++++++++++++++++++-------
 2 files changed, 119 insertions(+), 34 deletions(-)

diff --git a/src/skillspector/llm_analyzer_base.py b/src/skillspector/llm_analyzer_base.py
index a678e4e..99d7047 100644
--- a/src/skillspector/llm_analyzer_base.py
+++ b/src/skillspector/llm_analyzer_base.py
@@ -32,6 +32,7 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+from langchain_core.output_parsers import PydanticOutputParser
 from pydantic import BaseModel, Field
 
 from skillspector.llm_utils import get_chat_model
@@ -241,9 +242,21 @@ def __init__(self, base_prompt: str, model: str):
         self.model = model
         self._input_budget = get_max_input_tokens(model)
         self._llm = get_chat_model(model=model)
-        self._structured_llm = (
-            self._llm.with_structured_output(self.response_schema) if self.response_schema else None
-        )
+        # Parse structured output from the text response rather than relying on
+        # the endpoint's native structured-output mode. ChatOpenAI defaults to
+        # method="json_schema" (strict response_format), which OpenAI-compatible
+        # endpoints are free to ignore -- they may wrap the JSON in markdown
+        # fences or surround it with prose. PydanticOutputParser uses langchain's
+        # fence-tolerant parser and supplies matching format instructions, so
+        # both strict and lenient endpoints work.
+        if self.response_schema:
+            self._parser = PydanticOutputParser(pydantic_object=self.response_schema)
+            self._structured_llm = self._llm | self._parser
+            self._format_instructions = self._parser.get_format_instructions()
+        else:
+            self._parser = None
+            self._structured_llm = None
+            self._format_instructions = ""
 
     # -- Batching -----------------------------------------------------------
 
@@ -303,6 +316,18 @@ def get_batches(
 
     # -- Prompt / parse -----------------------------------------------------
 
+    def _with_format(self, prompt: str) -> str:
+        """Append schema format instructions for the structured-output chain.
+
+        Centralized here so both the default :meth:`build_prompt` and any
+        subclass override (e.g. the meta-analyzer) get the parser's format
+        guidance without each prompt template having to embed it. No-op when
+        running in raw-string mode (no ``response_schema``).
+        """
+        if not self._format_instructions:
+            return prompt
+        return f"{prompt}\n\n{self._format_instructions}"
+
     def build_prompt(self, batch: Batch, **kwargs: object) -> str:
         """Build the LLM prompt for a single batch.
 
@@ -353,7 +378,7 @@ def run_batches(
                 len(batch.findings),
             )
             if self._structured_llm:
-                response = self._structured_llm.invoke(prompt)
+                response = self._structured_llm.invoke(self._with_format(prompt))
             else:
                 response = self._llm.invoke(prompt).content
             logger.debug("LLM response for %s", batch.file_label)
@@ -388,7 +413,7 @@ async def _process(batch: Batch) -> tuple[Batch, list]:
                     len(batch.findings),
                 )
                 if self._structured_llm:
-                    response = await self._structured_llm.ainvoke(prompt)
+                    response = await self._structured_llm.ainvoke(self._with_format(prompt))
                 else:
                     response = (await self._llm.ainvoke(prompt)).content
                 logger.debug("LLM response for %s", batch.file_label)
diff --git a/tests/nodes/test_llm_analyzer_base.py b/tests/nodes/test_llm_analyzer_base.py
index 9899a7f..87675f7 100644
--- a/tests/nodes/test_llm_analyzer_base.py
+++ b/tests/nodes/test_llm_analyzer_base.py
@@ -20,6 +20,8 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
+from langchain_core.messages import AIMessage
+from langchain_core.runnables import Runnable
 
 from skillspector.llm_analyzer_base import (
     Batch,
@@ -155,15 +157,77 @@ def test_chunk_label(self) -> None:
 
 
 def _mock_get_chat_model(*_args, **_kwargs):
-    """Return a mock ChatOpenAI that supports with_structured_output."""
-    mock_llm = MagicMock()
-    mock_llm.with_structured_output.return_value = MagicMock()
-    return mock_llm
+    """Return a mock chat model for tests that override ``_structured_llm``."""
+    return MagicMock()
 
 
 MOCK_PATCH_TARGET = "skillspector.llm_analyzer_base.get_chat_model"
 
 
+class _FakeChatModel(Runnable):
+    """Minimal Runnable that returns a fixed ``AIMessage`` content.
+
+    Stands in for an OpenAI-compatible endpoint that ignores
+    ``response_format`` so the real ``PydanticOutputParser`` in the
+    structured-output chain is exercised end-to-end.
+    """
+
+    def __init__(self, content: str):
+        self._content = content
+
+    def invoke(self, _input, _config=None, **_kwargs):
+        return AIMessage(content=self._content)
+
+    async def ainvoke(self, _input, _config=None, **_kwargs):
+        return AIMessage(content=self._content)
+
+
+class TestStructuredOutputParsing:
+    """Regression tests: structured chain tolerates non-strict endpoints.
+
+    OpenAI-compatible endpoints may ignore ``response_format`` and wrap the
+    JSON in markdown fences or surround it with prose. The chain must still
+    parse into the response schema rather than crashing the scan.
+    """
+
+    MODEL = "nvidia/openai/gpt-oss-120b"
+
+    @pytest.mark.parametrize(
+        "content",
+        [
+            '{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}',
+            '```json\n{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}\n```',
+            'Here is the analysis:\n```json\n{"findings": [{"rule_id": "R-1", "message": "m", "severity": "LOW", "start_line": 1}]}\n```\nDone.',
+        ],
+    )
+    @patch(MOCK_PATCH_TARGET, _mock_get_chat_model)
+    def test_parses_fenced_and_prose_wrapped_json(self, content: str) -> None:
+        analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL)
+        analyzer._structured_llm = _FakeChatModel(content) | analyzer._parser
+        results = analyzer.run_batches([Batch(file_path="x.py", content="code")])
+        _, findings = results[0]
+        assert len(findings) == 1
+        assert findings[0].rule_id == "R-1"
+
+    @patch(MOCK_PATCH_TARGET, _mock_get_chat_model)
+    def test_empty_findings_object_parses(self) -> None:
+        analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL)
+        analyzer._structured_llm = (
+            _FakeChatModel('```json\n{"findings": []}\n```') | analyzer._parser
+        )
+        results = analyzer.run_batches([Batch(file_path="x.py", content="code")])
+        _, findings = results[0]
+        assert findings == []
+
+    @patch(MOCK_PATCH_TARGET, _mock_get_chat_model)
+    def test_format_instructions_appended_to_prompt(self) -> None:
+        analyzer = LLMAnalyzerBase(base_prompt="test", model=self.MODEL)
+        assert analyzer._format_instructions
+        augmented = analyzer._with_format("PROMPT BODY")
+        assert augmented.startswith("PROMPT BODY")
+        assert analyzer._format_instructions in augmented
+
+
 # ---------------------------------------------------------------------------
 # number_lines
 # ---------------------------------------------------------------------------
@@ -1096,22 +1160,22 @@ class TestLLMMetaAnalyzerRunBatches:
     @patch(MOCK_PATCH_TARGET)
     def test_run_batches_calls_structured_llm_per_batch(self, mock_get_model: MagicMock) -> None:
         mock_llm = MagicMock()
-        mock_structured = MagicMock()
         mock_get_model.return_value = mock_llm
-        mock_llm.with_structured_output.return_value = mock_structured
-        mock_structured.invoke.return_value = MetaAnalyzerResult(
-            findings=[
-                MetaAnalyzerFinding(
-                    pattern_id="E1",
-                    is_vulnerability=True,
-                    confidence=0.9,
-                    intent="malicious",
-                    impact="high",
-                )
-            ],
-        )
 
         analyzer = LLMMetaAnalyzer(model=self.MODEL)
+        analyzer._structured_llm.invoke = MagicMock(
+            return_value=MetaAnalyzerResult(
+                findings=[
+                    MetaAnalyzerFinding(
+                        pattern_id="E1",
+                        is_vulnerability=True,
+                        confidence=0.9,
+                        intent="malicious",
+                        impact="high",
+                    )
+                ],
+            )
+        )
         f1 = Finding(rule_id="E1", message="test", file="a.py", start_line=1)
         f2 = Finding(rule_id="E2", message="test", file="b.py", start_line=1)
         batches = [
@@ -1119,7 +1183,7 @@ def test_run_batches_calls_structured_llm_per_batch(self, mock_get_model: MagicM
             Batch(file_path="b.py", content="code b", findings=[f2]),
         ]
         results = analyzer.run_batches(batches, metadata_text="Name: skill")
-        assert mock_structured.invoke.call_count == 2
+        assert analyzer._structured_llm.invoke.call_count == 2
         assert len(results) == 2
 
     @patch(MOCK_PATCH_TARGET)
@@ -1140,10 +1204,10 @@ class TestLLMMetaAnalyzerARunBatches:
     @patch(MOCK_PATCH_TARGET)
     async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicMock) -> None:
         mock_llm = MagicMock()
-        mock_structured = MagicMock()
         mock_get_model.return_value = mock_llm
-        mock_llm.with_structured_output.return_value = mock_structured
-        mock_structured.ainvoke = AsyncMock(
+
+        analyzer = LLMMetaAnalyzer(model=self.MODEL)
+        analyzer._structured_llm.ainvoke = AsyncMock(
             return_value=MetaAnalyzerResult(
                 findings=[
                     MetaAnalyzerFinding(
@@ -1156,8 +1220,6 @@ async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicM
                 ],
             )
         )
-
-        analyzer = LLMMetaAnalyzer(model=self.MODEL)
         f1 = Finding(rule_id="E1", message="test", file="a.py", start_line=1)
         f2 = Finding(rule_id="E2", message="test", file="b.py", start_line=1)
         batches = [
@@ -1165,7 +1227,7 @@ async def test_arun_batches_calls_ainvoke_per_batch(self, mock_get_model: MagicM
             Batch(file_path="b.py", content="code b", findings=[f2]),
         ]
         results = await analyzer.arun_batches(batches, metadata_text="Name: skill")
-        assert mock_structured.ainvoke.call_count == 2
+        assert analyzer._structured_llm.ainvoke.call_count == 2
         assert len(results) == 2
 
     @patch(MOCK_PATCH_TARGET)
@@ -1174,10 +1236,10 @@ async def test_arun_batches_results_compatible_with_apply_filter(
         mock_get_model: MagicMock,
     ) -> None:
         mock_llm = MagicMock()
-        mock_structured = MagicMock()
         mock_get_model.return_value = mock_llm
-        mock_llm.with_structured_output.return_value = mock_structured
-        mock_structured.ainvoke = AsyncMock(
+
+        analyzer = LLMMetaAnalyzer(model=self.MODEL)
+        analyzer._structured_llm.ainvoke = AsyncMock(
             return_value=MetaAnalyzerResult(
                 findings=[
                     MetaAnalyzerFinding(
@@ -1192,8 +1254,6 @@ async def test_arun_batches_results_compatible_with_apply_filter(
                 ],
             )
         )
-
-        analyzer = LLMMetaAnalyzer(model=self.MODEL)
         finding = Finding(rule_id="E1", message="test", file="a.py", start_line=1)
         batches = [Batch(file_path="a.py", content="code", findings=[finding])]
         batch_results = await analyzer.arun_batches(batches, metadata_text="")