From 9bc220f9e2e4892fb01b921631f2e36e7ae2a5d8 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 19:33:45 +0100 Subject: [PATCH 1/7] using instructor for models --- CHANGELOG.md | 14 + .../2026-03-22-add-instructor-library.md | 1087 +++++++++++++++++ maseval/__init__.py | 11 + maseval/benchmark/tau2/domains/base.py | 58 +- maseval/benchmark/tau2/tau2.py | 61 +- maseval/core/instructor.py | 117 ++ maseval/core/model.py | 71 +- maseval/core/simulator.py | 223 ++-- maseval/interface/inference/anthropic.py | 42 + maseval/interface/inference/google_genai.py | 42 + maseval/interface/inference/litellm.py | 52 + maseval/interface/inference/openai.py | 42 + pyproject.toml | 1 + tests/conftest.py | 35 + tests/test_core/test_agentic_user.py | 27 +- tests/test_core/test_llm_simulator.py | 234 ++-- uv.lock | 243 ++-- 17 files changed, 1841 insertions(+), 519 deletions(-) create mode 100644 docs/superpowers/plans/2026-03-22-add-instructor-library.md create mode 100644 maseval/core/instructor.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 53911dc0..0bb4b82b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 **Core** +- Instructor library (`instructor>=1.14.0`) as core dependency for structured LLM output handling with automatic validation and retries. (PR: #PR_NUMBER_PLACEHOLDER) +- `response_model` parameter on `ModelAdapter.chat()` — pass a Pydantic `BaseModel` class to get validated structured outputs via `ChatResponse.structured_response`. Supported on OpenAI, Anthropic, Google GenAI, and LiteLLM adapters. (PR: #PR_NUMBER_PLACEHOLDER) +- `maseval.core.instructor` module with `create_instructor_client()` and `flatten_model_schema()` helpers for creating instructor-patched clients and generating provider-compatible JSON schemas. (PR: #PR_NUMBER_PLACEHOLDER) + +### Changed + +**Core** + +- Simulators (`ToolLLMSimulator`, `UserLLMSimulator`, `AgenticUserLLMSimulator`) now use instructor for structured output parsing with automatic validation and retries, replacing manual JSON extraction and retry loops. (PR: #PR_NUMBER_PLACEHOLDER) + +**Benchmarks** + +- Tau2 benchmark uses `flatten_model_schema()` from `maseval.core.instructor` for tool parameter schema generation, replacing the manual `_flatten_schema()` function. (PR: #PR_NUMBER_PLACEHOLDER) + - Usage and cost tracking via `Usage` and `TokenUsage` data classes. `ModelAdapter` tracks token usage automatically after each `chat()` call. Components that implement `UsageTrackableMixin` are collected via `gather_usage()`. Live totals available during benchmark runs via `benchmark.usage` (grand total) and `benchmark.usage_by_component` (per-component breakdowns). Post-hoc analysis via `UsageReporter.from_reports(benchmark.reports)` with breakdowns by task, component, or model. (PR: #45) - Pluggable cost calculation via `CostCalculator` protocol. `StaticPricingCalculator` computes cost from user-supplied per-token rates. `LiteLLMCostCalculator` in `maseval.interface.usage` for automatic pricing via LiteLLM's model database (supports `custom_pricing` overrides and `model_id_map`; requires `litellm`). Pass a `cost_calculator` to `ModelAdapter` or `AgentAdapter` to compute `Usage.cost`. Provider-reported cost always takes precedence. (PR: #45) - `AgentAdapter` now accepts `cost_calculator` and `model_id` parameters. For smolagents, CAMEL, and LlamaIndex, both are auto-detected from the framework's agent object (`LiteLLMCostCalculator` if litellm is installed). LangGraph requires explicit `model_id` since graphs can contain multiple models. Explicit parameters always override auto-detection. (PR: #45) diff --git a/docs/superpowers/plans/2026-03-22-add-instructor-library.md b/docs/superpowers/plans/2026-03-22-add-instructor-library.md new file mode 100644 index 00000000..bdf490e8 --- /dev/null +++ b/docs/superpowers/plans/2026-03-22-add-instructor-library.md @@ -0,0 +1,1087 @@ +# Add Instructor Library Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the [instructor](https://github.com/567-labs/instructor) library as core infrastructure for structured LLM output handling — both internally and as a user-facing API. This is not a patch on top of existing code; it is a clean replacement. The custom JSON extraction, schema flattening, and retry logic are removed, not wrapped with fallbacks. + +**Why:** Reliable structured output from unreliable models is critical for researchers who use cheap/small models due to cost constraints. Hand-rolled JSON parsing and retry logic is finicky, under-tested, and reimplemented in multiple places. Instructor provides a battle-tested foundation (3M+ monthly downloads) for validation, retries with error feedback to the model, and multi-provider support. By making it core infrastructure, every future structured output need builds on this foundation rather than reinventing it. + +**Design principles:** +1. **Clean replacement, not a compatibility layer.** Per AGENTS.md: "Clean, maintainable code is the priority — not backwards compatibility." Old code (`_extract_json_object`, `_flatten_schema`, manual retry loops) is deleted, not preserved as fallbacks. If instructor handles it, the old path is gone. +2. **Infrastructure for future work.** This is a seed ecosystem — the integration points are designed so that upcoming features (partial streaming, custom validators, fallback models) plug in naturally. +3. **Follow existing patterns.** The `ModelAdapter` / provider adapter pattern stays. Instructor slots into this architecture cleanly via `_structured_chat()` overrides in each provider adapter. + +**Architecture:** Instructor wraps provider clients via `instructor.from_provider()` to add `response_model` support with automatic validation and retries. All providers use a unified API: `client.chat.completions.create(response_model=..., messages=...)`. We integrate at the `ModelAdapter` level: each adapter creates an instructor-patched client alongside the raw client. The public `chat()` method gains an optional `response_model` parameter. Simulators switch fully to `response_model` — no legacy JSON parsing fallback. Tau2 schema flattening uses instructor's `openai_schema()` as a base. + +**Tech Stack:** Python, instructor (>=1.14.0), pydantic (>=2.10.6, already a core dep) + +**Key instructor API facts (verified against v1.14.4):** +- `instructor.from_provider("provider/model")` — unified client creation (no `from_gemini` or `from_anthropic`) +- `instructor.from_openai(client)` — OpenAI-specific wrapping +- `instructor.from_litellm(completion_fn)` — LiteLLM wrapping +- All wrapped clients use `client.chat.completions.create(response_model=..., messages=...)` uniformly +- `instructor.openai_schema(MyModel)` — generate clean OpenAI-compatible schemas (returns object with `.openai_schema` dict containing `name`, `description`, `parameters`) +- Note: `openai_schema()` still produces `anyOf` for `Optional` fields — we keep `_flatten_schema()` as a thin utility for providers (like Google GenAI) that reject `anyOf` + +**Project conventions (from AGENTS.md):** +- Use `uv add` for dependencies, `uv run` for commands, never `pip install` +- Union syntax: `A | B`, optionals: `Optional[X]`, collections: `List`, `Dict` +- Core (`maseval/core/`) must NOT import from interface (`maseval/interface/`) +- `just all` before committing (format + lint + typecheck + test) + +--- + +## File Structure + +| File | Responsibility | Action | +|------|---------------|--------| +| `pyproject.toml` | Dependencies | Modify: add `instructor>=1.14.0` to core deps via `uv add` | +| `maseval/core/model.py` | ModelAdapter base + ChatResponse | Modify: add `response_model` param to `chat()`, add `_structured_chat()`, add `structured_response` field to ChatResponse | +| `maseval/core/instructor.py` | Instructor integration helpers | Create: `create_instructor_client()` helper, `flatten_model_schema()` | +| `maseval/interface/inference/openai.py` | OpenAI adapter | Modify: create instructor client, override `_structured_chat()` | +| `maseval/interface/inference/anthropic.py` | Anthropic adapter | Modify: create instructor client, override `_structured_chat()` | +| `maseval/interface/inference/google_genai.py` | Google adapter | Modify: create instructor client, override `_structured_chat()` | +| `maseval/interface/inference/litellm.py` | LiteLLM adapter | Modify: create instructor client, override `_structured_chat()` | +| `maseval/core/simulator.py` | LLM simulators | Modify: add Pydantic response models, use `response_model` in simulators with legacy fallback | +| `maseval/benchmark/tau2/tau2.py` | Tau2 benchmark | Modify: replace `_flatten_schema()` usage in both `_build_tool_definitions()` (line 897) and `_get_tool_definitions()` (line 1231) with `flatten_model_schema()` | +| `tests/test_core/test_instructor_integration.py` | Instructor integration tests | Create: test `response_model` on ModelAdapter | +| `tests/test_core/test_llm_simulator.py` | Existing simulator tests | Modify: add response model tests, verify existing tests pass | +| `CHANGELOG.md` | Changelog | Modify: add entry under Unreleased | + +--- + +## Task 1: Add instructor dependency and create integration module + +**Files:** +- Modify: `pyproject.toml:24-29` +- Create: `maseval/core/instructor.py` +- Test: `tests/test_core/test_instructor_integration.py` + +- [ ] **Step 1: Write failing test for instructor import** + +```python +# tests/test_core/test_instructor_integration.py +"""Test instructor library integration.""" +import pytest + + +@pytest.mark.core +class TestInstructorAvailable: + """Verify instructor is importable as a core dependency.""" + + def test_instructor_importable(self): + """instructor should be importable since it's a core dep.""" + import instructor + assert hasattr(instructor, "from_openai") + + def test_instructor_helpers_importable(self): + """maseval.core.instructor helpers should be importable.""" + from maseval.core.instructor import create_instructor_client + assert callable(create_instructor_client) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` +Expected: FAIL — instructor not installed, module not found + +- [ ] **Step 3: Add instructor to core dependencies** + +Run: `uv add instructor` + +This updates both `pyproject.toml` and `uv.lock` automatically. + +- [ ] **Step 4: Create maseval/core/instructor.py** + +```python +"""Instructor library integration for structured LLM outputs. + +Provides helpers to create instructor-patched clients from provider SDK clients +and to generate flattened JSON schemas from Pydantic models. + +Instructor adds ``response_model`` support with automatic validation and retries +to any supported LLM provider. + +Example: + ```python + from maseval.core.instructor import create_instructor_client + + # Wrap an OpenAI client + import openai + client = openai.OpenAI() + instructor_client = create_instructor_client(client, provider="openai") + ``` +""" + +from __future__ import annotations + +from typing import Any, Optional, Dict + + +def create_instructor_client( + client: Any, + provider: str, + mode: Optional[str] = None, +) -> Any: + """Create an instructor-patched client from a provider SDK client. + + All patched clients expose a unified API: + ``client.chat.completions.create(response_model=..., messages=...)``. + + Args: + client: The provider SDK client instance (e.g., ``openai.OpenAI()``, + ``anthropic.Anthropic()``). For LiteLLM, pass ``litellm.completion``. + provider: Provider name. One of: ``"openai"``, ``"litellm"``. + For other providers, use ``instructor.from_provider()`` directly. + mode: Optional instructor mode override. If None, uses the default + for the provider. + + Returns: + An instructor-patched client supporting ``response_model``. + + Raises: + ValueError: If provider is not recognized. + """ + import instructor + + kwargs: Dict[str, Any] = {} + if mode is not None: + kwargs["mode"] = getattr(instructor.Mode, mode.upper(), mode) + + if provider == "openai": + return instructor.from_openai(client, **kwargs) + elif provider == "litellm": + return instructor.from_litellm(client, **kwargs) + else: + raise ValueError( + f"Unsupported provider: {provider!r}. " + f"Use instructor.from_provider() directly for other providers." + ) + + +def flatten_model_schema(model: type) -> Dict[str, Any]: + """Generate a flattened JSON schema from a Pydantic model. + + Uses instructor's ``openai_schema`` to produce a clean schema, then + applies additional flattening to remove ``anyOf`` (for ``Optional`` + fields) and other constructs that some providers reject. + + This replaces the manual ``_flatten_schema()`` function that was + previously needed to post-process Pydantic v2 schemas. + + Args: + model: A Pydantic BaseModel subclass. + + Returns: + A flat JSON schema dict suitable for LLM tool parameters. + """ + import instructor + + schema_obj = instructor.openai_schema(model) + schema = schema_obj.openai_schema["parameters"] + + # instructor's openai_schema still produces anyOf for Optional fields. + # Flatten those for provider compatibility (especially Google GenAI). + return _resolve_schema(schema) + + +def _resolve_schema(node: Any) -> Any: + """Recursively resolve anyOf and strip unsupported keys from a schema.""" + if not isinstance(node, dict): + return node + + _STRIP_KEYS = {"$defs", "additionalProperties", "title", "default"} + + # Simplify anyOf (Optional[X] -> X with nullable) + if "anyOf" in node: + variants = node["anyOf"] + non_null = [v for v in variants if not (isinstance(v, dict) and v.get("type") == "null")] + if len(non_null) == 1: + resolved = _resolve_schema(non_null[0]) + if isinstance(resolved, dict): + resolved["nullable"] = True + if "description" in node and "description" not in resolved: + resolved["description"] = node["description"] + return resolved + if non_null: + return _resolve_schema(non_null[0]) + + out: Dict[str, Any] = {} + for key, value in node.items(): + if key in _STRIP_KEYS or key == "anyOf": + continue + if isinstance(value, dict): + out[key] = _resolve_schema(value) + elif isinstance(value, list): + out[key] = [_resolve_schema(v) if isinstance(v, dict) else v for v in value] + else: + out[key] = value + return out +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` +Expected: PASS + +- [ ] **Step 6: Run full existing test suite** + +Run: `uv run pytest tests/test_core/ -v --tb=short` +Expected: All existing tests PASS + +- [ ] **Step 7: Commit** + +```bash +git add pyproject.toml uv.lock maseval/core/instructor.py tests/test_core/test_instructor_integration.py +git commit -m "feat: add instructor as core dependency with integration helpers" +``` + +--- + +## Task 2: Add response_model support to ChatResponse and ModelAdapter base + +**Files:** +- Modify: `maseval/core/model.py:62-135` (ChatResponse) +- Modify: `maseval/core/model.py:207-342` (ModelAdapter.chat) +- Test: `tests/test_core/test_instructor_integration.py` + +- [ ] **Step 1: Write failing tests for response_model support** + +Append to `tests/test_core/test_instructor_integration.py`: + +```python +from pydantic import BaseModel +from conftest import DummyModelAdapter +from maseval.core.model import ChatResponse + + +class WeatherResponse(BaseModel): + city: str + temperature: float + unit: str + + +@pytest.mark.core +class TestChatResponseStructured: + """Test ChatResponse with structured_response field.""" + + def test_chat_response_has_structured_response_field(self): + """ChatResponse should have an optional structured_response field.""" + resp = ChatResponse(content='{"city": "Paris"}') + assert resp.structured_response is None + + def test_chat_response_with_structured_response(self): + """ChatResponse can hold a parsed Pydantic model.""" + weather = WeatherResponse(city="Paris", temperature=20.0, unit="celsius") + resp = ChatResponse(content='{"city": "Paris"}', structured_response=weather) + assert resp.structured_response is not None + assert resp.structured_response.city == "Paris" + + +@pytest.mark.core +class TestModelAdapterResponseModel: + """Test ModelAdapter.chat() with response_model parameter.""" + + def test_chat_accepts_response_model_param(self): + """chat() should accept a response_model keyword argument.""" + import inspect + from maseval.core.model import ModelAdapter + sig = inspect.signature(ModelAdapter.chat) + assert "response_model" in sig.parameters + + def test_chat_without_response_model_unchanged(self): + """chat() without response_model behaves exactly as before.""" + model = DummyModelAdapter(responses=["Hello"]) + result = model.chat([{"role": "user", "content": "Hi"}]) + assert isinstance(result, ChatResponse) + assert result.content == "Hello" + assert result.structured_response is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestChatResponseStructured -v -x` +Expected: FAIL — `structured_response` field doesn't exist + +- [ ] **Step 3: Add structured_response field to ChatResponse** + +In `maseval/core/model.py`, add to the `ChatResponse` dataclass (after `stop_reason` on line 105): + +```python + structured_response: Optional[Any] = None +``` + +Update the docstring to include: + +``` + structured_response: The validated Pydantic model instance when + ``response_model`` was used with ``chat()``. None otherwise. +``` + +- [ ] **Step 4: Add response_model and max_retries parameters to ModelAdapter.chat()** + +In `maseval/core/model.py`, modify the `chat()` method signature (lines 207-214) to: + +```python + def chat( + self, + messages: Union[List[Dict[str, Any]], MessageHistory], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + response_model: Optional[type] = None, + max_retries: int = 3, + **kwargs: Any, + ) -> ChatResponse: +``` + +Update the docstring Args section to include: + +``` + response_model: Optional Pydantic BaseModel class. When provided, + the model's response is validated against this schema and + returned in ``ChatResponse.structured_response``. Uses + instructor for automatic validation and retries. + max_retries: Number of retries on validation failure when using + ``response_model``. Default is 3. Ignored without ``response_model``. +``` + +In the `try` block (around line 284), add branching for response_model: + +```python + try: + if response_model is not None: + result = self._structured_chat( + messages_list, + response_model=response_model, + max_retries=max_retries, + generation_params=generation_params, + tools=tools, + tool_choice=tool_choice, + **kwargs, + ) + else: + result = self._chat_impl( + messages_list, + generation_params=generation_params, + tools=tools, + tool_choice=tool_choice, + **kwargs, + ) +``` + +- [ ] **Step 5: Add _structured_chat() method to ModelAdapter** + +Add after `_chat_impl` (around line 367): + +```python + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Internal structured chat using instructor. + + Subclasses that support instructor should override this method. + The default implementation falls back to ``_chat_impl`` and attempts + manual JSON parsing of the response content. + + Args: + messages: List of message dicts. + response_model: Pydantic model class for response validation. + max_retries: Number of retries on validation failure. + generation_params: Generation parameters. + tools: Tool definitions, if any. + tool_choice: Tool choice setting, if any. + **kwargs: Additional arguments. + + Returns: + ChatResponse with ``structured_response`` populated. + """ + # Base class raises — subclasses must override with their + # instructor-patched client. No silent fallback to unstructured output. + raise NotImplementedError( + f"{type(self).__name__} does not support response_model. " + f"Override _structured_chat() with an instructor-patched client." + ) +``` + +- [ ] **Step 6: Run tests to verify they pass** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` +Expected: PASS + +- [ ] **Step 7: Run full existing test suite** + +Run: `uv run pytest tests/test_core/ -v --tb=short` +Expected: All existing tests still PASS (the new `structured_response` field defaults to None) + +- [ ] **Step 8: Commit** + +```bash +git add maseval/core/model.py tests/test_core/test_instructor_integration.py +git commit -m "feat: add response_model support to ModelAdapter.chat() and ChatResponse" +``` + +--- + +## Task 3: Implement instructor support in provider adapters + +**Files:** +- Modify: `maseval/interface/inference/openai.py` +- Modify: `maseval/interface/inference/anthropic.py` +- Modify: `maseval/interface/inference/google_genai.py` +- Modify: `maseval/interface/inference/litellm.py` +- Test: `tests/test_core/test_instructor_integration.py` + +**Important:** All provider adapters use instructor's unified API after wrapping. `instructor.from_provider("provider/model")` returns an `Instructor` instance where all calls go through `client.chat.completions.create(response_model=..., messages=...)` regardless of the underlying provider. For OpenAI, we use `instructor.from_openai(client)`. For LiteLLM, `instructor.from_litellm(litellm.completion)`. For Anthropic and Google, we use `instructor.from_provider()` since there are no dedicated `from_anthropic`/`from_gemini` functions in current instructor. + +- [ ] **Step 1: Write failing tests for provider adapter instructor support** + +Append to `tests/test_core/test_instructor_integration.py`: + +```python +from unittest.mock import MagicMock + + +@pytest.mark.core +class TestOpenAIInstructorSupport: + """Test OpenAI adapter creates instructor client.""" + + def test_openai_adapter_has_instructor_client(self): + """OpenAIModelAdapter should create an instructor-patched client.""" + from maseval.interface.inference import OpenAIModelAdapter + mock_client = MagicMock() + adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") + assert hasattr(adapter, "_instructor_client") + + def test_openai_adapter_structured_chat_uses_instructor(self): + """OpenAIModelAdapter._structured_chat should use instructor client.""" + from maseval.interface.inference import OpenAIModelAdapter + mock_client = MagicMock() + adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") + + # Mock the instructor client + mock_response = WeatherResponse(city="Paris", temperature=20.0, unit="celsius") + adapter._instructor_client = MagicMock() + adapter._instructor_client.chat.completions.create.return_value = mock_response + + result = adapter.chat( + [{"role": "user", "content": "Weather in Paris?"}], + response_model=WeatherResponse, + ) + + assert result.structured_response is not None + assert result.structured_response.city == "Paris" + adapter._instructor_client.chat.completions.create.assert_called_once() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestOpenAIInstructorSupport -v -x` +Expected: FAIL — no `_instructor_client` attribute + +- [ ] **Step 3: Add instructor support to OpenAIModelAdapter** + +In `maseval/interface/inference/openai.py`: + +Add to `__init__` (after existing setup, around line 92): +```python + # Create instructor-patched client for structured outputs + from maseval.core.instructor import create_instructor_client + self._instructor_client = create_instructor_client(client, provider="openai") +``` + +Add `_structured_chat` override (after `_chat_impl`): +```python + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Use instructor for structured output with validation and retries.""" + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + + result = self._instructor_client.chat.completions.create( + model=self._model_id, + messages=messages, + response_model=response_model, + max_retries=max_retries, + **params, + ) + + # result is a validated Pydantic model instance + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) +``` + +- [ ] **Step 4: Add instructor support to AnthropicModelAdapter** + +In `maseval/interface/inference/anthropic.py`: + +Add to `__init__` (after existing setup, around line 109): +```python + # Create instructor-patched client for structured outputs + import instructor + self._instructor_client = instructor.from_provider("anthropic/" + model_id) +``` + +Note: We use `from_provider` since there's no `from_anthropic` in current instructor. + +Add `_structured_chat` override: +```python + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Use instructor for structured output with validation and retries.""" + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + max_tokens = params.pop("max_tokens", self._max_tokens) + params["max_tokens"] = max_tokens + + result = self._instructor_client.chat.completions.create( + response_model=response_model, + messages=messages, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) +``` + +- [ ] **Step 5: Add instructor support to GoogleGenAIModelAdapter** + +In `maseval/interface/inference/google_genai.py`: + +Add to `__init__` (after existing setup, around line 85): +```python + # Create instructor-patched client for structured outputs + import instructor + self._instructor_client = instructor.from_provider("gemini/" + model_id) +``` + +Add `_structured_chat` override: +```python + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Use instructor for structured output with validation and retries.""" + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + + result = self._instructor_client.chat.completions.create( + response_model=response_model, + messages=messages, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) +``` + +- [ ] **Step 6: Add instructor support to LiteLLMModelAdapter** + +In `maseval/interface/inference/litellm.py`: + +Add to `__init__` (after existing setup, around line 101): +```python + # Create instructor-patched completion function for structured outputs. + # Deferred to first use since litellm is an optional import. + self._instructor_client = None +``` + +Add helper + `_structured_chat` override: +```python + def _get_instructor_client(self) -> Any: + """Lazily create instructor-patched LiteLLM client.""" + if self._instructor_client is None: + try: + import litellm + except ImportError as e: + raise ImportError("LiteLLM is not installed. Install with: pip install maseval[litellm]") from e + from maseval.core.instructor import create_instructor_client + self._instructor_client = create_instructor_client(litellm.completion, provider="litellm") + return self._instructor_client + + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Use instructor for structured output with validation and retries.""" + client = self._get_instructor_client() + + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + if self._api_key: + params["api_key"] = self._api_key + if self._api_base: + params["api_base"] = self._api_base + + result = client( + model=self._model_id, + messages=messages, + response_model=response_model, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) +``` + +- [ ] **Step 7: Run tests** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` +Expected: PASS + +- [ ] **Step 8: Run full test suite** + +Run: `uv run pytest tests/test_core/ -v --tb=short` +Expected: All PASS + +- [ ] **Step 9: Commit** + +```bash +git add maseval/interface/inference/openai.py maseval/interface/inference/anthropic.py maseval/interface/inference/google_genai.py maseval/interface/inference/litellm.py tests/test_core/test_instructor_integration.py +git commit -m "feat: add instructor support to all provider adapters" +``` + +--- + +## Task 4: Rework simulators to use instructor + +**Files:** +- Modify: `maseval/core/simulator.py` +- Test: `tests/test_core/test_llm_simulator.py` + +**Context:** Simulators currently use `model.generate()` (text-in/text-out) and manually parse JSON using `_extract_json_object()` + `json.loads()`. With instructor, we switch fully to `model.chat(messages=[...], response_model=OutputModel)` to get validated Pydantic models directly. The old `_extract_json_object()`, `_parse_output()`, and manual retry logic are deleted — instructor handles validation and retries. + +- [ ] **Step 1: Write failing test for Pydantic response models** + +Append to `tests/test_core/test_llm_simulator.py`: + +```python +@pytest.mark.core +class TestSimulatorResponseModels: + """Test that simulator response Pydantic models work correctly.""" + + def test_tool_simulator_response_model_exists(self): + from maseval.core.simulator import ToolSimulatorResponse + resp = ToolSimulatorResponse(text="success", details={"key": "value"}) + assert resp.text == "success" + assert resp.details == {"key": "value"} + + def test_user_simulator_response_model_exists(self): + from maseval.core.simulator import UserSimulatorResponse + resp = UserSimulatorResponse(text="I need help") + assert resp.text == "I need help" + + def test_agentic_user_simulator_response_model_exists(self): + from maseval.core.simulator import AgenticUserSimulatorResponse + resp = AgenticUserSimulatorResponse( + text="Let me check", + tool_calls=[{"name": "check_status", "arguments": {}}], + ) + assert resp.text == "Let me check" + assert len(resp.tool_calls) == 1 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_core/test_llm_simulator.py::TestSimulatorResponseModels -v -x` +Expected: FAIL — models don't exist yet + +- [ ] **Step 3: Add Pydantic response models to simulator.py** + +In `maseval/core/simulator.py`, after imports (before `_extract_json_object`), add: + +```python +from pydantic import BaseModel, Field + + +class ToolSimulatorResponse(BaseModel): + """Expected output format for ToolLLMSimulator.""" + text: str = Field(default="", description="Human-readable description of the tool's output") + details: Dict[str, Any] = Field(default_factory=dict, description="Structured tool output data") + + +class UserSimulatorResponse(BaseModel): + """Expected output format for UserLLMSimulator.""" + text: str = Field(default="", description="The user's response text") + + +class AgenticUserSimulatorResponse(BaseModel): + """Expected output format for AgenticUserLLMSimulator.""" + text: str = Field(default="", description="The user's response text") + tool_calls: List[Dict[str, Any]] = Field(default_factory=list, description="List of tool calls") +``` + +- [ ] **Step 4: Add _response_model and _parse_structured_response to LLMSimulator base** + +In `LLMSimulator` class, add class attribute: +```python + _response_model: Optional[type] = None +``` + +Add method: +```python + def _parse_structured_response(self, response: Any) -> Any: + """Convert instructor-validated response to expected return format. + + Override in subclasses to convert the Pydantic model instance + to the format expected by callers. + """ + return response +``` + +- [ ] **Step 5: Rewrite LLMSimulator.__call__ to use instructor directly** + +Replace the inner loop body. No legacy fallback — instructor handles validation and retries via `response_model`. Delete `_extract_json_object()` and `_parse_output()` methods entirely. + +```python + try: + chat_result = self.model.chat( + messages=[{"role": "user", "content": prompt}], + response_model=self._response_model, + max_retries=self.max_try, + generation_params=generation_params, + ) + parsed_result = self._parse_structured_response(chat_result.structured_response) + entry["raw_output"] = chat_result.content + entry["parsed_output"] = parsed_result + entry["status"] = SimulatorCallStatus.Successful.value + except Exception as e: + entry["raw_output"] = None + entry["status"] = SimulatorCallStatus.ModelCallError.value + entry["error"] = str(e) + self.logs.append(entry) +``` + +- [ ] **Step 6: Delete legacy parsing code** + +Remove from `simulator.py`: +- `_extract_json_object()` function (lines 13-27) +- `_parse_output()` methods from all simulator subclasses +- Manual JSON retry logic in `__call__` (the old `json.loads` / `json.JSONDecodeError` paths) + +- [ ] **Step 6: Wire up response models in simulator subclasses** + +In `ToolLLMSimulator`: +```python + _response_model = ToolSimulatorResponse + + def _parse_structured_response(self, response: ToolSimulatorResponse) -> tuple[str, Dict[str, Any]]: + return response.text, response.details +``` + +In `UserLLMSimulator`: +```python + _response_model = UserSimulatorResponse + + def _parse_structured_response(self, response: UserSimulatorResponse) -> str: + return response.text +``` + +In `AgenticUserLLMSimulator`: +```python + _response_model = AgenticUserSimulatorResponse + + def _parse_structured_response(self, response: AgenticUserSimulatorResponse) -> Tuple[str, List[Dict[str, Any]]]: + return response.text, response.tool_calls +``` + +- [ ] **Step 7: Run all simulator tests** + +Run: `uv run pytest tests/test_core/test_llm_simulator.py -v --tb=short` +Expected: All existing tests still PASS (DummyModelAdapter returns text via `generate()`, so the fallback path is exercised) + +- [ ] **Step 8: Run full core test suite** + +Run: `uv run pytest tests/test_core/ -v --tb=short` +Expected: All PASS + +- [ ] **Step 9: Commit** + +```bash +git add maseval/core/simulator.py tests/test_core/test_llm_simulator.py +git commit -m "feat: add instructor-based structured output to simulators with legacy fallback" +``` + +--- + +## Task 5: Replace _flatten_schema in Tau2 + +**Files:** +- Modify: `maseval/benchmark/tau2/tau2.py:781-902` and line 1231 +- Test: `tests/test_core/test_instructor_integration.py` + +**Context:** `_flatten_schema()` is called in two places: +1. `_build_tool_definitions()` at line 897 +2. `_get_tool_definitions()` at line 1231 + +Both must be updated to use `flatten_model_schema()` from `maseval.core.instructor`. + +- [ ] **Step 1: Write failing test for instructor-based schema generation** + +Append to `tests/test_core/test_instructor_integration.py`: + +```python +@pytest.mark.core +class TestInstructorSchemaGeneration: + """Test that flatten_model_schema produces clean schemas.""" + + def test_generates_clean_schema(self): + from pydantic import BaseModel, Field + from typing import Optional + from maseval.core.instructor import flatten_model_schema + + class OrderParams(BaseModel): + order_id: str = Field(description="The order ID") + status: Optional[str] = Field(default=None, description="Filter by status") + + flat = flatten_model_schema(OrderParams) + assert "$ref" not in str(flat) + assert "$defs" not in str(flat) + assert "anyOf" not in str(flat) + assert flat["type"] == "object" + assert "order_id" in flat["properties"] + + def test_handles_nested_models(self): + from pydantic import BaseModel, Field + from maseval.core.instructor import flatten_model_schema + + class Address(BaseModel): + street: str + city: str + + class Person(BaseModel): + name: str + address: Address + + flat = flatten_model_schema(Person) + assert "$ref" not in str(flat) + assert "address" in flat["properties"] +``` + +- [ ] **Step 2: Run test to verify it passes (flatten_model_schema was created in Task 1)** + +Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestInstructorSchemaGeneration -v -x` +Expected: PASS (function already exists from Task 1) + +- [ ] **Step 3: Replace _flatten_schema calls in tau2.py** + +In `maseval/benchmark/tau2/tau2.py`: + +1. Add import at the top of the `_build_tool_definitions` function (replace the existing `from typing import Any as TypingAny` line area): +```python + from maseval.core.instructor import flatten_model_schema +``` + +2. Replace line 897: +```python +# Before: +"parameters": _flatten_schema(params_model.model_json_schema()), +# After: +"parameters": flatten_model_schema(params_model), +``` + +3. Replace line 1231 (in `_get_tool_definitions()`): +```python +# Before: +"parameters": _flatten_schema(params_model.model_json_schema()), +# After: +from maseval.core.instructor import flatten_model_schema +... +"parameters": flatten_model_schema(params_model), +``` + +(Add the import once at the top of `_get_tool_definitions`, not inline at line 1231.) + +- [ ] **Step 4: Delete _flatten_schema() function** + +Remove the `_flatten_schema()` function (lines 781-834) from `tau2.py`. + +- [ ] **Step 5: Run tests** + +Run: `uv run pytest tests/test_core/ -v --tb=short` +Expected: PASS + +Run: `uv run pytest tests/ -v --tb=short -m "not (slow or credentialed or smoke)"` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add maseval/benchmark/tau2/tau2.py tests/test_core/test_instructor_integration.py +git commit -m "feat: replace _flatten_schema with instructor-based schema generation in Tau2" +``` + +--- + +## Task 6: Update exports and changelog + +**Files:** +- Modify: `maseval/core/__init__.py` (if it has explicit exports) +- Modify: `CHANGELOG.md` + +- [ ] **Step 1: Check and update exports** + +Check what's currently exported from `maseval/core/__init__.py` and `maseval/__init__.py`. If they have explicit `__all__` or import statements, add: + +```python +from .instructor import create_instructor_client, flatten_model_schema +``` + +Also export the simulator response models if they're useful to users: +```python +from .simulator import ToolSimulatorResponse, UserSimulatorResponse, AgenticUserSimulatorResponse +``` + +- [ ] **Step 2: Update CHANGELOG.md** + +Add under `## Unreleased`: + +```markdown +### Added + +- Added `instructor` as a core dependency for structured LLM output handling with automatic validation and retries. +- Added `response_model` parameter to `ModelAdapter.chat()` — pass a Pydantic `BaseModel` class to get validated structured outputs via `ChatResponse.structured_response`. +- Added `structured_response` field to `ChatResponse` for accessing parsed Pydantic model instances. +- Added `maseval.core.instructor` module with `create_instructor_client()` and `flatten_model_schema()` helpers. +- Added Pydantic response models for simulators: `ToolSimulatorResponse`, `UserSimulatorResponse`, `AgenticUserSimulatorResponse`. +- Simulators now use instructor for structured output parsing with automatic fallback to legacy JSON extraction. + +### Changed + +- Replaced manual `_flatten_schema()` in Tau2 benchmark with instructor-based `flatten_model_schema()`. +``` + +- [ ] **Step 3: Commit** + +```bash +git add maseval/core/__init__.py maseval/__init__.py CHANGELOG.md +git commit -m "chore: update exports and changelog for instructor integration" +``` + +--- + +## Task 7: Final validation + +- [ ] **Step 1: Run linter and formatter** + +Run: `uv run ruff format . && uv run ruff check . --fix` + +- [ ] **Step 2: Run type checker** + +Run: `uv run ty check` + +- [ ] **Step 3: Run full test suite** + +Run: `uv run pytest tests/ -v --tb=short -m "not (slow or credentialed or smoke)"` +Expected: All PASS + +- [ ] **Step 4: Verify end-to-end import** + +Run: +```bash +uv run python3 -c " +from maseval.core.instructor import create_instructor_client, flatten_model_schema +from maseval.core.model import ChatResponse, ModelAdapter +from maseval.core.simulator import ToolSimulatorResponse, UserSimulatorResponse, AgenticUserSimulatorResponse +print('All imports successful') + +from pydantic import BaseModel, Field +class TestModel(BaseModel): + name: str = Field(description='A name') + age: int = Field(description='An age') + +schema = flatten_model_schema(TestModel) +print(f'Schema: {schema}') +assert 'anyOf' not in str(schema) +print('Schema generation works correctly') +" +``` + +- [ ] **Step 5: Run just all (format + lint + typecheck + test)** + +Run: `just all` + +- [ ] **Step 6: Review git log** + +Run: `git log --oneline main..HEAD` +Expected: Clean series of feature commits + +- [ ] **Step 7: Final cleanup commit if needed** + +```bash +git status +# Only commit if there are changes +git diff --cached --quiet || git commit -m "chore: final cleanup for instructor integration" +``` diff --git a/maseval/__init__.py b/maseval/__init__.py index c50012ac..f76b93af 100644 --- a/maseval/__init__.py +++ b/maseval/__init__.py @@ -31,7 +31,11 @@ SimulatorError, ToolSimulatorError, UserSimulatorError, + ToolSimulatorResponse, + UserSimulatorResponse, + AgenticUserSimulatorResponse, ) +from .core.instructor import create_instructor_client, flatten_model_schema from .core.model import ModelAdapter, ChatResponse from .core.user import User, LLMUser, AgenticLLMUser, TerminationReason from .core.evaluator import Evaluator @@ -108,6 +112,13 @@ # Model adapters "ModelAdapter", "ChatResponse", + # Instructor integration + "create_instructor_client", + "flatten_model_schema", + # Simulator response models + "ToolSimulatorResponse", + "UserSimulatorResponse", + "AgenticUserSimulatorResponse", # Exceptions and validation "MASEvalError", "AgentError", diff --git a/maseval/benchmark/tau2/domains/base.py b/maseval/benchmark/tau2/domains/base.py index 06ded629..4ed2f57c 100644 --- a/maseval/benchmark/tau2/domains/base.py +++ b/maseval/benchmark/tau2/domains/base.py @@ -132,59 +132,6 @@ def decorator(func: Callable) -> Callable: return decorator -# ============================================================================= -# JSON Schema Helpers (for get_tool_metadata) -# ============================================================================= - - -def _resolve_node(node: Any, defs: Dict[str, Any]) -> Any: - """Resolve a JSON schema node, inlining ``$ref`` references and simplifying ``anyOf``.""" - if not isinstance(node, dict): - return node - - if "$ref" in node: - ref_name = node["$ref"].rsplit("/", 1)[-1] - if ref_name in defs: - return _resolve_node(dict(defs[ref_name]), defs) - return node - - # Simplify anyOf (typically Optional[X] -> X with nullable flag) - if "anyOf" in node: - variants = node["anyOf"] - non_null = [v for v in variants if not (isinstance(v, dict) and v.get("type") == "null")] - if len(non_null) == 1: - resolved = _resolve_node(non_null[0], defs) - resolved["nullable"] = True - if "description" in node and "description" not in resolved: - resolved["description"] = node["description"] - return resolved - if non_null: - return _resolve_node(non_null[0], defs) - - out: Dict[str, Any] = {} - for key, value in node.items(): - if key in ("$defs", "title", "default"): - continue - if isinstance(value, dict): - out[key] = _resolve_node(value, defs) - elif isinstance(value, list): - out[key] = [_resolve_node(v, defs) if isinstance(v, dict) else v for v in value] - else: - out[key] = value - return out - - -def _resolve_schema_properties(schema: Dict[str, Any]) -> Dict[str, Any]: - """Extract resolved per-parameter schemas from a JSON Schema ``properties`` block.""" - defs = schema.get("$defs", {}) - properties = schema.get("properties", {}) - - resolved: Dict[str, Any] = {} - for name, prop in properties.items(): - resolved[name] = _resolve_node(prop, defs) - return resolved - - # ============================================================================= # ToolKit Base Class # ============================================================================= @@ -413,10 +360,11 @@ def get_tool_metadata(self, tool_name: str) -> Dict[str, Any]: model_fields[param_name] = (anno, default) params_model = create_model("parameters", **model_fields) - schema = params_model.model_json_schema() # Resolve $ref/$defs and extract per-parameter schemas - inputs = _resolve_schema_properties(schema) + from maseval.core.instructor import flatten_model_schema + + inputs = flatten_model_schema(params_model).get("properties", {}) return { "description": description, diff --git a/maseval/benchmark/tau2/tau2.py b/maseval/benchmark/tau2/tau2.py index 555e41f7..b2bf5c4c 100644 --- a/maseval/benchmark/tau2/tau2.py +++ b/maseval/benchmark/tau2/tau2.py @@ -72,6 +72,7 @@ def get_model_adapter(self, model_id, **kwargs): from maseval.core.exceptions import UserExhaustedError from maseval.core.seeding import DefaultSeedGenerator, SeedGenerator +from maseval.core.instructor import flatten_model_schema from maseval.benchmark.tau2.environment import Tau2Environment from maseval.benchmark.tau2.evaluator import Tau2Evaluator @@ -778,62 +779,6 @@ def execution_loop( # type: ignore[override] """.strip() -def _flatten_schema(schema: Dict[str, Any]) -> Dict[str, Any]: - """Flatten a JSON schema by inlining ``$ref`` references and removing unsupported keys. - - Pydantic v2's ``model_json_schema()`` emits ``$ref`` / ``$defs`` for nested - models and ``anyOf`` for ``Optional`` fields. Google GenAI rejects all of - these. This helper recursively resolves them so the resulting schema is - self-contained and compatible with every provider. - - Args: - schema: A JSON schema dict (typically from ``Model.model_json_schema()``). - - Returns: - A flattened copy with ``$ref``, ``$defs``, ``anyOf``, ``title``, - ``default``, and ``additionalProperties`` removed / inlined. - """ - _STRIP_KEYS = {"$defs", "additionalProperties", "title", "default"} - - def _resolve(node: Any, defs: Dict[str, Any]) -> Any: - if not isinstance(node, dict): - return node - - # Inline $ref - if "$ref" in node: - ref_name = node["$ref"].rsplit("/", 1)[-1] - if ref_name in defs: - return _resolve(dict(defs[ref_name]), defs) - return node - - # Simplify anyOf (Optional[X] → X with nullable) - if "anyOf" in node: - variants = node["anyOf"] - non_null = [v for v in variants if not (isinstance(v, dict) and v.get("type") == "null")] - if len(non_null) == 1: - resolved = _resolve(non_null[0], defs) - resolved["nullable"] = True - if "description" in node and "description" not in resolved: - resolved["description"] = node["description"] - return resolved - if non_null: - return _resolve(non_null[0], defs) - - out: Dict[str, Any] = {} - for key, value in node.items(): - if key in _STRIP_KEYS or key == "anyOf": - continue - if isinstance(value, dict): - out[key] = _resolve(value, defs) - elif isinstance(value, list): - out[key] = [_resolve(v, defs) if isinstance(v, dict) else v for v in value] - else: - out[key] = value - return out - - return _resolve(schema, schema.get("$defs", {})) - - def _build_tool_definitions(tools: Dict[str, Callable]) -> List[Dict[str, Any]]: """Build OpenAI-format tool definitions from a dict of callables. @@ -894,7 +839,7 @@ def _build_tool_definitions(tools: Dict[str, Callable]) -> List[Dict[str, Any]]: "function": { "name": name, "description": description, - "parameters": _flatten_schema(params_model.model_json_schema()), + "parameters": flatten_model_schema(params_model), }, } ) @@ -1228,7 +1173,7 @@ def _get_tool_definitions(self) -> List[Dict[str, Any]]: "function": { "name": name, "description": description, - "parameters": _flatten_schema(params_model.model_json_schema()), + "parameters": flatten_model_schema(params_model), }, } ) diff --git a/maseval/core/instructor.py b/maseval/core/instructor.py new file mode 100644 index 00000000..9e284fec --- /dev/null +++ b/maseval/core/instructor.py @@ -0,0 +1,117 @@ +"""Instructor library integration for structured LLM outputs. + +Provides helpers to create instructor-patched clients from provider SDK clients +and to generate flattened JSON schemas from Pydantic models. + +Instructor adds ``response_model`` support with automatic validation and retries +to any supported LLM provider. + +Example: + ```python + from maseval.core.instructor import create_instructor_client + + # Wrap an OpenAI client + import openai + client = openai.OpenAI() + instructor_client = create_instructor_client(client, provider="openai") + ``` +""" + +from __future__ import annotations + +from typing import Any, Optional, Dict + + +def create_instructor_client( + client: Any, + provider: str, + mode: Optional[str] = None, +) -> Any: + """Create an instructor-patched client from a provider SDK client. + + All patched clients expose a unified API: + ``client.chat.completions.create(response_model=..., messages=...)``. + + Args: + client: The provider SDK client instance (e.g., ``openai.OpenAI()``, + ``anthropic.Anthropic()``). For LiteLLM, pass ``litellm.completion``. + provider: Provider name. One of: ``"openai"``, ``"litellm"``. + For other providers, use ``instructor.from_provider()`` directly. + mode: Optional instructor mode override. If None, uses the default + for the provider. + + Returns: + An instructor-patched client supporting ``response_model``. + + Raises: + ValueError: If provider is not recognized. + """ + import instructor + + kwargs: Dict[str, Any] = {} + if mode is not None: + kwargs["mode"] = getattr(instructor.Mode, mode.upper(), mode) + + if provider == "openai": + return instructor.from_openai(client, **kwargs) + elif provider == "litellm": + return instructor.from_litellm(client, **kwargs) + else: + raise ValueError(f"Unsupported provider: {provider!r}. Use instructor.from_provider() directly for other providers.") + + +def flatten_model_schema(model: type) -> Dict[str, Any]: + """Generate a flattened JSON schema from a Pydantic model. + + Uses instructor's ``openai_schema`` to produce a clean schema, then + applies additional flattening to remove ``anyOf`` (for ``Optional`` + fields) and other constructs that some providers reject. + + Args: + model: A Pydantic BaseModel subclass. + + Returns: + A flat JSON schema dict suitable for LLM tool parameters. + """ + import instructor + + schema_obj = instructor.openai_schema(model) + schema = schema_obj.openai_schema["parameters"] + + # instructor's openai_schema still produces anyOf for Optional fields. + # Flatten those for provider compatibility (especially Google GenAI). + return _resolve_schema(schema) + + +def _resolve_schema(node: Any) -> Any: + """Recursively resolve anyOf and strip unsupported keys from a schema.""" + if not isinstance(node, dict): + return node + + _STRIP_KEYS = {"$defs", "additionalProperties", "title", "default"} + + # Simplify anyOf (Optional[X] -> X with nullable) + if "anyOf" in node: + variants = node["anyOf"] + non_null = [v for v in variants if not (isinstance(v, dict) and v.get("type") == "null")] + if len(non_null) == 1: + resolved = _resolve_schema(non_null[0]) + if isinstance(resolved, dict): + resolved["nullable"] = True + if "description" in node and "description" not in resolved: + resolved["description"] = node["description"] + return resolved + if non_null: + return _resolve_schema(non_null[0]) + + out: Dict[str, Any] = {} + for key, value in node.items(): + if key in _STRIP_KEYS or key == "anyOf": + continue + if isinstance(value, dict): + out[key] = _resolve_schema(value) + elif isinstance(value, list): + out[key] = [_resolve_schema(v) if isinstance(v, dict) else v for v in value] + else: + out[key] = value + return out diff --git a/maseval/core/model.py b/maseval/core/model.py index 528f5ac2..2f668039 100644 --- a/maseval/core/model.py +++ b/maseval/core/model.py @@ -79,6 +79,8 @@ class ChatResponse: model: The model ID that generated this response, if available. stop_reason: Why the model stopped generating. Common values: 'end_turn', 'tool_use', 'max_tokens', 'stop_sequence'. + structured_response: The validated Pydantic model instance when + ``response_model`` was used with ``chat()``. None otherwise. Example: ```python @@ -103,6 +105,7 @@ class ChatResponse: usage: Optional[Dict[str, Any]] = None model: Optional[str] = None stop_reason: Optional[str] = None + structured_response: Optional[Any] = None def to_message(self) -> Dict[str, Any]: """Convert this response to an OpenAI-compatible message dict. @@ -210,6 +213,8 @@ def chat( generation_params: Optional[Dict[str, Any]] = None, tools: Optional[List[Dict[str, Any]]] = None, tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + response_model: Optional[type] = None, + max_retries: int = 3, **kwargs: Any, ) -> ChatResponse: """Send messages to the model and get a response. @@ -232,6 +237,12 @@ def chat( - "none": Model won't use tools - "required": Model must use a tool - {"type": "function", "function": {"name": "..."}}: Use specific tool + response_model: Optional Pydantic BaseModel class. When provided, + the model's response is validated against this schema and + returned in ``ChatResponse.structured_response``. Uses + instructor for automatic validation and retries. + max_retries: Number of retries on validation failure when using + ``response_model``. Default is 3. Ignored without ``response_model``. **kwargs: Additional provider-specific arguments. Returns: @@ -282,13 +293,24 @@ def chat( messages_list = messages try: - result = self._chat_impl( - messages_list, - generation_params=generation_params, - tools=tools, - tool_choice=tool_choice, - **kwargs, - ) + if response_model is not None: + result = self._structured_chat( + messages_list, + response_model=response_model, + max_retries=max_retries, + generation_params=generation_params, + tools=tools, + tool_choice=tool_choice, + **kwargs, + ) + else: + result = self._chat_impl( + messages_list, + generation_params=generation_params, + tools=tools, + tool_choice=tool_choice, + **kwargs, + ) duration = time.time() - start_time self.logs.append( @@ -366,6 +388,41 @@ def _chat_impl( ChatResponse with the model's output. """ + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Internal structured chat using instructor. + + Subclasses that support instructor should override this method + with their instructor-patched client. The base class raises + ``NotImplementedError`` — no silent fallback to unstructured output. + + Args: + messages: List of message dicts. + response_model: Pydantic model class for response validation. + max_retries: Number of retries on validation failure. + generation_params: Generation parameters. + tools: Tool definitions, if any. + tool_choice: Tool choice setting, if any. + **kwargs: Additional arguments. + + Returns: + ChatResponse with ``structured_response`` populated. + + Raises: + NotImplementedError: If the adapter does not support ``response_model``. + """ + raise NotImplementedError( + f"{type(self).__name__} does not support response_model. Override _structured_chat() with an instructor-patched client." + ) + def generate( self, prompt: str, diff --git a/maseval/core/simulator.py b/maseval/core/simulator.py index bd81580d..2555761a 100644 --- a/maseval/core/simulator.py +++ b/maseval/core/simulator.py @@ -3,6 +3,8 @@ import json import os from datetime import datetime +from pydantic import BaseModel, Field + from .model import ModelAdapter from .tracing import TraceableMixin from .exceptions import EnvironmentError, UserError @@ -10,21 +12,24 @@ from enum import Enum -def _extract_json_object(output: str) -> str: - """Extract a JSON object from LLM output that may contain surrounding noise. +class ToolSimulatorResponse(BaseModel): + """Expected output format for ToolLLMSimulator.""" - LLMs may wrap valid JSON in markdown fences, prepend reasoning/thinking - tokens, or append stop/EOS tokens. This function extracts the substring - from the first '{' to the last '}' so that json.loads can parse it. + text: str = Field(default="", description="Human-readable description of the tool's output") + details: Dict[str, Any] = Field(default_factory=dict, description="Structured tool output data") + + +class UserSimulatorResponse(BaseModel): + """Expected output format for UserLLMSimulator.""" + + text: str = Field(default="", description="The user's response text") - Falls back to the stripped output if no braces are found, letting - json.loads raise a clear error on truly invalid output. - """ - json_start = output.find("{") - json_end = output.rfind("}") - if json_start != -1 and json_end != -1 and json_end > json_start: - return output[json_start : json_end + 1] - return output.strip() + +class AgenticUserSimulatorResponse(BaseModel): + """Expected output format for AgenticUserLLMSimulator.""" + + text: str = Field(default="", description="The user's response text") + tool_calls: List[Dict[str, Any]] = Field(default_factory=list, description="List of tool calls") class SimulatorError(Exception): @@ -151,6 +156,9 @@ class LLMSimulator(ABC, TraceableMixin): # Override in subclasses to specify component name for error messages _component_name: Optional[str] = None + # Override in subclasses to specify the Pydantic response model + _response_model: Optional[type] = None + def __init__( self, model: ModelAdapter, @@ -164,7 +172,8 @@ def __init__( Args: model (ModelAdapter): The language model to use for generation. template (str, optional): A prompt template. - max_try (int, optional): Maximum number of model calls to attempt. Defaults to 3. + max_try (int, optional): Maximum number of retries for structured output + validation via instructor. Defaults to 3. generation_params (Dict[str, Any], optional): Default generation parameters for the model. This overwrites the ModelAdapter's defaults if provided. Both can be overridden at call time. Defaults to None. """ @@ -206,91 +215,63 @@ def _create_error( component=self._component_name, ) + def _parse_structured_response(self, response: Any) -> Any: + """Convert instructor-validated response to expected return format. + + Override in subclasses to convert the Pydantic model instance + to the format expected by callers. + """ + return response + def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **kwargs) -> Any: """ - Generates a simulated output. + Generates a simulated output using instructor for structured output. """ prompt = self._fill_prompt_template(**kwargs) request_id = str(uuid.uuid4()) - attempts = 0 - parsed_result = None # merging of LLM default and call-time generation params done here, so subclasses # can just call super().__call__(generation_params=...) and have it handled generation_params = self.generation_params | (generation_params or {}) - # For each attempt, append a separate history entry with the request id. - while attempts < self.max_try and parsed_result is None: - attempts += 1 - raw_output = None - entry = { - "id": request_id, - "timestamp": datetime.now().isoformat(), - "input": kwargs, - "prompt": prompt, - "generation_params": generation_params, - "raw_output": None, - "parsed_output": None, - "status": None, - "error": None, - } - - try: - raw_output = self.model.generate(prompt, generation_params=generation_params) - entry["raw_output"] = raw_output - except Exception as e: - # record model call error attempt by updating the pre-created entry - entry["raw_output"] = None - entry["status"] = SimulatorCallStatus.ModelCallError.value - entry["error"] = str(e) - # rich.print( - # f"[yellow]Warning:[/yellow] Attempt {attempts} failed to call model: {e}" - # + (" Retrying..." if attempts < self.max_try else "") - # ) - self.logs.append(entry) - continue - - # try parsing the raw output - try: - parsed_result = self._parse_output(raw_output) - # update the existing entry with successful result - entry["parsed_output"] = parsed_result - entry["status"] = SimulatorCallStatus.Successful.value - - except (json.JSONDecodeError, AttributeError) as e: - # update the existing entry with parsing error info - entry["status"] = SimulatorCallStatus.ModelParsingError.value - entry["error"] = str(e) - # rich.print( - # f"[yellow]Warning:[/yellow] Attempt {attempts} failed to parse LLM output: {e}" - # + (" Retrying..." if attempts < self.max_try else "") - # ) - self.logs.append(entry) + entry = { + "id": request_id, + "timestamp": datetime.now().isoformat(), + "input": kwargs, + "prompt": prompt, + "generation_params": generation_params, + "raw_output": None, + "parsed_output": None, + "status": None, + "error": None, + } - if parsed_result is not None: + try: + chat_result = self.model.chat( + messages=[{"role": "user", "content": prompt}], + response_model=self._response_model, + max_retries=self.max_try, + generation_params=generation_params, + ) + parsed_result = self._parse_structured_response(chat_result.structured_response) + entry["raw_output"] = chat_result.content + entry["parsed_output"] = parsed_result + entry["status"] = SimulatorCallStatus.Successful.value + self.logs.append(entry) return parsed_result + except Exception as e: + entry["raw_output"] = None + entry["status"] = SimulatorCallStatus.ModelCallError.value + entry["error"] = str(e) + self.logs.append(entry) - # All attempts failed - raise exception with details - last_error = None - for log in reversed(self.logs): - if log.get("id") == request_id and log.get("error"): - last_error = log["error"] - break - - raise self._create_error( - message=f"{self.__class__.__name__} failed to parse model output after {self.max_try} attempts", - attempts=self.max_try, - last_error=last_error, - logs=[log for log in self.logs if log.get("id") == request_id], - ) - - def _call_model_and_parse(self, prompt: str) -> Any: - """ - Calls the model with the given prompt and attempts to parse the output. - """ - raw_output = self.model.generate(prompt) - return self._parse_output(raw_output) + raise self._create_error( + message=f"{self.__class__.__name__} failed: {e}", + attempts=self.max_try, + last_error=str(e), + logs=[log for log in self.logs if log.get("id") == request_id], + ) @abstractmethod def _fill_prompt_template(self, **kwargs) -> str: @@ -299,13 +280,6 @@ def _fill_prompt_template(self, **kwargs) -> str: """ pass - @abstractmethod - def _parse_output(self, output: str) -> Any: - """ - Parses the raw output from the model. - """ - pass - def gather_traces(self) -> dict[str, Any]: """Gather execution traces from this simulator. @@ -351,6 +325,8 @@ class ToolLLMSimulator(LLMSimulator): ENVIRONMENT_ERROR (not the agent's fault). """ + _response_model = ToolSimulatorResponse + def __init__( self, model: ModelAdapter, @@ -405,13 +381,8 @@ def _create_error( def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **actual_inputs: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]: # type: ignore[override] return super().__call__(generation_params=generation_params, **actual_inputs) - def _parse_output(self, output: str) -> tuple[str, Dict[str, Any]]: # type: ignore[override] - # LLMs may wrap valid JSON in markdown fences, prepend reasoning/thinking - # tokens, or append stop tokens. Extract the outermost { ... } span to - # robustly parse the expected JSON object. - text_stripped = _extract_json_object(output) - output_data = json.loads(text_stripped) - return output_data.get("text", ""), output_data.get("details", {}) + def _parse_structured_response(self, response: ToolSimulatorResponse) -> Tuple[str, Dict[str, Any]]: # type: ignore[override] + return response.text, response.details def _fill_prompt_template(self, **kwargs) -> str: """ @@ -439,6 +410,7 @@ class UserLLMSimulator(LLMSimulator): """ _component_name = "user_simulator" + _response_model = UserSimulatorResponse def __init__( self, @@ -525,31 +497,8 @@ def __call__( # ty: ignore[invalid-method-override] """ return super().__call__(generation_params=generation_params, conversation_history=conversation_history) - def _parse_output(self, output: str) -> str: # type: ignore[override] - """ - Parses the raw JSON output from the model. - """ - # LLMs may wrap valid JSON in markdown fences, prepend reasoning/thinking - # tokens, or append stop tokens. Extract the outermost { ... } span. - text_stripped = _extract_json_object(output) - try: - output_data = json.loads(text_stripped) - except json.JSONDecodeError as e: - raise json.JSONDecodeError( - f"UserLLMSimulator failed to decode JSON from model output. Extracted: {text_stripped[:200]!r}", - e.doc, - e.pos, - ) from e - text = output_data.get("text", "") - - # Stop tokens may appear outside the JSON object (e.g. "} "). - # The upstream User._check_stop_token checks the parsed text string, - # so we need to preserve the stop token if it was in the raw output - # but ended up outside the extracted JSON. - if self.stop_token and self.stop_token in output and self.stop_token not in text: - text = text + " " + self.stop_token - - return text + def _parse_structured_response(self, response: UserSimulatorResponse) -> str: # type: ignore[override] + return response.text def _fill_prompt_template(self, **kwargs) -> str: """ @@ -588,6 +537,7 @@ class AgenticUserLLMSimulator(LLMSimulator): """A simulator that uses an LLM to act as an agentic user (capable of using tools).""" _component_name = "user_simulator" + _response_model = AgenticUserSimulatorResponse def __init__( self, @@ -653,29 +603,8 @@ def __call__( """ return super().__call__(generation_params=generation_params, conversation_history=conversation_history) - def _parse_output(self, output: str) -> Tuple[str, List[Dict[str, Any]]]: # type: ignore[override] - """Parse the raw JSON output from the model.""" - # LLMs may wrap valid JSON in markdown fences, prepend reasoning/thinking - # tokens, or append stop tokens. Extract the outermost { ... } span. - text_stripped = _extract_json_object(output) - try: - output_data = json.loads(text_stripped) - except json.JSONDecodeError as e: - raise json.JSONDecodeError( - f"AgenticUserLLMSimulator failed to decode JSON from model output. Extracted: {text_stripped[:200]!r}", - e.doc, - e.pos, - ) from e - - text = output_data.get("text", "") - tool_calls = output_data.get("tool_calls", []) - - # Preserve stop token if it appeared outside the JSON object - # (see UserLLMSimulator._parse_output for rationale). - if self.stop_token and self.stop_token in output and self.stop_token not in text: - text = text + " " + self.stop_token - - return text, tool_calls + def _parse_structured_response(self, response: AgenticUserSimulatorResponse) -> Tuple[str, List[Dict[str, Any]]]: # type: ignore[override] + return response.text, response.tool_calls def _fill_prompt_template(self, **kwargs) -> str: """Fill the prompt template with the message history, user profile, and tools.""" diff --git a/maseval/interface/inference/anthropic.py b/maseval/interface/inference/anthropic.py index dfd07579..4409ff3f 100644 --- a/maseval/interface/inference/anthropic.py +++ b/maseval/interface/inference/anthropic.py @@ -108,6 +108,9 @@ def __init__( self._default_generation_params = default_generation_params or {} self._max_tokens = max_tokens + # Instructor-patched client created lazily in _structured_chat + self._instructor_client = None + @property def model_id(self) -> str: return self._model_id @@ -370,6 +373,45 @@ def _parse_response(self, response: Any) -> ChatResponse: stop_reason=stop_reason, ) + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> "ChatResponse": + """Use instructor for structured output with validation and retries.""" + if self._instructor_client is None: + from instructor import from_anthropic + + self._instructor_client = from_anthropic(self._client) + + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + max_tokens = params.pop("max_tokens", self._max_tokens) + params["max_tokens"] = max_tokens + + result = self._instructor_client.chat.completions.create( + model=self._model_id, + response_model=response_model, + messages=messages, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this Anthropic model adapter. diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index 5bbf33ce..0b4b1a70 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -84,6 +84,9 @@ def __init__( self._model_id = model_id self._default_generation_params = default_generation_params or {} + # Instructor-patched client created lazily in _structured_chat + self._instructor_client = None + @property def model_id(self) -> str: return self._model_id @@ -316,6 +319,45 @@ def _parse_response(self, response: Any) -> ChatResponse: stop_reason=stop_reason, ) + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> "ChatResponse": + """Use instructor for structured output with validation and retries.""" + if self._instructor_client is None: + from instructor import from_genai + + self._instructor_client = from_genai(self._client) + + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + + result = self._instructor_client.chat.completions.create( + model=self._model_id, + response_model=response_model, + messages=messages, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this Google GenAI model adapter. diff --git a/maseval/interface/inference/litellm.py b/maseval/interface/inference/litellm.py index b12b618e..c47177b5 100644 --- a/maseval/interface/inference/litellm.py +++ b/maseval/interface/inference/litellm.py @@ -212,6 +212,58 @@ def _chat_impl( stop_reason=getattr(choice, "finish_reason", None), ) + def _get_instructor_client(self) -> Any: + """Lazily create instructor-patched LiteLLM client.""" + if not hasattr(self, "_instructor_client") or self._instructor_client is None: + try: + import litellm + except ImportError as e: + raise ImportError("LiteLLM is not installed. Install with: uv add maseval[litellm]") from e + from maseval.core.instructor import create_instructor_client + + self._instructor_client = create_instructor_client(litellm.completion, provider="litellm") + return self._instructor_client + + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> "ChatResponse": + """Use instructor for structured output with validation and retries.""" + client = self._get_instructor_client() + + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + if self._api_key: + params["api_key"] = self._api_key + if self._api_base: + params["api_base"] = self._api_base + + result = client.chat.completions.create( + model=self._model_id, + messages=messages, + response_model=response_model, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this LiteLLM model adapter. diff --git a/maseval/interface/inference/openai.py b/maseval/interface/inference/openai.py index ff0c4245..ae01df3d 100644 --- a/maseval/interface/inference/openai.py +++ b/maseval/interface/inference/openai.py @@ -91,6 +91,9 @@ def __init__( self._model_id = model_id self._default_generation_params = default_generation_params or {} + # Instructor-patched client created lazily in _structured_chat + self._instructor_client = None + @property def model_id(self) -> str: return self._model_id @@ -286,6 +289,45 @@ def _parse_dict_response(self, response: Dict[str, Any]) -> ChatResponse: stop_reason=choice.get("finish_reason"), ) + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> "ChatResponse": + """Use instructor for structured output with validation and retries.""" + if self._instructor_client is None: + from maseval.core.instructor import create_instructor_client + + self._instructor_client = create_instructor_client(self._client, provider="openai") + + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + if self._seed is not None and "seed" not in params: + params["seed"] = self._seed + + result = self._instructor_client.chat.completions.create( + model=self._model_id, + messages=messages, + response_model=response_model, + max_retries=max_retries, + **params, + ) + + return ChatResponse( + content=result.model_dump_json(), + structured_response=result, + role="assistant", + model=self._model_id, + ) + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this OpenAI model adapter. diff --git a/pyproject.toml b/pyproject.toml index 45805087..075484d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "tqdm>=4.66.0", "rich>=14.1.0", "pydantic>=2.10.6", + "instructor>=1.14.5", ] # Enable optional dependencies for end users diff --git a/tests/conftest.py b/tests/conftest.py index 00407458..e21f3e39 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -116,6 +116,41 @@ def _chat_impl( stop_reason=self._stop_reason, ) + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Any] = None, + **kwargs: Any, + ) -> ChatResponse: + """Mock structured chat that parses response content via response_model. + + Simulates instructor behavior: retries on validation failure by calling + _chat_impl again up to max_retries times. Raises on exhaustion. + """ + last_error = None + for _ in range(max_retries): + result = self._chat_impl(messages, generation_params, tools, tool_choice, **kwargs) + if result.content and response_model is not None: + try: + structured = response_model.model_validate_json(result.content) + return ChatResponse( + content=result.content, + tool_calls=result.tool_calls, + role=result.role, + model=result.model, + usage=result.usage, + stop_reason=result.stop_reason, + structured_response=structured, + ) + except Exception as e: + last_error = e + continue + raise ValueError(f"Failed to validate response after {max_retries} retries: {last_error}") + class CountingFakeModelAdapter(ModelAdapter): """Model adapter that returns different responses on successive calls. diff --git a/tests/test_core/test_agentic_user.py b/tests/test_core/test_agentic_user.py index 8e8aa0ad..4de8b933 100644 --- a/tests/test_core/test_agentic_user.py +++ b/tests/test_core/test_agentic_user.py @@ -30,12 +30,37 @@ def _chat_impl( response_text = self.responses[self.call_count % len(self.responses)] self.call_count += 1 - # Create ChatResponse with the text content chat_response = ChatResponse() chat_response.content = response_text chat_response.tool_calls = None return chat_response + def _structured_chat( + self, + messages: List[Dict[str, Any]], + response_model: type, + max_retries: int = 3, + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Mock structured chat with retry.""" + last_error = None + for _ in range(max_retries): + result = self._chat_impl(messages, generation_params, tools, tool_choice, **kwargs) + if result.content and response_model is not None: + try: + structured = response_model.model_validate_json(result.content) + return ChatResponse( + content=result.content, + structured_response=structured, + ) + except Exception as e: + last_error = e + continue + raise ValueError(f"Failed to validate response after {max_retries} retries: {last_error}") + class DummyAgenticLLMUser(AgenticLLMUser): def get_tool(self): diff --git a/tests/test_core/test_llm_simulator.py b/tests/test_core/test_llm_simulator.py index 5ab8f9bf..160522c4 100644 --- a/tests/test_core/test_llm_simulator.py +++ b/tests/test_core/test_llm_simulator.py @@ -1,31 +1,55 @@ """Test LLM Simulator functionality. -These tests verify that LLMSimulator retry logic and tracing work correctly. +These tests verify that LLMSimulator structured output and tracing work correctly. +Simulators use instructor for structured output via response_model, so retries +happen inside the model adapter's _structured_chat method, not in the simulator loop. """ import pytest from maseval.core.simulator import ( ToolLLMSimulator, + UserLLMSimulator, + AgenticUserLLMSimulator, SimulatorCallStatus, ToolSimulatorError, + UserSimulatorError, + ToolSimulatorResponse, + UserSimulatorResponse, + AgenticUserSimulatorResponse, ) +@pytest.mark.core +class TestSimulatorResponseModels: + """Test that simulator response Pydantic models work correctly.""" + + def test_tool_simulator_response_model(self): + resp = ToolSimulatorResponse(text="success", details={"key": "value"}) + assert resp.text == "success" + assert resp.details == {"key": "value"} + + def test_user_simulator_response_model(self): + resp = UserSimulatorResponse(text="I need help") + assert resp.text == "I need help" + + def test_agentic_user_simulator_response_model(self): + resp = AgenticUserSimulatorResponse( + text="Let me check", + tool_calls=[{"name": "check_status", "arguments": {}}], + ) + assert resp.text == "Let me check" + assert len(resp.tool_calls) == 1 + + @pytest.mark.core class TestLLMSimulator: - """Tests for LLMSimulator retry and tracing.""" + """Tests for LLMSimulator structured output and tracing.""" - def test_llm_simulator_retry_logic(self, dummy_model): - """Test that simulator retries on parsing errors.""" - # Model returns invalid JSON first, then valid + def test_llm_simulator_success(self, dummy_model): + """Test that simulator succeeds with valid JSON.""" from conftest import DummyModelAdapter - model = DummyModelAdapter( - responses=[ - "invalid json", - '{"text": "Tool executed successfully", "details": {"result": "success"}}', - ] - ) + model = DummyModelAdapter(responses=['{"text": "Tool executed successfully", "details": {"result": "success"}}']) simulator = ToolLLMSimulator( model=model, @@ -37,22 +61,24 @@ def test_llm_simulator_retry_logic(self, dummy_model): result = simulator(actual_inputs={"param": "test"}) - # Should eventually succeed - ToolLLMSimulator returns (text, details) tuple assert result is not None assert isinstance(result, tuple) text, details = result - assert isinstance(details, dict) + assert text == "Tool executed successfully" assert details.get("result") == "success" + assert len(simulator.logs) == 1 + assert simulator.logs[0]["status"] == SimulatorCallStatus.Successful.value - # Should have 2 attempts captured in logs (1 fail, 1 success) - assert len(simulator.logs) == 2 - - def test_llm_simulator_parsing_error_retry(self, dummy_model): - """Test that parsing errors trigger retries and raise SimulatorError on exhaustion.""" + def test_llm_simulator_retry_logic(self, dummy_model): + """Test that instructor retries on invalid JSON and eventually succeeds.""" from conftest import DummyModelAdapter - # All responses are invalid JSON - model = DummyModelAdapter(responses=["bad", "bad", "bad"]) + model = DummyModelAdapter( + responses=[ + "invalid json", + '{"text": "Tool executed successfully", "details": {"result": "success"}}', + ] + ) simulator = ToolLLMSimulator( model=model, @@ -62,45 +88,43 @@ def test_llm_simulator_parsing_error_retry(self, dummy_model): max_try=3, ) - # Should raise ToolSimulatorError after max_try attempts - with pytest.raises(ToolSimulatorError) as exc_info: - simulator(actual_inputs={"param": "test"}) + result = simulator(actual_inputs={"param": "test"}) + + # Should succeed after instructor retries internally + assert result is not None + assert isinstance(result, tuple) + text, details = result + assert details.get("result") == "success" - # Verify exception details - err = exc_info.value - assert err.attempts == 3 - assert err.last_error is not None - assert len(err.logs) == 3 # All 3 attempts in exception logs - assert len(simulator.logs) == 3 # All 3 attempts logged in simulator + # Simulator sees 1 successful call (retries are internal to _structured_chat) + assert len(simulator.logs) == 1 - def test_llm_simulator_max_attempts_respected(self, dummy_model): - """Test that max_try limit is respected.""" + def test_llm_simulator_parsing_error_raises(self, dummy_model): + """Test that all-invalid JSON raises SimulatorError after retries.""" from conftest import DummyModelAdapter - model = DummyModelAdapter(responses=["invalid"] * 10) + model = DummyModelAdapter(responses=["bad", "bad", "bad"]) simulator = ToolLLMSimulator( model=model, tool_name="test_tool", tool_description="A test tool", tool_inputs={"param": {"type": "string"}}, - max_try=2, # Only allow 2 attempts + max_try=3, ) - # Should raise after 2 attempts - with pytest.raises(ToolSimulatorError) as exc_info: + with pytest.raises(ToolSimulatorError): simulator(actual_inputs={"param": "test"}) - # Should stop after 2 attempts, not continue to 10 - err = exc_info.value - assert len(simulator.logs) == 2 - assert err.attempts == 2 + # Simulator logs 1 entry (the failed call) + assert len(simulator.logs) == 1 + assert simulator.logs[0]["status"] == SimulatorCallStatus.ModelCallError.value def test_llm_simulator_history_structure(self, dummy_model): """Test that history entries have correct structure.""" from conftest import DummyModelAdapter - model = DummyModelAdapter(responses=['{"result": "success"}']) + model = DummyModelAdapter(responses=['{"text": "ok", "details": {}}']) simulator = ToolLLMSimulator( model=model, @@ -112,7 +136,6 @@ def test_llm_simulator_history_structure(self, dummy_model): _ = simulator(actual_inputs={"param": "test"}) - # Check log entry structure entry = simulator.logs[0] assert "id" in entry assert "timestamp" in entry @@ -126,7 +149,7 @@ def test_llm_simulator_status_tracking(self, dummy_model): """Test that status is correctly tracked.""" from conftest import DummyModelAdapter - model = DummyModelAdapter(responses=['{"result": "success"}']) + model = DummyModelAdapter(responses=['{"text": "ok", "details": {}}']) simulator = ToolLLMSimulator( model=model, @@ -145,7 +168,7 @@ def test_llm_simulator_gather_traces(self, dummy_model): """Test that gather_traces includes complete history.""" from conftest import DummyModelAdapter - model = DummyModelAdapter(responses=['{"result": "success"}']) + model = DummyModelAdapter(responses=['{"text": "ok", "details": {}}']) simulator = ToolLLMSimulator( model=model, @@ -173,8 +196,6 @@ class TestUserLLMSimulatorValidation: def test_stop_token_without_condition_raises(self, dummy_model): """ValueError raised when stop_token set but early_stopping_condition is None.""" - from maseval.core.simulator import UserLLMSimulator - with pytest.raises(ValueError, match="must both be set or both be None"): UserLLMSimulator( model=dummy_model, @@ -185,8 +206,6 @@ def test_stop_token_without_condition_raises(self, dummy_model): def test_condition_without_stop_token_raises(self, dummy_model): """ValueError raised when early_stopping_condition set but stop_token is None.""" - from maseval.core.simulator import UserLLMSimulator - with pytest.raises(ValueError, match="must both be set or both be None"): UserLLMSimulator( model=dummy_model, @@ -196,9 +215,6 @@ def test_condition_without_stop_token_raises(self, dummy_model): ) def test_both_none_is_valid(self, dummy_model): - """No error when both stop_token and early_stopping_condition are None.""" - from maseval.core.simulator import UserLLMSimulator - simulator = UserLLMSimulator( model=dummy_model, user_profile={"name": "test"}, @@ -208,9 +224,6 @@ def test_both_none_is_valid(self, dummy_model): assert simulator.early_stopping_condition is None def test_both_set_is_valid(self, dummy_model): - """No error when both stop_token and early_stopping_condition are set.""" - from maseval.core.simulator import UserLLMSimulator - simulator = UserLLMSimulator( model=dummy_model, user_profile={"name": "test"}, @@ -222,21 +235,13 @@ def test_both_set_is_valid(self, dummy_model): assert simulator.early_stopping_condition == "all goals accomplished" -# ============================================================================= -# UserLLMSimulator Response Tests -# ============================================================================= - - @pytest.mark.core class TestUserLLMSimulatorResponse: """Tests for UserLLMSimulator response generation.""" def test_user_simulator_generates_response(self, dummy_model): - """UserLLMSimulator generates a response from conversation history.""" from conftest import DummyModelAdapter - from maseval.core.simulator import UserLLMSimulator - # UserLLMSimulator expects JSON output with "text" field model = DummyModelAdapter(responses=['{"text": "I need help with my order."}']) simulator = UserLLMSimulator( @@ -252,9 +257,7 @@ def test_user_simulator_generates_response(self, dummy_model): assert result == "I need help with my order." def test_user_simulator_fills_template(self, dummy_model): - """UserLLMSimulator fills prompt template with profile and scenario.""" from conftest import DummyModelAdapter - from maseval.core.simulator import UserLLMSimulator model = DummyModelAdapter(responses=['{"text": "Test response"}']) @@ -264,18 +267,14 @@ def test_user_simulator_fills_template(self, dummy_model): scenario="Account inquiry scenario", ) - # Call to trigger prompt filling simulator(conversation_history=[{"role": "agent", "content": "Hello"}]) - # Check that the prompt was filled (via logs) assert len(simulator.logs) > 0 prompt = simulator.logs[0].get("prompt", "") assert "Jane" in prompt or "12345" in prompt or "Account inquiry" in prompt def test_user_simulator_with_early_stopping(self, dummy_model): - """UserLLMSimulator includes early stopping instructions when configured.""" from conftest import DummyModelAdapter - from maseval.core.simulator import UserLLMSimulator model = DummyModelAdapter(responses=['{"text": "Thanks, goodbye! "}']) @@ -290,7 +289,6 @@ def test_user_simulator_with_early_stopping(self, dummy_model): result = simulator(conversation_history=[{"role": "agent", "content": "Your issue is fixed."}]) assert result is not None - # The prompt should include early stopping instructions prompt = simulator.logs[0].get("prompt", "") assert "" in prompt or "issue is resolved" in prompt @@ -305,9 +303,6 @@ class TestAgenticUserLLMSimulatorValidation: """Tests for AgenticUserLLMSimulator initialization and validation.""" def test_agentic_user_simulator_initialization(self, dummy_model): - """AgenticUserLLMSimulator initializes with required parameters.""" - from maseval.core.simulator import AgenticUserLLMSimulator - simulator = AgenticUserLLMSimulator( model=dummy_model, user_profile={"name": "test", "phone": "555-1234"}, @@ -319,9 +314,6 @@ def test_agentic_user_simulator_initialization(self, dummy_model): assert simulator.tools == [] def test_agentic_user_simulator_with_tools(self, dummy_model): - """AgenticUserLLMSimulator initializes with tools.""" - from maseval.core.simulator import AgenticUserLLMSimulator - tools = [ {"name": "check_balance", "description": "Check account balance", "inputs": {}}, {"name": "make_payment", "description": "Make a payment", "inputs": {"amount": {"type": "number"}}}, @@ -338,10 +330,6 @@ def test_agentic_user_simulator_with_tools(self, dummy_model): assert simulator.tools[0]["name"] == "check_balance" def test_agentic_user_stop_token_validation(self, dummy_model): - """AgenticUserLLMSimulator validates stop_token and early_stopping_condition.""" - from maseval.core.simulator import AgenticUserLLMSimulator - - # stop_token without condition should raise with pytest.raises(ValueError, match="must both be set or both be None"): AgenticUserLLMSimulator( model=dummy_model, @@ -350,7 +338,6 @@ def test_agentic_user_stop_token_validation(self, dummy_model): stop_token="", ) - # condition without stop_token should raise with pytest.raises(ValueError, match="must both be set or both be None"): AgenticUserLLMSimulator( model=dummy_model, @@ -360,9 +347,6 @@ def test_agentic_user_stop_token_validation(self, dummy_model): ) def test_agentic_user_both_early_stopping_params_valid(self, dummy_model): - """AgenticUserLLMSimulator accepts both early stopping params together.""" - from maseval.core.simulator import AgenticUserLLMSimulator - simulator = AgenticUserLLMSimulator( model=dummy_model, user_profile={"name": "test"}, @@ -380,18 +364,15 @@ class TestAgenticUserLLMSimulatorResponse: """Tests for AgenticUserLLMSimulator response generation.""" def test_agentic_user_generates_text_response(self, dummy_model): - """AgenticUserLLMSimulator generates text response from JSON output.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "I need to check my balance.", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "John"}, scenario="Account inquiry", ) - simulator.model = model # Override with our test model result = simulator(conversation_history=[{"role": "agent", "content": "How can I help?"}]) @@ -401,21 +382,18 @@ def test_agentic_user_generates_text_response(self, dummy_model): assert tool_calls == [] def test_agentic_user_generates_tool_calls(self, dummy_model): - """AgenticUserLLMSimulator generates tool calls from JSON output.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "Let me check.", "tool_calls": [{"name": "check_signal", "arguments": {}}]}']) tools = [{"name": "check_signal", "description": "Check phone signal"}] simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Jane"}, scenario="Phone issue", tools=tools, ) - simulator.model = model result = simulator(conversation_history=[{"role": "agent", "content": "What's the problem?"}]) @@ -424,64 +402,36 @@ def test_agentic_user_generates_tool_calls(self, dummy_model): assert len(tool_calls) == 1 assert tool_calls[0]["name"] == "check_signal" - def test_agentic_user_parses_markdown_json(self, dummy_model): - """AgenticUserLLMSimulator parses JSON wrapped in markdown code blocks.""" - from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator - - # Response wrapped in markdown code block - model = DummyModelAdapter(responses=['```json\n{"text": "Parsed correctly", "tool_calls": []}\n```']) - - simulator = AgenticUserLLMSimulator( - model=dummy_model, - user_profile={"name": "Test"}, - scenario="Test", - ) - simulator.model = model - - result = simulator(conversation_history=[]) - - text, tool_calls = result - assert text == "Parsed correctly" - def test_agentic_user_invalid_json_raises(self, dummy_model): - """AgenticUserLLMSimulator raises on invalid JSON after retries.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator, UserSimulatorError model = DummyModelAdapter(responses=["not valid json", "still not valid", "nope"]) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", max_try=3, ) - simulator.model = model - with pytest.raises(UserSimulatorError) as exc_info: + with pytest.raises(UserSimulatorError): simulator(conversation_history=[]) - assert exc_info.value.attempts == 3 - @pytest.mark.core class TestAgenticUserLLMSimulatorPrompt: """Tests for AgenticUserLLMSimulator prompt template filling.""" def test_prompt_includes_user_profile(self, dummy_model): - """Prompt template includes user profile information.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Alice", "customer_id": "C12345"}, scenario="Customer support call", ) - simulator.model = model simulator(conversation_history=[{"role": "agent", "content": "Hello"}]) @@ -489,18 +439,15 @@ def test_prompt_includes_user_profile(self, dummy_model): assert "Alice" in prompt or "C12345" in prompt def test_prompt_includes_scenario(self, dummy_model): - """Prompt template includes scenario description.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Billing dispute about overcharges", ) - simulator.model = model simulator(conversation_history=[]) @@ -508,9 +455,7 @@ def test_prompt_includes_scenario(self, dummy_model): assert "Billing dispute" in prompt or "overcharges" in prompt def test_prompt_includes_tool_instructions(self, dummy_model): - """Prompt template includes tool instructions when tools are provided.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) @@ -519,12 +464,11 @@ def test_prompt_includes_tool_instructions(self, dummy_model): ] simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", tools=tools, ) - simulator.model = model simulator(conversation_history=[]) @@ -532,20 +476,17 @@ def test_prompt_includes_tool_instructions(self, dummy_model): assert "toggle_wifi" in prompt or "Toggle WiFi" in prompt def test_prompt_includes_early_stopping_instructions(self, dummy_model): - """Prompt template includes early stopping instructions when configured.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", stop_token="", early_stopping_condition="problem is solved", ) - simulator.model = model simulator(conversation_history=[]) @@ -553,18 +494,15 @@ def test_prompt_includes_early_stopping_instructions(self, dummy_model): assert "" in prompt or "problem is solved" in prompt def test_prompt_includes_conversation_history(self, dummy_model): - """Prompt template includes formatted conversation history.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", ) - simulator.model = model history = [ {"role": "agent", "content": "Welcome to support."}, @@ -583,18 +521,15 @@ class TestAgenticUserLLMSimulatorTracing: """Tests for AgenticUserLLMSimulator tracing and logging.""" def test_logs_successful_calls(self, dummy_model): - """AgenticUserLLMSimulator logs successful calls.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator, SimulatorCallStatus model = DummyModelAdapter(responses=['{"text": "Success", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", ) - simulator.model = model simulator(conversation_history=[]) @@ -602,40 +537,35 @@ def test_logs_successful_calls(self, dummy_model): assert simulator.logs[0]["status"] == SimulatorCallStatus.Successful.value def test_logs_failed_calls(self, dummy_model): - """AgenticUserLLMSimulator logs failed parsing attempts.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator, SimulatorCallStatus, UserSimulatorError model = DummyModelAdapter(responses=["bad json", "still bad"]) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", max_try=2, ) - simulator.model = model with pytest.raises(UserSimulatorError): simulator(conversation_history=[]) - assert len(simulator.logs) == 2 - for log in simulator.logs: - assert log["status"] == SimulatorCallStatus.ModelParsingError.value + # With instructor-based retries, the simulator logs 1 entry + # (retries happen inside _structured_chat) + assert len(simulator.logs) == 1 + assert simulator.logs[0]["status"] == SimulatorCallStatus.ModelCallError.value def test_gather_traces_returns_complete_info(self, dummy_model): - """gather_traces returns complete tracing information.""" from conftest import DummyModelAdapter - from maseval.core.simulator import AgenticUserLLMSimulator model = DummyModelAdapter(responses=['{"text": "test", "tool_calls": []}']) simulator = AgenticUserLLMSimulator( - model=dummy_model, + model=model, user_profile={"name": "Test"}, scenario="Test", ) - simulator.model = model simulator(conversation_history=[]) diff --git a/uv.lock b/uv.lock index 37478089..c1e20868 100644 --- a/uv.lock +++ b/uv.lock @@ -517,12 +517,13 @@ wheels = [ [[package]] name = "camel-ai" -version = "0.2.79" +version = "0.2.90" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "astor" }, { name = "colorama" }, { name = "docstring-parser" }, + { name = "google-search-results" }, { name = "httpx" }, { name = "jsonschema" }, { name = "mcp" }, @@ -530,12 +531,13 @@ dependencies = [ { name = "pillow" }, { name = "psutil" }, { name = "pydantic" }, + { name = "pyyaml" }, { name = "tiktoken" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/39/c5/85be0a6ee5103264109cf30356f6218904cf8c8024fe52b1e21c3a32f029/camel_ai-0.2.79.tar.gz", hash = "sha256:29ef94cbe33893eedfd854e0f5adefd3da1b427f5479e46b16ce614c6eef19d2", size = 1003596, upload-time = "2025-11-13T17:14:39.436Z" } +sdist = { url = "https://files.pythonhosted.org/packages/26/b3/da958646c69b42cfafd3fc081b77bb19bb625773766da6ffa2d77c9c66e0/camel_ai-0.2.90.tar.gz", hash = "sha256:43f11673390cc8d4451d6b1bb2913ddef6131f411f47da02d16a3989c8096d02", size = 1212904, upload-time = "2026-03-22T08:37:04.611Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ad/f0/71588517218d7d288246e144ef9f9aaef15c05d7cbc1257b07084d2824b3/camel_ai-0.2.79-py3-none-any.whl", hash = "sha256:287c325dcc11006f90b48d6846876299d3558d2b5bd459e9e562935ba536c864", size = 1465014, upload-time = "2025-11-13T17:14:37.298Z" }, + { url = "https://files.pythonhosted.org/packages/eb/e2/7005080797edcc760dcf7695ece29f97e18189ea2c56cd7514592801f0c9/camel_ai-0.2.90-py3-none-any.whl", hash = "sha256:9998c434779a1a847d9ccddce1c069f22fb9667b19ba06a2452c479882169082", size = 1700705, upload-time = "2026-03-22T08:37:02.259Z" }, ] [[package]] @@ -1224,6 +1226,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197, upload-time = "2022-11-28T23:32:31.219Z" }, ] +[[package]] +name = "diskcache" +version = "5.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -1759,6 +1770,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/93/94bc7a89ef4e7ed3666add55cd859d1483a22737251df659bf1aa46e9405/google_genai-1.56.0-py3-none-any.whl", hash = "sha256:9e6b11e0c105ead229368cb5849a480e4d0185519f8d9f538d61ecfcf193b052", size = 426563, upload-time = "2025-12-17T12:35:03.717Z" }, ] +[[package]] +name = "google-search-results" +version = "2.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/30/b3a6f6a2e00f8153549c2fa345c58ae1ce8e5f3153c2fe0484d444c3abcb/google_search_results-2.4.2.tar.gz", hash = "sha256:603a30ecae2af8e600b22635757a6df275dad4b934f975e67878ccd640b78245", size = 18818, upload-time = "2023-03-10T11:13:09.953Z" } + [[package]] name = "googleapis-common-protos" version = "1.72.0" @@ -2063,6 +2083,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/9c/1646ca469bc2dc299ac393c8d31136c6c22a35ca1e373fa462ac01100d37/inputimeout-1.0.4-py3-none-any.whl", hash = "sha256:f4e23d27753cfc25268eefc8d52a3edc46280ad831d226617c51882423475a43", size = 4639, upload-time = "2018-03-02T14:28:06.903Z" }, ] +[[package]] +name = "instructor" +version = "1.14.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "diskcache" }, + { name = "docstring-parser" }, + { name = "jinja2" }, + { name = "jiter" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "requests" }, + { name = "rich" }, + { name = "tenacity" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0b/ef/986d059424db204ed57b29d8c07fda35de2a2c72dee8ea7994bc90a6f767/instructor-1.14.5.tar.gz", hash = "sha256:fcb6432867f2fe4a5986e8bf389dcc64ed2ad4039a12a2dff85464e51c2f171a", size = 69950754, upload-time = "2026-01-29T14:18:50.454Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/04/e442e1356c97b03a6d30d2b462f7c0bdfbf207e75f6833815fd1225a75b4/instructor-1.14.5-py3-none-any.whl", hash = "sha256:2a5a31222b008c0989be1cc001e33a237f49506e80ac5833f6d36d7690bae7b1", size = 177445, upload-time = "2026-01-29T14:18:53.641Z" }, +] + [[package]] name = "ipykernel" version = "6.31.0" @@ -2231,99 +2274,99 @@ wheels = [ [[package]] name = "jiter" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/45/9d/e0660989c1370e25848bb4c52d061c71837239738ad937e83edca174c273/jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b", size = 168294, upload-time = "2025-11-09T20:49:23.302Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/91/13cb9505f7be74a933f37da3af22e029f6ba64f5669416cb8b2774bc9682/jiter-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e7acbaba9703d5de82a2c98ae6a0f59ab9770ab5af5fa35e43a303aee962cf65", size = 316652, upload-time = "2025-11-09T20:46:41.021Z" }, - { url = "https://files.pythonhosted.org/packages/4e/76/4e9185e5d9bb4e482cf6dec6410d5f78dfeb374cfcecbbe9888d07c52daa/jiter-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:364f1a7294c91281260364222f535bc427f56d4de1d8ffd718162d21fbbd602e", size = 319829, upload-time = "2025-11-09T20:46:43.281Z" }, - { url = "https://files.pythonhosted.org/packages/86/af/727de50995d3a153138139f259baae2379d8cb0522c0c00419957bc478a6/jiter-0.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85ee4d25805d4fb23f0a5167a962ef8e002dbfb29c0989378488e32cf2744b62", size = 350568, upload-time = "2025-11-09T20:46:45.075Z" }, - { url = "https://files.pythonhosted.org/packages/6a/c1/d6e9f4b7a3d5ac63bcbdfddeb50b2dcfbdc512c86cffc008584fdc350233/jiter-0.12.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:796f466b7942107eb889c08433b6e31b9a7ed31daceaecf8af1be26fb26c0ca8", size = 369052, upload-time = "2025-11-09T20:46:46.818Z" }, - { url = "https://files.pythonhosted.org/packages/eb/be/00824cd530f30ed73fa8a4f9f3890a705519e31ccb9e929f1e22062e7c76/jiter-0.12.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35506cb71f47dba416694e67af996bbdefb8e3608f1f78799c2e1f9058b01ceb", size = 481585, upload-time = "2025-11-09T20:46:48.319Z" }, - { url = "https://files.pythonhosted.org/packages/74/b6/2ad7990dff9504d4b5052eef64aa9574bd03d722dc7edced97aad0d47be7/jiter-0.12.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:726c764a90c9218ec9e4f99a33d6bf5ec169163f2ca0fc21b654e88c2abc0abc", size = 380541, upload-time = "2025-11-09T20:46:49.643Z" }, - { url = "https://files.pythonhosted.org/packages/b5/c7/f3c26ecbc1adbf1db0d6bba99192143d8fe8504729d9594542ecc4445784/jiter-0.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa47810c5565274810b726b0dc86d18dce5fd17b190ebdc3890851d7b2a0e74", size = 364423, upload-time = "2025-11-09T20:46:51.731Z" }, - { url = "https://files.pythonhosted.org/packages/18/51/eac547bf3a2d7f7e556927278e14c56a0604b8cddae75815d5739f65f81d/jiter-0.12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ec0259d3f26c62aed4d73b198c53e316ae11f0f69c8fbe6682c6dcfa0fcce2", size = 389958, upload-time = "2025-11-09T20:46:53.432Z" }, - { url = "https://files.pythonhosted.org/packages/2c/1f/9ca592e67175f2db156cff035e0d817d6004e293ee0c1d73692d38fcb596/jiter-0.12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:79307d74ea83465b0152fa23e5e297149506435535282f979f18b9033c0bb025", size = 522084, upload-time = "2025-11-09T20:46:54.848Z" }, - { url = "https://files.pythonhosted.org/packages/83/ff/597d9cdc3028f28224f53e1a9d063628e28b7a5601433e3196edda578cdd/jiter-0.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cf6e6dd18927121fec86739f1a8906944703941d000f0639f3eb6281cc601dca", size = 513054, upload-time = "2025-11-09T20:46:56.487Z" }, - { url = "https://files.pythonhosted.org/packages/24/6d/1970bce1351bd02e3afcc5f49e4f7ef3dabd7fb688f42be7e8091a5b809a/jiter-0.12.0-cp310-cp310-win32.whl", hash = "sha256:b6ae2aec8217327d872cbfb2c1694489057b9433afce447955763e6ab015b4c4", size = 206368, upload-time = "2025-11-09T20:46:58.638Z" }, - { url = "https://files.pythonhosted.org/packages/e3/6b/eb1eb505b2d86709b59ec06681a2b14a94d0941db091f044b9f0e16badc0/jiter-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:c7f49ce90a71e44f7e1aa9e7ec415b9686bbc6a5961e57eab511015e6759bc11", size = 204847, upload-time = "2025-11-09T20:47:00.295Z" }, - { url = "https://files.pythonhosted.org/packages/32/f9/eaca4633486b527ebe7e681c431f529b63fe2709e7c5242fc0f43f77ce63/jiter-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8f8a7e317190b2c2d60eb2e8aa835270b008139562d70fe732e1c0020ec53c9", size = 316435, upload-time = "2025-11-09T20:47:02.087Z" }, - { url = "https://files.pythonhosted.org/packages/10/c1/40c9f7c22f5e6ff715f28113ebaba27ab85f9af2660ad6e1dd6425d14c19/jiter-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2218228a077e784c6c8f1a8e5d6b8cb1dea62ce25811c356364848554b2056cd", size = 320548, upload-time = "2025-11-09T20:47:03.409Z" }, - { url = "https://files.pythonhosted.org/packages/6b/1b/efbb68fe87e7711b00d2cfd1f26bb4bfc25a10539aefeaa7727329ffb9cb/jiter-0.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9354ccaa2982bf2188fd5f57f79f800ef622ec67beb8329903abf6b10da7d423", size = 351915, upload-time = "2025-11-09T20:47:05.171Z" }, - { url = "https://files.pythonhosted.org/packages/15/2d/c06e659888c128ad1e838123d0638f0efad90cc30860cb5f74dd3f2fc0b3/jiter-0.12.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f2607185ea89b4af9a604d4c7ec40e45d3ad03ee66998b031134bc510232bb7", size = 368966, upload-time = "2025-11-09T20:47:06.508Z" }, - { url = "https://files.pythonhosted.org/packages/6b/20/058db4ae5fb07cf6a4ab2e9b9294416f606d8e467fb74c2184b2a1eeacba/jiter-0.12.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a585a5e42d25f2e71db5f10b171f5e5ea641d3aa44f7df745aa965606111cc2", size = 482047, upload-time = "2025-11-09T20:47:08.382Z" }, - { url = "https://files.pythonhosted.org/packages/49/bb/dc2b1c122275e1de2eb12905015d61e8316b2f888bdaac34221c301495d6/jiter-0.12.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd9e21d34edff5a663c631f850edcb786719c960ce887a5661e9c828a53a95d9", size = 380835, upload-time = "2025-11-09T20:47:09.81Z" }, - { url = "https://files.pythonhosted.org/packages/23/7d/38f9cd337575349de16da575ee57ddb2d5a64d425c9367f5ef9e4612e32e/jiter-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a612534770470686cd5431478dc5a1b660eceb410abade6b1b74e320ca98de6", size = 364587, upload-time = "2025-11-09T20:47:11.529Z" }, - { url = "https://files.pythonhosted.org/packages/f0/a3/b13e8e61e70f0bb06085099c4e2462647f53cc2ca97614f7fedcaa2bb9f3/jiter-0.12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3985aea37d40a908f887b34d05111e0aae822943796ebf8338877fee2ab67725", size = 390492, upload-time = "2025-11-09T20:47:12.993Z" }, - { url = "https://files.pythonhosted.org/packages/07/71/e0d11422ed027e21422f7bc1883c61deba2d9752b720538430c1deadfbca/jiter-0.12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b1207af186495f48f72529f8d86671903c8c10127cac6381b11dddc4aaa52df6", size = 522046, upload-time = "2025-11-09T20:47:14.6Z" }, - { url = "https://files.pythonhosted.org/packages/9f/59/b968a9aa7102a8375dbbdfbd2aeebe563c7e5dddf0f47c9ef1588a97e224/jiter-0.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef2fb241de583934c9915a33120ecc06d94aa3381a134570f59eed784e87001e", size = 513392, upload-time = "2025-11-09T20:47:16.011Z" }, - { url = "https://files.pythonhosted.org/packages/ca/e4/7df62002499080dbd61b505c5cb351aa09e9959d176cac2aa8da6f93b13b/jiter-0.12.0-cp311-cp311-win32.whl", hash = "sha256:453b6035672fecce8007465896a25b28a6b59cfe8fbc974b2563a92f5a92a67c", size = 206096, upload-time = "2025-11-09T20:47:17.344Z" }, - { url = "https://files.pythonhosted.org/packages/bb/60/1032b30ae0572196b0de0e87dce3b6c26a1eff71aad5fe43dee3082d32e0/jiter-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:ca264b9603973c2ad9435c71a8ec8b49f8f715ab5ba421c85a51cde9887e421f", size = 204899, upload-time = "2025-11-09T20:47:19.365Z" }, - { url = "https://files.pythonhosted.org/packages/49/d5/c145e526fccdb834063fb45c071df78b0cc426bbaf6de38b0781f45d956f/jiter-0.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:cb00ef392e7d684f2754598c02c409f376ddcef857aae796d559e6cacc2d78a5", size = 188070, upload-time = "2025-11-09T20:47:20.75Z" }, - { url = "https://files.pythonhosted.org/packages/92/c9/5b9f7b4983f1b542c64e84165075335e8a236fa9e2ea03a0c79780062be8/jiter-0.12.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:305e061fa82f4680607a775b2e8e0bcb071cd2205ac38e6ef48c8dd5ebe1cf37", size = 314449, upload-time = "2025-11-09T20:47:22.999Z" }, - { url = "https://files.pythonhosted.org/packages/98/6e/e8efa0e78de00db0aee82c0cf9e8b3f2027efd7f8a71f859d8f4be8e98ef/jiter-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c1860627048e302a528333c9307c818c547f214d8659b0705d2195e1a94b274", size = 319855, upload-time = "2025-11-09T20:47:24.779Z" }, - { url = "https://files.pythonhosted.org/packages/20/26/894cd88e60b5d58af53bec5c6759d1292bd0b37a8b5f60f07abf7a63ae5f/jiter-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df37577a4f8408f7e0ec3205d2a8f87672af8f17008358063a4d6425b6081ce3", size = 350171, upload-time = "2025-11-09T20:47:26.469Z" }, - { url = "https://files.pythonhosted.org/packages/f5/27/a7b818b9979ac31b3763d25f3653ec3a954044d5e9f5d87f2f247d679fd1/jiter-0.12.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fdd787356c1c13a4f40b43c2156276ef7a71eb487d98472476476d803fb2cf", size = 365590, upload-time = "2025-11-09T20:47:27.918Z" }, - { url = "https://files.pythonhosted.org/packages/ba/7e/e46195801a97673a83746170b17984aa8ac4a455746354516d02ca5541b4/jiter-0.12.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1eb5db8d9c65b112aacf14fcd0faae9913d07a8afea5ed06ccdd12b724e966a1", size = 479462, upload-time = "2025-11-09T20:47:29.654Z" }, - { url = "https://files.pythonhosted.org/packages/ca/75/f833bfb009ab4bd11b1c9406d333e3b4357709ed0570bb48c7c06d78c7dd/jiter-0.12.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73c568cc27c473f82480abc15d1301adf333a7ea4f2e813d6a2c7d8b6ba8d0df", size = 378983, upload-time = "2025-11-09T20:47:31.026Z" }, - { url = "https://files.pythonhosted.org/packages/71/b3/7a69d77943cc837d30165643db753471aff5df39692d598da880a6e51c24/jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4321e8a3d868919bcb1abb1db550d41f2b5b326f72df29e53b2df8b006eb9403", size = 361328, upload-time = "2025-11-09T20:47:33.286Z" }, - { url = "https://files.pythonhosted.org/packages/b0/ac/a78f90caf48d65ba70d8c6efc6f23150bc39dc3389d65bbec2a95c7bc628/jiter-0.12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0a51bad79f8cc9cac2b4b705039f814049142e0050f30d91695a2d9a6611f126", size = 386740, upload-time = "2025-11-09T20:47:34.703Z" }, - { url = "https://files.pythonhosted.org/packages/39/b6/5d31c2cc8e1b6a6bcf3c5721e4ca0a3633d1ab4754b09bc7084f6c4f5327/jiter-0.12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2a67b678f6a5f1dd6c36d642d7db83e456bc8b104788262aaefc11a22339f5a9", size = 520875, upload-time = "2025-11-09T20:47:36.058Z" }, - { url = "https://files.pythonhosted.org/packages/30/b5/4df540fae4e9f68c54b8dab004bd8c943a752f0b00efd6e7d64aa3850339/jiter-0.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efe1a211fe1fd14762adea941e3cfd6c611a136e28da6c39272dbb7a1bbe6a86", size = 511457, upload-time = "2025-11-09T20:47:37.932Z" }, - { url = "https://files.pythonhosted.org/packages/07/65/86b74010e450a1a77b2c1aabb91d4a91dd3cd5afce99f34d75fd1ac64b19/jiter-0.12.0-cp312-cp312-win32.whl", hash = "sha256:d779d97c834b4278276ec703dc3fc1735fca50af63eb7262f05bdb4e62203d44", size = 204546, upload-time = "2025-11-09T20:47:40.47Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c7/6659f537f9562d963488e3e55573498a442503ced01f7e169e96a6110383/jiter-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e8269062060212b373316fe69236096aaf4c49022d267c6736eebd66bbbc60bb", size = 205196, upload-time = "2025-11-09T20:47:41.794Z" }, - { url = "https://files.pythonhosted.org/packages/21/f4/935304f5169edadfec7f9c01eacbce4c90bb9a82035ac1de1f3bd2d40be6/jiter-0.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:06cb970936c65de926d648af0ed3d21857f026b1cf5525cb2947aa5e01e05789", size = 186100, upload-time = "2025-11-09T20:47:43.007Z" }, - { url = "https://files.pythonhosted.org/packages/3d/a6/97209693b177716e22576ee1161674d1d58029eb178e01866a0422b69224/jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e", size = 313658, upload-time = "2025-11-09T20:47:44.424Z" }, - { url = "https://files.pythonhosted.org/packages/06/4d/125c5c1537c7d8ee73ad3d530a442d6c619714b95027143f1b61c0b4dfe0/jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1", size = 318605, upload-time = "2025-11-09T20:47:45.973Z" }, - { url = "https://files.pythonhosted.org/packages/99/bf/a840b89847885064c41a5f52de6e312e91fa84a520848ee56c97e4fa0205/jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf", size = 349803, upload-time = "2025-11-09T20:47:47.535Z" }, - { url = "https://files.pythonhosted.org/packages/8a/88/e63441c28e0db50e305ae23e19c1d8fae012d78ed55365da392c1f34b09c/jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44", size = 365120, upload-time = "2025-11-09T20:47:49.284Z" }, - { url = "https://files.pythonhosted.org/packages/0a/7c/49b02714af4343970eb8aca63396bc1c82fa01197dbb1e9b0d274b550d4e/jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45", size = 479918, upload-time = "2025-11-09T20:47:50.807Z" }, - { url = "https://files.pythonhosted.org/packages/69/ba/0a809817fdd5a1db80490b9150645f3aae16afad166960bcd562be194f3b/jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87", size = 379008, upload-time = "2025-11-09T20:47:52.211Z" }, - { url = "https://files.pythonhosted.org/packages/5f/c3/c9fc0232e736c8877d9e6d83d6eeb0ba4e90c6c073835cc2e8f73fdeef51/jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed", size = 361785, upload-time = "2025-11-09T20:47:53.512Z" }, - { url = "https://files.pythonhosted.org/packages/96/61/61f69b7e442e97ca6cd53086ddc1cf59fb830549bc72c0a293713a60c525/jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9", size = 386108, upload-time = "2025-11-09T20:47:54.893Z" }, - { url = "https://files.pythonhosted.org/packages/e9/2e/76bb3332f28550c8f1eba3bf6e5efe211efda0ddbbaf24976bc7078d42a5/jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626", size = 519937, upload-time = "2025-11-09T20:47:56.253Z" }, - { url = "https://files.pythonhosted.org/packages/84/d6/fa96efa87dc8bff2094fb947f51f66368fa56d8d4fc9e77b25d7fbb23375/jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c", size = 510853, upload-time = "2025-11-09T20:47:58.32Z" }, - { url = "https://files.pythonhosted.org/packages/8a/28/93f67fdb4d5904a708119a6ab58a8f1ec226ff10a94a282e0215402a8462/jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de", size = 204699, upload-time = "2025-11-09T20:47:59.686Z" }, - { url = "https://files.pythonhosted.org/packages/c4/1f/30b0eb087045a0abe2a5c9c0c0c8da110875a1d3be83afd4a9a4e548be3c/jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a", size = 204258, upload-time = "2025-11-09T20:48:01.01Z" }, - { url = "https://files.pythonhosted.org/packages/2c/f4/2b4daf99b96bce6fc47971890b14b2a36aef88d7beb9f057fafa032c6141/jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60", size = 185503, upload-time = "2025-11-09T20:48:02.35Z" }, - { url = "https://files.pythonhosted.org/packages/39/ca/67bb15a7061d6fe20b9b2a2fd783e296a1e0f93468252c093481a2f00efa/jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6", size = 317965, upload-time = "2025-11-09T20:48:03.783Z" }, - { url = "https://files.pythonhosted.org/packages/18/af/1788031cd22e29c3b14bc6ca80b16a39a0b10e611367ffd480c06a259831/jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4", size = 345831, upload-time = "2025-11-09T20:48:05.55Z" }, - { url = "https://files.pythonhosted.org/packages/05/17/710bf8472d1dff0d3caf4ced6031060091c1320f84ee7d5dcbed1f352417/jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb", size = 361272, upload-time = "2025-11-09T20:48:06.951Z" }, - { url = "https://files.pythonhosted.org/packages/fb/f1/1dcc4618b59761fef92d10bcbb0b038b5160be653b003651566a185f1a5c/jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7", size = 204604, upload-time = "2025-11-09T20:48:08.328Z" }, - { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" }, - { url = "https://files.pythonhosted.org/packages/a8/99/45c9f0dbe4a1416b2b9a8a6d1236459540f43d7fb8883cff769a8db0612d/jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525", size = 312478, upload-time = "2025-11-09T20:48:10.898Z" }, - { url = "https://files.pythonhosted.org/packages/4c/a7/54ae75613ba9e0f55fcb0bc5d1f807823b5167cc944e9333ff322e9f07dd/jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49", size = 318706, upload-time = "2025-11-09T20:48:12.266Z" }, - { url = "https://files.pythonhosted.org/packages/59/31/2aa241ad2c10774baf6c37f8b8e1f39c07db358f1329f4eb40eba179c2a2/jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1", size = 351894, upload-time = "2025-11-09T20:48:13.673Z" }, - { url = "https://files.pythonhosted.org/packages/54/4f/0f2759522719133a9042781b18cc94e335b6d290f5e2d3e6899d6af933e3/jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e", size = 365714, upload-time = "2025-11-09T20:48:15.083Z" }, - { url = "https://files.pythonhosted.org/packages/dc/6f/806b895f476582c62a2f52c453151edd8a0fde5411b0497baaa41018e878/jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e", size = 478989, upload-time = "2025-11-09T20:48:16.706Z" }, - { url = "https://files.pythonhosted.org/packages/86/6c/012d894dc6e1033acd8db2b8346add33e413ec1c7c002598915278a37f79/jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff", size = 378615, upload-time = "2025-11-09T20:48:18.614Z" }, - { url = "https://files.pythonhosted.org/packages/87/30/d718d599f6700163e28e2c71c0bbaf6dace692e7df2592fd793ac9276717/jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a", size = 364745, upload-time = "2025-11-09T20:48:20.117Z" }, - { url = "https://files.pythonhosted.org/packages/8f/85/315b45ce4b6ddc7d7fceca24068543b02bdc8782942f4ee49d652e2cc89f/jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a", size = 386502, upload-time = "2025-11-09T20:48:21.543Z" }, - { url = "https://files.pythonhosted.org/packages/74/0b/ce0434fb40c5b24b368fe81b17074d2840748b4952256bab451b72290a49/jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67", size = 519845, upload-time = "2025-11-09T20:48:22.964Z" }, - { url = "https://files.pythonhosted.org/packages/e8/a3/7a7a4488ba052767846b9c916d208b3ed114e3eb670ee984e4c565b9cf0d/jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b", size = 510701, upload-time = "2025-11-09T20:48:24.483Z" }, - { url = "https://files.pythonhosted.org/packages/c3/16/052ffbf9d0467b70af24e30f91e0579e13ded0c17bb4a8eb2aed3cb60131/jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42", size = 205029, upload-time = "2025-11-09T20:48:25.749Z" }, - { url = "https://files.pythonhosted.org/packages/e4/18/3cf1f3f0ccc789f76b9a754bdb7a6977e5d1d671ee97a9e14f7eb728d80e/jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf", size = 204960, upload-time = "2025-11-09T20:48:27.415Z" }, - { url = "https://files.pythonhosted.org/packages/02/68/736821e52ecfdeeb0f024b8ab01b5a229f6b9293bbdb444c27efade50b0f/jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451", size = 185529, upload-time = "2025-11-09T20:48:29.125Z" }, - { url = "https://files.pythonhosted.org/packages/30/61/12ed8ee7a643cce29ac97c2281f9ce3956eb76b037e88d290f4ed0d41480/jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7", size = 318974, upload-time = "2025-11-09T20:48:30.87Z" }, - { url = "https://files.pythonhosted.org/packages/2d/c6/f3041ede6d0ed5e0e79ff0de4c8f14f401bbf196f2ef3971cdbe5fd08d1d/jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684", size = 345932, upload-time = "2025-11-09T20:48:32.658Z" }, - { url = "https://files.pythonhosted.org/packages/d5/5d/4d94835889edd01ad0e2dbfc05f7bdfaed46292e7b504a6ac7839aa00edb/jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c", size = 367243, upload-time = "2025-11-09T20:48:34.093Z" }, - { url = "https://files.pythonhosted.org/packages/fd/76/0051b0ac2816253a99d27baf3dda198663aff882fa6ea7deeb94046da24e/jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d", size = 479315, upload-time = "2025-11-09T20:48:35.507Z" }, - { url = "https://files.pythonhosted.org/packages/70/ae/83f793acd68e5cb24e483f44f482a1a15601848b9b6f199dacb970098f77/jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993", size = 380714, upload-time = "2025-11-09T20:48:40.014Z" }, - { url = "https://files.pythonhosted.org/packages/b1/5e/4808a88338ad2c228b1126b93fcd8ba145e919e886fe910d578230dabe3b/jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f", size = 365168, upload-time = "2025-11-09T20:48:41.462Z" }, - { url = "https://files.pythonhosted.org/packages/0c/d4/04619a9e8095b42aef436b5aeb4c0282b4ff1b27d1db1508df9f5dc82750/jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783", size = 387893, upload-time = "2025-11-09T20:48:42.921Z" }, - { url = "https://files.pythonhosted.org/packages/17/ea/d3c7e62e4546fdc39197fa4a4315a563a89b95b6d54c0d25373842a59cbe/jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b", size = 520828, upload-time = "2025-11-09T20:48:44.278Z" }, - { url = "https://files.pythonhosted.org/packages/cc/0b/c6d3562a03fd767e31cb119d9041ea7958c3c80cb3d753eafb19b3b18349/jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6", size = 511009, upload-time = "2025-11-09T20:48:45.726Z" }, - { url = "https://files.pythonhosted.org/packages/aa/51/2cb4468b3448a8385ebcd15059d325c9ce67df4e2758d133ab9442b19834/jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183", size = 205110, upload-time = "2025-11-09T20:48:47.033Z" }, - { url = "https://files.pythonhosted.org/packages/b2/c5/ae5ec83dec9c2d1af805fd5fe8f74ebded9c8670c5210ec7820ce0dbeb1e/jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873", size = 205223, upload-time = "2025-11-09T20:48:49.076Z" }, - { url = "https://files.pythonhosted.org/packages/97/9a/3c5391907277f0e55195550cf3fa8e293ae9ee0c00fb402fec1e38c0c82f/jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473", size = 185564, upload-time = "2025-11-09T20:48:50.376Z" }, - { url = "https://files.pythonhosted.org/packages/fe/54/5339ef1ecaa881c6948669956567a64d2670941925f245c434f494ffb0e5/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:4739a4657179ebf08f85914ce50332495811004cc1747852e8b2041ed2aab9b8", size = 311144, upload-time = "2025-11-09T20:49:10.503Z" }, - { url = "https://files.pythonhosted.org/packages/27/74/3446c652bffbd5e81ab354e388b1b5fc1d20daac34ee0ed11ff096b1b01a/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:41da8def934bf7bec16cb24bd33c0ca62126d2d45d81d17b864bd5ad721393c3", size = 305877, upload-time = "2025-11-09T20:49:12.269Z" }, - { url = "https://files.pythonhosted.org/packages/a1/f4/ed76ef9043450f57aac2d4fbeb27175aa0eb9c38f833be6ef6379b3b9a86/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c44ee814f499c082e69872d426b624987dbc5943ab06e9bbaa4f81989fdb79e", size = 340419, upload-time = "2025-11-09T20:49:13.803Z" }, - { url = "https://files.pythonhosted.org/packages/21/01/857d4608f5edb0664aa791a3d45702e1a5bcfff9934da74035e7b9803846/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2097de91cf03eaa27b3cbdb969addf83f0179c6afc41bbc4513705e013c65d", size = 347212, upload-time = "2025-11-09T20:49:15.643Z" }, - { url = "https://files.pythonhosted.org/packages/cb/f5/12efb8ada5f5c9edc1d4555fe383c1fb2eac05ac5859258a72d61981d999/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e8547883d7b96ef2e5fe22b88f8a4c8725a56e7f4abafff20fd5272d634c7ecb", size = 309974, upload-time = "2025-11-09T20:49:17.187Z" }, - { url = "https://files.pythonhosted.org/packages/85/15/d6eb3b770f6a0d332675141ab3962fd4a7c270ede3515d9f3583e1d28276/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:89163163c0934854a668ed783a2546a0617f71706a2551a4a0666d91ab365d6b", size = 304233, upload-time = "2025-11-09T20:49:18.734Z" }, - { url = "https://files.pythonhosted.org/packages/8c/3e/e7e06743294eea2cf02ced6aa0ff2ad237367394e37a0e2b4a1108c67a36/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d96b264ab7d34bbb2312dedc47ce07cd53f06835eacbc16dde3761f47c3a9e7f", size = 338537, upload-time = "2025-11-09T20:49:20.317Z" }, - { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" }, +version = "0.11.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a3/68/0357982493a7b20925aece061f7fb7a2678e3b232f8d73a6edb7e5304443/jiter-0.11.1.tar.gz", hash = "sha256:849dcfc76481c0ea0099391235b7ca97d7279e0fa4c86005457ac7c88e8b76dc", size = 168385, upload-time = "2025-10-17T11:31:15.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/10/d099def5716452c8d5ffa527405373a44ddaf8e3c9d4f6de1e1344cffd90/jiter-0.11.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ed58841a491bbbf3f7c55a6b68fff568439ab73b2cce27ace0e169057b5851df", size = 310078, upload-time = "2025-10-17T11:28:36.186Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/b81d010b0031ffa96dfb590628562ac5f513ce56aa2ab451d29fb3fedeb9/jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:499beb9b2d7e51d61095a8de39ebcab1d1778f2a74085f8305a969f6cee9f3e4", size = 317138, upload-time = "2025-10-17T11:28:38.294Z" }, + { url = "https://files.pythonhosted.org/packages/89/12/31ea12af9d79671cc7bd893bf0ccaf3467624c0fc7146a0cbfe7b549bcfa/jiter-0.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87b2821795e28cc990939b68ce7a038edea680a24910bd68a79d54ff3f03c02", size = 348964, upload-time = "2025-10-17T11:28:40.103Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/95cb6dc5ff962410667a29708c7a6c0691cc3c4866a0bfa79d085b56ebd6/jiter-0.11.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83f6fa494d8bba14ab100417c80e70d32d737e805cb85be2052d771c76fcd1f8", size = 363289, upload-time = "2025-10-17T11:28:41.49Z" }, + { url = "https://files.pythonhosted.org/packages/b8/3e/37006ad5843a0bc3a3ec3a6c44710d7a154113befaf5f26d2fe190668b63/jiter-0.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fbc6aea1daa2ec6f5ed465f0c5e7b0607175062ceebbea5ca70dd5ddab58083", size = 487243, upload-time = "2025-10-17T11:28:43.209Z" }, + { url = "https://files.pythonhosted.org/packages/80/5c/d38c8c801a322a0c0de47b9618c16fd766366f087ce37c4e55ae8e3c8b03/jiter-0.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:302288e2edc43174bb2db838e94688d724f9aad26c5fb9a74f7a5fb427452a6a", size = 376139, upload-time = "2025-10-17T11:28:44.821Z" }, + { url = "https://files.pythonhosted.org/packages/b0/cd/442ad2389a5570b0ee673f93e14bbe8cdecd3e08a9ba7756081d84065e4c/jiter-0.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85db563fe3b367bb568af5d29dea4d4066d923b8e01f3417d25ebecd958de815", size = 359279, upload-time = "2025-10-17T11:28:46.152Z" }, + { url = "https://files.pythonhosted.org/packages/9a/35/8f5810d0e7d00bc395889085dbc1ccc36d454b56f28b2a5359dfd1bab48d/jiter-0.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1c1ba2b6b22f775444ef53bc2d5778396d3520abc7b2e1da8eb0c27cb3ffb10", size = 384911, upload-time = "2025-10-17T11:28:48.03Z" }, + { url = "https://files.pythonhosted.org/packages/3c/bd/8c069ceb0bafcf6b4aa5de0c27f02faf50468df39564a02e1a12389ad6c2/jiter-0.11.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:523be464b14f8fd0cc78da6964b87b5515a056427a2579f9085ce30197a1b54a", size = 517879, upload-time = "2025-10-17T11:28:49.902Z" }, + { url = "https://files.pythonhosted.org/packages/bc/3c/9163efcf762f79f47433078b4f0a1bddc56096082c02c6cae2f47f07f56f/jiter-0.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25b99b3f04cd2a38fefb22e822e35eb203a2cd37d680dbbc0c0ba966918af336", size = 508739, upload-time = "2025-10-17T11:28:51.785Z" }, + { url = "https://files.pythonhosted.org/packages/44/07/50690f257935845d3114b95b5dd03749eeaab5e395cbb522f9e957da4551/jiter-0.11.1-cp310-cp310-win32.whl", hash = "sha256:47a79e90545a596bb9104109777894033347b11180d4751a216afef14072dbe7", size = 203948, upload-time = "2025-10-17T11:28:54.368Z" }, + { url = "https://files.pythonhosted.org/packages/d2/3a/5964a944bf2e98ffd566153fdc2a6a368fcb11b58cc46832ca8c75808dba/jiter-0.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:cace75621ae9bd66878bf69fbd4dfc1a28ef8661e0c2d0eb72d3d6f1268eddf5", size = 207522, upload-time = "2025-10-17T11:28:56.79Z" }, + { url = "https://files.pythonhosted.org/packages/8b/34/c9e6cfe876f9a24f43ed53fe29f052ce02bd8d5f5a387dbf46ad3764bef0/jiter-0.11.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b0088ff3c374ce8ce0168523ec8e97122ebb788f950cf7bb8e39c7dc6a876a2", size = 310160, upload-time = "2025-10-17T11:28:59.174Z" }, + { url = "https://files.pythonhosted.org/packages/bc/9f/b06ec8181d7165858faf2ac5287c54fe52b2287760b7fe1ba9c06890255f/jiter-0.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74433962dd3c3090655e02e461267095d6c84f0741c7827de11022ef8d7ff661", size = 316573, upload-time = "2025-10-17T11:29:00.905Z" }, + { url = "https://files.pythonhosted.org/packages/66/49/3179d93090f2ed0c6b091a9c210f266d2d020d82c96f753260af536371d0/jiter-0.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d98030e345e6546df2cc2c08309c502466c66c4747b043f1a0d415fada862b8", size = 348998, upload-time = "2025-10-17T11:29:02.321Z" }, + { url = "https://files.pythonhosted.org/packages/ae/9d/63db2c8eabda7a9cad65a2e808ca34aaa8689d98d498f5a2357d7a2e2cec/jiter-0.11.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d6db0b2e788db46bec2cf729a88b6dd36959af2abd9fa2312dfba5acdd96dcb", size = 363413, upload-time = "2025-10-17T11:29:03.787Z" }, + { url = "https://files.pythonhosted.org/packages/25/ff/3e6b3170c5053053c7baddb8d44e2bf11ff44cd71024a280a8438ae6ba32/jiter-0.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55678fbbda261eafe7289165dd2ddd0e922df5f9a1ae46d7c79a5a15242bd7d1", size = 487144, upload-time = "2025-10-17T11:29:05.37Z" }, + { url = "https://files.pythonhosted.org/packages/b0/50/b63fcadf699893269b997f4c2e88400bc68f085c6db698c6e5e69d63b2c1/jiter-0.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a6b74fae8e40497653b52ce6ca0f1b13457af769af6fb9c1113efc8b5b4d9be", size = 376215, upload-time = "2025-10-17T11:29:07.123Z" }, + { url = "https://files.pythonhosted.org/packages/39/8c/57a8a89401134167e87e73471b9cca321cf651c1fd78c45f3a0f16932213/jiter-0.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a55a453f8b035eb4f7852a79a065d616b7971a17f5e37a9296b4b38d3b619e4", size = 359163, upload-time = "2025-10-17T11:29:09.047Z" }, + { url = "https://files.pythonhosted.org/packages/4b/96/30b0cdbffbb6f753e25339d3dbbe26890c9ef119928314578201c758aace/jiter-0.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2638148099022e6bdb3f42904289cd2e403609356fb06eb36ddec2d50958bc29", size = 385344, upload-time = "2025-10-17T11:29:10.69Z" }, + { url = "https://files.pythonhosted.org/packages/c6/d5/31dae27c1cc9410ad52bb514f11bfa4f286f7d6ef9d287b98b8831e156ec/jiter-0.11.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:252490567a5d990986f83b95a5f1ca1bf205ebd27b3e9e93bb7c2592380e29b9", size = 517972, upload-time = "2025-10-17T11:29:12.174Z" }, + { url = "https://files.pythonhosted.org/packages/61/1e/5905a7a3aceab80de13ab226fd690471a5e1ee7e554dc1015e55f1a6b896/jiter-0.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d431d52b0ca2436eea6195f0f48528202100c7deda354cb7aac0a302167594d5", size = 508408, upload-time = "2025-10-17T11:29:13.597Z" }, + { url = "https://files.pythonhosted.org/packages/91/12/1c49b97aa49077e136e8591cef7162f0d3e2860ae457a2d35868fd1521ef/jiter-0.11.1-cp311-cp311-win32.whl", hash = "sha256:db6f41e40f8bae20c86cb574b48c4fd9f28ee1c71cb044e9ec12e78ab757ba3a", size = 203937, upload-time = "2025-10-17T11:29:14.894Z" }, + { url = "https://files.pythonhosted.org/packages/6d/9d/2255f7c17134ee9892c7e013c32d5bcf4bce64eb115402c9fe5e727a67eb/jiter-0.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cc407b8e6cdff01b06bb80f61225c8b090c3df108ebade5e0c3c10993735b19", size = 207589, upload-time = "2025-10-17T11:29:16.166Z" }, + { url = "https://files.pythonhosted.org/packages/3c/28/6307fc8f95afef84cae6caf5429fee58ef16a582c2ff4db317ceb3e352fa/jiter-0.11.1-cp311-cp311-win_arm64.whl", hash = "sha256:fe04ea475392a91896d1936367854d346724a1045a247e5d1c196410473b8869", size = 188391, upload-time = "2025-10-17T11:29:17.488Z" }, + { url = "https://files.pythonhosted.org/packages/15/8b/318e8af2c904a9d29af91f78c1e18f0592e189bbdb8a462902d31fe20682/jiter-0.11.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c92148eec91052538ce6823dfca9525f5cfc8b622d7f07e9891a280f61b8c96c", size = 305655, upload-time = "2025-10-17T11:29:18.859Z" }, + { url = "https://files.pythonhosted.org/packages/f7/29/6c7de6b5d6e511d9e736312c0c9bfcee8f9b6bef68182a08b1d78767e627/jiter-0.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ecd4da91b5415f183a6be8f7158d127bdd9e6a3174138293c0d48d6ea2f2009d", size = 315645, upload-time = "2025-10-17T11:29:20.889Z" }, + { url = "https://files.pythonhosted.org/packages/ac/5f/ef9e5675511ee0eb7f98dd8c90509e1f7743dbb7c350071acae87b0145f3/jiter-0.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7e3ac25c00b9275684d47aa42febaa90a9958e19fd1726c4ecf755fbe5e553b", size = 348003, upload-time = "2025-10-17T11:29:22.712Z" }, + { url = "https://files.pythonhosted.org/packages/56/1b/abe8c4021010b0a320d3c62682769b700fb66f92c6db02d1a1381b3db025/jiter-0.11.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7305c0a841858f866cd459cd9303f73883fb5e097257f3d4a3920722c69d4", size = 365122, upload-time = "2025-10-17T11:29:24.408Z" }, + { url = "https://files.pythonhosted.org/packages/2a/2d/4a18013939a4f24432f805fbd5a19893e64650b933edb057cd405275a538/jiter-0.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e86fa10e117dce22c547f31dd6d2a9a222707d54853d8de4e9a2279d2c97f239", size = 488360, upload-time = "2025-10-17T11:29:25.724Z" }, + { url = "https://files.pythonhosted.org/packages/f0/77/38124f5d02ac4131f0dfbcfd1a19a0fac305fa2c005bc4f9f0736914a1a4/jiter-0.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae5ef1d48aec7e01ee8420155d901bb1d192998fa811a65ebb82c043ee186711", size = 376884, upload-time = "2025-10-17T11:29:27.056Z" }, + { url = "https://files.pythonhosted.org/packages/7b/43/59fdc2f6267959b71dd23ce0bd8d4aeaf55566aa435a5d00f53d53c7eb24/jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68e7bf65c990531ad8715e57d50195daf7c8e6f1509e617b4e692af1108939", size = 358827, upload-time = "2025-10-17T11:29:28.698Z" }, + { url = "https://files.pythonhosted.org/packages/7d/d0/b3cc20ff5340775ea3bbaa0d665518eddecd4266ba7244c9cb480c0c82ec/jiter-0.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43b30c8154ded5845fa454ef954ee67bfccce629b2dea7d01f795b42bc2bda54", size = 385171, upload-time = "2025-10-17T11:29:30.078Z" }, + { url = "https://files.pythonhosted.org/packages/d2/bc/94dd1f3a61f4dc236f787a097360ec061ceeebebf4ea120b924d91391b10/jiter-0.11.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:586cafbd9dd1f3ce6a22b4a085eaa6be578e47ba9b18e198d4333e598a91db2d", size = 518359, upload-time = "2025-10-17T11:29:31.464Z" }, + { url = "https://files.pythonhosted.org/packages/7e/8c/12ee132bd67e25c75f542c227f5762491b9a316b0dad8e929c95076f773c/jiter-0.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:677cc2517d437a83bb30019fd4cf7cad74b465914c56ecac3440d597ac135250", size = 509205, upload-time = "2025-10-17T11:29:32.895Z" }, + { url = "https://files.pythonhosted.org/packages/39/d5/9de848928ce341d463c7e7273fce90ea6d0ea4343cd761f451860fa16b59/jiter-0.11.1-cp312-cp312-win32.whl", hash = "sha256:fa992af648fcee2b850a3286a35f62bbbaeddbb6dbda19a00d8fbc846a947b6e", size = 205448, upload-time = "2025-10-17T11:29:34.217Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b0/8002d78637e05009f5e3fb5288f9d57d65715c33b5d6aa20fd57670feef5/jiter-0.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:88b5cae9fa51efeb3d4bd4e52bfd4c85ccc9cac44282e2a9640893a042ba4d87", size = 204285, upload-time = "2025-10-17T11:29:35.446Z" }, + { url = "https://files.pythonhosted.org/packages/9f/a2/bb24d5587e4dff17ff796716542f663deee337358006a80c8af43ddc11e5/jiter-0.11.1-cp312-cp312-win_arm64.whl", hash = "sha256:9a6cae1ab335551917f882f2c3c1efe7617b71b4c02381e4382a8fc80a02588c", size = 188712, upload-time = "2025-10-17T11:29:37.027Z" }, + { url = "https://files.pythonhosted.org/packages/7c/4b/e4dd3c76424fad02a601d570f4f2a8438daea47ba081201a721a903d3f4c/jiter-0.11.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:71b6a920a5550f057d49d0e8bcc60945a8da998019e83f01adf110e226267663", size = 305272, upload-time = "2025-10-17T11:29:39.249Z" }, + { url = "https://files.pythonhosted.org/packages/67/83/2cd3ad5364191130f4de80eacc907f693723beaab11a46c7d155b07a092c/jiter-0.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b3de72e925388453a5171be83379549300db01284f04d2a6f244d1d8de36f94", size = 314038, upload-time = "2025-10-17T11:29:40.563Z" }, + { url = "https://files.pythonhosted.org/packages/d3/3c/8e67d9ba524e97d2f04c8f406f8769a23205026b13b0938d16646d6e2d3e/jiter-0.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc19dd65a2bd3d9c044c5b4ebf657ca1e6003a97c0fc10f555aa4f7fb9821c00", size = 345977, upload-time = "2025-10-17T11:29:42.009Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a5/489ce64d992c29bccbffabb13961bbb0435e890d7f2d266d1f3df5e917d2/jiter-0.11.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d58faaa936743cd1464540562f60b7ce4fd927e695e8bc31b3da5b914baa9abd", size = 364503, upload-time = "2025-10-17T11:29:43.459Z" }, + { url = "https://files.pythonhosted.org/packages/d4/c0/e321dd83ee231d05c8fe4b1a12caf1f0e8c7a949bf4724d58397104f10f2/jiter-0.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:902640c3103625317291cb73773413b4d71847cdf9383ba65528745ff89f1d14", size = 487092, upload-time = "2025-10-17T11:29:44.835Z" }, + { url = "https://files.pythonhosted.org/packages/f9/5e/8f24ec49c8d37bd37f34ec0112e0b1a3b4b5a7b456c8efff1df5e189ad43/jiter-0.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30405f726e4c2ed487b176c09f8b877a957f535d60c1bf194abb8dadedb5836f", size = 376328, upload-time = "2025-10-17T11:29:46.175Z" }, + { url = "https://files.pythonhosted.org/packages/7f/70/ded107620e809327cf7050727e17ccfa79d6385a771b7fe38fb31318ef00/jiter-0.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3217f61728b0baadd2551844870f65219ac4a1285d5e1a4abddff3d51fdabe96", size = 356632, upload-time = "2025-10-17T11:29:47.454Z" }, + { url = "https://files.pythonhosted.org/packages/19/53/c26f7251613f6a9079275ee43c89b8a973a95ff27532c421abc2a87afb04/jiter-0.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1364cc90c03a8196f35f396f84029f12abe925415049204446db86598c8b72c", size = 384358, upload-time = "2025-10-17T11:29:49.377Z" }, + { url = "https://files.pythonhosted.org/packages/84/16/e0f2cc61e9c4d0b62f6c1bd9b9781d878a427656f88293e2a5335fa8ff07/jiter-0.11.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:53a54bf8e873820ab186b2dca9f6c3303f00d65ae5e7b7d6bda1b95aa472d646", size = 517279, upload-time = "2025-10-17T11:29:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/60/5c/4cd095eaee68961bca3081acbe7c89e12ae24a5dae5fd5d2a13e01ed2542/jiter-0.11.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7e29aca023627b0e0c2392d4248f6414d566ff3974fa08ff2ac8dbb96dfee92a", size = 508276, upload-time = "2025-10-17T11:29:52.619Z" }, + { url = "https://files.pythonhosted.org/packages/4f/25/f459240e69b0e09a7706d96ce203ad615ca36b0fe832308d2b7123abf2d0/jiter-0.11.1-cp313-cp313-win32.whl", hash = "sha256:f153e31d8bca11363751e875c0a70b3d25160ecbaee7b51e457f14498fb39d8b", size = 205593, upload-time = "2025-10-17T11:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/7c/16/461bafe22bae79bab74e217a09c907481a46d520c36b7b9fe71ee8c9e983/jiter-0.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:f773f84080b667c69c4ea0403fc67bb08b07e2b7ce1ef335dea5868451e60fed", size = 203518, upload-time = "2025-10-17T11:29:55.216Z" }, + { url = "https://files.pythonhosted.org/packages/7b/72/c45de6e320edb4fa165b7b1a414193b3cae302dd82da2169d315dcc78b44/jiter-0.11.1-cp313-cp313-win_arm64.whl", hash = "sha256:635ecd45c04e4c340d2187bcb1cea204c7cc9d32c1364d251564bf42e0e39c2d", size = 188062, upload-time = "2025-10-17T11:29:56.631Z" }, + { url = "https://files.pythonhosted.org/packages/65/9b/4a57922437ca8753ef823f434c2dec5028b237d84fa320f06a3ba1aec6e8/jiter-0.11.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d892b184da4d94d94ddb4031296931c74ec8b325513a541ebfd6dfb9ae89904b", size = 313814, upload-time = "2025-10-17T11:29:58.509Z" }, + { url = "https://files.pythonhosted.org/packages/76/50/62a0683dadca25490a4bedc6a88d59de9af2a3406dd5a576009a73a1d392/jiter-0.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa22c223a3041dacb2fcd37c70dfd648b44662b4a48e242592f95bda5ab09d58", size = 344987, upload-time = "2025-10-17T11:30:00.208Z" }, + { url = "https://files.pythonhosted.org/packages/da/00/2355dbfcbf6cdeaddfdca18287f0f38ae49446bb6378e4a5971e9356fc8a/jiter-0.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:330e8e6a11ad4980cd66a0f4a3e0e2e0f646c911ce047014f984841924729789", size = 356399, upload-time = "2025-10-17T11:30:02.084Z" }, + { url = "https://files.pythonhosted.org/packages/c9/07/c2bd748d578fa933d894a55bff33f983bc27f75fc4e491b354bef7b78012/jiter-0.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09e2e386ebf298547ca3a3704b729471f7ec666c2906c5c26c1a915ea24741ec", size = 203289, upload-time = "2025-10-17T11:30:03.656Z" }, + { url = "https://files.pythonhosted.org/packages/e6/ee/ace64a853a1acbd318eb0ca167bad1cf5ee037207504b83a868a5849747b/jiter-0.11.1-cp313-cp313t-win_arm64.whl", hash = "sha256:fe4a431c291157e11cee7c34627990ea75e8d153894365a3bc84b7a959d23ca8", size = 188284, upload-time = "2025-10-17T11:30:05.046Z" }, + { url = "https://files.pythonhosted.org/packages/8d/00/d6006d069e7b076e4c66af90656b63da9481954f290d5eca8c715f4bf125/jiter-0.11.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0fa1f70da7a8a9713ff8e5f75ec3f90c0c870be6d526aa95e7c906f6a1c8c676", size = 304624, upload-time = "2025-10-17T11:30:06.678Z" }, + { url = "https://files.pythonhosted.org/packages/fc/45/4a0e31eb996b9ccfddbae4d3017b46f358a599ccf2e19fbffa5e531bd304/jiter-0.11.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:569ee559e5046a42feb6828c55307cf20fe43308e3ae0d8e9e4f8d8634d99944", size = 315042, upload-time = "2025-10-17T11:30:08.87Z" }, + { url = "https://files.pythonhosted.org/packages/e7/91/22f5746f5159a28c76acdc0778801f3c1181799aab196dbea2d29e064968/jiter-0.11.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f69955fa1d92e81987f092b233f0be49d4c937da107b7f7dcf56306f1d3fcce9", size = 346357, upload-time = "2025-10-17T11:30:10.222Z" }, + { url = "https://files.pythonhosted.org/packages/f5/4f/57620857d4e1dc75c8ff4856c90cb6c135e61bff9b4ebfb5dc86814e82d7/jiter-0.11.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:090f4c9d4a825e0fcbd0a2647c9a88a0f366b75654d982d95a9590745ff0c48d", size = 365057, upload-time = "2025-10-17T11:30:11.585Z" }, + { url = "https://files.pythonhosted.org/packages/ce/34/caf7f9cc8ae0a5bb25a5440cc76c7452d264d1b36701b90fdadd28fe08ec/jiter-0.11.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbf3d8cedf9e9d825233e0dcac28ff15c47b7c5512fdfe2e25fd5bbb6e6b0cee", size = 487086, upload-time = "2025-10-17T11:30:13.052Z" }, + { url = "https://files.pythonhosted.org/packages/50/17/85b5857c329d533d433fedf98804ebec696004a1f88cabad202b2ddc55cf/jiter-0.11.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa9b1958f9c30d3d1a558b75f0626733c60eb9b7774a86b34d88060be1e67fe", size = 376083, upload-time = "2025-10-17T11:30:14.416Z" }, + { url = "https://files.pythonhosted.org/packages/85/d3/2d9f973f828226e6faebdef034097a2918077ea776fb4d88489949024787/jiter-0.11.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42d1ca16590b768c5e7d723055acd2633908baacb3628dd430842e2e035aa90", size = 357825, upload-time = "2025-10-17T11:30:15.765Z" }, + { url = "https://files.pythonhosted.org/packages/f4/55/848d4dabf2c2c236a05468c315c2cb9dc736c5915e65449ccecdba22fb6f/jiter-0.11.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5db4c2486a023820b701a17aec9c5a6173c5ba4393f26662f032f2de9c848b0f", size = 383933, upload-time = "2025-10-17T11:30:17.34Z" }, + { url = "https://files.pythonhosted.org/packages/0b/6c/204c95a4fbb0e26dfa7776c8ef4a878d0c0b215868011cc904bf44f707e2/jiter-0.11.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:4573b78777ccfac954859a6eff45cbd9d281d80c8af049d0f1a3d9fc323d5c3a", size = 517118, upload-time = "2025-10-17T11:30:18.684Z" }, + { url = "https://files.pythonhosted.org/packages/88/25/09956644ea5a2b1e7a2a0f665cb69a973b28f4621fa61fc0c0f06ff40a31/jiter-0.11.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7593ac6f40831d7961cb67633c39b9fef6689a211d7919e958f45710504f52d3", size = 508194, upload-time = "2025-10-17T11:30:20.719Z" }, + { url = "https://files.pythonhosted.org/packages/09/49/4d1657355d7f5c9e783083a03a3f07d5858efa6916a7d9634d07db1c23bd/jiter-0.11.1-cp314-cp314-win32.whl", hash = "sha256:87202ec6ff9626ff5f9351507def98fcf0df60e9a146308e8ab221432228f4ea", size = 203961, upload-time = "2025-10-17T11:30:22.073Z" }, + { url = "https://files.pythonhosted.org/packages/76/bd/f063bd5cc2712e7ca3cf6beda50894418fc0cfeb3f6ff45a12d87af25996/jiter-0.11.1-cp314-cp314-win_amd64.whl", hash = "sha256:a5dd268f6531a182c89d0dd9a3f8848e86e92dfff4201b77a18e6b98aa59798c", size = 202804, upload-time = "2025-10-17T11:30:23.452Z" }, + { url = "https://files.pythonhosted.org/packages/52/ca/4d84193dfafef1020bf0bedd5e1a8d0e89cb67c54b8519040effc694964b/jiter-0.11.1-cp314-cp314-win_arm64.whl", hash = "sha256:5d761f863f912a44748a21b5c4979c04252588ded8d1d2760976d2e42cd8d991", size = 188001, upload-time = "2025-10-17T11:30:24.915Z" }, + { url = "https://files.pythonhosted.org/packages/d5/fa/3b05e5c9d32efc770a8510eeb0b071c42ae93a5b576fd91cee9af91689a1/jiter-0.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2cc5a3965285ddc33e0cab933e96b640bc9ba5940cea27ebbbf6695e72d6511c", size = 312561, upload-time = "2025-10-17T11:30:26.742Z" }, + { url = "https://files.pythonhosted.org/packages/50/d3/335822eb216154ddb79a130cbdce88fdf5c3e2b43dc5dba1fd95c485aaf5/jiter-0.11.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b572b3636a784c2768b2342f36a23078c8d3aa6d8a30745398b1bab58a6f1a8", size = 344551, upload-time = "2025-10-17T11:30:28.252Z" }, + { url = "https://files.pythonhosted.org/packages/31/6d/a0bed13676b1398f9b3ba61f32569f20a3ff270291161100956a577b2dd3/jiter-0.11.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad93e3d67a981f96596d65d2298fe8d1aa649deb5374a2fb6a434410ee11915e", size = 363051, upload-time = "2025-10-17T11:30:30.009Z" }, + { url = "https://files.pythonhosted.org/packages/a4/03/313eda04aa08545a5a04ed5876e52f49ab76a4d98e54578896ca3e16313e/jiter-0.11.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a83097ce379e202dcc3fe3fc71a16d523d1ee9192c8e4e854158f96b3efe3f2f", size = 485897, upload-time = "2025-10-17T11:30:31.429Z" }, + { url = "https://files.pythonhosted.org/packages/5f/13/a1011b9d325e40b53b1b96a17c010b8646013417f3902f97a86325b19299/jiter-0.11.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7042c51e7fbeca65631eb0c332f90c0c082eab04334e7ccc28a8588e8e2804d9", size = 375224, upload-time = "2025-10-17T11:30:33.18Z" }, + { url = "https://files.pythonhosted.org/packages/92/da/1b45026b19dd39b419e917165ff0ea629dbb95f374a3a13d2df95e40a6ac/jiter-0.11.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a68d679c0e47649a61df591660507608adc2652442de7ec8276538ac46abe08", size = 356606, upload-time = "2025-10-17T11:30:34.572Z" }, + { url = "https://files.pythonhosted.org/packages/7a/0c/9acb0e54d6a8ba59ce923a180ebe824b4e00e80e56cefde86cc8e0a948be/jiter-0.11.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b0da75dbf4b6ec0b3c9e604d1ee8beaf15bc046fff7180f7d89e3cdbd3bb51", size = 384003, upload-time = "2025-10-17T11:30:35.987Z" }, + { url = "https://files.pythonhosted.org/packages/3f/2b/e5a5fe09d6da2145e4eed651e2ce37f3c0cf8016e48b1d302e21fb1628b7/jiter-0.11.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:69dd514bf0fa31c62147d6002e5ca2b3e7ef5894f5ac6f0a19752385f4e89437", size = 516946, upload-time = "2025-10-17T11:30:37.425Z" }, + { url = "https://files.pythonhosted.org/packages/5f/fe/db936e16e0228d48eb81f9934e8327e9fde5185e84f02174fcd22a01be87/jiter-0.11.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:bb31ac0b339efa24c0ca606febd8b77ef11c58d09af1b5f2be4c99e907b11111", size = 507614, upload-time = "2025-10-17T11:30:38.977Z" }, + { url = "https://files.pythonhosted.org/packages/86/db/c4438e8febfb303486d13c6b72f5eb71cf851e300a0c1f0b4140018dd31f/jiter-0.11.1-cp314-cp314t-win32.whl", hash = "sha256:b2ce0d6156a1d3ad41da3eec63b17e03e296b78b0e0da660876fccfada86d2f7", size = 204043, upload-time = "2025-10-17T11:30:40.308Z" }, + { url = "https://files.pythonhosted.org/packages/36/59/81badb169212f30f47f817dfaabf965bc9b8204fed906fab58104ee541f9/jiter-0.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f4db07d127b54c4a2d43b4cf05ff0193e4f73e0dd90c74037e16df0b29f666e1", size = 204046, upload-time = "2025-10-17T11:30:41.692Z" }, + { url = "https://files.pythonhosted.org/packages/dd/01/43f7b4eb61db3e565574c4c5714685d042fb652f9eef7e5a3de6aafa943a/jiter-0.11.1-cp314-cp314t-win_arm64.whl", hash = "sha256:28e4fdf2d7ebfc935523e50d1efa3970043cfaa161674fe66f9642409d001dfe", size = 188069, upload-time = "2025-10-17T11:30:43.23Z" }, + { url = "https://files.pythonhosted.org/packages/9d/51/bd41562dd284e2a18b6dc0a99d195fd4a3560d52ab192c42e56fe0316643/jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:e642b5270e61dd02265866398707f90e365b5db2eb65a4f30c789d826682e1f6", size = 306871, upload-time = "2025-10-17T11:31:03.616Z" }, + { url = "https://files.pythonhosted.org/packages/ba/cb/64e7f21dd357e8cd6b3c919c26fac7fc198385bbd1d85bb3b5355600d787/jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:464ba6d000585e4e2fd1e891f31f1231f497273414f5019e27c00a4b8f7a24ad", size = 301454, upload-time = "2025-10-17T11:31:05.338Z" }, + { url = "https://files.pythonhosted.org/packages/55/b0/54bdc00da4ef39801b1419a01035bd8857983de984fd3776b0be6b94add7/jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:055568693ab35e0bf3a171b03bb40b2dcb10352359e0ab9b5ed0da2bf1eb6f6f", size = 336801, upload-time = "2025-10-17T11:31:06.893Z" }, + { url = "https://files.pythonhosted.org/packages/de/8f/87176ed071d42e9db415ed8be787ef4ef31a4fa27f52e6a4fbf34387bd28/jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c69ea798d08a915ba4478113efa9e694971e410056392f4526d796f136d3fa", size = 343452, upload-time = "2025-10-17T11:31:08.259Z" }, + { url = "https://files.pythonhosted.org/packages/a6/bc/950dd7f170c6394b6fdd73f989d9e729bd98907bcc4430ef080a72d06b77/jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:0d4d6993edc83cf75e8c6828a8d6ce40a09ee87e38c7bfba6924f39e1337e21d", size = 302626, upload-time = "2025-10-17T11:31:09.645Z" }, + { url = "https://files.pythonhosted.org/packages/3a/65/43d7971ca82ee100b7b9b520573eeef7eabc0a45d490168ebb9a9b5bb8b2/jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f78d151c83a87a6cf5461d5ee55bc730dd9ae227377ac6f115b922989b95f838", size = 297034, upload-time = "2025-10-17T11:31:10.975Z" }, + { url = "https://files.pythonhosted.org/packages/19/4c/000e1e0c0c67e96557a279f8969487ea2732d6c7311698819f977abae837/jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9022974781155cd5521d5cb10997a03ee5e31e8454c9d999dcdccd253f2353f", size = 337328, upload-time = "2025-10-17T11:31:12.399Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/71408b02c6133153336d29fa3ba53000f1e1a3f78bb2fc2d1a1865d2e743/jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18c77aaa9117510d5bdc6a946baf21b1f0cfa58ef04d31c8d016f206f2118960", size = 343697, upload-time = "2025-10-17T11:31:13.773Z" }, ] [[package]] @@ -3440,6 +3483,7 @@ version = "0.3.0" source = { editable = "." } dependencies = [ { name = "gitpython" }, + { name = "instructor" }, { name = "pydantic" }, { name = "rich" }, { name = "tqdm" }, @@ -3687,6 +3731,7 @@ requires-dist = [ { name = "gitpython", specifier = ">=3.1.0" }, { name = "google-genai", marker = "extra == 'google-genai'", specifier = ">=1.37.0" }, { name = "h5py", marker = "extra == 'disco'", specifier = ">=3.0.0" }, + { name = "instructor", specifier = ">=1.14.5" }, { name = "ipykernel", marker = "extra == 'examples'", specifier = ">=6.0.0" }, { name = "ipython", marker = "extra == 'disco'", specifier = ">=8.0.0" }, { name = "ipywidgets", marker = "extra == 'examples'", specifier = ">=8.0.0" }, @@ -4870,7 +4915,7 @@ wheels = [ [[package]] name = "openai" -version = "1.109.1" +version = "2.29.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -4882,9 +4927,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c6/a1/a303104dc55fc546a3f6914c842d3da471c64eec92043aef8f652eb6c524/openai-1.109.1.tar.gz", hash = "sha256:d173ed8dbca665892a6db099b4a2dfac624f94d20a93f46eb0b56aae940ed869", size = 564133, upload-time = "2025-09-24T13:00:53.075Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/15/203d537e58986b5673e7f232453a2a2f110f22757b15921cbdeea392e520/openai-2.29.0.tar.gz", hash = "sha256:32d09eb2f661b38d3edd7d7e1a2943d1633f572596febe64c0cd370c86d52bec", size = 671128, upload-time = "2026-03-17T17:53:49.599Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/2a/7dd3d207ec669cacc1f186fd856a0f61dbc255d24f6fdc1a6715d6051b0f/openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315", size = 948627, upload-time = "2025-09-24T13:00:50.754Z" }, + { url = "https://files.pythonhosted.org/packages/d0/b1/35b6f9c8cf9318e3dbb7146cc82dab4cf61182a8d5406fc9b50864362895/openai-2.29.0-py3-none-any.whl", hash = "sha256:b7c5de513c3286d17c5e29b92c4c98ceaf0d775244ac8159aeb1bddf840eb42a", size = 1141533, upload-time = "2026-03-17T17:53:47.348Z" }, ] [[package]] From caab7927a6a02300c1b3a76495cd60af0c934290 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 21:08:34 +0100 Subject: [PATCH 2/7] added optional dependency for google + instructor --- pyproject.toml | 5 +- tests/test_core/test_instructor.py | 179 ++++++++++++++++++ .../test_model_integration/test_live_api.py | 85 +++++++++ uv.lock | 13 ++ 4 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 tests/test_core/test_instructor.py diff --git a/pyproject.toml b/pyproject.toml index 075484d4..23ba940f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,10 @@ camel = ["camel-ai>=0.2.0", "litellm>=1.0.0"] # Inference engines anthropic = ["anthropic>=0.40.0"] openai = ["openai>=1.107.2"] -google-genai = ["google-genai>=1.37.0"] +google-genai = [ + "google-genai>=1.37.0", + "jsonref>=1.1.0", +] transformers = ["transformers>=4.37.0"] litellm = ["litellm>=1.0.0"] diff --git a/tests/test_core/test_instructor.py b/tests/test_core/test_instructor.py new file mode 100644 index 00000000..9bf16cf5 --- /dev/null +++ b/tests/test_core/test_instructor.py @@ -0,0 +1,179 @@ +"""Tests for maseval.core.instructor module. + +Tests the schema flattening logic (``flatten_model_schema``) and the +instructor client factory (``create_instructor_client``). + +Schema tests are pure unit tests — they exercise Pydantic model → JSON +schema conversion without any mocking or network access. +""" + +import pytest +from typing import Optional, List +from pydantic import BaseModel, Field + +from maseval.core.instructor import flatten_model_schema, create_instructor_client + +# ── Test models ────────────────────────────────────────────────────────────── + + +class SimpleModel(BaseModel): + """Model with all required fields.""" + + name: str = Field(description="The name") + age: int = Field(description="The age") + score: float = Field(description="The score") + + +class OptionalFieldsModel(BaseModel): + """Model with Optional fields that produce anyOf in Pydantic v2 schemas.""" + + required_field: str = Field(description="Always required") + optional_field: Optional[str] = Field(default=None, description="May be absent") + optional_int: Optional[int] = Field(default=None, description="Optional number") + + +class Address(BaseModel): + street: str + city: str + + +class NestedModel(BaseModel): + """Model with a nested sub-model that produces $ref/$defs.""" + + name: str = Field(description="Person name") + address: Address = Field(description="Home address") + + +class ListFieldModel(BaseModel): + """Model with list fields.""" + + tags: List[str] = Field(description="A list of tags") + scores: List[float] = Field(description="A list of scores") + + +# ── flatten_model_schema tests ─────────────────────────────────────────────── + +STRIPPED_KEYS = {"$defs", "additionalProperties", "title", "default"} + + +def _assert_no_stripped_keys(schema: dict) -> None: + """Recursively verify no stripped keys exist anywhere in the schema.""" + for key in STRIPPED_KEYS: + assert key not in schema, f"Found stripped key {key!r} in schema" + for value in schema.values(): + if isinstance(value, dict): + _assert_no_stripped_keys(value) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + _assert_no_stripped_keys(item) + + +@pytest.mark.core +class TestFlattenModelSchema: + """Tests for flatten_model_schema().""" + + def test_simple_model_produces_flat_schema(self): + """Simple model with all required fields produces correct types and descriptions.""" + schema = flatten_model_schema(SimpleModel) + + assert schema["type"] == "object" + props = schema["properties"] + + assert props["name"]["type"] == "string" + assert props["name"]["description"] == "The name" + assert props["age"]["type"] == "integer" + assert props["score"]["type"] == "number" + + assert "name" in schema["required"] + assert "age" in schema["required"] + assert "score" in schema["required"] + + def test_optional_fields_resolved_to_nullable(self): + """Optional[X] fields have anyOf removed and nullable added.""" + schema = flatten_model_schema(OptionalFieldsModel) + props = schema["properties"] + + # Required field is straightforward + assert props["required_field"]["type"] == "string" + assert "nullable" not in props["required_field"] + + # Optional fields should have base type + nullable + assert props["optional_field"]["type"] == "string" + assert props["optional_field"]["nullable"] is True + + assert props["optional_int"]["type"] == "integer" + assert props["optional_int"]["nullable"] is True + + # No anyOf should remain anywhere + assert "anyOf" not in str(schema) + + def test_optional_field_preserves_description(self): + """Description on Optional fields survives the anyOf resolution.""" + schema = flatten_model_schema(OptionalFieldsModel) + props = schema["properties"] + + assert props["optional_field"]["description"] == "May be absent" + assert props["optional_int"]["description"] == "Optional number" + + def test_nested_model_defs_stripped(self): + """Nested model $defs are stripped from the schema. + + Note: $ref references are preserved but $defs definitions are removed. + In practice this doesn't arise — Tau2 uses create_model() with simple + types (str, int, float) which don't produce $ref. + """ + schema = flatten_model_schema(NestedModel) + + # $defs should be stripped + assert "$defs" not in schema + + # Description is preserved on the nested field + assert schema["properties"]["address"]["description"] == "Home address" + + def test_stripped_keys_removed(self): + """$defs, additionalProperties, title, default are stripped recursively.""" + schema = flatten_model_schema(NestedModel) + _assert_no_stripped_keys(schema) + + def test_list_fields_preserved(self): + """List[X] fields produce correct array schemas.""" + schema = flatten_model_schema(ListFieldModel) + props = schema["properties"] + + assert props["tags"]["type"] == "array" + assert props["scores"]["type"] == "array" + + +# ── create_instructor_client tests ─────────────────────────────────────────── + + +@pytest.mark.core +class TestCreateInstructorClient: + """Tests for create_instructor_client().""" + + def test_unknown_provider_raises_value_error(self): + """Unsupported provider raises ValueError with helpful message.""" + with pytest.raises(ValueError, match="Unsupported provider"): + create_instructor_client(object(), provider="not-a-real-provider") + + def test_openai_provider_returns_patched_client(self): + """OpenAI client is wrapped and exposes chat.completions.create.""" + from openai import OpenAI + + client = OpenAI(api_key="test-key-not-real") + patched = create_instructor_client(client, provider="openai") + + assert hasattr(patched, "chat") + assert hasattr(patched.chat, "completions") + assert callable(patched.chat.completions.create) + + def test_litellm_provider_returns_patched_client(self): + """LiteLLM completion function is wrapped and exposes chat.completions.create.""" + litellm = pytest.importorskip("litellm") + + patched = create_instructor_client(litellm.completion, provider="litellm") + + assert hasattr(patched, "chat") + assert hasattr(patched.chat, "completions") + assert callable(patched.chat.completions.create) diff --git a/tests/test_interface/test_model_integration/test_live_api.py b/tests/test_interface/test_model_integration/test_live_api.py index 40e8793b..0d98d472 100644 --- a/tests/test_interface/test_model_integration/test_live_api.py +++ b/tests/test_interface/test_model_integration/test_live_api.py @@ -8,6 +8,7 @@ - ``ChatResponse`` fields are populated correctly (content, role, usage, stop_reason) - Tool calling produces properly structured ``tool_calls`` dicts +- Structured output via ``response_model`` returns validated Pydantic instances - The adapter's format conversions survive a real API round-trip These tests require API keys and incur small costs (~$0.001 per run). @@ -28,6 +29,7 @@ import os import pytest +from pydantic import BaseModel, Field pytestmark = [pytest.mark.interface, pytest.mark.credentialed] @@ -35,6 +37,15 @@ requires_anthropic = pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set") requires_google = pytest.mark.skipif(not os.environ.get("GOOGLE_API_KEY"), reason="GOOGLE_API_KEY not set") + +# Shared response model used across all structured output tests. +class Capital(BaseModel): + """A country's capital city.""" + + city: str = Field(description="Name of the capital city") + country: str = Field(description="Name of the country") + + # Shared tool definition used across all provider tests. WEATHER_TOOL = { "type": "function", @@ -110,6 +121,25 @@ def test_tool_call_response(self): assert isinstance(args, dict) assert "city" in args + @requires_openai + def test_structured_output(self): + """Structured output via response_model returns a validated Pydantic instance.""" + from openai import OpenAI + from maseval.interface.inference.openai import OpenAIModelAdapter + + client = OpenAI() + adapter = OpenAIModelAdapter(client=client, model_id="gpt-4o-mini") + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + generation_params={"max_tokens": 50}, + ) + + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + assert response.content is not None # JSON serialization of the model + # ============================================================================= # Anthropic @@ -167,6 +197,24 @@ def test_tool_use_response(self): assert isinstance(args, dict) assert "city" in args + @requires_anthropic + def test_structured_output(self): + """Structured output via response_model returns a validated Pydantic instance.""" + from anthropic import Anthropic + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + client = Anthropic() + adapter = AnthropicModelAdapter(client=client, model_id="claude-3-5-haiku-20241022", max_tokens=100) + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + ) + + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + assert response.content is not None + # ============================================================================= # Google GenAI @@ -223,6 +271,25 @@ def test_function_call_response(self): assert isinstance(args, dict) assert "city" in args + @requires_google + def test_structured_output(self): + """Structured output via response_model returns a validated Pydantic instance.""" + from google import genai + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + client = genai.Client() + adapter = GoogleGenAIModelAdapter(client=client, model_id="gemini-2.0-flash") + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + generation_params={"max_output_tokens": 50}, + ) + + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + assert response.content is not None + # ============================================================================= # LiteLLM (routes through OpenAI) @@ -282,3 +349,21 @@ def test_tool_call_response(self): args = json.loads(tc["function"]["arguments"]) assert isinstance(args, dict) assert "city" in args + + @requires_openai + def test_structured_output(self): + """Structured output via response_model returns a validated Pydantic instance.""" + pytest.importorskip("litellm") + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + generation_params={"max_tokens": 50}, + ) + + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + assert response.content is not None diff --git a/uv.lock b/uv.lock index c1e20868..6c5e0a4d 100644 --- a/uv.lock +++ b/uv.lock @@ -2420,6 +2420,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" }, ] +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, +] + [[package]] name = "jsonschema" version = "4.25.1" @@ -3513,6 +3522,7 @@ all = [ { name = "ipywidgets" }, { name = "javascript" }, { name = "jsonlines" }, + { name = "jsonref" }, { name = "jupyter" }, { name = "keybert" }, { name = "langchain" }, @@ -3600,6 +3610,7 @@ examples = [ { name = "ipython", version = "9.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "ipywidgets" }, { name = "jsonlines" }, + { name = "jsonref" }, { name = "jupyter" }, { name = "langchain" }, { name = "langchain-google-genai" }, @@ -3633,6 +3644,7 @@ gaia2 = [ ] google-genai = [ { name = "google-genai" }, + { name = "jsonref" }, ] langfuse = [ { name = "langfuse" }, @@ -3737,6 +3749,7 @@ requires-dist = [ { name = "ipywidgets", marker = "extra == 'examples'", specifier = ">=8.0.0" }, { name = "javascript", marker = "extra == 'multiagentbench'", specifier = ">=1!1.2.0" }, { name = "jsonlines", marker = "extra == 'disco'", specifier = ">=4.0.0" }, + { name = "jsonref", marker = "extra == 'google-genai'", specifier = ">=1.1.0" }, { name = "jupyter", marker = "extra == 'examples'", specifier = ">=1.0.0" }, { name = "keybert", marker = "extra == 'multiagentbench'", specifier = ">=0.8.0" }, { name = "langchain", marker = "extra == 'examples'", specifier = ">=0.3.27" }, From 159a4d54eefcab828da0c8f09e29bb7cf88db3fb Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 21:34:17 +0100 Subject: [PATCH 3/7] improved testing --- tests/conftest.py | 11 +++++++ .../test_model_integration/test_live_api.py | 31 +++++++++++-------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e21f3e39..85b93391 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,19 @@ """Shared fixtures for MASEval tests.""" +import os + import pytest from typing import Any, Dict, List, Optional, Sequence, Tuple +# Load .env for local development (CI injects secrets via environment). +if not os.environ.get("CI"): + try: + from dotenv import load_dotenv + + load_dotenv() + except ImportError: + pass + from maseval import ( Benchmark, AgentAdapter, diff --git a/tests/test_interface/test_model_integration/test_live_api.py b/tests/test_interface/test_model_integration/test_live_api.py index 0d98d472..66ceea63 100644 --- a/tests/test_interface/test_model_integration/test_live_api.py +++ b/tests/test_interface/test_model_integration/test_live_api.py @@ -37,6 +37,12 @@ requires_anthropic = pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set") requires_google = pytest.mark.skipif(not os.environ.get("GOOGLE_API_KEY"), reason="GOOGLE_API_KEY not set") +# Model IDs used across tests. Update here when models are rotated. +OPENAI_MODEL = "gpt-4o-mini" +ANTHROPIC_MODEL = "claude-haiku-4-5" +GOOGLE_MODEL = "gemini-2.0-flash" +LITELLM_MODEL = "gpt-4o-mini" + # Shared response model used across all structured output tests. class Capital(BaseModel): @@ -78,7 +84,7 @@ def test_text_response(self): from maseval.interface.inference.openai import OpenAIModelAdapter client = OpenAI() - adapter = OpenAIModelAdapter(client=client, model_id="gpt-4o-mini") + adapter = OpenAIModelAdapter(client=client, model_id=OPENAI_MODEL) response = adapter.chat( [{"role": "user", "content": "Say 'test' and nothing else."}], generation_params={"max_tokens": 10}, @@ -102,7 +108,7 @@ def test_tool_call_response(self): from maseval.interface.inference.openai import OpenAIModelAdapter client = OpenAI() - adapter = OpenAIModelAdapter(client=client, model_id="gpt-4o-mini") + adapter = OpenAIModelAdapter(client=client, model_id=OPENAI_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], tools=[WEATHER_TOOL], @@ -128,7 +134,7 @@ def test_structured_output(self): from maseval.interface.inference.openai import OpenAIModelAdapter client = OpenAI() - adapter = OpenAIModelAdapter(client=client, model_id="gpt-4o-mini") + adapter = OpenAIModelAdapter(client=client, model_id=OPENAI_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the capital of France?"}], response_model=Capital, @@ -156,7 +162,7 @@ def test_text_response(self): from maseval.interface.inference.anthropic import AnthropicModelAdapter client = Anthropic() - adapter = AnthropicModelAdapter(client=client, model_id="claude-3-5-haiku-20241022", max_tokens=10) + adapter = AnthropicModelAdapter(client=client, model_id=ANTHROPIC_MODEL, max_tokens=10) response = adapter.chat( [{"role": "user", "content": "Say 'test' and nothing else."}], ) @@ -179,7 +185,7 @@ def test_tool_use_response(self): from maseval.interface.inference.anthropic import AnthropicModelAdapter client = Anthropic() - adapter = AnthropicModelAdapter(client=client, model_id="claude-3-5-haiku-20241022", max_tokens=100) + adapter = AnthropicModelAdapter(client=client, model_id=ANTHROPIC_MODEL, max_tokens=100) response = adapter.chat( [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], tools=[WEATHER_TOOL], @@ -204,7 +210,7 @@ def test_structured_output(self): from maseval.interface.inference.anthropic import AnthropicModelAdapter client = Anthropic() - adapter = AnthropicModelAdapter(client=client, model_id="claude-3-5-haiku-20241022", max_tokens=100) + adapter = AnthropicModelAdapter(client=client, model_id=ANTHROPIC_MODEL, max_tokens=100) response = adapter.chat( [{"role": "user", "content": "What is the capital of France?"}], response_model=Capital, @@ -231,7 +237,7 @@ def test_text_response(self): from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter client = genai.Client() - adapter = GoogleGenAIModelAdapter(client=client, model_id="gemini-2.0-flash") + adapter = GoogleGenAIModelAdapter(client=client, model_id=GOOGLE_MODEL) response = adapter.chat( [{"role": "user", "content": "Say 'test' and nothing else."}], generation_params={"max_output_tokens": 10}, @@ -253,7 +259,7 @@ def test_function_call_response(self): from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter client = genai.Client() - adapter = GoogleGenAIModelAdapter(client=client, model_id="gemini-2.0-flash") + adapter = GoogleGenAIModelAdapter(client=client, model_id=GOOGLE_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], tools=[WEATHER_TOOL], @@ -278,11 +284,10 @@ def test_structured_output(self): from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter client = genai.Client() - adapter = GoogleGenAIModelAdapter(client=client, model_id="gemini-2.0-flash") + adapter = GoogleGenAIModelAdapter(client=client, model_id=GOOGLE_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the capital of France?"}], response_model=Capital, - generation_params={"max_output_tokens": 50}, ) assert isinstance(response.structured_response, Capital) @@ -309,7 +314,7 @@ def test_text_response(self): pytest.importorskip("litellm") from maseval.interface.inference.litellm import LiteLLMModelAdapter - adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + adapter = LiteLLMModelAdapter(model_id=LITELLM_MODEL) response = adapter.chat( [{"role": "user", "content": "Say 'test' and nothing else."}], generation_params={"max_tokens": 10}, @@ -331,7 +336,7 @@ def test_tool_call_response(self): pytest.importorskip("litellm") from maseval.interface.inference.litellm import LiteLLMModelAdapter - adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + adapter = LiteLLMModelAdapter(model_id=LITELLM_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], tools=[WEATHER_TOOL], @@ -356,7 +361,7 @@ def test_structured_output(self): pytest.importorskip("litellm") from maseval.interface.inference.litellm import LiteLLMModelAdapter - adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + adapter = LiteLLMModelAdapter(model_id=LITELLM_MODEL) response = adapter.chat( [{"role": "user", "content": "What is the capital of France?"}], response_model=Capital, From 7f7517665245875c13f9493a52a9f9535666ae07 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 21:41:19 +0100 Subject: [PATCH 4/7] updated changelog --- CHANGELOG.md | 10 +- .../2026-03-22-add-instructor-library.md | 1087 ----------------- 2 files changed, 5 insertions(+), 1092 deletions(-) delete mode 100644 docs/superpowers/plans/2026-03-22-add-instructor-library.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bb4b82b..057f970e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,19 +11,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 **Core** -- Instructor library (`instructor>=1.14.0`) as core dependency for structured LLM output handling with automatic validation and retries. (PR: #PR_NUMBER_PLACEHOLDER) -- `response_model` parameter on `ModelAdapter.chat()` — pass a Pydantic `BaseModel` class to get validated structured outputs via `ChatResponse.structured_response`. Supported on OpenAI, Anthropic, Google GenAI, and LiteLLM adapters. (PR: #PR_NUMBER_PLACEHOLDER) -- `maseval.core.instructor` module with `create_instructor_client()` and `flatten_model_schema()` helpers for creating instructor-patched clients and generating provider-compatible JSON schemas. (PR: #PR_NUMBER_PLACEHOLDER) +- Instructor library (`instructor>=1.14.0`) as core dependency for structured LLM output handling with automatic validation and retries. (PR: #49) +- `response_model` parameter on `ModelAdapter.chat()` — pass a Pydantic `BaseModel` class to get validated structured outputs via `ChatResponse.structured_response`. Supported on OpenAI, Anthropic, Google GenAI, and LiteLLM adapters. (PR: #49) +- `maseval.core.instructor` module with `create_instructor_client()` and `flatten_model_schema()` helpers for creating instructor-patched clients and generating provider-compatible JSON schemas. (PR: #49) ### Changed **Core** -- Simulators (`ToolLLMSimulator`, `UserLLMSimulator`, `AgenticUserLLMSimulator`) now use instructor for structured output parsing with automatic validation and retries, replacing manual JSON extraction and retry loops. (PR: #PR_NUMBER_PLACEHOLDER) +- Simulators (`ToolLLMSimulator`, `UserLLMSimulator`, `AgenticUserLLMSimulator`) now use instructor for structured output parsing with automatic validation and retries, replacing manual JSON extraction and retry loops. (PR: #49) **Benchmarks** -- Tau2 benchmark uses `flatten_model_schema()` from `maseval.core.instructor` for tool parameter schema generation, replacing the manual `_flatten_schema()` function. (PR: #PR_NUMBER_PLACEHOLDER) +- Tau2 benchmark uses `flatten_model_schema()` from `maseval.core.instructor` for tool parameter schema generation, replacing the manual `_flatten_schema()` function. (PR: #49) - Usage and cost tracking via `Usage` and `TokenUsage` data classes. `ModelAdapter` tracks token usage automatically after each `chat()` call. Components that implement `UsageTrackableMixin` are collected via `gather_usage()`. Live totals available during benchmark runs via `benchmark.usage` (grand total) and `benchmark.usage_by_component` (per-component breakdowns). Post-hoc analysis via `UsageReporter.from_reports(benchmark.reports)` with breakdowns by task, component, or model. (PR: #45) - Pluggable cost calculation via `CostCalculator` protocol. `StaticPricingCalculator` computes cost from user-supplied per-token rates. `LiteLLMCostCalculator` in `maseval.interface.usage` for automatic pricing via LiteLLM's model database (supports `custom_pricing` overrides and `model_id_map`; requires `litellm`). Pass a `cost_calculator` to `ModelAdapter` or `AgentAdapter` to compute `Usage.cost`. Provider-reported cost always takes precedence. (PR: #45) diff --git a/docs/superpowers/plans/2026-03-22-add-instructor-library.md b/docs/superpowers/plans/2026-03-22-add-instructor-library.md deleted file mode 100644 index bdf490e8..00000000 --- a/docs/superpowers/plans/2026-03-22-add-instructor-library.md +++ /dev/null @@ -1,1087 +0,0 @@ -# Add Instructor Library Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add the [instructor](https://github.com/567-labs/instructor) library as core infrastructure for structured LLM output handling — both internally and as a user-facing API. This is not a patch on top of existing code; it is a clean replacement. The custom JSON extraction, schema flattening, and retry logic are removed, not wrapped with fallbacks. - -**Why:** Reliable structured output from unreliable models is critical for researchers who use cheap/small models due to cost constraints. Hand-rolled JSON parsing and retry logic is finicky, under-tested, and reimplemented in multiple places. Instructor provides a battle-tested foundation (3M+ monthly downloads) for validation, retries with error feedback to the model, and multi-provider support. By making it core infrastructure, every future structured output need builds on this foundation rather than reinventing it. - -**Design principles:** -1. **Clean replacement, not a compatibility layer.** Per AGENTS.md: "Clean, maintainable code is the priority — not backwards compatibility." Old code (`_extract_json_object`, `_flatten_schema`, manual retry loops) is deleted, not preserved as fallbacks. If instructor handles it, the old path is gone. -2. **Infrastructure for future work.** This is a seed ecosystem — the integration points are designed so that upcoming features (partial streaming, custom validators, fallback models) plug in naturally. -3. **Follow existing patterns.** The `ModelAdapter` / provider adapter pattern stays. Instructor slots into this architecture cleanly via `_structured_chat()` overrides in each provider adapter. - -**Architecture:** Instructor wraps provider clients via `instructor.from_provider()` to add `response_model` support with automatic validation and retries. All providers use a unified API: `client.chat.completions.create(response_model=..., messages=...)`. We integrate at the `ModelAdapter` level: each adapter creates an instructor-patched client alongside the raw client. The public `chat()` method gains an optional `response_model` parameter. Simulators switch fully to `response_model` — no legacy JSON parsing fallback. Tau2 schema flattening uses instructor's `openai_schema()` as a base. - -**Tech Stack:** Python, instructor (>=1.14.0), pydantic (>=2.10.6, already a core dep) - -**Key instructor API facts (verified against v1.14.4):** -- `instructor.from_provider("provider/model")` — unified client creation (no `from_gemini` or `from_anthropic`) -- `instructor.from_openai(client)` — OpenAI-specific wrapping -- `instructor.from_litellm(completion_fn)` — LiteLLM wrapping -- All wrapped clients use `client.chat.completions.create(response_model=..., messages=...)` uniformly -- `instructor.openai_schema(MyModel)` — generate clean OpenAI-compatible schemas (returns object with `.openai_schema` dict containing `name`, `description`, `parameters`) -- Note: `openai_schema()` still produces `anyOf` for `Optional` fields — we keep `_flatten_schema()` as a thin utility for providers (like Google GenAI) that reject `anyOf` - -**Project conventions (from AGENTS.md):** -- Use `uv add` for dependencies, `uv run` for commands, never `pip install` -- Union syntax: `A | B`, optionals: `Optional[X]`, collections: `List`, `Dict` -- Core (`maseval/core/`) must NOT import from interface (`maseval/interface/`) -- `just all` before committing (format + lint + typecheck + test) - ---- - -## File Structure - -| File | Responsibility | Action | -|------|---------------|--------| -| `pyproject.toml` | Dependencies | Modify: add `instructor>=1.14.0` to core deps via `uv add` | -| `maseval/core/model.py` | ModelAdapter base + ChatResponse | Modify: add `response_model` param to `chat()`, add `_structured_chat()`, add `structured_response` field to ChatResponse | -| `maseval/core/instructor.py` | Instructor integration helpers | Create: `create_instructor_client()` helper, `flatten_model_schema()` | -| `maseval/interface/inference/openai.py` | OpenAI adapter | Modify: create instructor client, override `_structured_chat()` | -| `maseval/interface/inference/anthropic.py` | Anthropic adapter | Modify: create instructor client, override `_structured_chat()` | -| `maseval/interface/inference/google_genai.py` | Google adapter | Modify: create instructor client, override `_structured_chat()` | -| `maseval/interface/inference/litellm.py` | LiteLLM adapter | Modify: create instructor client, override `_structured_chat()` | -| `maseval/core/simulator.py` | LLM simulators | Modify: add Pydantic response models, use `response_model` in simulators with legacy fallback | -| `maseval/benchmark/tau2/tau2.py` | Tau2 benchmark | Modify: replace `_flatten_schema()` usage in both `_build_tool_definitions()` (line 897) and `_get_tool_definitions()` (line 1231) with `flatten_model_schema()` | -| `tests/test_core/test_instructor_integration.py` | Instructor integration tests | Create: test `response_model` on ModelAdapter | -| `tests/test_core/test_llm_simulator.py` | Existing simulator tests | Modify: add response model tests, verify existing tests pass | -| `CHANGELOG.md` | Changelog | Modify: add entry under Unreleased | - ---- - -## Task 1: Add instructor dependency and create integration module - -**Files:** -- Modify: `pyproject.toml:24-29` -- Create: `maseval/core/instructor.py` -- Test: `tests/test_core/test_instructor_integration.py` - -- [ ] **Step 1: Write failing test for instructor import** - -```python -# tests/test_core/test_instructor_integration.py -"""Test instructor library integration.""" -import pytest - - -@pytest.mark.core -class TestInstructorAvailable: - """Verify instructor is importable as a core dependency.""" - - def test_instructor_importable(self): - """instructor should be importable since it's a core dep.""" - import instructor - assert hasattr(instructor, "from_openai") - - def test_instructor_helpers_importable(self): - """maseval.core.instructor helpers should be importable.""" - from maseval.core.instructor import create_instructor_client - assert callable(create_instructor_client) -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` -Expected: FAIL — instructor not installed, module not found - -- [ ] **Step 3: Add instructor to core dependencies** - -Run: `uv add instructor` - -This updates both `pyproject.toml` and `uv.lock` automatically. - -- [ ] **Step 4: Create maseval/core/instructor.py** - -```python -"""Instructor library integration for structured LLM outputs. - -Provides helpers to create instructor-patched clients from provider SDK clients -and to generate flattened JSON schemas from Pydantic models. - -Instructor adds ``response_model`` support with automatic validation and retries -to any supported LLM provider. - -Example: - ```python - from maseval.core.instructor import create_instructor_client - - # Wrap an OpenAI client - import openai - client = openai.OpenAI() - instructor_client = create_instructor_client(client, provider="openai") - ``` -""" - -from __future__ import annotations - -from typing import Any, Optional, Dict - - -def create_instructor_client( - client: Any, - provider: str, - mode: Optional[str] = None, -) -> Any: - """Create an instructor-patched client from a provider SDK client. - - All patched clients expose a unified API: - ``client.chat.completions.create(response_model=..., messages=...)``. - - Args: - client: The provider SDK client instance (e.g., ``openai.OpenAI()``, - ``anthropic.Anthropic()``). For LiteLLM, pass ``litellm.completion``. - provider: Provider name. One of: ``"openai"``, ``"litellm"``. - For other providers, use ``instructor.from_provider()`` directly. - mode: Optional instructor mode override. If None, uses the default - for the provider. - - Returns: - An instructor-patched client supporting ``response_model``. - - Raises: - ValueError: If provider is not recognized. - """ - import instructor - - kwargs: Dict[str, Any] = {} - if mode is not None: - kwargs["mode"] = getattr(instructor.Mode, mode.upper(), mode) - - if provider == "openai": - return instructor.from_openai(client, **kwargs) - elif provider == "litellm": - return instructor.from_litellm(client, **kwargs) - else: - raise ValueError( - f"Unsupported provider: {provider!r}. " - f"Use instructor.from_provider() directly for other providers." - ) - - -def flatten_model_schema(model: type) -> Dict[str, Any]: - """Generate a flattened JSON schema from a Pydantic model. - - Uses instructor's ``openai_schema`` to produce a clean schema, then - applies additional flattening to remove ``anyOf`` (for ``Optional`` - fields) and other constructs that some providers reject. - - This replaces the manual ``_flatten_schema()`` function that was - previously needed to post-process Pydantic v2 schemas. - - Args: - model: A Pydantic BaseModel subclass. - - Returns: - A flat JSON schema dict suitable for LLM tool parameters. - """ - import instructor - - schema_obj = instructor.openai_schema(model) - schema = schema_obj.openai_schema["parameters"] - - # instructor's openai_schema still produces anyOf for Optional fields. - # Flatten those for provider compatibility (especially Google GenAI). - return _resolve_schema(schema) - - -def _resolve_schema(node: Any) -> Any: - """Recursively resolve anyOf and strip unsupported keys from a schema.""" - if not isinstance(node, dict): - return node - - _STRIP_KEYS = {"$defs", "additionalProperties", "title", "default"} - - # Simplify anyOf (Optional[X] -> X with nullable) - if "anyOf" in node: - variants = node["anyOf"] - non_null = [v for v in variants if not (isinstance(v, dict) and v.get("type") == "null")] - if len(non_null) == 1: - resolved = _resolve_schema(non_null[0]) - if isinstance(resolved, dict): - resolved["nullable"] = True - if "description" in node and "description" not in resolved: - resolved["description"] = node["description"] - return resolved - if non_null: - return _resolve_schema(non_null[0]) - - out: Dict[str, Any] = {} - for key, value in node.items(): - if key in _STRIP_KEYS or key == "anyOf": - continue - if isinstance(value, dict): - out[key] = _resolve_schema(value) - elif isinstance(value, list): - out[key] = [_resolve_schema(v) if isinstance(v, dict) else v for v in value] - else: - out[key] = value - return out -``` - -- [ ] **Step 5: Run tests to verify they pass** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` -Expected: PASS - -- [ ] **Step 6: Run full existing test suite** - -Run: `uv run pytest tests/test_core/ -v --tb=short` -Expected: All existing tests PASS - -- [ ] **Step 7: Commit** - -```bash -git add pyproject.toml uv.lock maseval/core/instructor.py tests/test_core/test_instructor_integration.py -git commit -m "feat: add instructor as core dependency with integration helpers" -``` - ---- - -## Task 2: Add response_model support to ChatResponse and ModelAdapter base - -**Files:** -- Modify: `maseval/core/model.py:62-135` (ChatResponse) -- Modify: `maseval/core/model.py:207-342` (ModelAdapter.chat) -- Test: `tests/test_core/test_instructor_integration.py` - -- [ ] **Step 1: Write failing tests for response_model support** - -Append to `tests/test_core/test_instructor_integration.py`: - -```python -from pydantic import BaseModel -from conftest import DummyModelAdapter -from maseval.core.model import ChatResponse - - -class WeatherResponse(BaseModel): - city: str - temperature: float - unit: str - - -@pytest.mark.core -class TestChatResponseStructured: - """Test ChatResponse with structured_response field.""" - - def test_chat_response_has_structured_response_field(self): - """ChatResponse should have an optional structured_response field.""" - resp = ChatResponse(content='{"city": "Paris"}') - assert resp.structured_response is None - - def test_chat_response_with_structured_response(self): - """ChatResponse can hold a parsed Pydantic model.""" - weather = WeatherResponse(city="Paris", temperature=20.0, unit="celsius") - resp = ChatResponse(content='{"city": "Paris"}', structured_response=weather) - assert resp.structured_response is not None - assert resp.structured_response.city == "Paris" - - -@pytest.mark.core -class TestModelAdapterResponseModel: - """Test ModelAdapter.chat() with response_model parameter.""" - - def test_chat_accepts_response_model_param(self): - """chat() should accept a response_model keyword argument.""" - import inspect - from maseval.core.model import ModelAdapter - sig = inspect.signature(ModelAdapter.chat) - assert "response_model" in sig.parameters - - def test_chat_without_response_model_unchanged(self): - """chat() without response_model behaves exactly as before.""" - model = DummyModelAdapter(responses=["Hello"]) - result = model.chat([{"role": "user", "content": "Hi"}]) - assert isinstance(result, ChatResponse) - assert result.content == "Hello" - assert result.structured_response is None -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestChatResponseStructured -v -x` -Expected: FAIL — `structured_response` field doesn't exist - -- [ ] **Step 3: Add structured_response field to ChatResponse** - -In `maseval/core/model.py`, add to the `ChatResponse` dataclass (after `stop_reason` on line 105): - -```python - structured_response: Optional[Any] = None -``` - -Update the docstring to include: - -``` - structured_response: The validated Pydantic model instance when - ``response_model`` was used with ``chat()``. None otherwise. -``` - -- [ ] **Step 4: Add response_model and max_retries parameters to ModelAdapter.chat()** - -In `maseval/core/model.py`, modify the `chat()` method signature (lines 207-214) to: - -```python - def chat( - self, - messages: Union[List[Dict[str, Any]], MessageHistory], - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - response_model: Optional[type] = None, - max_retries: int = 3, - **kwargs: Any, - ) -> ChatResponse: -``` - -Update the docstring Args section to include: - -``` - response_model: Optional Pydantic BaseModel class. When provided, - the model's response is validated against this schema and - returned in ``ChatResponse.structured_response``. Uses - instructor for automatic validation and retries. - max_retries: Number of retries on validation failure when using - ``response_model``. Default is 3. Ignored without ``response_model``. -``` - -In the `try` block (around line 284), add branching for response_model: - -```python - try: - if response_model is not None: - result = self._structured_chat( - messages_list, - response_model=response_model, - max_retries=max_retries, - generation_params=generation_params, - tools=tools, - tool_choice=tool_choice, - **kwargs, - ) - else: - result = self._chat_impl( - messages_list, - generation_params=generation_params, - tools=tools, - tool_choice=tool_choice, - **kwargs, - ) -``` - -- [ ] **Step 5: Add _structured_chat() method to ModelAdapter** - -Add after `_chat_impl` (around line 367): - -```python - def _structured_chat( - self, - messages: List[Dict[str, Any]], - response_model: type, - max_retries: int = 3, - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - **kwargs: Any, - ) -> ChatResponse: - """Internal structured chat using instructor. - - Subclasses that support instructor should override this method. - The default implementation falls back to ``_chat_impl`` and attempts - manual JSON parsing of the response content. - - Args: - messages: List of message dicts. - response_model: Pydantic model class for response validation. - max_retries: Number of retries on validation failure. - generation_params: Generation parameters. - tools: Tool definitions, if any. - tool_choice: Tool choice setting, if any. - **kwargs: Additional arguments. - - Returns: - ChatResponse with ``structured_response`` populated. - """ - # Base class raises — subclasses must override with their - # instructor-patched client. No silent fallback to unstructured output. - raise NotImplementedError( - f"{type(self).__name__} does not support response_model. " - f"Override _structured_chat() with an instructor-patched client." - ) -``` - -- [ ] **Step 6: Run tests to verify they pass** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` -Expected: PASS - -- [ ] **Step 7: Run full existing test suite** - -Run: `uv run pytest tests/test_core/ -v --tb=short` -Expected: All existing tests still PASS (the new `structured_response` field defaults to None) - -- [ ] **Step 8: Commit** - -```bash -git add maseval/core/model.py tests/test_core/test_instructor_integration.py -git commit -m "feat: add response_model support to ModelAdapter.chat() and ChatResponse" -``` - ---- - -## Task 3: Implement instructor support in provider adapters - -**Files:** -- Modify: `maseval/interface/inference/openai.py` -- Modify: `maseval/interface/inference/anthropic.py` -- Modify: `maseval/interface/inference/google_genai.py` -- Modify: `maseval/interface/inference/litellm.py` -- Test: `tests/test_core/test_instructor_integration.py` - -**Important:** All provider adapters use instructor's unified API after wrapping. `instructor.from_provider("provider/model")` returns an `Instructor` instance where all calls go through `client.chat.completions.create(response_model=..., messages=...)` regardless of the underlying provider. For OpenAI, we use `instructor.from_openai(client)`. For LiteLLM, `instructor.from_litellm(litellm.completion)`. For Anthropic and Google, we use `instructor.from_provider()` since there are no dedicated `from_anthropic`/`from_gemini` functions in current instructor. - -- [ ] **Step 1: Write failing tests for provider adapter instructor support** - -Append to `tests/test_core/test_instructor_integration.py`: - -```python -from unittest.mock import MagicMock - - -@pytest.mark.core -class TestOpenAIInstructorSupport: - """Test OpenAI adapter creates instructor client.""" - - def test_openai_adapter_has_instructor_client(self): - """OpenAIModelAdapter should create an instructor-patched client.""" - from maseval.interface.inference import OpenAIModelAdapter - mock_client = MagicMock() - adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") - assert hasattr(adapter, "_instructor_client") - - def test_openai_adapter_structured_chat_uses_instructor(self): - """OpenAIModelAdapter._structured_chat should use instructor client.""" - from maseval.interface.inference import OpenAIModelAdapter - mock_client = MagicMock() - adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") - - # Mock the instructor client - mock_response = WeatherResponse(city="Paris", temperature=20.0, unit="celsius") - adapter._instructor_client = MagicMock() - adapter._instructor_client.chat.completions.create.return_value = mock_response - - result = adapter.chat( - [{"role": "user", "content": "Weather in Paris?"}], - response_model=WeatherResponse, - ) - - assert result.structured_response is not None - assert result.structured_response.city == "Paris" - adapter._instructor_client.chat.completions.create.assert_called_once() -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestOpenAIInstructorSupport -v -x` -Expected: FAIL — no `_instructor_client` attribute - -- [ ] **Step 3: Add instructor support to OpenAIModelAdapter** - -In `maseval/interface/inference/openai.py`: - -Add to `__init__` (after existing setup, around line 92): -```python - # Create instructor-patched client for structured outputs - from maseval.core.instructor import create_instructor_client - self._instructor_client = create_instructor_client(client, provider="openai") -``` - -Add `_structured_chat` override (after `_chat_impl`): -```python - def _structured_chat( - self, - messages: List[Dict[str, Any]], - response_model: type, - max_retries: int = 3, - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - **kwargs: Any, - ) -> ChatResponse: - """Use instructor for structured output with validation and retries.""" - params = dict(self._default_generation_params) - if generation_params: - params.update(generation_params) - params.update(kwargs) - - if self._seed is not None and "seed" not in params: - params["seed"] = self._seed - - result = self._instructor_client.chat.completions.create( - model=self._model_id, - messages=messages, - response_model=response_model, - max_retries=max_retries, - **params, - ) - - # result is a validated Pydantic model instance - return ChatResponse( - content=result.model_dump_json(), - structured_response=result, - role="assistant", - model=self._model_id, - ) -``` - -- [ ] **Step 4: Add instructor support to AnthropicModelAdapter** - -In `maseval/interface/inference/anthropic.py`: - -Add to `__init__` (after existing setup, around line 109): -```python - # Create instructor-patched client for structured outputs - import instructor - self._instructor_client = instructor.from_provider("anthropic/" + model_id) -``` - -Note: We use `from_provider` since there's no `from_anthropic` in current instructor. - -Add `_structured_chat` override: -```python - def _structured_chat( - self, - messages: List[Dict[str, Any]], - response_model: type, - max_retries: int = 3, - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - **kwargs: Any, - ) -> ChatResponse: - """Use instructor for structured output with validation and retries.""" - params = dict(self._default_generation_params) - if generation_params: - params.update(generation_params) - params.update(kwargs) - - max_tokens = params.pop("max_tokens", self._max_tokens) - params["max_tokens"] = max_tokens - - result = self._instructor_client.chat.completions.create( - response_model=response_model, - messages=messages, - max_retries=max_retries, - **params, - ) - - return ChatResponse( - content=result.model_dump_json(), - structured_response=result, - role="assistant", - model=self._model_id, - ) -``` - -- [ ] **Step 5: Add instructor support to GoogleGenAIModelAdapter** - -In `maseval/interface/inference/google_genai.py`: - -Add to `__init__` (after existing setup, around line 85): -```python - # Create instructor-patched client for structured outputs - import instructor - self._instructor_client = instructor.from_provider("gemini/" + model_id) -``` - -Add `_structured_chat` override: -```python - def _structured_chat( - self, - messages: List[Dict[str, Any]], - response_model: type, - max_retries: int = 3, - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - **kwargs: Any, - ) -> ChatResponse: - """Use instructor for structured output with validation and retries.""" - params = dict(self._default_generation_params) - if generation_params: - params.update(generation_params) - params.update(kwargs) - - if self._seed is not None and "seed" not in params: - params["seed"] = self._seed - - result = self._instructor_client.chat.completions.create( - response_model=response_model, - messages=messages, - max_retries=max_retries, - **params, - ) - - return ChatResponse( - content=result.model_dump_json(), - structured_response=result, - role="assistant", - model=self._model_id, - ) -``` - -- [ ] **Step 6: Add instructor support to LiteLLMModelAdapter** - -In `maseval/interface/inference/litellm.py`: - -Add to `__init__` (after existing setup, around line 101): -```python - # Create instructor-patched completion function for structured outputs. - # Deferred to first use since litellm is an optional import. - self._instructor_client = None -``` - -Add helper + `_structured_chat` override: -```python - def _get_instructor_client(self) -> Any: - """Lazily create instructor-patched LiteLLM client.""" - if self._instructor_client is None: - try: - import litellm - except ImportError as e: - raise ImportError("LiteLLM is not installed. Install with: pip install maseval[litellm]") from e - from maseval.core.instructor import create_instructor_client - self._instructor_client = create_instructor_client(litellm.completion, provider="litellm") - return self._instructor_client - - def _structured_chat( - self, - messages: List[Dict[str, Any]], - response_model: type, - max_retries: int = 3, - generation_params: Optional[Dict[str, Any]] = None, - tools: Optional[List[Dict[str, Any]]] = None, - tool_choice: Optional[Union[str, Dict[str, Any]]] = None, - **kwargs: Any, - ) -> ChatResponse: - """Use instructor for structured output with validation and retries.""" - client = self._get_instructor_client() - - params = dict(self._default_generation_params) - if generation_params: - params.update(generation_params) - params.update(kwargs) - - if self._seed is not None and "seed" not in params: - params["seed"] = self._seed - if self._api_key: - params["api_key"] = self._api_key - if self._api_base: - params["api_base"] = self._api_base - - result = client( - model=self._model_id, - messages=messages, - response_model=response_model, - max_retries=max_retries, - **params, - ) - - return ChatResponse( - content=result.model_dump_json(), - structured_response=result, - role="assistant", - model=self._model_id, - ) -``` - -- [ ] **Step 7: Run tests** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py -v -x` -Expected: PASS - -- [ ] **Step 8: Run full test suite** - -Run: `uv run pytest tests/test_core/ -v --tb=short` -Expected: All PASS - -- [ ] **Step 9: Commit** - -```bash -git add maseval/interface/inference/openai.py maseval/interface/inference/anthropic.py maseval/interface/inference/google_genai.py maseval/interface/inference/litellm.py tests/test_core/test_instructor_integration.py -git commit -m "feat: add instructor support to all provider adapters" -``` - ---- - -## Task 4: Rework simulators to use instructor - -**Files:** -- Modify: `maseval/core/simulator.py` -- Test: `tests/test_core/test_llm_simulator.py` - -**Context:** Simulators currently use `model.generate()` (text-in/text-out) and manually parse JSON using `_extract_json_object()` + `json.loads()`. With instructor, we switch fully to `model.chat(messages=[...], response_model=OutputModel)` to get validated Pydantic models directly. The old `_extract_json_object()`, `_parse_output()`, and manual retry logic are deleted — instructor handles validation and retries. - -- [ ] **Step 1: Write failing test for Pydantic response models** - -Append to `tests/test_core/test_llm_simulator.py`: - -```python -@pytest.mark.core -class TestSimulatorResponseModels: - """Test that simulator response Pydantic models work correctly.""" - - def test_tool_simulator_response_model_exists(self): - from maseval.core.simulator import ToolSimulatorResponse - resp = ToolSimulatorResponse(text="success", details={"key": "value"}) - assert resp.text == "success" - assert resp.details == {"key": "value"} - - def test_user_simulator_response_model_exists(self): - from maseval.core.simulator import UserSimulatorResponse - resp = UserSimulatorResponse(text="I need help") - assert resp.text == "I need help" - - def test_agentic_user_simulator_response_model_exists(self): - from maseval.core.simulator import AgenticUserSimulatorResponse - resp = AgenticUserSimulatorResponse( - text="Let me check", - tool_calls=[{"name": "check_status", "arguments": {}}], - ) - assert resp.text == "Let me check" - assert len(resp.tool_calls) == 1 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `uv run pytest tests/test_core/test_llm_simulator.py::TestSimulatorResponseModels -v -x` -Expected: FAIL — models don't exist yet - -- [ ] **Step 3: Add Pydantic response models to simulator.py** - -In `maseval/core/simulator.py`, after imports (before `_extract_json_object`), add: - -```python -from pydantic import BaseModel, Field - - -class ToolSimulatorResponse(BaseModel): - """Expected output format for ToolLLMSimulator.""" - text: str = Field(default="", description="Human-readable description of the tool's output") - details: Dict[str, Any] = Field(default_factory=dict, description="Structured tool output data") - - -class UserSimulatorResponse(BaseModel): - """Expected output format for UserLLMSimulator.""" - text: str = Field(default="", description="The user's response text") - - -class AgenticUserSimulatorResponse(BaseModel): - """Expected output format for AgenticUserLLMSimulator.""" - text: str = Field(default="", description="The user's response text") - tool_calls: List[Dict[str, Any]] = Field(default_factory=list, description="List of tool calls") -``` - -- [ ] **Step 4: Add _response_model and _parse_structured_response to LLMSimulator base** - -In `LLMSimulator` class, add class attribute: -```python - _response_model: Optional[type] = None -``` - -Add method: -```python - def _parse_structured_response(self, response: Any) -> Any: - """Convert instructor-validated response to expected return format. - - Override in subclasses to convert the Pydantic model instance - to the format expected by callers. - """ - return response -``` - -- [ ] **Step 5: Rewrite LLMSimulator.__call__ to use instructor directly** - -Replace the inner loop body. No legacy fallback — instructor handles validation and retries via `response_model`. Delete `_extract_json_object()` and `_parse_output()` methods entirely. - -```python - try: - chat_result = self.model.chat( - messages=[{"role": "user", "content": prompt}], - response_model=self._response_model, - max_retries=self.max_try, - generation_params=generation_params, - ) - parsed_result = self._parse_structured_response(chat_result.structured_response) - entry["raw_output"] = chat_result.content - entry["parsed_output"] = parsed_result - entry["status"] = SimulatorCallStatus.Successful.value - except Exception as e: - entry["raw_output"] = None - entry["status"] = SimulatorCallStatus.ModelCallError.value - entry["error"] = str(e) - self.logs.append(entry) -``` - -- [ ] **Step 6: Delete legacy parsing code** - -Remove from `simulator.py`: -- `_extract_json_object()` function (lines 13-27) -- `_parse_output()` methods from all simulator subclasses -- Manual JSON retry logic in `__call__` (the old `json.loads` / `json.JSONDecodeError` paths) - -- [ ] **Step 6: Wire up response models in simulator subclasses** - -In `ToolLLMSimulator`: -```python - _response_model = ToolSimulatorResponse - - def _parse_structured_response(self, response: ToolSimulatorResponse) -> tuple[str, Dict[str, Any]]: - return response.text, response.details -``` - -In `UserLLMSimulator`: -```python - _response_model = UserSimulatorResponse - - def _parse_structured_response(self, response: UserSimulatorResponse) -> str: - return response.text -``` - -In `AgenticUserLLMSimulator`: -```python - _response_model = AgenticUserSimulatorResponse - - def _parse_structured_response(self, response: AgenticUserSimulatorResponse) -> Tuple[str, List[Dict[str, Any]]]: - return response.text, response.tool_calls -``` - -- [ ] **Step 7: Run all simulator tests** - -Run: `uv run pytest tests/test_core/test_llm_simulator.py -v --tb=short` -Expected: All existing tests still PASS (DummyModelAdapter returns text via `generate()`, so the fallback path is exercised) - -- [ ] **Step 8: Run full core test suite** - -Run: `uv run pytest tests/test_core/ -v --tb=short` -Expected: All PASS - -- [ ] **Step 9: Commit** - -```bash -git add maseval/core/simulator.py tests/test_core/test_llm_simulator.py -git commit -m "feat: add instructor-based structured output to simulators with legacy fallback" -``` - ---- - -## Task 5: Replace _flatten_schema in Tau2 - -**Files:** -- Modify: `maseval/benchmark/tau2/tau2.py:781-902` and line 1231 -- Test: `tests/test_core/test_instructor_integration.py` - -**Context:** `_flatten_schema()` is called in two places: -1. `_build_tool_definitions()` at line 897 -2. `_get_tool_definitions()` at line 1231 - -Both must be updated to use `flatten_model_schema()` from `maseval.core.instructor`. - -- [ ] **Step 1: Write failing test for instructor-based schema generation** - -Append to `tests/test_core/test_instructor_integration.py`: - -```python -@pytest.mark.core -class TestInstructorSchemaGeneration: - """Test that flatten_model_schema produces clean schemas.""" - - def test_generates_clean_schema(self): - from pydantic import BaseModel, Field - from typing import Optional - from maseval.core.instructor import flatten_model_schema - - class OrderParams(BaseModel): - order_id: str = Field(description="The order ID") - status: Optional[str] = Field(default=None, description="Filter by status") - - flat = flatten_model_schema(OrderParams) - assert "$ref" not in str(flat) - assert "$defs" not in str(flat) - assert "anyOf" not in str(flat) - assert flat["type"] == "object" - assert "order_id" in flat["properties"] - - def test_handles_nested_models(self): - from pydantic import BaseModel, Field - from maseval.core.instructor import flatten_model_schema - - class Address(BaseModel): - street: str - city: str - - class Person(BaseModel): - name: str - address: Address - - flat = flatten_model_schema(Person) - assert "$ref" not in str(flat) - assert "address" in flat["properties"] -``` - -- [ ] **Step 2: Run test to verify it passes (flatten_model_schema was created in Task 1)** - -Run: `uv run pytest tests/test_core/test_instructor_integration.py::TestInstructorSchemaGeneration -v -x` -Expected: PASS (function already exists from Task 1) - -- [ ] **Step 3: Replace _flatten_schema calls in tau2.py** - -In `maseval/benchmark/tau2/tau2.py`: - -1. Add import at the top of the `_build_tool_definitions` function (replace the existing `from typing import Any as TypingAny` line area): -```python - from maseval.core.instructor import flatten_model_schema -``` - -2. Replace line 897: -```python -# Before: -"parameters": _flatten_schema(params_model.model_json_schema()), -# After: -"parameters": flatten_model_schema(params_model), -``` - -3. Replace line 1231 (in `_get_tool_definitions()`): -```python -# Before: -"parameters": _flatten_schema(params_model.model_json_schema()), -# After: -from maseval.core.instructor import flatten_model_schema -... -"parameters": flatten_model_schema(params_model), -``` - -(Add the import once at the top of `_get_tool_definitions`, not inline at line 1231.) - -- [ ] **Step 4: Delete _flatten_schema() function** - -Remove the `_flatten_schema()` function (lines 781-834) from `tau2.py`. - -- [ ] **Step 5: Run tests** - -Run: `uv run pytest tests/test_core/ -v --tb=short` -Expected: PASS - -Run: `uv run pytest tests/ -v --tb=short -m "not (slow or credentialed or smoke)"` -Expected: All PASS - -- [ ] **Step 6: Commit** - -```bash -git add maseval/benchmark/tau2/tau2.py tests/test_core/test_instructor_integration.py -git commit -m "feat: replace _flatten_schema with instructor-based schema generation in Tau2" -``` - ---- - -## Task 6: Update exports and changelog - -**Files:** -- Modify: `maseval/core/__init__.py` (if it has explicit exports) -- Modify: `CHANGELOG.md` - -- [ ] **Step 1: Check and update exports** - -Check what's currently exported from `maseval/core/__init__.py` and `maseval/__init__.py`. If they have explicit `__all__` or import statements, add: - -```python -from .instructor import create_instructor_client, flatten_model_schema -``` - -Also export the simulator response models if they're useful to users: -```python -from .simulator import ToolSimulatorResponse, UserSimulatorResponse, AgenticUserSimulatorResponse -``` - -- [ ] **Step 2: Update CHANGELOG.md** - -Add under `## Unreleased`: - -```markdown -### Added - -- Added `instructor` as a core dependency for structured LLM output handling with automatic validation and retries. -- Added `response_model` parameter to `ModelAdapter.chat()` — pass a Pydantic `BaseModel` class to get validated structured outputs via `ChatResponse.structured_response`. -- Added `structured_response` field to `ChatResponse` for accessing parsed Pydantic model instances. -- Added `maseval.core.instructor` module with `create_instructor_client()` and `flatten_model_schema()` helpers. -- Added Pydantic response models for simulators: `ToolSimulatorResponse`, `UserSimulatorResponse`, `AgenticUserSimulatorResponse`. -- Simulators now use instructor for structured output parsing with automatic fallback to legacy JSON extraction. - -### Changed - -- Replaced manual `_flatten_schema()` in Tau2 benchmark with instructor-based `flatten_model_schema()`. -``` - -- [ ] **Step 3: Commit** - -```bash -git add maseval/core/__init__.py maseval/__init__.py CHANGELOG.md -git commit -m "chore: update exports and changelog for instructor integration" -``` - ---- - -## Task 7: Final validation - -- [ ] **Step 1: Run linter and formatter** - -Run: `uv run ruff format . && uv run ruff check . --fix` - -- [ ] **Step 2: Run type checker** - -Run: `uv run ty check` - -- [ ] **Step 3: Run full test suite** - -Run: `uv run pytest tests/ -v --tb=short -m "not (slow or credentialed or smoke)"` -Expected: All PASS - -- [ ] **Step 4: Verify end-to-end import** - -Run: -```bash -uv run python3 -c " -from maseval.core.instructor import create_instructor_client, flatten_model_schema -from maseval.core.model import ChatResponse, ModelAdapter -from maseval.core.simulator import ToolSimulatorResponse, UserSimulatorResponse, AgenticUserSimulatorResponse -print('All imports successful') - -from pydantic import BaseModel, Field -class TestModel(BaseModel): - name: str = Field(description='A name') - age: int = Field(description='An age') - -schema = flatten_model_schema(TestModel) -print(f'Schema: {schema}') -assert 'anyOf' not in str(schema) -print('Schema generation works correctly') -" -``` - -- [ ] **Step 5: Run just all (format + lint + typecheck + test)** - -Run: `just all` - -- [ ] **Step 6: Review git log** - -Run: `git log --oneline main..HEAD` -Expected: Clean series of feature commits - -- [ ] **Step 7: Final cleanup commit if needed** - -```bash -git status -# Only commit if there are changes -git diff --cached --quiet || git commit -m "chore: final cleanup for instructor integration" -``` From 3a3dad6e75a3d61ad597393ecba068f214836da3 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 21:55:05 +0100 Subject: [PATCH 5/7] fixed issues --- maseval/core/instructor.py | 2 +- maseval/interface/inference/google_genai.py | 6 +++--- tests/conftest.py | 2 +- tests/test_core/test_agentic_user.py | 2 +- tests/test_core/test_usage.py | 10 +++++----- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/maseval/core/instructor.py b/maseval/core/instructor.py index 9e284fec..31789464 100644 --- a/maseval/core/instructor.py +++ b/maseval/core/instructor.py @@ -75,7 +75,7 @@ def flatten_model_schema(model: type) -> Dict[str, Any]: """ import instructor - schema_obj = instructor.openai_schema(model) + schema_obj = instructor.openai_schema(model) # ty: ignore[invalid-argument-type] schema = schema_obj.openai_schema["parameters"] # instructor's openai_schema still produces anyOf for Optional fields. diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index 0b4b1a70..7d1024f0 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -345,14 +345,14 @@ def _structured_chat( result = self._instructor_client.chat.completions.create( model=self._model_id, - response_model=response_model, - messages=messages, + response_model=response_model, # ty: ignore[invalid-argument-type] + messages=messages, # ty: ignore[invalid-argument-type] max_retries=max_retries, **params, ) return ChatResponse( - content=result.model_dump_json(), + content=result.model_dump_json(), # ty: ignore[possibly-missing-attribute] structured_response=result, role="assistant", model=self._model_id, diff --git a/tests/conftest.py b/tests/conftest.py index 85b93391..12c6a25a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -147,7 +147,7 @@ def _structured_chat( result = self._chat_impl(messages, generation_params, tools, tool_choice, **kwargs) if result.content and response_model is not None: try: - structured = response_model.model_validate_json(result.content) + structured = response_model.model_validate_json(result.content) # ty: ignore[unresolved-attribute] return ChatResponse( content=result.content, tool_calls=result.tool_calls, diff --git a/tests/test_core/test_agentic_user.py b/tests/test_core/test_agentic_user.py index 4de8b933..45c5730a 100644 --- a/tests/test_core/test_agentic_user.py +++ b/tests/test_core/test_agentic_user.py @@ -51,7 +51,7 @@ def _structured_chat( result = self._chat_impl(messages, generation_params, tools, tool_choice, **kwargs) if result.content and response_model is not None: try: - structured = response_model.model_validate_json(result.content) + structured = response_model.model_validate_json(result.content) # ty: ignore[unresolved-attribute] return ChatResponse( content=result.content, structured_response=structured, diff --git a/tests/test_core/test_usage.py b/tests/test_core/test_usage.py index 383aa0f1..940940e1 100644 --- a/tests/test_core/test_usage.py +++ b/tests/test_core/test_usage.py @@ -465,7 +465,7 @@ def test_basic_pipeline(self): 100 input * $0.01 + 50 output * $0.02 = $2.00 """ - from tests.conftest import DummyModelAdapter + from conftest import DummyModelAdapter calc = StaticPricingCalculator( { @@ -493,7 +493,7 @@ def test_pipeline_multiple_calls_accumulate(self): Call 2: 100 input * $0.01 + 50 output * $0.02 = $2.00 Total = $4.00, 200 input, 100 output """ - from tests.conftest import DummyModelAdapter + from conftest import DummyModelAdapter calc = StaticPricingCalculator( { @@ -522,7 +522,7 @@ def test_pipeline_provider_cost_takes_precedence(self): Calculator would compute $2.00. Provider cost should win. """ - from tests.conftest import DummyModelAdapter + from conftest import DummyModelAdapter calc = StaticPricingCalculator( { @@ -542,7 +542,7 @@ def test_pipeline_provider_cost_takes_precedence(self): def test_pipeline_no_calculator_no_provider_cost(self): """Without calculator or provider cost, cost is None.""" - from tests.conftest import DummyModelAdapter + from conftest import DummyModelAdapter adapter = DummyModelAdapter( model_id="test-model", @@ -565,7 +565,7 @@ def test_pipeline_with_cached_tokens(self): Output: 100 * $0.015 = $1.50 Total = $2.34 """ - from tests.conftest import DummyModelAdapter + from conftest import DummyModelAdapter calc = StaticPricingCalculator( { From 3ca8ad319b8d8938d6302f45955916d379621ea2 Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 22:37:22 +0100 Subject: [PATCH 6/7] improved testing --- tests/test_core/test_history.py | 139 ++++ tests/test_core/test_instructor.py | 11 + .../test_model_adapters.py | 730 +++++++++++++++++- 3 files changed, 877 insertions(+), 3 deletions(-) create mode 100644 tests/test_core/test_history.py diff --git a/tests/test_core/test_history.py b/tests/test_core/test_history.py new file mode 100644 index 00000000..2bdf69dd --- /dev/null +++ b/tests/test_core/test_history.py @@ -0,0 +1,139 @@ +"""Tests for MessageHistory and ToolInvocationHistory.""" + +import pytest + +from maseval.core.history import MessageHistory + + +@pytest.mark.core +class TestMessageHistory: + """Tests for MessageHistory conversation container.""" + + def test_add_tool_call_with_metadata(self): + """Tool call messages store optional metadata.""" + history = MessageHistory() + tc = [{"id": "call_1", "type": "function", "function": {"name": "f", "arguments": "{}"}}] + history.add_tool_call(tc, content="Calling f", metadata={"key": "val"}) + + msg = history[0] + assert msg["role"] == "assistant" + assert msg["tool_calls"] == tc + assert msg["content"] == "Calling f" + assert msg["metadata"] == {"key": "val"} + assert "timestamp" in msg + + def test_add_tool_call_without_content(self): + """Tool call messages work without optional text content.""" + history = MessageHistory() + tc = [{"id": "call_1", "type": "function", "function": {"name": "f", "arguments": "{}"}}] + history.add_tool_call(tc) + + msg = history[0] + assert "content" not in msg + + def test_add_tool_response_with_name_and_metadata(self): + """Tool response messages store name and metadata.""" + history = MessageHistory() + history.add_tool_response( + tool_call_id="call_1", + content="result", + name="my_tool", + metadata={"took_ms": 42}, + ) + + msg = history[0] + assert msg["role"] == "tool" + assert msg["tool_call_id"] == "call_1" + assert msg["content"] == "result" + assert msg["name"] == "my_tool" + assert msg["metadata"] == {"took_ms": 42} + assert "timestamp" in msg + + def test_add_tool_response_minimal(self): + """Tool response works with only required fields.""" + history = MessageHistory() + history.add_tool_response(tool_call_id="call_1", content="ok") + + msg = history[0] + assert msg["role"] == "tool" + assert "name" not in msg + assert "metadata" not in msg + + def test_clear(self): + """Clear removes all messages.""" + history = MessageHistory() + history.add_message("user", "hello") + history.add_message("assistant", "hi") + assert len(history) == 2 + + history.clear() + assert len(history) == 0 + assert not history # bool check + + def test_filter_by_role(self): + """filter_by_role returns only messages with matching role.""" + history = MessageHistory() + history.add_message("user", "q1") + history.add_message("assistant", "a1") + history.add_message("user", "q2") + history.add_message("system", "sys") + + user_msgs = history.filter_by_role("user") + assert len(user_msgs) == 2 + assert all(m["role"] == "user" for m in user_msgs) + + system_msgs = history.filter_by_role("system") + assert len(system_msgs) == 1 + + def test_get_last_message(self): + """get_last_message returns last or None if empty.""" + history = MessageHistory() + assert history.get_last_message() is None + + history.add_message("user", "first") + history.add_message("assistant", "second") + last = history.get_last_message() + assert last is not None + assert last["content"] == "second" + + def test_to_openai_format_strips_metadata_and_timestamps(self): + """to_openai_format returns only OpenAI-compatible fields.""" + history = MessageHistory() + history.add_message("user", "hello", metadata={"key": "val"}) + history.add_message("assistant", "hi") + + tc = [{"id": "call_1", "type": "function", "function": {"name": "f", "arguments": "{}"}}] + history.add_tool_call(tc, content="calling") + history.add_tool_response(tool_call_id="call_1", content="done", name="f") + + openai_msgs = history.to_openai_format() + + # All should have role + assert all("role" in m for m in openai_msgs) + # None should have metadata or timestamp + assert all("metadata" not in m for m in openai_msgs) + assert all("timestamp" not in m for m in openai_msgs) + + # Check tool call message preserves tool_calls + tc_msg = openai_msgs[2] + assert "tool_calls" in tc_msg + + # Check tool response preserves tool_call_id and name + tr_msg = openai_msgs[3] + assert tr_msg["tool_call_id"] == "call_1" + assert tr_msg["name"] == "f" + + def test_explicit_timestamp_preserved(self): + """Explicit timestamps are used instead of auto-generated ones.""" + history = MessageHistory() + history.add_message("user", "hello", timestamp="2024-01-01T00:00:00") + assert history[0]["timestamp"] == "2024-01-01T00:00:00" + + history.add_tool_call( + [{"id": "c1", "type": "function", "function": {"name": "f", "arguments": "{}"}}], + timestamp="2024-01-01T00:00:01", + ) + assert history[1]["timestamp"] == "2024-01-01T00:00:01" + + history.add_tool_response(tool_call_id="c1", content="ok", timestamp="2024-01-01T00:00:02") + assert history[2]["timestamp"] == "2024-01-01T00:00:02" diff --git a/tests/test_core/test_instructor.py b/tests/test_core/test_instructor.py index 9bf16cf5..8c478199 100644 --- a/tests/test_core/test_instructor.py +++ b/tests/test_core/test_instructor.py @@ -177,3 +177,14 @@ def test_litellm_provider_returns_patched_client(self): assert hasattr(patched, "chat") assert hasattr(patched.chat, "completions") assert callable(patched.chat.completions.create) + + def test_mode_override(self): + """Mode parameter is passed through to instructor.""" + from openai import OpenAI + + client = OpenAI(api_key="test-key-not-real") + # JSON mode is valid for OpenAI + patched = create_instructor_client(client, provider="openai", mode="json") + + assert hasattr(patched, "chat") + assert callable(patched.chat.completions.create) diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py index 50f418ca..3956afb5 100644 --- a/tests/test_interface/test_model_integration/test_model_adapters.py +++ b/tests/test_interface/test_model_integration/test_model_adapters.py @@ -10,6 +10,8 @@ and provides adapter-specific configuration. """ +from unittest.mock import MagicMock, patch + import pytest @@ -1479,7 +1481,7 @@ def test_openai_user_seed_overrides_adapter_seed(self): def test_litellm_adapter_passes_seed_to_api(self): """LiteLLM adapter includes seed in API call.""" pytest.importorskip("litellm") - from unittest.mock import patch, MagicMock + from unittest.mock import MagicMock from maseval.interface.inference import LiteLLMModelAdapter mock_response = MagicMock() @@ -1504,7 +1506,7 @@ def test_litellm_adapter_passes_seed_to_api(self): def test_litellm_adapter_no_seed_when_not_set(self): """LiteLLM adapter doesn't include seed when not set.""" pytest.importorskip("litellm") - from unittest.mock import patch, MagicMock + from unittest.mock import MagicMock from maseval.interface.inference import LiteLLMModelAdapter mock_response = MagicMock() @@ -1529,7 +1531,7 @@ def test_litellm_adapter_no_seed_when_not_set(self): def test_litellm_user_seed_overrides_adapter_seed(self): """LiteLLM user-provided seed in generation_params takes precedence.""" pytest.importorskip("litellm") - from unittest.mock import patch, MagicMock + from unittest.mock import MagicMock from maseval.interface.inference import LiteLLMModelAdapter mock_response = MagicMock() @@ -1767,3 +1769,725 @@ def __init__(self): # LiteLLM litellm_config = LiteLLMModelAdapter(model_id="gpt-3.5-turbo", default_generation_params=params).gather_config() assert "default_generation_params" in litellm_config + + +# ==================== Google GenAI Message Conversion Tests ==================== + + +@pytest.mark.interface +class TestGoogleGenAIMessageConversion: + """Test GoogleGenAIModelAdapter._convert_messages for edge cases.""" + + def _make_adapter(self): + """Create an adapter with a no-op client for testing _convert_messages.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + class Response: + text = "ok" + candidates = [] + usage_metadata = None + + return Response() + + def __init__(self): + self.models = self.Models() + + return GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + + def test_system_message_extracted(self): + """System messages become system_instruction, not contents.""" + adapter = self._make_adapter() + system, contents = adapter._convert_messages( + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hi"}, + ] + ) + assert system == "You are helpful." + assert len(contents) == 1 + assert contents[0]["role"] == "user" + + def test_assistant_text_only(self): + """Assistant messages map to 'model' role.""" + adapter = self._make_adapter() + _, contents = adapter._convert_messages( + [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + ) + assert contents[1]["role"] == "model" + assert contents[1]["parts"] == [{"text": "Hello!"}] + + def test_assistant_with_tool_calls(self): + """Assistant tool_calls are converted to function_call parts.""" + adapter = self._make_adapter() + _, contents = adapter._convert_messages( + [ + {"role": "user", "content": "Weather?"}, + { + "role": "assistant", + "content": "Let me check.", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "Paris"}', + }, + } + ], + }, + ] + ) + model_msg = contents[1] + assert model_msg["role"] == "model" + assert len(model_msg["parts"]) == 2 + assert model_msg["parts"][0] == {"text": "Let me check."} + assert model_msg["parts"][1]["function_call"]["name"] == "get_weather" + assert model_msg["parts"][1]["function_call"]["args"] == {"city": "Paris"} + + def test_assistant_with_invalid_json_arguments(self): + """Invalid JSON in tool call arguments falls back to empty dict.""" + adapter = self._make_adapter() + _, contents = adapter._convert_messages( + [ + {"role": "user", "content": "Go"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "do_thing", + "arguments": "not json at all", + }, + } + ], + }, + ] + ) + fc = contents[1]["parts"][0]["function_call"] + assert fc["args"] == {} + + def test_tool_response_messages(self): + """Tool messages are converted to function_response parts.""" + adapter = self._make_adapter() + _, contents = adapter._convert_messages( + [ + {"role": "user", "content": "Weather?"}, + {"role": "tool", "name": "get_weather", "tool_call_id": "call_1", "content": "72°F"}, + ] + ) + assert contents[1]["role"] == "function" + fr = contents[1]["parts"][0]["function_response"] + assert fr["name"] == "get_weather" + assert fr["response"] == {"result": "72°F"} + + def test_consecutive_tool_responses_merged(self): + """Consecutive tool messages are merged into a single function contents entry.""" + adapter = self._make_adapter() + _, contents = adapter._convert_messages( + [ + {"role": "user", "content": "Do both"}, + {"role": "tool", "name": "tool_a", "tool_call_id": "c1", "content": "result_a"}, + {"role": "tool", "name": "tool_b", "tool_call_id": "c2", "content": "result_b"}, + ] + ) + # Should be merged into one function entry + assert len(contents) == 2 # user + merged function + assert contents[1]["role"] == "function" + assert len(contents[1]["parts"]) == 2 + + +# ==================== OpenAI Edge Case Tests ==================== + + +@pytest.mark.interface +class TestOpenAIAdapterEdgeCases: + """Test OpenAI adapter edge cases for uncovered lines.""" + + def test_parse_dict_response_no_choices(self): + """Dict response without choices returns string representation.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, **kwargs): + return {"data": "some raw response"} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content is not None + + def test_parse_dict_response_completion_style(self): + """Dict response with 'text' in choice (completion-style) is parsed.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, **kwargs): + return {"choices": [{"text": "Completion text", "finish_reason": "stop"}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content == "Completion text" + + def test_parse_dict_response_unknown_choice_format(self): + """Dict response with neither 'message' nor 'text' falls back to str.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, **kwargs): + return {"choices": [{"unknown_key": "val"}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content is not None + + def test_callable_client_fallback(self): + """Adapter falls back to calling client directly if no known methods exist.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class CallableClient: + """Client without chat.completions but is callable.""" + + def __call__(self, **kwargs): + return {"choices": [{"message": {"content": "Direct call"}}]} + + adapter = OpenAIModelAdapter(client=CallableClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content == "Direct call" + + def test_non_callable_client_raises(self): + """Adapter raises TypeError if client has no suitable method.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class BadClient: + """Client with no usable methods and not callable.""" + + pass + + adapter = OpenAIModelAdapter(client=BadClient(), model_id="gpt-4") + with pytest.raises(TypeError, match="Unable to call client"): + adapter.chat([{"role": "user", "content": "Hi"}]) + + +# ==================== Anthropic Edge Case Tests ==================== + + +@pytest.mark.interface +class TestAnthropicAdapterEdgeCases: + """Test Anthropic adapter edge cases.""" + + def test_convert_tool_choice_unknown_string(self): + """Unknown tool_choice string defaults to auto.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + class Messages: + def create(self, **kwargs): + class TextBlock: + type = "text" + text = "ok" + + class Response: + content = [TextBlock()] + usage = None + model = "claude-sonnet-4-5-20250514" + stop_reason = "end_turn" + + return Response() + + messages = Messages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-sonnet-4-5-20250514") + result = adapter._convert_tool_choice("some_unknown_value") + assert result == {"type": "auto"} + + def test_convert_messages_assistant_tool_calls_invalid_json(self): + """Invalid JSON in assistant tool call arguments falls back to empty dict.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + class Messages: + def create(self, **kwargs): + class TextBlock: + type = "text" + text = "ok" + + class Response: + content = [TextBlock()] + usage = None + model = "claude-sonnet-4-5-20250514" + stop_reason = "end_turn" + + return Response() + + messages = Messages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-sonnet-4-5-20250514") + _, converted = adapter._convert_messages( + [ + {"role": "user", "content": "Go"}, + { + "role": "assistant", + "content": "Using tool", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "do_thing", + "arguments": "not valid json {{{", + }, + } + ], + }, + ] + ) + # Should have tool_use block with empty input dict + assistant_msg = converted[1] + tool_block = [b for b in assistant_msg["content"] if b["type"] == "tool_use"][0] + assert tool_block["input"] == {} + + +# ==================== ModelAdapter.chat() with response_model Tests ==================== + + +@pytest.mark.core +class TestModelAdapterStructuredChat: + """Test that ModelAdapter.chat() routes to _structured_chat when response_model is given.""" + + def test_chat_routes_to_structured_chat(self): + """chat() with response_model calls _structured_chat, not _chat_impl.""" + from conftest import DummyModelAdapter + from pydantic import BaseModel + + class SimpleResponse(BaseModel): + answer: str + + model = DummyModelAdapter(responses=['{"answer": "42"}']) + response = model.chat( + [{"role": "user", "content": "What is 6*7?"}], + response_model=SimpleResponse, + ) + + assert response.structured_response is not None + assert isinstance(response.structured_response, SimpleResponse) + assert response.structured_response.answer == "42" + + def test_chat_without_response_model_uses_chat_impl(self): + """chat() without response_model uses _chat_impl directly.""" + from conftest import DummyModelAdapter + + model = DummyModelAdapter(responses=["plain text"]) + response = model.chat([{"role": "user", "content": "Hi"}]) + + assert response.content == "plain text" + assert response.structured_response is None + + def test_chat_with_message_history(self): + """chat() accepts MessageHistory and converts to list.""" + from conftest import DummyModelAdapter + from maseval.core.history import MessageHistory + + history = MessageHistory() + history.add_message("user", "Hello!") + + model = DummyModelAdapter(responses=["Hi there!"]) + response = model.chat(history) + + assert response.content == "Hi there!" + + def test_base_structured_chat_raises_not_implemented(self): + """Base ModelAdapter._structured_chat raises NotImplementedError.""" + from maseval.core.model import ModelAdapter, ChatResponse + + class MinimalAdapter(ModelAdapter): + @property + def model_id(self): + return "minimal" + + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): + return ChatResponse(content="text") + + adapter = MinimalAdapter() + with pytest.raises(NotImplementedError, match="does not support response_model"): + adapter.chat( + [{"role": "user", "content": "Hi"}], + response_model=object, + ) + + +# ==================== Adapter _structured_chat Tests ==================== +# +# These tests inject a mock _instructor_client to exercise the adapter's +# _structured_chat code path without requiring real API credentials. +# This covers parameter merging, seed propagation, and response wrapping. +# ========================================================================= + + +def _make_mock_instructor_result(): + """Create a mock Pydantic-like result object.""" + result = MagicMock() + result.model_dump_json.return_value = '{"answer": "42"}' + return result + + +@pytest.mark.interface +class TestOpenAIStructuredChat: + """Test OpenAIModelAdapter._structured_chat with mocked instructor.""" + + def test_structured_chat_returns_chat_response(self): + """_structured_chat returns ChatResponse with structured_response.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, **kwargs): + return {"choices": [{"message": {"content": "ok"}}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + + # Inject mock instructor client + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + response = adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + generation_params={"temperature": 0.5}, + ) + + assert response.content == '{"answer": "42"}' + assert response.structured_response is mock_result + assert response.model == "gpt-4" + mock_instructor.chat.completions.create.assert_called_once() + + def test_structured_chat_propagates_seed(self): + """_structured_chat includes seed in params.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, **kwargs): + return {"choices": [{"message": {"content": "ok"}}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4", seed=42) + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + ) + + call_kwargs = mock_instructor.chat.completions.create.call_args + assert call_kwargs.kwargs.get("seed") == 42 + + +@pytest.mark.interface +class TestAnthropicStructuredChat: + """Test AnthropicModelAdapter._structured_chat with mocked instructor.""" + + def test_structured_chat_returns_chat_response(self): + """_structured_chat returns ChatResponse with structured_response.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + class Messages: + def create(self, **kwargs): + class TextBlock: + type = "text" + text = "ok" + + class Response: + content = [TextBlock()] + usage = None + model = "claude-sonnet-4-5-20250514" + stop_reason = "end_turn" + + return Response() + + messages = Messages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-sonnet-4-5-20250514") + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + response = adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + generation_params={"temperature": 0.5}, + ) + + assert response.content == '{"answer": "42"}' + assert response.structured_response is mock_result + assert response.model == "claude-sonnet-4-5-20250514" + + def test_structured_chat_uses_max_tokens(self): + """_structured_chat passes max_tokens from adapter default.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + class Messages: + def create(self, **kwargs): + class TextBlock: + type = "text" + text = "ok" + + class Response: + content = [TextBlock()] + usage = None + model = "claude-sonnet-4-5-20250514" + stop_reason = "end_turn" + + return Response() + + messages = Messages() + + adapter = AnthropicModelAdapter( + client=MockClient(), + model_id="claude-sonnet-4-5-20250514", + max_tokens=500, + ) + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + ) + + call_kwargs = mock_instructor.chat.completions.create.call_args + assert call_kwargs.kwargs.get("max_tokens") == 500 + + +@pytest.mark.interface +class TestGoogleGenAIStructuredChat: + """Test GoogleGenAIModelAdapter._structured_chat with mocked instructor.""" + + def test_structured_chat_returns_chat_response(self): + """_structured_chat returns ChatResponse with structured_response.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + class Response: + text = "ok" + + return Response() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + response = adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + generation_params={"temperature": 0.5}, + ) + + assert response.content == '{"answer": "42"}' + assert response.structured_response is mock_result + assert response.model == "gemini-pro" + + def test_structured_chat_propagates_seed(self): + """_structured_chat includes seed in params.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + class Response: + text = "ok" + + return Response() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro", seed=99) + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + ) + + call_kwargs = mock_instructor.chat.completions.create.call_args + assert call_kwargs.kwargs.get("seed") == 99 + + +@pytest.mark.interface +class TestLiteLLMStructuredChat: + """Test LiteLLMModelAdapter._structured_chat with mocked instructor.""" + + def test_structured_chat_returns_chat_response(self): + """_structured_chat returns ChatResponse with structured_response.""" + pytest.importorskip("litellm") + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + + # Pre-inject to skip _get_instructor_client + adapter._instructor_client = mock_instructor + + response = adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + generation_params={"temperature": 0.7}, + ) + + assert response.content == '{"answer": "42"}' + assert response.structured_response is mock_result + assert response.model == "gpt-4o-mini" + + def test_structured_chat_propagates_api_credentials(self): + """_structured_chat passes api_key and api_base.""" + pytest.importorskip("litellm") + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + adapter = LiteLLMModelAdapter( + model_id="gpt-4o-mini", + api_key="test-key", + api_base="https://custom.api/v1", + seed=77, + ) + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + ) + + call_kwargs = mock_instructor.chat.completions.create.call_args + assert call_kwargs.kwargs.get("api_key") == "test-key" + assert call_kwargs.kwargs.get("api_base") == "https://custom.api/v1" + assert call_kwargs.kwargs.get("seed") == 77 + + def test_get_instructor_client_creates_patched_client(self): + """_get_instructor_client lazily creates instructor client from litellm.""" + pytest.importorskip("litellm") + pytest.importorskip("instructor") + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + adapter = LiteLLMModelAdapter(model_id="gpt-4o-mini") + client = adapter._get_instructor_client() + + assert client is not None + assert hasattr(client, "chat") + # Calling again returns same instance (lazy caching) + assert adapter._get_instructor_client() is client + + +# ==================== OpenAI Legacy Client Fallback Tests ==================== + + +@pytest.mark.interface +class TestOpenAILegacyClientFallback: + """Test OpenAI adapter legacy client method fallbacks.""" + + def test_legacy_client_with_create_method(self): + """Adapter uses client.create() if no chat.completions.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class LegacyClient: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Legacy create"}}]} + + adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content == "Legacy create" + + def test_legacy_client_without_model_param(self): + """Adapter retries without model param on TypeError.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class LegacyClient: + def create(self, messages, **kwargs): + # Accepts messages but not model + return {"choices": [{"message": {"content": "No model param"}}]} + + adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + assert response.content == "No model param" From 4d6b25a89057cfd973498e8118e74adb6149d85c Mon Sep 17 00:00:00 2001 From: cemde <42615086+cemde@users.noreply.github.com> Date: Sun, 22 Mar 2026 22:54:14 +0100 Subject: [PATCH 7/7] [skip ci] fixed docstring --- maseval/core/model.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/maseval/core/model.py b/maseval/core/model.py index 2f668039..5e0c4566 100644 --- a/maseval/core/model.py +++ b/maseval/core/model.py @@ -237,12 +237,17 @@ def chat( - "none": Model won't use tools - "required": Model must use a tool - {"type": "function", "function": {"name": "..."}}: Use specific tool - response_model: Optional Pydantic BaseModel class. When provided, - the model's response is validated against this schema and - returned in ``ChatResponse.structured_response``. Uses - instructor for automatic validation and retries. - max_retries: Number of retries on validation failure when using - ``response_model``. Default is 3. Ignored without ``response_model``. + response_model: Optional Pydantic BaseModel class for structured + output. When provided, the response is validated against this + schema and returned in ``ChatResponse.structured_response``. + Powered by `instructor `_: + the Pydantic model is converted to a tool/function schema and + sent to the provider to guide generation, then the response is + parsed back into a Pydantic instance and validated client-side. + On validation failure, instructor re-prompts automatically. + max_retries: Number of client-side retries on validation failure + when using ``response_model``. Default is 3. Ignored without + ``response_model``. **kwargs: Additional provider-specific arguments. Returns: