Claude Opus 4.5 as a reasoning model with custom effort param (#1250)

juanmichelini · xingyaoww · openhands-agent · web-flow · commit a527525d71e7 · 2025-11-26T01:32:24.000+08:00
Co-authored-by: Xingyao Wang &lt;xingyao@all-hands.dev&gt;
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
Co-authored-by: enyst &lt;engel.nyst@gmail.com&gt;
diff --git a/.github/scripts/check_deprecations.py b/.github/scripts/check_deprecations.py
@@ -64,7 +64,7 @@ class DeprecationRecord:
     deprecated_in: str | None
     path: Path
     line: int
-    kind: Literal["decorator", "warn_call"]
+    kind: Literal["decorator", "warn_call", "cleanup_call"]
     package: str
 
 
@@ -246,28 +246,47 @@ def _gather_warn_calls(
         else:
             continue
 
-        if func_name != "warn_deprecated":
-            continue
+        if func_name == "warn_deprecated":
+            identifier_node = node.args[0] if node.args else None
+            if identifier_node is None:
+                continue
+            identifier = ast.unparse(identifier_node)
 
-        identifier_node = node.args[0] if node.args else None
-        if identifier_node is None:
-            continue
-        identifier = ast.unparse(identifier_node)
-
-        removed_expr = _extract_kw(node, "removed_in")
-        deprecated_expr = _extract_kw(node, "deprecated_in")
-
-        yield DeprecationRecord(
-            identifier=identifier,
-            removed_in=_parse_removed_value(removed_expr, path=path, line=node.lineno),
-            deprecated_in=_parse_deprecated_value(
-                deprecated_expr, path=path, line=node.lineno
-            ),
-            path=path,
-            line=node.lineno,
-            kind="warn_call",
-            package=package,
-        )
+            removed_expr = _extract_kw(node, "removed_in")
+            deprecated_expr = _extract_kw(node, "deprecated_in")
+
+            yield DeprecationRecord(
+                identifier=identifier,
+                removed_in=_parse_removed_value(
+                    removed_expr, path=path, line=node.lineno
+                ),
+                deprecated_in=_parse_deprecated_value(
+                    deprecated_expr, path=path, line=node.lineno
+                ),
+                path=path,
+                line=node.lineno,
+                kind="warn_call",
+                package=package,
+            )
+        elif func_name == "warn_cleanup":
+            identifier_node = node.args[0] if node.args else None
+            if identifier_node is None:
+                continue
+            identifier = ast.unparse(identifier_node)
+
+            cleanup_expr = _extract_kw(node, "cleanup_by")
+
+            yield DeprecationRecord(
+                identifier=identifier,
+                removed_in=_parse_removed_value(
+                    cleanup_expr, path=path, line=node.lineno
+                ),
+                deprecated_in=None,
+                path=path,
+                line=node.lineno,
+                kind="cleanup_call",
+                package=package,
+            )
 
 
 def _build_identifier(node: ast.AST) -> str:
@@ -329,6 +348,14 @@ def _should_fail(current_version: str, record: DeprecationRecord) -> bool:
 def _format_record(record: DeprecationRecord) -> str:
     location = record.path.relative_to(REPO_ROOT)
     removed = record.removed_in if record.removed_in is not None else "(none)"
+
+    if record.kind == "cleanup_call":
+        return (
+            f"- [{record.package}] {record.identifier} ({record.kind})\n"
+            f"  cleanup by:    {removed}\n"
+            f"  defined at:    {location}:{record.line}"
+        )
+
     deprecated = (
         record.deprecated_in if record.deprecated_in is not None else "(unknown)"
     )
@@ -371,14 +398,34 @@ def main(argv: Sequence[str] | None = None) -> int:
         package_summaries.append((package.name, current_version, len(records)))
 
     if overdue:
-        print("The following deprecated features have passed their removal deadline:\n")
-        for record in overdue:
-            print(_format_record(record))
-            print()
-        print(
-            "Update or remove the listed features before publishing a version that "
-            "meets or exceeds their removal deadline."
-        )
+        deprecated_items = [r for r in overdue if r.kind != "cleanup_call"]
+        cleanup_items = [r for r in overdue if r.kind == "cleanup_call"]
+
+        if deprecated_items:
+            print(
+                "The following deprecated features have passed their removal "
+                "deadline:\n"
+            )
+            for record in deprecated_items:
+                print(_format_record(record))
+                print()
+
+        if cleanup_items:
+            print("The following workarounds have passed their cleanup deadline:\n")
+            for record in cleanup_items:
+                print(_format_record(record))
+                print()
+
+        if deprecated_items:
+            print(
+                "Update or remove the listed features before publishing a version that "
+                "meets or exceeds their removal deadline."
+            )
+        if cleanup_items:
+            print(
+                "Remove the listed workarounds before publishing a version that "
+                "meets or exceeds their cleanup deadline."
+            )
         return 1
 
     for package_name, version, count in package_summaries:
diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
@@ -4,6 +4,7 @@
 
 from openhands.sdk.llm.options.common import apply_defaults_if_absent
 from openhands.sdk.llm.utils.model_features import get_features
+from openhands.sdk.utils.deprecation import warn_cleanup
 
 
 def select_chat_options(
@@ -34,12 +35,33 @@ def select_chat_options(
 
     # Reasoning-model quirks
     if get_features(llm.model).supports_reasoning_effort:
-        # Preferred: use reasoning_effort
-        if llm.reasoning_effort is not None:
-            out["reasoning_effort"] = llm.reasoning_effort
-        # Anthropic/OpenAI reasoning models ignore temp/top_p
+        # Claude models use different parameter format
+        if "claude-opus-4-5" in llm.model.lower():
+            warn_cleanup(
+                "Claude Opus 4.5 effort parameter workaround",
+                cleanup_by="1.4.0",
+                details=(
+                    "LiteLLM does not yet redirect reasoning_effort to "
+                    "output_config.effort for Claude Opus 4.5. Remove this workaround "
+                    "once LiteLLM adds native support."
+                ),
+            )
+            # Claude uses output_config.effort instead of reasoning_effort
+            if llm.reasoning_effort is not None:
+                out["output_config"] = {"effort": llm.reasoning_effort}
+            # Claude requires beta header for effort parameter
+            if "extra_headers" not in out:
+                out["extra_headers"] = {}
+            out["extra_headers"]["anthropic-beta"] = "effort-2025-11-24"
+        else:
+            # OpenAI/other models use reasoning_effort parameter
+            if llm.reasoning_effort is not None:
+                out["reasoning_effort"] = llm.reasoning_effort
+
+        # All reasoning models ignore temp/top_p
         out.pop("temperature", None)
         out.pop("top_p", None)
+
         # Gemini 2.5-pro default to low if not set
         if "gemini-2.5-pro" in llm.model:
             if llm.reasoning_effort in {None, "none"}:
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -42,6 +42,8 @@ class ModelFeatures:
     "gemini-2.5-pro",
     # OpenAI GPT-5 family (includes mini variants)
     "gpt-5",
+    # Anthropic Opus 4.5
+    "claude-opus-4-5",
 ]
 
 EXTENDED_THINKING_PATTERNS: list[str] = [
@@ -63,6 +65,7 @@ class ModelFeatures:
     "claude-opus-4",
     # Anthropic Haiku 4.5 variants (dash only; official IDs use hyphens)
     "claude-haiku-4-5",
+    "claude-opus-4-5",
 ]
 
 SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [
diff --git a/openhands-sdk/openhands/sdk/llm/utils/verified_models.py b/openhands-sdk/openhands/sdk/llm/utils/verified_models.py
@@ -16,6 +16,7 @@
 VERIFIED_ANTHROPIC_MODELS = [
     "claude-sonnet-4-5-20250929",
     "claude-haiku-4-5-20251001",
+    "claude-opus-4-5-20251101",
     "claude-sonnet-4-20250514",
     "claude-opus-4-20250514",
     "claude-opus-4-1-20250805",
@@ -40,6 +41,7 @@
     "gpt-5-2025-08-07",
     "gpt-5-codex",
     "claude-haiku-4-5-20251001",
+    "claude-opus-4-5-20251101",
     "kimi-k2-thinking",
     "gpt-5-mini-2025-08-07",
     "claude-opus-4-1-20250805",
diff --git a/openhands-sdk/openhands/sdk/utils/deprecation.py b/openhands-sdk/openhands/sdk/utils/deprecation.py
@@ -112,7 +112,55 @@ def warn_deprecated(
     warnings.warn(warning, stacklevel=stacklevel)
 
 
+def warn_cleanup(
+    workaround: str,
+    *,
+    cleanup_by: str | date,
+    current_version: str | None = None,
+    details: str = "",
+    stacklevel: int = 2,
+) -> None:
+    """Emit a warning for temporary workarounds that need cleanup by a deadline.
+
+    Use this helper for temporary code that addresses upstream issues, compatibility
+    shims, or other workarounds that should be removed once external conditions
+    change (e.g., when a library adds support for a feature, or when an API
+    stabilizes). The deprecation check workflow will fail when the cleanup deadline
+    is reached, ensuring the workaround is removed before the specified version or
+    date.
+
+    Args:
+        workaround: Description of the temporary workaround
+        cleanup_by: Version string or date when this workaround must be removed
+        current_version: Override the detected package version (for testing)
+        details: Additional context about why cleanup is needed
+        stacklevel: Stack level for warning emission
+    """
+    current_version = current_version or _current_version()
+
+    should_cleanup = False
+    if isinstance(cleanup_by, date):
+        should_cleanup = date.today() >= cleanup_by
+    else:
+        try:
+            current = pkg_version.parse(current_version)
+            target = pkg_version.parse(str(cleanup_by))
+            should_cleanup = current >= target
+        except pkg_version.InvalidVersion:
+            pass
+
+    if should_cleanup:
+        message = (
+            f"Cleanup required: {workaround}. "
+            f"This workaround was scheduled for removal by {cleanup_by}."
+        )
+        if details:
+            message += f" {details}"
+        warnings.warn(message, UserWarning, stacklevel=stacklevel)
+
+
 __all__ = [
     "deprecated",
     "warn_deprecated",
+    "warn_cleanup",
 ]
diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py
@@ -0,0 +1,97 @@
+from dataclasses import dataclass
+from typing import Any
+
+from openhands.sdk.llm.options.chat_options import select_chat_options
+
+
+@dataclass
+class DummyLLM:
+    model: str
+    top_k: int | None = None
+    top_p: float | None = 1.0
+    temperature: float | None = 0.0
+    max_output_tokens: int = 1024
+    extra_headers: dict[str, str] | None = None
+    reasoning_effort: str | None = None
+    extended_thinking_budget: int | None = None
+    safety_settings: list[dict[str, Any]] | None = None
+    litellm_extra_body: dict[str, Any] | None = None
+
+
+def test_opus_4_5_uses_effort_and_beta_header_and_strips_temp_top_p():
+    llm = DummyLLM(
+        model="claude-opus-4-5-20251101",
+        top_p=0.9,
+        temperature=0.7,
+        reasoning_effort="medium",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    # Uses output_config.effort instead of reasoning_effort
+    assert out.get("output_config") == {"effort": "medium"}
+    assert "reasoning_effort" not in out
+
+    # Adds the required beta header for effort
+    headers = out.get("extra_headers") or {}
+    assert headers.get("anthropic-beta") == "effort-2025-11-24"
+
+    # Strips temperature/top_p for reasoning models
+    assert "temperature" not in out
+    assert "top_p" not in out
+
+
+def test_gpt5_uses_reasoning_effort_and_strips_temp_top_p():
+    llm = DummyLLM(
+        model="gpt-5-mini-2025-08-07",
+        temperature=0.5,
+        top_p=0.8,
+        reasoning_effort="high",
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    assert out.get("reasoning_effort") == "high"
+    assert "output_config" not in out
+    headers = out.get("extra_headers") or {}
+    assert "anthropic-beta" not in headers
+    assert "temperature" not in out
+    assert "top_p" not in out
+
+
+def test_gemini_2_5_pro_defaults_reasoning_effort_low_when_none():
+    llm = DummyLLM(model="gemini-2.5-pro-experimental", reasoning_effort=None)
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    assert out.get("reasoning_effort") == "low"
+
+
+def test_non_reasoning_model_preserves_temp_and_top_p():
+    llm = DummyLLM(model="gpt-4o", temperature=0.6, top_p=0.7)
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    # Non-reasoning models should retain temperature/top_p defaults
+    assert out.get("temperature") == 0.6
+    assert out.get("top_p") == 0.7
+
+
+def test_azure_renames_max_completion_tokens_to_max_tokens():
+    llm = DummyLLM(model="azure/gpt-4o")
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    assert "max_completion_tokens" not in out
+    assert out.get("max_tokens") == llm.max_output_tokens
+
+
+def test_tools_removed_when_has_tools_false():
+    llm = DummyLLM(model="gpt-4o")
+    uk = {"tools": ["t1"], "tool_choice": "auto"}
+    out = select_chat_options(llm, user_kwargs=uk, has_tools=False)
+
+    assert "tools" not in out
+    assert "tool_choice" not in out
+
+
+def test_extra_body_is_forwarded():
+    llm = DummyLLM(model="gpt-4o", litellm_extra_body={"x": 1})
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    assert out.get("extra_body") == {"x": 1}
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
@@ -30,6 +30,8 @@ def test_model_matches(name, pattern, expected):
         ("o1", True),
         ("o3-mini", True),
         ("o3", True),
+        # Anthropic Opus 4.5 (dash variant only)
+        ("claude-opus-4-5", True),
         ("gpt-4o", False),
         ("claude-3-5-sonnet", False),
         ("gemini-1.5-pro", False),
@@ -51,12 +53,12 @@ def test_reasoning_effort_support(model, expected_reasoning):
         # AWS Bedrock model ids (provider-prefixed)
         ("bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0", True),
         ("bedrock/anthropic.claude-3-haiku-20240307-v1:0", True),
-        # Anthropic Haiku 4.5 variants (dash only; official IDs use hyphens)
+        # Anthropic 4.5 variants (dash only; official IDs use hyphens)
         ("claude-haiku-4-5", True),
         ("us.anthropic.claude-haiku-4-5-20251001", True),
         ("bedrock/anthropic.claude-3-opus-20240229-v1:0", True),
-        # Anthropic 4.5 variants (dash only; official IDs use hyphens)
         ("claude-sonnet-4-5", True),
+        ("claude-opus-4-5", True),
         # User-facing model names (no provider prefix)
         ("anthropic.claude-3-5-sonnet-20241022", True),
         ("anthropic.claude-3-haiku-20240307", True),
diff --git a/tests/sdk/utils/test_deprecation.py b/tests/sdk/utils/test_deprecation.py

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,8 @@ class ModelFeatures:`
`42`	`42`	`"gemini-2.5-pro",`
`43`	`43`	`# OpenAI GPT-5 family (includes mini variants)`
`44`	`44`	`"gpt-5",`
	`45`	`+ # Anthropic Opus 4.5`
	`46`	`+ "claude-opus-4-5",`
`45`	`47`	`]`
`46`	`48`
`47`	`49`	`EXTENDED_THINKING_PATTERNS: list[str] = [`
`@@ -63,6 +65,7 @@ class ModelFeatures:`
`63`	`65`	`"claude-opus-4",`
`64`	`66`	`# Anthropic Haiku 4.5 variants (dash only; official IDs use hyphens)`
`65`	`67`	`"claude-haiku-4-5",`
	`68`	`+ "claude-opus-4-5",`
`66`	`69`	`]`
`67`	`70`
`68`	`71`	`SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [`