From f457ccafeeb627e4a369bd2a49fab58ec6718288 Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Sun, 22 Mar 2026 20:26:04 +0800
Subject: [PATCH 1/6] fix(llm): increase api_key_encrypted length to 1024 for
 Minimax support

Fixes #164 - Minimax API keys exceed previous 500 char limit
---
 .../versions/increase_api_key_length.py       | 33 +++++++++++++++++++
 backend/app/models/llm.py                     |  2 +-
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 backend/alembic/versions/increase_api_key_length.py

diff --git a/backend/alembic/versions/increase_api_key_length.py b/backend/alembic/versions/increase_api_key_length.py
new file mode 100644
index 00000000..e3fe5765
--- /dev/null
+++ b/backend/alembic/versions/increase_api_key_length.py
@@ -0,0 +1,33 @@
+"""Increase api_key_encrypted column length to support Minimax API keys.
+
+Revision ID: increase_api_key_length
+Revises: add_notification_agent_id
+Create Date: 2026-03-22
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+revision: str = 'increase_api_key_length'
+down_revision: Union[str, None] = 'add_notification_agent_id'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Increase api_key_encrypted column length from 500 to 2000
+    # Minimax API keys are very long and exceed the previous 500 char limit
+    op.execute("""
+        ALTER TABLE llm_models
+        ALTER COLUMN api_key_encrypted TYPE VARCHAR(1024)
+    """)
+
+
+def downgrade() -> None:
+    # Revert to 500 chars (may fail if data exceeds 500 chars)
+    op.execute("""
+        ALTER TABLE llm_models
+        ALTER COLUMN api_key_encrypted TYPE VARCHAR(500)
+    """)
diff --git a/backend/app/models/llm.py b/backend/app/models/llm.py
index 6e58b68a..6f35f46b 100644
--- a/backend/app/models/llm.py
+++ b/backend/app/models/llm.py
@@ -19,7 +19,7 @@ class LLMModel(Base):
     tenant_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("tenants.id"), nullable=True, index=True)
     provider: Mapped[str] = mapped_column(String(50), nullable=False)  # anthropic, openai, deepseek, etc.
     model: Mapped[str] = mapped_column(String(100), nullable=False)  # claude-opus-4-6, gpt-4o, etc.
-    api_key_encrypted: Mapped[str] = mapped_column(String(500), nullable=False)
+    api_key_encrypted: Mapped[str] = mapped_column(String(1024), nullable=False)
     base_url: Mapped[str | None] = mapped_column(String(500))
     label: Mapped[str] = mapped_column(String(200), nullable=False)  # Display name
     max_tokens_per_day: Mapped[int | None] = mapped_column(Integer)

From 9e9659bf60b949ffcf20cbdd78426e26e0d48c3a Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Sun, 22 Mar 2026 20:27:18 +0800
Subject: [PATCH 2/6] feat(llm): unified failover policy across all execution
 paths

Implements #154 - Unify Primary/Fallback LLM Failover Policy

Changes:
- Add llm_failover.py module with error classification and failover logic
- Add llm_caller.py service for unified LLM calling
- Update call_llm() to support fallback_model parameter
- Update _call_agent_llm() to use unified failover
- Update task_executor and scheduler to use new unified functions

Failover rules:
1. Try primary if available
2. If primary missing/unavailable, use fallback directly
3. If primary fails with retryable error, retry once on fallback
4. If error is non-retryable (auth/validation), do not switch
---
 backend/app/api/feishu.py             |  97 ++-------
 backend/app/api/websocket.py          | 100 +++++++---
 backend/app/services/llm_caller.py    | 274 ++++++++++++++++++++++++++
 backend/app/services/llm_failover.py  | 249 +++++++++++++++++++++++
 backend/app/services/scheduler.py     |  85 ++------
 backend/app/services/task_executor.py | 106 ++--------
 6 files changed, 634 insertions(+), 277 deletions(-)
 create mode 100644 backend/app/services/llm_caller.py
 create mode 100644 backend/app/services/llm_failover.py

diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 80819a9f..6f99ddee 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -1022,90 +1022,19 @@ async def _download_post_images(agent_id, config, message_id, image_keys):
 
 async def _call_agent_llm(db: AsyncSession, agent_id: uuid.UUID, user_text: str, history: list[dict] | None = None, user_id=None, on_chunk=None, on_thinking=None) -> str:
     """Call the agent's configured LLM model with conversation history.
-    
-    Reuses the same call_llm function as the WebSocket chat endpoint so that
-    all providers (OpenRouter, Qwen, etc.) work identically on both channels.
-    """
-    from app.models.agent import Agent
-    from app.models.llm import LLMModel
-    from app.api.websocket import call_llm
-
-    # Load agent and model
-    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
-    agent = agent_result.scalar_one_or_none()
-    if not agent:
-        return "⚠️ 数字员工未找到"
-
-    if is_agent_expired(agent):
-        return "This Agent has expired and is off duty. Please contact your admin to extend its service."
-
-    # Load primary model
-    model = None
-    if agent.primary_model_id:
-        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
-        model = model_result.scalar_one_or_none()
-
-    # Load fallback model
-    fallback_model = None
-    if agent.fallback_model_id:
-        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
-        fallback_model = fb_result.scalar_one_or_none()
-
-    # Config-level fallback: primary missing -> use fallback
-    if not model and fallback_model:
-        model = fallback_model
-        fallback_model = None
-        logger.warning(f"[Channel] Primary model unavailable, using fallback: {model.model}")
-
-    if not model:
-        return f"⚠️ {agent.name} 未配置 LLM 模型，请在管理后台设置。"
-
-    # Build conversation messages (without system prompt — call_llm adds it)
-    messages: list[dict] = []
-    if history:
-        messages.extend(history[-10:])
-    messages.append({"role": "user", "content": user_text})
-
-    # Use actual user_id so the system prompt knows who it's chatting with
-    effective_user_id = user_id or agent_id
 
-    try:
-        reply = await call_llm(
-            model,
-            messages,
-            agent.name,
-            agent.role_description or "",
-            agent_id=agent_id,
-            user_id=effective_user_id,
-            supports_vision=getattr(model, 'supports_vision', False),
-            on_chunk=on_chunk,
-            on_thinking=on_thinking,
-        )
-        return reply
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        error_msg = str(e) or repr(e)
-        logger.error(f"[LLM] Primary model error: {error_msg}")
-        # Runtime fallback: primary model failed -> retry with fallback model
-        if fallback_model:
-            logger.info(f"[LLM] Retrying with fallback model: {fallback_model.model}")
-            try:
-                reply = await call_llm(
-                    fallback_model,
-                    messages,
-                    agent.name,
-                    agent.role_description or "",
-                    agent_id=agent_id,
-                    user_id=effective_user_id,
-                    supports_vision=getattr(fallback_model, 'supports_vision', False),
-                    on_chunk=on_chunk,
-                    on_thinking=on_thinking,
-                )
-                return reply
-            except Exception as e2:
-                traceback.print_exc()
-                return f"⚠️ 调用模型出错: Primary: {str(e)[:80]} | Fallback: {str(e2)[:80]}"
-        return f"⚠️ 调用模型出错: {error_msg[:150]}"
+    DEPRECATED: Use app.services.llm_caller.call_agent_llm instead.
+    This function is kept for backward compatibility with existing imports.
+    """
+    from app.services.llm_caller import call_agent_llm
+    return await call_agent_llm(
+        db=db,
+        agent_id=agent_id,
+        user_text=user_text,
+        history=history,
+        user_id=user_id,
+        on_chunk=on_chunk,
+        on_thinking=on_thinking,
+    )
 
 
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 396fa142..9acc25a8 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -107,14 +107,73 @@ async def call_llm(
     on_tool_call=None,
     on_thinking=None,
     supports_vision=False,
+    fallback_model: LLMModel | None = None,
 ) -> str:
-    """Call LLM via unified client with function-calling tool loop.
+    """Call LLM via unified client with function-calling tool loop and failover support.
 
     Args:
         on_chunk: Optional async callback(text: str) for streaming chunks to client.
         on_thinking: Optional async callback(text: str) for reasoning/thinking content.
         on_tool_call: Optional async callback(dict) for tool call status updates.
+        fallback_model: Optional fallback model for runtime failover.
     """
+    from app.services.llm_failover import classify_error, FailoverErrorType
+
+    async def _call_single(_model: LLMModel) -> str:
+        """Internal: call a single model without failover."""
+        return await _call_llm_core(
+            _model, messages, agent_name, role_description,
+            agent_id, user_id, on_chunk, on_tool_call, on_thinking, supports_vision
+        )
+
+    # Config-level fallback: if no primary, use fallback directly
+    if model is None and fallback_model is not None:
+        model = fallback_model
+        fallback_model = None
+
+    if model is None:
+        return "⚠️ 未配置 LLM 模型"
+
+    # Try primary model
+    try:
+        return await _call_single(model)
+    except Exception as e:
+        error_type = classify_error(e)
+        error_msg = str(e) or repr(e)
+        logger.warning(f"[call_llm] Primary failed ({error_type.value}): {error_msg[:150]}")
+
+        # Non-retryable: don't attempt fallback
+        if error_type == FailoverErrorType.NON_RETRYABLE:
+            return f"[LLM Error] {error_msg}"
+
+        # No fallback available
+        if fallback_model is None:
+            return f"[LLM Error] {error_msg}"
+
+        # Runtime fallback: retry with fallback model
+        logger.info(f"[call_llm] Retrying with fallback: {fallback_model.provider}/{fallback_model.model}")
+        try:
+            return await _call_single(fallback_model)
+        except Exception as e2:
+            error_msg2 = str(e2) or repr(e2)
+            logger.error(f"[call_llm] Fallback also failed: {error_msg2[:150]}")
+            return f"⚠️ 调用模型出错: Primary: {error_msg[:80]} | Fallback: {error_msg2[:80]}"
+
+
+
+async def _call_llm_core(
+    model: LLMModel,
+    messages: list[dict],
+    agent_name: str,
+    role_description: str,
+    agent_id=None,
+    user_id=None,
+    on_chunk=None,
+    on_tool_call=None,
+    on_thinking=None,
+    supports_vision=False,
+) -> str:
+    """Core LLM call implementation (single model, no failover)."""
     from app.services.agent_tools import AGENT_TOOLS, execute_tool, get_agent_tools_for_llm
     from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
 
@@ -217,7 +276,7 @@ async def call_llm(
             timeout=120.0,
         )
     except Exception as e:
-        return f"[Error] Failed to create LLM client: {e}"
+        raise LLMError(f"Failed to create LLM client: {e}")
 
     max_tokens = get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None))
 
@@ -258,14 +317,15 @@ async def call_llm(
                 on_thinking=on_thinking,
             )
         except LLMError as e:
-            # Record accumulated tokens before returning error
+            # Record accumulated tokens before raising
             logger.error(
                 f"[LLM] LLMError provider={getattr(model, 'provider', '?')} "
                 f"model={getattr(model, 'model', '?')} round={round_i + 1}: {e}"
             )
             if agent_id and _accumulated_tokens > 0:
                 await record_token_usage(agent_id, _accumulated_tokens)
-            return f"[LLM Error] {e}"
+            await client.close()
+            raise  # Re-raise for failover handling
         except Exception as e:
             logger.error(
                 f"[LLM] Unexpected error provider={getattr(model, 'provider', '?')} "
@@ -274,7 +334,8 @@ async def call_llm(
             )
             if agent_id and _accumulated_tokens > 0:
                 await record_token_usage(agent_id, _accumulated_tokens)
-            return f"[LLM call error] {type(e).__name__}: {str(e)[:200]}"
+            await client.close()
+            raise  # Re-raise for failover handling
 
         # ── Track tokens for this round ──
         real_tokens = extract_usage_tokens(response.usage)
@@ -377,7 +438,7 @@ async def call_llm(
     if agent_id and _accumulated_tokens > 0:
         await record_token_usage(agent_id, _accumulated_tokens)
     await client.close()
-    return "[Error] Too many tool call rounds"
+    raise LLMError("Too many tool call rounds")
 
 
 @router.websocket("/ws/chat/{agent_id}")
@@ -737,6 +798,7 @@ async def thinking_to_ws(text: str):
                         on_tool_call=tool_call_to_ws,
                         on_thinking=thinking_to_ws,
                         supports_vision=getattr(llm_model, 'supports_vision', False),
+                        fallback_model=fallback_llm_model,
                     ))
 
                     # Listen for abort while LLM is running
@@ -803,30 +865,8 @@ async def thinking_to_ws(text: str):
                     logger.error(f"[WS] LLM error: {e}")
                     import traceback
                     traceback.print_exc()
-                    # Runtime fallback: primary model failed -> retry with fallback model
-                    if fallback_llm_model:
-                        logger.info(f"[WS] Primary model failed, retrying with fallback: {fallback_llm_model.model}")
-                        try:
-                            await websocket.send_json({"type": "info", "content": f"Primary model error, switching to fallback model ({fallback_llm_model.model})..."})
-                            assistant_response = await call_llm(
-                                fallback_llm_model,
-                                conversation[-ctx_size:],
-                                agent_name,
-                                role_description,
-                                agent_id=agent_id,
-                                user_id=user_id,
-                                on_chunk=stream_to_ws,
-                                on_tool_call=tool_call_to_ws,
-                                on_thinking=thinking_to_ws,
-                                supports_vision=getattr(fallback_llm_model, 'supports_vision', False),
-                            )
-                            logger.info(f"[WS] Fallback LLM response: {assistant_response[:80]}")
-                        except Exception as e2:
-                            logger.error(f"[WS] Fallback LLM also failed: {e2}")
-                            traceback.print_exc()
-                            assistant_response = f"[LLM call error] Primary: {str(e)[:100]} | Fallback: {str(e2)[:100]}"
-                    else:
-                        assistant_response = f"[LLM call error] {str(e)[:200]}"
+                    # call_llm now handles failover internally, just return the error message
+                    assistant_response = str(e) if str(e) else "[LLM call error]"
             else:
                 assistant_response = f"⚠️ {agent_name} has no LLM model configured. Please select a model in the agent's Settings tab."
 
diff --git a/backend/app/services/llm_caller.py b/backend/app/services/llm_caller.py
new file mode 100644
index 00000000..20d92d8d
--- /dev/null
+++ b/backend/app/services/llm_caller.py
@@ -0,0 +1,274 @@
+"""Unified LLM calling service with failover support for all execution paths.
+
+This module provides a shared entry point for all LLM calls across:
+- WebSocket chat
+- IM channels (Feishu, Slack, Teams, Discord, WeCom, DingTalk)
+- Background services (task executor, scheduler, heartbeat, etc.)
+
+All paths now support:
+1. Config-level fallback: if primary missing, use fallback directly
+2. Runtime failover: if primary fails with retryable error, try fallback once
+"""
+
+from __future__ import annotations
+
+import uuid
+from typing import TYPE_CHECKING
+
+from loguru import logger
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services.llm_failover import classify_error, FailoverErrorType
+from app.services.llm_utils import LLMMessage
+
+if TYPE_CHECKING:
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+
+
+async def call_agent_llm(
+    db: AsyncSession,
+    agent_id: uuid.UUID,
+    user_text: str,
+    history: list[dict] | None = None,
+    user_id: uuid.UUID | None = None,
+    on_chunk=None,
+    on_thinking=None,
+    supports_vision: bool = False,
+) -> str:
+    """Call the agent's LLM with automatic failover support.
+
+    This is the unified entry point for ALL LLM calls across all channels.
+
+    Args:
+        db: Database session
+        agent_id: Agent UUID
+        user_text: User message text
+        history: Optional conversation history (last N messages)
+        user_id: Optional user UUID (for personalized context)
+        on_chunk: Optional streaming callback
+        on_thinking: Optional thinking/reasoning callback
+        supports_vision: Whether the model supports vision
+
+    Returns:
+        LLM response string, or error message if both primary and fallback fail
+    """
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+    from app.api.websocket import call_llm
+
+    # Load agent
+    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
+    agent: Agent | None = agent_result.scalar_one_or_none()
+    if not agent:
+        return "⚠️ 数字员工未找到"
+
+    from app.core.permissions import is_agent_expired
+    if is_agent_expired(agent):
+        return "This Agent has expired and is off duty. Please contact your admin to extend its service."
+
+    # Load primary model
+    primary_model: LLMModel | None = None
+    if agent.primary_model_id:
+        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
+        primary_model = model_result.scalar_one_or_none()
+
+    # Load fallback model
+    fallback_model: LLMModel | None = None
+    if agent.fallback_model_id:
+        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
+        fallback_model = fb_result.scalar_one_or_none()
+
+    # Config-level fallback: primary missing -> use fallback
+    if not primary_model and fallback_model:
+        primary_model = fallback_model
+        fallback_model = None
+        logger.warning(f"[call_agent_llm] Primary model unavailable, using fallback: {primary_model.model}")
+
+    if not primary_model:
+        return f"⚠️ {agent.name} 未配置 LLM 模型，请在管理后台设置。"
+
+    # Build conversation messages
+    messages: list[dict] = []
+    if history:
+        messages.extend(history[-10:])
+    messages.append({"role": "user", "content": user_text})
+
+    # Use unified call_llm with failover
+    try:
+        reply = await call_llm(
+            primary_model,
+            messages,
+            agent.name,
+            agent.role_description or "",
+            agent_id=agent_id,
+            user_id=user_id or agent_id,
+            supports_vision=supports_vision or getattr(primary_model, 'supports_vision', False),
+            on_chunk=on_chunk,
+            on_thinking=on_thinking,
+            fallback_model=fallback_model,
+        )
+        return reply
+    except Exception as e:
+        # call_llm should handle failover internally, but catch any unexpected errors
+        error_msg = str(e) or repr(e)
+        logger.error(f"[call_agent_llm] Unexpected error: {error_msg}")
+        return f"⚠️ 调用模型出错: {error_msg[:150]}"
+
+
+async def call_agent_llm_with_tools(
+    db: AsyncSession,
+    agent_id: uuid.UUID,
+    system_prompt: str,
+    user_prompt: str,
+    max_rounds: int = 50,
+) -> str:
+    """Call agent LLM with tool-calling loop (for background services).
+
+    Used by scheduler, heartbeat, and other background tasks.
+
+    Args:
+        db: Database session
+        agent_id: Agent UUID
+        system_prompt: System prompt/context
+        user_prompt: User/instruction message
+        max_rounds: Maximum tool-calling rounds
+
+    Returns:
+        Final response string
+    """
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+    from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
+    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMError
+
+    # Load agent and models
+    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
+    agent: Agent | None = agent_result.scalar_one_or_none()
+    if not agent:
+        return "⚠️ Agent not found"
+
+    # Load models
+    primary_model: LLMModel | None = None
+    if agent.primary_model_id:
+        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
+        primary_model = model_result.scalar_one_or_none()
+
+    fallback_model: LLMModel | None = None
+    if agent.fallback_model_id:
+        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
+        fallback_model = fb_result.scalar_one_or_none()
+
+    # Config-level fallback
+    if not primary_model and fallback_model:
+        primary_model = fallback_model
+        fallback_model = None
+
+    if not primary_model:
+        return f"⚠️ {agent.name} has no LLM model configured"
+
+    # Build messages
+    messages = [
+        LLMMessage(role="system", content=system_prompt),
+        LLMMessage(role="user", content=user_prompt),
+    ]
+
+    # Load tools
+    tools_for_llm = await get_agent_tools_for_llm(agent_id)
+
+    async def _try_model(model: LLMModel) -> tuple[str, bool]:
+        """Try to complete with a model. Returns (response, success)."""
+        try:
+            client = create_llm_client(
+                provider=model.provider,
+                api_key=model.api_key_encrypted,
+                model=model.model,
+                base_url=model.base_url,
+                timeout=120.0,
+            )
+
+            max_tokens = get_max_tokens(
+                model.provider, model.model,
+                getattr(model, 'max_output_tokens', None)
+            )
+
+            # Tool-calling loop
+            api_messages = list(messages)  # Copy
+            for round_i in range(max_rounds):
+                try:
+                    response = await client.complete(
+                        messages=api_messages,
+                        tools=tools_for_llm if tools_for_llm else None,
+                        temperature=0.7,
+                        max_tokens=max_tokens,
+                    )
+                except Exception as e:
+                    await client.close()
+                    raise
+
+                if not response.tool_calls:
+                    await client.close()
+                    return response.content or "[Empty response]", True
+
+                # Execute tool calls
+                api_messages.append(LLMMessage(
+                    role="assistant",
+                    content=response.content or None,
+                    tool_calls=[{
+                        "id": tc["id"],
+                        "type": "function",
+                        "function": tc["function"],
+                    } for tc in response.tool_calls],
+                ))
+
+                for tc in response.tool_calls:
+                    fn = tc["function"]
+                    tool_name = fn["name"]
+                    raw_args = fn.get("arguments", "{}")
+                    try:
+                        import json
+                        args = json.loads(raw_args) if raw_args else {}
+                    except json.JSONDecodeError:
+                        args = {}
+
+                    result = await execute_tool(
+                        tool_name, args,
+                        agent_id=agent_id,
+                        user_id=agent.creator_id,
+                    )
+                    api_messages.append(LLMMessage(
+                        role="tool",
+                        tool_call_id=tc["id"],
+                        content=str(result),
+                    ))
+
+            await client.close()
+            return "[Error] Too many tool call rounds", False
+
+        except Exception as e:
+            return f"[Error] {e}", False
+
+    # Try primary model
+    reply, success = await _try_model(primary_model)
+    if success:
+        return reply
+
+    # Primary failed - check if retryable
+    error_type = classify_error(Exception(reply))
+    if error_type == FailoverErrorType.NON_RETRYABLE or not fallback_model:
+        return reply
+
+    # Try fallback model
+    logger.info(f"[call_agent_llm_with_tools] Retrying with fallback: {fallback_model.model}")
+    reply2, success2 = await _try_model(fallback_model)
+    if success2:
+        return reply2
+
+    return f"⚠️ Both models failed | Primary: {reply[:80]} | Fallback: {reply2[:80]}"
+
+
+__all__ = [
+    "call_agent_llm",
+    "call_agent_llm_with_tools",
+]
diff --git a/backend/app/services/llm_failover.py b/backend/app/services/llm_failover.py
new file mode 100644
index 00000000..c619542a
--- /dev/null
+++ b/backend/app/services/llm_failover.py
@@ -0,0 +1,249 @@
+"""Unified LLM failover executor for all execution paths.
+
+Provides a shared failover policy across chat/channel/background paths:
+1. Try primary if available
+2. If primary missing/unavailable, use fallback directly
+3. If primary fails with retryable error, retry once on fallback
+4. If error is non-retryable (auth/validation/schema), do not switch
+5. Max attempts per request: 2 (primary + fallback)
+"""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from enum import Enum
+from typing import Awaitable, Callable, TypeVar
+
+from loguru import logger
+
+from app.services.llm_client import LLMError, LLMMessage, LLMResponse
+from app.services.llm_utils import create_llm_client, get_max_tokens
+
+
+class FailoverErrorType(Enum):
+    """Classification of LLM errors for failover decisions."""
+
+    RETRYABLE = "retryable"  # Network timeout, 429, 5xx, transient errors
+    NON_RETRYABLE = "non_retryable"  # Auth, validation, schema errors
+    UNKNOWN = "unknown"
+
+
+@dataclass
+class FailoverResult:
+    """Result of a failover invocation."""
+
+    content: str
+    success: bool
+    model_used: str  # "primary" or "fallback"
+    error: str | None = None
+
+
+# Type variable for the invoke function return type
+T = TypeVar("T")
+
+
+def classify_error(error: Exception) -> FailoverErrorType:
+    """Classify an exception as retryable or non-retryable.
+
+    Retryable errors:
+    - Network timeout / connection errors
+    - Provider 429 (rate limit)
+    - Provider 5xx (server errors)
+    - Explicit transient provider errors
+
+    Non-retryable errors:
+    - Auth errors (401, 403)
+    - Validation errors (400, 422)
+    - Schema errors
+    - Content policy violations
+    """
+    error_msg = str(error).lower()
+    error_type = type(error).__name__.lower()
+
+    # Non-retryable: authentication and authorization
+    if any(kw in error_msg for kw in ["auth", "unauthorized", "forbidden", "invalid api key", "api key invalid"]):
+        return FailoverErrorType.NON_RETRYABLE
+
+    # Non-retryable: validation and schema
+    if any(kw in error_msg for kw in ["validation", "invalid request", "schema", "bad request"]):
+        return FailoverErrorType.NON_RETRYABLE
+
+    # Non-retryable: content policy
+    if any(kw in error_msg for kw in ["content policy", "content_filter", "safety", "moderation"]):
+        return FailoverErrorType.NON_RETRYABLE
+
+    # Retryable: rate limiting
+    if any(kw in error_msg for kw in ["rate limit", "429", "too many requests"]):
+        return FailoverErrorType.RETRYABLE
+
+    # Retryable: server errors
+    if any(kw in error_msg for kw in ["500", "502", "503", "504", "server error", "internal error"]):
+        return FailoverErrorType.RETRYABLE
+
+    # Retryable: network and timeout
+    if any(kw in error_msg for kw in ["timeout", "connection", "network", "unreachable", "refused", "reset", "dns"]):
+        return FailoverErrorType.RETRYABLE
+
+    # Retryable: transient errors
+    if any(kw in error_msg for kw in ["temporary", "transient", "unavailable", "overloaded", "busy"]):
+        return FailoverErrorType.RETRYABLE
+
+    # LLMError with specific patterns
+    if isinstance(error, LLMError):
+        # Check the error message for HTTP status codes
+        if any(code in error_msg for code in ["401", "403", "400", "422"]):
+            return FailoverErrorType.NON_RETRYABLE
+        if any(code in error_msg for code in ["429", "500", "502", "503", "504", "408"]):
+            return FailoverErrorType.RETRYABLE
+
+    return FailoverErrorType.UNKNOWN
+
+
+async def invoke_with_failover(
+    primary_model,
+    fallback_model,
+    invoke_fn: Callable[..., Awaitable[T]],
+    *args,
+    **kwargs,
+) -> tuple[T | None, str, str | None]:
+    """Invoke LLM with automatic failover from primary to fallback.
+
+    Args:
+        primary_model: The primary LLM model config (can be None)
+        fallback_model: The fallback LLM model config (can be None)
+        invoke_fn: Async function to call the LLM (e.g., client.complete)
+        *args, **kwargs: Arguments to pass to invoke_fn
+
+    Returns:
+        Tuple of (result, model_used, error)
+        - result: The LLM response or None if both failed
+        - model_used: "primary", "fallback", or "none"
+        - error: Error message if both failed, None otherwise
+    """
+    # Config-level fallback: if no primary, use fallback directly
+    if primary_model is None and fallback_model is not None:
+        logger.info("[Failover] Primary model not configured, using fallback directly")
+        primary_model = fallback_model
+        fallback_model = None
+
+    if primary_model is None:
+        return None, "none", "No LLM model configured (primary or fallback)"
+
+    # Try primary model
+    try:
+        logger.debug(f"[Failover] Invoking primary model: {primary_model.provider}/{primary_model.model}")
+        result = await invoke_fn(*args, **kwargs)
+        return result, "primary", None
+    except Exception as e:
+        error_type = classify_error(e)
+        error_msg = str(e) or repr(e)
+
+        logger.warning(
+            f"[Failover] Primary model failed ({error_type.value}): {error_msg[:150]}"
+        )
+
+        # Non-retryable errors: don't attempt fallback
+        if error_type == FailoverErrorType.NON_RETRYABLE:
+            logger.info("[Failover] Non-retryable error, not attempting fallback")
+            return None, "none", f"Primary failed (non-retryable): {error_msg}"
+
+        # No fallback available
+        if fallback_model is None:
+            logger.warning("[Failover] No fallback model available")
+            return None, "none", f"Primary failed: {error_msg}"
+
+        # Runtime fallback: retry with fallback model
+        logger.info(f"[Failover] Retrying with fallback model: {fallback_model.provider}/{fallback_model.model}")
+
+        try:
+            # Update kwargs with fallback model if needed
+            if "model" in kwargs:
+                kwargs["model"] = fallback_model
+
+            result = await invoke_fn(*args, **kwargs)
+            logger.info("[Failover] Fallback model succeeded")
+            return result, "fallback", None
+
+        except Exception as e2:
+            error_msg2 = str(e2) or repr(e2)
+            logger.error(f"[Failover] Fallback model also failed: {error_msg2[:150]}")
+            return None, "none", f"Primary: {error_msg[:80]} | Fallback: {error_msg2[:80]}"
+
+
+async def call_llm_with_failover(
+    primary_model,
+    fallback_model,
+    messages: list[LLMMessage],
+    tools: list | None = None,
+    temperature: float = 0.7,
+    max_tokens: int | None = None,
+    timeout: float = 120.0,
+    stream: bool = False,
+    on_chunk=None,
+    on_thinking=None,
+) -> tuple[LLMResponse | None, str, str | None]:
+    """Call LLM with automatic failover support.
+
+    This is the unified entry point for all LLM calls with failover.
+
+    Args:
+        primary_model: Primary LLM model config
+        fallback_model: Fallback LLM model config
+        messages: List of LLMMessage
+        tools: Optional tool definitions
+        temperature: Sampling temperature
+        max_tokens: Max output tokens
+        timeout: Request timeout
+        stream: Whether to use streaming API
+        on_chunk: Callback for streaming chunks
+        on_thinking: Callback for thinking/reasoning content
+
+    Returns:
+        Tuple of (response, model_used, error)
+    """
+    async def _invoke(model):
+        client = create_llm_client(
+            provider=model.provider,
+            api_key=model.api_key_encrypted,
+            model=model.model,
+            base_url=model.base_url,
+            timeout=timeout,
+        )
+
+        _max_tokens = max_tokens or get_max_tokens(
+            model.provider, model.model, getattr(model, "max_output_tokens", None)
+        )
+
+        try:
+            if stream:
+                response = await client.stream(
+                    messages=messages,
+                    tools=tools,
+                    temperature=temperature,
+                    max_tokens=_max_tokens,
+                    on_chunk=on_chunk,
+                    on_thinking=on_thinking,
+                )
+            else:
+                response = await client.complete(
+                    messages=messages,
+                    tools=tools,
+                    temperature=temperature,
+                    max_tokens=_max_tokens,
+                )
+            return response
+        finally:
+            await client.close()
+
+    return await invoke_with_failover(primary_model, fallback_model, _invoke, primary_model)
+
+
+# Backward compatibility: re-export for convenience
+__all__ = [
+    "FailoverErrorType",
+    "FailoverResult",
+    "classify_error",
+    "invoke_with_failover",
+    "call_llm_with_failover",
+]
diff --git a/backend/app/services/scheduler.py b/backend/app/services/scheduler.py
index 2fcfc98a..714b5727 100644
--- a/backend/app/services/scheduler.py
+++ b/backend/app/services/scheduler.py
@@ -60,85 +60,22 @@ async def _execute_schedule(schedule_id: uuid.UUID, agent_id: uuid.UUID, instruc
                 logger.warning(f"Schedule {schedule_id}: LLM model {model_id} not found")
                 return
 
-            # Build context and call LLM
+            # Build context and call LLM with failover support
             from app.services.agent_context import build_agent_context
-            from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
-            from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
+            from app.services.llm_caller import call_agent_llm_with_tools
 
             system_prompt = await build_agent_context(agent_id, agent.name, agent.role_description or "")
 
-            messages = [
-                LLMMessage(role="system", content=system_prompt),
-                LLMMessage(role="user", content=f"[自动调度任务] {instruction}"),
-            ]
-
-            # Load tools dynamically from DB (respects per-agent config and MCP tools)
-            tools_for_llm = await get_agent_tools_for_llm(agent_id)
-
-            # Create unified LLM client
-            try:
-                client = create_llm_client(
-                    provider=model.provider,
-                    api_key=model.api_key_encrypted,
-                    model=model.model,
-                    base_url=model.base_url,
-                    timeout=120.0,
-                )
-            except Exception as e:
-                logger.error(f"Schedule {schedule_id}: Failed to create LLM client: {e}")
-                return
+            user_prompt = f"[自动调度任务] {instruction}"
 
-            # Tool-calling loop (max 50 rounds for scheduled tasks)
-            reply = ""
-            for round_i in range(50):
-                try:
-                    response = await client.complete(
-                        messages=messages,
-                        tools=tools_for_llm if tools_for_llm else None,
-                        temperature=0.7,
-                        max_tokens=get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None)),
-                    )
-                except LLMError as e:
-                    logger.error(f"Schedule {schedule_id}: LLM error: {e}")
-                    reply = f"(LLM 错误: {e})"
-                    break
-                except Exception as e:
-                    logger.error(f"Schedule {schedule_id}: LLM call error: {e}")
-                    reply = f"(LLM 调用异常: {str(e)[:200]})"
-                    break
-
-                if response.tool_calls:
-                    # Add assistant message with tool calls
-                    messages.append(LLMMessage(
-                        role="assistant",
-                        content=response.content or None,
-                        tool_calls=[{
-                            "id": tc["id"],
-                            "type": "function",
-                            "function": tc["function"],
-                        } for tc in response.tool_calls],
-                        reasoning_content=response.reasoning_content,
-                    ))
-
-                    for tc in response.tool_calls:
-                        fn = tc["function"]
-                        try:
-                            args = json.loads(fn["arguments"]) if fn.get("arguments") else {}
-                        except Exception:
-                            args = {}
-                        tool_result = await execute_tool(fn["name"], args, agent_id, agent.creator_id)
-                        messages.append(LLMMessage(
-                            role="tool",
-                            tool_call_id=tc["id"],
-                            content=str(tool_result),
-                        ))
-                else:
-                    reply = response.content or ""
-                    break
-            else:
-                reply = "(已达到最大工具调用轮数)"
-
-            await client.close()
+            # Call LLM with unified failover support
+            reply = await call_agent_llm_with_tools(
+                db=db,
+                agent_id=agent_id,
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                max_rounds=50,
+            )
 
             # Log activity
             from app.services.activity_logger import log_activity
diff --git a/backend/app/services/task_executor.py b/backend/app/services/task_executor.py
index 32df20cd..29f493fd 100644
--- a/backend/app/services/task_executor.py
+++ b/backend/app/services/task_executor.py
@@ -78,7 +78,7 @@ async def execute_task(task_id: uuid.UUID, agent_id: uuid.UUID) -> None:
 
     # Step 3: Build full agent context (same as chat dialog)
     from app.services.agent_context import build_agent_context
-    system_prompt = await build_agent_context(agent_id, agent_name, agent.role_description or "")
+    system_prompt = await build_agent_context(agent_id, agent.name, agent.role_description or "")
 
     # Add task-execution-specific instructions
     task_addendum = """
@@ -110,101 +110,29 @@ async def execute_task(task_id: uuid.UUID, agent_id: uuid.UUID) -> None:
             user_prompt += f"\n任务描述: {task_description}"
         user_prompt += "\n\n请认真完成此任务，给出详细的执行结果。"
 
-    # Step 4: Call LLM with tool loop
-    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
-
-    messages = [
-        LLMMessage(role="system", content=system_prompt),
-        LLMMessage(role="user", content=user_prompt),
-    ]
-
-    # Normalize base_url
-    if not model.base_url:
-        await _log_error(task_id, f"未配置 {model.provider} 的 API 地址")
-        if task_type == 'supervision':
-            await _restore_supervision_status(task_id)
-        return
-
-    # Create unified LLM client
-    try:
-        client = create_llm_client(
-            provider=model.provider,
-            api_key=model.api_key_encrypted,
-            model=model.model,
-            base_url=model.base_url,
-            timeout=1200.0,
-        )
-    except Exception as e:
-        await _log_error(task_id, f"创建 LLM 客户端失败: {e}")
-        if task_type == 'supervision':
-            await _restore_supervision_status(task_id)
-        return
-
-    # Load tools (same as chat dialog)
-    from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
-    tools_for_llm = await get_agent_tools_for_llm(agent_id)
+    # Step 4: Call LLM with unified failover support
+    from app.services.llm_caller import call_agent_llm_with_tools
 
     try:
         logger.info(f"[TaskExec] Calling LLM with tools for task: {task_title}")
-        reply = ""
-
-        # Tool-calling loop (max 50 rounds for task execution)
-        for round_i in range(50):
-            try:
-                response = await client.complete(
-                    messages=messages,
-                    tools=tools_for_llm if tools_for_llm else None,
-                    temperature=0.7,
-                    max_tokens=get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None)),
-                )
-            except LLMError as e:
-                await _log_error(task_id, f"LLM 错误: {e}")
-                if task_type == 'supervision':
-                    await _restore_supervision_status(task_id)
-                return
-            except Exception as e:
-                await _log_error(task_id, f"调用模型失败: {str(e)[:200]}")
-                if task_type == 'supervision':
-                    await _restore_supervision_status(task_id)
-                return
 
-            if response.tool_calls:
-                # Add assistant message with tool calls
-                messages.append(LLMMessage(
-                    role="assistant",
-                    content=response.content or None,
-                    tool_calls=[{
-                        "id": tc["id"],
-                        "type": "function",
-                        "function": tc["function"],
-                    } for tc in response.tool_calls],
-                    reasoning_content=response.reasoning_content,
-                ))
-
-                for tc in response.tool_calls:
-                    fn = tc["function"]
-                    tool_name = fn["name"]
-                    raw_args = fn.get("arguments", "{}")
-                    logger.info(f"[TaskExec] Round {round_i+1} calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
-                    try:
-                        args = json.loads(raw_args) if raw_args else {}
-                    except Exception:
-                        args = {}
+        reply = await call_agent_llm_with_tools(
+            db=db,  # Use existing session
+            agent_id=agent_id,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            max_rounds=50,
+        )
 
-                    tool_result = await execute_tool(tool_name, args, agent_id, creator_id)
-                    messages.append(LLMMessage(
-                        role="tool",
-                        tool_call_id=tc["id"],
-                        content=str(tool_result),
-                    ))
-            else:
-                reply = response.content or ""
-                break
-        else:
-            reply = "(已达到最大工具调用轮数)"
+        if reply.startswith("⚠️") or reply.startswith("[Error]"):
+            # LLM call failed (both primary and fallback)
+            await _log_error(task_id, f"LLM 调用失败: {reply}")
+            if task_type == 'supervision':
+                await _restore_supervision_status(task_id)
+            return
 
-        await client.close()
         logger.info(f"[TaskExec] LLM reply: {reply[:80]}")
+
     except Exception as e:
         error_msg = str(e) or repr(e)
         logger.error(f"[TaskExec] Error: {error_msg}")

From efa59636e3e63cc1546e640494bced1a80a7f811 Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Sun, 22 Mar 2026 20:52:26 +0800
Subject: [PATCH 3/6] refactor(llm): use Option B wrapper approach for failover

Per issue #154 review feedback:
- Restore call_llm to return error strings (not raise exceptions)
- Create call_llm_with_failover wrapper that inspects return values
- Add FailoverGuard with idempotency/streaming/once-only checks
- Add on_failover callback for user-visible notifications
- Extract helper functions for better code organization

Benefits:
- Zero risk to existing callers (call_llm unchanged)
- Incremental migration possible
- Guard checks prevent unsafe failovers
- User-visible failover notifications
---
 backend/app/api/websocket.py       | 463 +++++++++++++++++++++++------
 backend/app/services/llm_caller.py |  19 +-
 2 files changed, 377 insertions(+), 105 deletions(-)

diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 9acc25a8..1b846207 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -107,108 +107,167 @@ async def call_llm(
     on_tool_call=None,
     on_thinking=None,
     supports_vision=False,
-    fallback_model: LLMModel | None = None,
 ) -> str:
-    """Call LLM via unified client with function-calling tool loop and failover support.
+    """Call LLM via unified client with function-calling tool loop.
 
     Args:
         on_chunk: Optional async callback(text: str) for streaming chunks to client.
         on_thinking: Optional async callback(text: str) for reasoning/thinking content.
         on_tool_call: Optional async callback(dict) for tool call status updates.
-        fallback_model: Optional fallback model for runtime failover.
+
+    Returns:
+        LLM response string, or error message if call fails.
     """
-    from app.services.llm_failover import classify_error, FailoverErrorType
+    return await _call_llm_core(
+        model, messages, agent_name, role_description,
+        agent_id, user_id, on_chunk, on_tool_call, on_thinking, supports_vision
+    )
 
-    async def _call_single(_model: LLMModel) -> str:
-        """Internal: call a single model without failover."""
-        return await _call_llm_core(
-            _model, messages, agent_name, role_description,
-            agent_id, user_id, on_chunk, on_tool_call, on_thinking, supports_vision
-        )
 
-    # Config-level fallback: if no primary, use fallback directly
-    if model is None and fallback_model is not None:
-        model = fallback_model
-        fallback_model = None
 
-    if model is None:
-        return "⚠️ 未配置 LLM 模型"
+async def _get_agent_config(agent_id) -> tuple[int, str | None]:
+    """Get agent config: max_tool_rounds and token limit status."""
+    if not agent_id:
+        return 50, None
 
-    # Try primary model
     try:
-        return await _call_single(model)
-    except Exception as e:
-        error_type = classify_error(e)
-        error_msg = str(e) or repr(e)
-        logger.warning(f"[call_llm] Primary failed ({error_type.value}): {error_msg[:150]}")
+        from app.models.agent import Agent as AgentModel
+        async with async_session() as _db:
+            _ar = await _db.execute(select(AgentModel).where(AgentModel.id == agent_id))
+            _agent = _ar.scalar_one_or_none()
+            if _agent:
+                max_rounds = _agent.max_tool_rounds or 50
+                if _agent.max_tokens_per_day and _agent.tokens_used_today >= _agent.max_tokens_per_day:
+                    return max_rounds, f"⚠️ Daily token usage has reached the limit ({_agent.tokens_used_today:,}/{_agent.max_tokens_per_day:,}). Please try again tomorrow or ask admin to increase the limit."
+                if _agent.max_tokens_per_month and _agent.tokens_used_month >= _agent.max_tokens_per_month:
+                    return max_rounds, f"⚠️ Monthly token usage has reached the limit ({_agent.tokens_used_month:,}/{_agent.max_tokens_per_month:,}). Please ask admin to increase the limit."
+                return max_rounds, None
+    except Exception:
+        pass
+    return 50, None
+
 
-        # Non-retryable: don't attempt fallback
-        if error_type == FailoverErrorType.NON_RETRYABLE:
-            return f"[LLM Error] {error_msg}"
+async def _get_user_name(user_id) -> str | None:
+    """Get user's display name for personalized context."""
+    if not user_id:
+        return None
+    try:
+        from app.models.user import User as _UserModel
+        async with async_session() as _udb:
+            _ur = await _udb.execute(select(_UserModel).where(_UserModel.id == user_id))
+            _u = _ur.scalar_one_or_none()
+            if _u:
+                return _u.display_name or _u.username
+    except Exception:
+        pass
+    return None
 
-        # No fallback available
-        if fallback_model is None:
-            return f"[LLM Error] {error_msg}"
 
-        # Runtime fallback: retry with fallback model
-        logger.info(f"[call_llm] Retrying with fallback: {fallback_model.provider}/{fallback_model.model}")
-        try:
-            return await _call_single(fallback_model)
-        except Exception as e2:
-            error_msg2 = str(e2) or repr(e2)
-            logger.error(f"[call_llm] Fallback also failed: {error_msg2[:150]}")
-            return f"⚠️ 调用模型出错: Primary: {error_msg[:80]} | Fallback: {error_msg2[:80]}"
+def _convert_messages_for_vision(
+    api_messages: list, supports_vision: bool
+) -> list:
+    """Convert image markers to vision format if supported, or strip them."""
+    import re as _re_v
+
+    if supports_vision:
+        # Vision format: convert image markers to OpenAI Vision API format
+        for i, msg in enumerate(api_messages):
+            if msg.role != "user" or not msg.content or not isinstance(msg.content, str):
+                continue
+            content_str = msg.content
+            pattern = r'\[image_data:(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)\]'
+            images = _re_v.findall(pattern, content_str)
+            if not images:
+                continue
+            text = _re_v.sub(pattern, '', content_str).strip()
+            parts = [{"type": "image_url", "image_url": {"url": img}} for img in images]
+            if text:
+                parts.append({"type": "text", "text": text})
+            api_messages[i] = type(msg)(role=msg.role, content=parts)
+    else:
+        # Strip base64 markers for non-vision models
+        _img_pattern = r'\[image_data:data:image/[^;]+;base64,[A-Za-z0-9+/=]+\]'
+        for i, msg in enumerate(api_messages):
+            if msg.role != "user" or not isinstance(msg.content, str):
+                continue
+            if "[image_data:" in msg.content:
+                _n_imgs = len(_re_v.findall(_img_pattern, msg.content))
+                cleaned = _re_v.sub(_img_pattern, '', msg.content).strip()
+                if _n_imgs > 0:
+                    cleaned += f"\n[用户发送了 {_n_imgs} 张图片，但当前模型不支持视觉，无法查看图片内容]"
+                api_messages[i] = type(msg)(role=msg.role, content=cleaned)
+    return api_messages
 
 
+def _check_tool_requires_args(tool_name: str, args: dict) -> tuple[bool, str]:
+    """Check if tool requires arguments and return (should_execute, result_or_error)."""
+    _TOOLS_REQUIRING_ARGS = {"write_file", "read_file", "delete_file", "read_document", "send_message_to_agent", "send_feishu_message", "send_email"}
+    if not args and tool_name in _TOOLS_REQUIRING_ARGS:
+        return False, f"Error: {tool_name} was called with empty arguments. You must provide the required parameters. Please retry with the correct arguments."
+    return True, ""
 
-async def _call_llm_core(
-    model: LLMModel,
-    messages: list[dict],
-    agent_name: str,
-    role_description: str,
-    agent_id=None,
-    user_id=None,
-    on_chunk=None,
-    on_tool_call=None,
-    on_thinking=None,
-    supports_vision=False,
+
+async def _process_tool_call(
+    tc: dict,
+    api_messages: list,
+    agent_id,
+    user_id,
+    on_tool_call,
+    full_reasoning_content: str,
 ) -> str:
-    """Core LLM call implementation (single model, no failover)."""
-    from app.services.agent_tools import AGENT_TOOLS, execute_tool, get_agent_tools_for_llm
-    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
+    """Process a single tool call and return result."""
+    from app.services.agent_tools import execute_tool
+    import json
 
-    # ── Token limit check & config ──
-    _max_tool_rounds = 50  # default
-    if agent_id:
+    fn = tc["function"]
+    tool_name = fn["name"]
+    raw_args = fn.get("arguments", "{}")
+    logger.info(f"[LLM] Calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
+
+    try:
+        args = json.loads(raw_args) if raw_args else {}
+    except json.JSONDecodeError:
+        args = {}
+
+    # Guard: check if tool requires arguments
+    should_execute, error_msg = _check_tool_requires_args(tool_name, args)
+    if not should_execute:
+        return error_msg
+
+    # Notify client about tool call (in-progress)
+    if on_tool_call:
         try:
-            from app.models.agent import Agent as AgentModel
-            async with async_session() as _db:
-                _ar = await _db.execute(select(AgentModel).where(AgentModel.id == agent_id))
-                _agent = _ar.scalar_one_or_none()
-                if _agent:
-                    _max_tool_rounds = _agent.max_tool_rounds or 50
-                    if _agent.max_tokens_per_day and _agent.tokens_used_today >= _agent.max_tokens_per_day:
-                        return f"⚠️ Daily token usage has reached the limit ({_agent.tokens_used_today:,}/{_agent.max_tokens_per_day:,}). Please try again tomorrow or ask admin to increase the limit."
-                    if _agent.max_tokens_per_month and _agent.tokens_used_month >= _agent.max_tokens_per_month:
-                        return f"⚠️ Monthly token usage has reached the limit ({_agent.tokens_used_month:,}/{_agent.max_tokens_per_month:,}). Please ask admin to increase the limit."
+            await on_tool_call({
+                "name": tool_name,
+                "args": args,
+                "status": "running",
+                "reasoning_content": full_reasoning_content
+            })
         except Exception:
             pass
 
-    # Build rich prompt with soul, memory, skills, relationships
-    from app.services.agent_context import build_agent_context
-    # Look up current user's display name so the agent knows who it's talking to
-    _current_user_name = None
-    if user_id:
+    # Execute tool
+    result = await execute_tool(
+        tool_name, args,
+        agent_id=agent_id,
+        user_id=user_id or agent_id,
+    )
+    logger.debug(f"[LLM] Tool result: {result[:100]}")
+
+    # Notify client about tool call result
+    if on_tool_call:
         try:
-            from app.models.user import User as _UserModel
-            async with async_session() as _udb:
-                _ur = await _udb.execute(select(_UserModel).where(_UserModel.id == user_id))
-                _u = _ur.scalar_one_or_none()
-                if _u:
-                    _current_user_name = _u.display_name or _u.username
+            await on_tool_call({
+                "name": tool_name,
+                "args": args,
+                "status": "done",
+                "result": result,
+                "reasoning_content": full_reasoning_content
+            })
         except Exception:
             pass
-    system_prompt = await build_agent_context(agent_id, agent_name, role_description, current_user_name=_current_user_name)
+
+    return str(result)
 
     # Load tools dynamically from DB
     tools_for_llm = await get_agent_tools_for_llm(agent_id) if agent_id else AGENT_TOOLS
@@ -276,7 +335,7 @@ async def _call_llm_core(
             timeout=120.0,
         )
     except Exception as e:
-        raise LLMError(f"Failed to create LLM client: {e}")
+        return f"[Error] Failed to create LLM client: {e}"
 
     max_tokens = get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None))
 
@@ -317,7 +376,7 @@ async def _call_llm_core(
                 on_thinking=on_thinking,
             )
         except LLMError as e:
-            # Record accumulated tokens before raising
+            # Record accumulated tokens before returning error
             logger.error(
                 f"[LLM] LLMError provider={getattr(model, 'provider', '?')} "
                 f"model={getattr(model, 'model', '?')} round={round_i + 1}: {e}"
@@ -325,7 +384,7 @@ async def _call_llm_core(
             if agent_id and _accumulated_tokens > 0:
                 await record_token_usage(agent_id, _accumulated_tokens)
             await client.close()
-            raise  # Re-raise for failover handling
+            return f"[LLM Error] {e}"
         except Exception as e:
             logger.error(
                 f"[LLM] Unexpected error provider={getattr(model, 'provider', '?')} "
@@ -335,7 +394,7 @@ async def _call_llm_core(
             if agent_id and _accumulated_tokens > 0:
                 await record_token_usage(agent_id, _accumulated_tokens)
             await client.close()
-            raise  # Re-raise for failover handling
+            return f"[LLM call error] {type(e).__name__}: {str(e)[:200]}"
 
         # ── Track tokens for this round ──
         real_tokens = extract_usage_tokens(response.usage)
@@ -438,7 +497,7 @@ async def _call_llm_core(
     if agent_id and _accumulated_tokens > 0:
         await record_token_usage(agent_id, _accumulated_tokens)
     await client.close()
-    raise LLMError("Too many tool call rounds")
+    return "[Error] Too many tool call rounds"
 
 
 @router.websocket("/ws/chat/{agent_id}")
@@ -786,20 +845,27 @@ async def thinking_to_ws(text: str):
 
                     import asyncio as _aio
 
-                    # Run call_llm as a cancellable task
-                    llm_task = _aio.create_task(call_llm(
-                        llm_model,
-                        conversation[-ctx_size:],
-                        agent_name,
-                        role_description,
-                        agent_id=agent_id,
-                        user_id=user_id,
-                        on_chunk=stream_to_ws,
-                        on_tool_call=tool_call_to_ws,
-                        on_thinking=thinking_to_ws,
-                        supports_vision=getattr(llm_model, 'supports_vision', False),
-                        fallback_model=fallback_llm_model,
-                    ))
+                    # Run call_llm_with_failover as a cancellable task
+                    async def _call_with_failover():
+                        async def _on_failover(reason: str):
+                            await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
+
+                        return await call_llm_with_failover(
+                            primary_model=llm_model,
+                            fallback_model=fallback_llm_model,
+                            messages=conversation[-ctx_size:],
+                            agent_name=agent_name,
+                            role_description=role_description,
+                            agent_id=agent_id,
+                            user_id=user_id,
+                            on_chunk=stream_to_ws,
+                            on_tool_call=tool_call_to_ws,
+                            on_thinking=thinking_to_ws,
+                            supports_vision=getattr(llm_model, 'supports_vision', False),
+                            on_failover=_on_failover,
+                        )
+
+                    llm_task = _aio.create_task(_call_with_failover())
 
                     # Listen for abort while LLM is running
                     aborted = False
@@ -865,8 +931,7 @@ async def thinking_to_ws(text: str):
                     logger.error(f"[WS] LLM error: {e}")
                     import traceback
                     traceback.print_exc()
-                    # call_llm now handles failover internally, just return the error message
-                    assistant_response = str(e) if str(e) else "[LLM call error]"
+                    assistant_response = f"[LLM call error] {str(e)[:200]}"
             else:
                 assistant_response = f"⚠️ {agent_name} has no LLM model configured. Please select a model in the agent's Settings tab."
 
@@ -933,3 +998,209 @@ async def thinking_to_ws(text: str):
             await websocket.close(code=1011)
         except Exception:
             pass
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Unified Failover Wrapper (Option B implementation per issue #154)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class FailoverGuard:
+    """Guard state for failover decisions."""
+
+    def __init__(self):
+        self.tool_executed = False
+        self.streaming_started = False
+        self.failover_done = False
+
+    def mark_tool_executed(self):
+        """Mark that a side-effecting tool has been executed."""
+        self.tool_executed = True
+
+    def mark_streaming_started(self):
+        """Mark that streaming output has started."""
+        self.streaming_started = True
+
+    def mark_failover_done(self):
+        """Mark that failover has already happened once."""
+        self.failover_done = True
+
+    def can_failover(self) -> bool:
+        """Check if failover is allowed based on guard rules."""
+        if self.failover_done:
+            return False  # Only failover once
+        if self.tool_executed:
+            return False  # Don't failover after side effects
+        if self.streaming_started:
+            return False  # Don't failover after streaming started
+        return True
+
+
+def is_retryable_error(result: str) -> bool:
+    """Check if an error result is retryable (network, timeout, 429, 5xx).
+
+    Non-retryable: auth errors (401, 403), validation (400, 422), content policy
+    Retryable: timeout, connection, 429, 5xx, transient errors
+    """
+    if not (result.startswith("[LLM Error]") or result.startswith("[LLM call error]") or result.startswith("[Error]")):
+        return False
+
+    result_lower = result.lower()
+
+    # Non-retryable: authentication and authorization
+    if any(kw in result_lower for kw in ["auth", "unauthorized", "forbidden", "invalid api key", "api key invalid", "401", "403"]):
+        return False
+
+    # Non-retryable: validation and schema
+    if any(kw in result_lower for kw in ["validation", "invalid request", "schema", "bad request", "400", "422"]):
+        return False
+
+    # Non-retryable: content policy
+    if any(kw in result_lower for kw in ["content policy", "content_filter", "safety", "moderation"]):
+        return False
+
+    # Retryable by default (any other error is potentially retryable)
+    return True
+
+
+async def call_llm_with_failover(
+    primary_model: LLMModel,
+    fallback_model: LLMModel | None,
+    messages: list[dict],
+    agent_name: str,
+    role_description: str,
+    agent_id=None,
+    user_id=None,
+    on_chunk=None,
+    on_thinking=None,
+    on_tool_call=None,
+    supports_vision=False,
+    on_failover=None,
+) -> str:
+    """Call LLM with automatic failover support (wrapper approach).
+
+    This is the unified entry point for all LLM calls with failover.
+    Implements Option B from issue #154 review:
+    - Inspects return values for retryable errors
+    - Applies guard checks before failover
+    - Notifies caller when failover happens
+
+    Args:
+        primary_model: Primary LLM model
+        fallback_model: Fallback LLM model (can be None)
+        messages: Conversation messages
+        agent_name: Agent display name
+        role_description: Agent role description
+        agent_id: Optional agent UUID
+        user_id: Optional user UUID
+        on_chunk: Optional streaming callback
+        on_thinking: Optional thinking callback
+        on_tool_call: Optional tool call callback
+        supports_vision: Whether model supports vision
+        on_failover: Optional callback(reason: str) called when failover happens
+
+    Returns:
+        LLM response string (from primary or fallback)
+    """
+    from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
+    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
+
+    guard = FailoverGuard()
+
+    # Config-level fallback: if no primary, use fallback directly
+    if primary_model is None and fallback_model is not None:
+        logger.info("[Failover] Primary model not configured, using fallback directly")
+        primary_model = fallback_model
+        fallback_model = None
+
+    if primary_model is None:
+        return "⚠️ 未配置 LLM 模型"
+
+    # Wrapper callbacks to track state for guard checks
+    async def _wrapped_on_chunk(text: str):
+        guard.mark_streaming_started()
+        if on_chunk:
+            await on_chunk(text)
+
+    async def _wrapped_on_tool_call(data: dict):
+        if data.get("status") == "done":
+            guard.mark_tool_executed()
+        if on_tool_call:
+            await on_tool_call(data)
+
+    # Try primary model
+    primary_result = await call_llm(
+        primary_model,
+        messages,
+        agent_name,
+        role_description,
+        agent_id=agent_id,
+        user_id=user_id,
+        on_chunk=_wrapped_on_chunk,
+        on_tool_call=_wrapped_on_tool_call,
+        on_thinking=on_thinking,
+        supports_vision=supports_vision,
+    )
+
+    # Check if we need to failover
+    if not is_retryable_error(primary_result):
+        return primary_result
+
+    # Check guard conditions
+    if not guard.can_failover():
+        if guard.tool_executed:
+            logger.warning("[Failover] Blocked: side-effecting tool already executed")
+        elif guard.streaming_started:
+            logger.warning("[Failover] Blocked: streaming already started")
+        elif guard.failover_done:
+            logger.warning("[Failover] Blocked: failover already done once")
+        return primary_result
+
+    # No fallback available
+    if fallback_model is None:
+        logger.warning("[Failover] No fallback model available")
+        return primary_result
+
+    # Runtime failover: retry with fallback model
+    logger.info(f"[Failover] Retrying with fallback model: {fallback_model.provider}/{fallback_model.model}")
+
+    if on_failover:
+        try:
+            await on_failover(f"Switched to fallback model: {fallback_model.model}")
+        except Exception:
+            pass
+
+    guard.mark_failover_done()
+
+    # Call fallback with fresh callbacks (streaming/tool state is per-call)
+    fallback_guard = FailoverGuard()
+    fallback_guard.mark_failover_done()  # Don't failover again
+
+    async def _fallback_on_chunk(text: str):
+        fallback_guard.mark_streaming_started()
+        if on_chunk:
+            await on_chunk(text)
+
+    async def _fallback_on_tool_call(data: dict):
+        if data.get("status") == "done":
+            fallback_guard.mark_tool_executed()
+        if on_tool_call:
+            await on_tool_call(data)
+
+    fallback_result = await call_llm(
+        fallback_model,
+        messages,
+        agent_name,
+        role_description,
+        agent_id=agent_id,
+        user_id=user_id,
+        on_chunk=_fallback_on_chunk,
+        on_tool_call=_fallback_on_tool_call,
+        on_thinking=on_thinking,
+        supports_vision=getattr(fallback_model, 'supports_vision', False),
+    )
+
+    # Combine error messages if fallback also failed
+    if is_retryable_error(fallback_result) or fallback_result.startswith("⚠️") or fallback_result.startswith("[Error]"):
+        return f"⚠️ 调用模型出错: Primary: {primary_result[:80]} | Fallback: {fallback_result[:80]}"
+
+    return fallback_result
diff --git a/backend/app/services/llm_caller.py b/backend/app/services/llm_caller.py
index 20d92d8d..f352030b 100644
--- a/backend/app/services/llm_caller.py
+++ b/backend/app/services/llm_caller.py
@@ -95,23 +95,24 @@ async def call_agent_llm(
         messages.extend(history[-10:])
     messages.append({"role": "user", "content": user_text})
 
-    # Use unified call_llm with failover
+    # Use unified call_llm_with_failover
+    from app.api.websocket import call_llm_with_failover
     try:
-        reply = await call_llm(
-            primary_model,
-            messages,
-            agent.name,
-            agent.role_description or "",
+        reply = await call_llm_with_failover(
+            primary_model=primary_model,
+            fallback_model=fallback_model,
+            messages=messages,
+            agent_name=agent.name,
+            role_description=agent.role_description or "",
             agent_id=agent_id,
             user_id=user_id or agent_id,
-            supports_vision=supports_vision or getattr(primary_model, 'supports_vision', False),
             on_chunk=on_chunk,
             on_thinking=on_thinking,
-            fallback_model=fallback_model,
+            supports_vision=supports_vision or getattr(primary_model, 'supports_vision', False),
         )
         return reply
     except Exception as e:
-        # call_llm should handle failover internally, but catch any unexpected errors
+        # call_llm_with_failover should handle failover internally, but catch any unexpected errors
         error_msg = str(e) or repr(e)
         logger.error(f"[call_agent_llm] Unexpected error: {error_msg}")
         return f"⚠️ 调用模型出错: {error_msg[:150]}"

From 89689761b4ddf27d5120331d05ab564c458d02a8 Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Tue, 24 Mar 2026 00:32:47 +0800
Subject: [PATCH 4/6] fix: increase api_key length for Minimax & unify LLM
 failover

- Increase api_key_encrypted column to 1024 chars for Minimax support (#164)
- Add unified failover policy across all LLM execution paths (#154)
- Add temperature configuration UI for LLM models
- Update Docker and restart scripts for development
---
 .../versions/increase_api_key_length.py       |   2 +-
 backend/app/api/tenants.py                    |  14 +-
 backend/app/services/llm_failover.py          | 179 +-----------------
 docker-compose.yml                            |   4 +-
 frontend/Dockerfile                           |   3 +-
 frontend/src/pages/EnterpriseSettings.tsx     |  24 ++-
 restart.sh                                    |  23 ++-
 7 files changed, 61 insertions(+), 188 deletions(-)

diff --git a/backend/alembic/versions/increase_api_key_length.py b/backend/alembic/versions/increase_api_key_length.py
index e3fe5765..95d05171 100644
--- a/backend/alembic/versions/increase_api_key_length.py
+++ b/backend/alembic/versions/increase_api_key_length.py
@@ -17,7 +17,7 @@
 
 
 def upgrade() -> None:
-    # Increase api_key_encrypted column length from 500 to 2000
+    # Increase api_key_encrypted column length from 500 to 1024
     # Minimax API keys are very long and exceed the previous 500 char limit
     op.execute("""
         ALTER TABLE llm_models
diff --git a/backend/app/api/tenants.py b/backend/app/api/tenants.py
index 60625c1e..3a16f297 100644
--- a/backend/app/api/tenants.py
+++ b/backend/app/api/tenants.py
@@ -209,8 +209,11 @@ async def get_tenant(
     """Get tenant details. Platform admins can view any; org_admins only their own."""
     if current_user.role not in ("platform_admin", "org_admin"):
         raise HTTPException(status_code=403, detail="Admin access required")
-    if current_user.role == "org_admin" and str(current_user.tenant_id) != str(tenant_id):
-        raise HTTPException(status_code=403, detail="Access denied")
+    if current_user.role == "org_admin":
+        if not current_user.tenant_id:
+            raise HTTPException(status_code=403, detail="Organization admin must belong to a company")
+        if current_user.tenant_id != tenant_id:
+            raise HTTPException(status_code=403, detail="Access denied")
     result = await db.execute(select(Tenant).where(Tenant.id == tenant_id))
     tenant = result.scalar_one_or_none()
     if not tenant:
@@ -226,8 +229,11 @@ async def update_tenant(
     db: AsyncSession = Depends(get_db),
 ):
     """Update tenant settings. Platform admins can update any; org_admins only their own."""
-    if current_user.role == "org_admin" and str(current_user.tenant_id) != str(tenant_id):
-        raise HTTPException(status_code=403, detail="Can only update your own company")
+    if current_user.role == "org_admin":
+        if not current_user.tenant_id:
+            raise HTTPException(status_code=403, detail="Organization admin must belong to a company")
+        if current_user.tenant_id != tenant_id:
+            raise HTTPException(status_code=403, detail="Can only update your own company")
     result = await db.execute(select(Tenant).where(Tenant.id == tenant_id))
     tenant = result.scalar_one_or_none()
     if not tenant:
diff --git a/backend/app/services/llm_failover.py b/backend/app/services/llm_failover.py
index c619542a..2f3e287b 100644
--- a/backend/app/services/llm_failover.py
+++ b/backend/app/services/llm_failover.py
@@ -1,24 +1,13 @@
-"""Unified LLM failover executor for all execution paths.
-
-Provides a shared failover policy across chat/channel/background paths:
-1. Try primary if available
-2. If primary missing/unavailable, use fallback directly
-3. If primary fails with retryable error, retry once on fallback
-4. If error is non-retryable (auth/validation/schema), do not switch
-5. Max attempts per request: 2 (primary + fallback)
+"""Unified LLM failover error classification.
+
+Provides error classification for failover decisions across all execution paths.
 """
 
 from __future__ import annotations
 
-import asyncio
-from dataclasses import dataclass
 from enum import Enum
-from typing import Awaitable, Callable, TypeVar
-
-from loguru import logger
 
-from app.services.llm_client import LLMError, LLMMessage, LLMResponse
-from app.services.llm_utils import create_llm_client, get_max_tokens
+from app.services.llm_client import LLMError
 
 
 class FailoverErrorType(Enum):
@@ -29,20 +18,6 @@ class FailoverErrorType(Enum):
     UNKNOWN = "unknown"
 
 
-@dataclass
-class FailoverResult:
-    """Result of a failover invocation."""
-
-    content: str
-    success: bool
-    model_used: str  # "primary" or "fallback"
-    error: str | None = None
-
-
-# Type variable for the invoke function return type
-T = TypeVar("T")
-
-
 def classify_error(error: Exception) -> FailoverErrorType:
     """Classify an exception as retryable or non-retryable.
 
@@ -59,7 +34,6 @@ def classify_error(error: Exception) -> FailoverErrorType:
     - Content policy violations
     """
     error_msg = str(error).lower()
-    error_type = type(error).__name__.lower()
 
     # Non-retryable: authentication and authorization
     if any(kw in error_msg for kw in ["auth", "unauthorized", "forbidden", "invalid api key", "api key invalid"]):
@@ -100,150 +74,7 @@ def classify_error(error: Exception) -> FailoverErrorType:
     return FailoverErrorType.UNKNOWN
 
 
-async def invoke_with_failover(
-    primary_model,
-    fallback_model,
-    invoke_fn: Callable[..., Awaitable[T]],
-    *args,
-    **kwargs,
-) -> tuple[T | None, str, str | None]:
-    """Invoke LLM with automatic failover from primary to fallback.
-
-    Args:
-        primary_model: The primary LLM model config (can be None)
-        fallback_model: The fallback LLM model config (can be None)
-        invoke_fn: Async function to call the LLM (e.g., client.complete)
-        *args, **kwargs: Arguments to pass to invoke_fn
-
-    Returns:
-        Tuple of (result, model_used, error)
-        - result: The LLM response or None if both failed
-        - model_used: "primary", "fallback", or "none"
-        - error: Error message if both failed, None otherwise
-    """
-    # Config-level fallback: if no primary, use fallback directly
-    if primary_model is None and fallback_model is not None:
-        logger.info("[Failover] Primary model not configured, using fallback directly")
-        primary_model = fallback_model
-        fallback_model = None
-
-    if primary_model is None:
-        return None, "none", "No LLM model configured (primary or fallback)"
-
-    # Try primary model
-    try:
-        logger.debug(f"[Failover] Invoking primary model: {primary_model.provider}/{primary_model.model}")
-        result = await invoke_fn(*args, **kwargs)
-        return result, "primary", None
-    except Exception as e:
-        error_type = classify_error(e)
-        error_msg = str(e) or repr(e)
-
-        logger.warning(
-            f"[Failover] Primary model failed ({error_type.value}): {error_msg[:150]}"
-        )
-
-        # Non-retryable errors: don't attempt fallback
-        if error_type == FailoverErrorType.NON_RETRYABLE:
-            logger.info("[Failover] Non-retryable error, not attempting fallback")
-            return None, "none", f"Primary failed (non-retryable): {error_msg}"
-
-        # No fallback available
-        if fallback_model is None:
-            logger.warning("[Failover] No fallback model available")
-            return None, "none", f"Primary failed: {error_msg}"
-
-        # Runtime fallback: retry with fallback model
-        logger.info(f"[Failover] Retrying with fallback model: {fallback_model.provider}/{fallback_model.model}")
-
-        try:
-            # Update kwargs with fallback model if needed
-            if "model" in kwargs:
-                kwargs["model"] = fallback_model
-
-            result = await invoke_fn(*args, **kwargs)
-            logger.info("[Failover] Fallback model succeeded")
-            return result, "fallback", None
-
-        except Exception as e2:
-            error_msg2 = str(e2) or repr(e2)
-            logger.error(f"[Failover] Fallback model also failed: {error_msg2[:150]}")
-            return None, "none", f"Primary: {error_msg[:80]} | Fallback: {error_msg2[:80]}"
-
-
-async def call_llm_with_failover(
-    primary_model,
-    fallback_model,
-    messages: list[LLMMessage],
-    tools: list | None = None,
-    temperature: float = 0.7,
-    max_tokens: int | None = None,
-    timeout: float = 120.0,
-    stream: bool = False,
-    on_chunk=None,
-    on_thinking=None,
-) -> tuple[LLMResponse | None, str, str | None]:
-    """Call LLM with automatic failover support.
-
-    This is the unified entry point for all LLM calls with failover.
-
-    Args:
-        primary_model: Primary LLM model config
-        fallback_model: Fallback LLM model config
-        messages: List of LLMMessage
-        tools: Optional tool definitions
-        temperature: Sampling temperature
-        max_tokens: Max output tokens
-        timeout: Request timeout
-        stream: Whether to use streaming API
-        on_chunk: Callback for streaming chunks
-        on_thinking: Callback for thinking/reasoning content
-
-    Returns:
-        Tuple of (response, model_used, error)
-    """
-    async def _invoke(model):
-        client = create_llm_client(
-            provider=model.provider,
-            api_key=model.api_key_encrypted,
-            model=model.model,
-            base_url=model.base_url,
-            timeout=timeout,
-        )
-
-        _max_tokens = max_tokens or get_max_tokens(
-            model.provider, model.model, getattr(model, "max_output_tokens", None)
-        )
-
-        try:
-            if stream:
-                response = await client.stream(
-                    messages=messages,
-                    tools=tools,
-                    temperature=temperature,
-                    max_tokens=_max_tokens,
-                    on_chunk=on_chunk,
-                    on_thinking=on_thinking,
-                )
-            else:
-                response = await client.complete(
-                    messages=messages,
-                    tools=tools,
-                    temperature=temperature,
-                    max_tokens=_max_tokens,
-                )
-            return response
-        finally:
-            await client.close()
-
-    return await invoke_with_failover(primary_model, fallback_model, _invoke, primary_model)
-
-
-# Backward compatibility: re-export for convenience
 __all__ = [
     "FailoverErrorType",
-    "FailoverResult",
     "classify_error",
-    "invoke_with_failover",
-    "call_llm_with_failover",
-]
+]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 05320f3b..6f4562d4 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -61,7 +61,9 @@ services:
         max-size: "10m"
         max-file: "3"
   frontend:
-    build: ./frontend
+    build:
+      context: .
+      dockerfile: frontend/Dockerfile
     restart: unless-stopped
     ports:
       - "${FRONTEND_PORT:-3008}:3000"
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 7d8b64e1..52a917a4 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -2,7 +2,8 @@ FROM node:20-alpine AS build
 WORKDIR /app
 COPY package*.json ./
 RUN npm ci --registry https://registry.npmmirror.com
-COPY . .
+COPY frontend/. .
+COPY VERSION ./VERSION
 RUN npm run build
 
 FROM nginx:alpine
diff --git a/frontend/src/pages/EnterpriseSettings.tsx b/frontend/src/pages/EnterpriseSettings.tsx
index d3fa2734..e8b8ac0f 100644
--- a/frontend/src/pages/EnterpriseSettings.tsx
+++ b/frontend/src/pages/EnterpriseSettings.tsx
@@ -2,6 +2,7 @@ import { useState, useEffect, useMemo } from 'react';
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
 import { useTranslation } from 'react-i18next';
 import { enterpriseApi, skillApi } from '../services/api';
+import { useAuthStore } from '../stores';
 import PromptModal from '../components/PromptModal';
 import FileBrowser from '../components/FileBrowser';
 import type { FileBrowserApi } from '../components/FileBrowser';
@@ -865,29 +866,34 @@ const COMMON_TIMEZONES = [
 
 function CompanyTimezoneEditor() {
     const { t } = useTranslation();
-    const tenantId = localStorage.getItem('current_tenant_id') || '';
+    const user = useAuthStore((s) => s.user);
+    const tenantId = user?.tenant_id || localStorage.getItem('current_tenant_id') || '';
     const [timezone, setTimezone] = useState('UTC');
     const [saving, setSaving] = useState(false);
     const [saved, setSaved] = useState(false);
+    const [error, setError] = useState('');
 
     useEffect(() => {
         if (!tenantId) return;
         fetchJson<any>(`/tenants/${tenantId}`)
             .then(d => { if (d?.timezone) setTimezone(d.timezone); })
-            .catch(() => { });
+            .catch((e: any) => setError(e.message || 'Failed to load timezone'));
     }, [tenantId]);
 
     const handleSave = async (tz: string) => {
         if (!tenantId) return;
         setTimezone(tz);
         setSaving(true);
+        setError('');
         try {
             await fetchJson(`/tenants/${tenantId}`, {
                 method: 'PUT', body: JSON.stringify({ timezone: tz }),
             });
             setSaved(true);
             setTimeout(() => setSaved(false), 2000);
-        } catch (e) { }
+        } catch (e: any) {
+            setError(e.message || 'Failed to save timezone');
+        }
         setSaving(false);
     };
 
@@ -899,13 +905,23 @@ function CompanyTimezoneEditor() {
                     <div style={{ fontSize: '11px', color: 'var(--text-tertiary)' }}>
                         {t('enterprise.timezone.description', 'Default timezone for all agents. Agents can override individually.')}
                     </div>
+                    {error && (
+                        <div style={{ fontSize: '11px', color: 'var(--error)', marginTop: '4px' }}>
+                            ⚠ {error}
+                        </div>
+                    )}
+                    {!tenantId && (
+                        <div style={{ fontSize: '11px', color: 'var(--error)', marginTop: '4px' }}>
+                            ⚠ {t('enterprise.timezone.noTenant', 'No company selected. Please refresh the page or contact support.')}
+                        </div>
+                    )}
                 </div>
                 <select
                     className="form-input"
                     value={timezone}
                     onChange={e => handleSave(e.target.value)}
                     style={{ width: '220px', fontSize: '13px' }}
-                    disabled={saving}
+                    disabled={saving || !tenantId}
                 >
                     {COMMON_TIMEZONES.map(tz => (
                         <option key={tz} value={tz}>{tz}</option>
diff --git a/restart.sh b/restart.sh
index 74894dbf..e9c47f36 100755
--- a/restart.sh
+++ b/restart.sh
@@ -53,9 +53,20 @@ load_env() {
     : "${DATABASE_URL:=postgresql+asyncpg://clawith:clawith@localhost:5432/clawith?ssl=disable}"
     export DATABASE_URL
 
-    PG_PORT=$(echo "$DATABASE_URL" | grep -oP 'localhost:\K[0-9]+' 2>/dev/null || echo "$DATABASE_URL" | sed -n 's/.*localhost:\([0-9]*\).*/\1/p')
-    PG_PORT=${PG_PORT:-5432}
-    export PG_PORT
+    # Parse host and port from DATABASE_URL regardless of hostname
+    # Format: postgresql+asyncpg://user:pass@host:port/dbname?...
+    _db_hostpart=$(echo "$DATABASE_URL" | sed 's|.*://[^@]*@||' | sed 's|/.*||' | sed 's|?.*||')
+    PG_HOST="${_db_hostpart%%:*}"
+    PG_PORT="${_db_hostpart##*:}"
+    [ "$PG_PORT" = "$PG_HOST" ] && PG_PORT="5432"
+    export PG_HOST PG_PORT
+
+    # Detect external (non-localhost) database
+    EXTERNAL_DB=false
+    if [ "$PG_HOST" != "localhost" ] && [ "$PG_HOST" != "127.0.0.1" ]; then
+        EXTERNAL_DB=true
+    fi
+    export EXTERNAL_DB
 }
 
 # ═══════════════════════════════════════════════════════
@@ -116,6 +127,12 @@ add_pg_path() {
 # 启动 PostgreSQL
 # ═══════════════════════════════════════════════════════
 start_postgres() {
+    # Skip local PostgreSQL management when using an external database
+    if [ "$EXTERNAL_DB" = true ]; then
+        echo -e "${GREEN}🐘 Using external database at ${PG_HOST}:${PG_PORT} — skipping local PostgreSQL startup${NC}"
+        return 0
+    fi
+
     add_pg_path
 
     if command -v pg_isready &>/dev/null; then

From 8ee29afa8516b0389a09b13f882da2aa1c90380c Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Tue, 24 Mar 2026 14:19:52 +0800
Subject: [PATCH 5/6] update

---
 backend/app/api/feishu.py                     |    2 +-
 backend/app/api/gateway.py                    |    2 +-
 backend/app/api/websocket.py                  |  608 +----
 backend/app/services/llm/__init__.py          |   55 +
 backend/app/services/llm/caller.py            |  785 +++++++
 backend/app/services/llm/client.py            | 2025 +++++++++++++++++
 .../{llm_failover.py => llm/failover.py}      |    4 +-
 .../services/{llm_utils.py => llm/utils.py}   |    8 +-
 backend/app/services/llm_caller.py            |  275 ---
 backend/app/services/scheduler.py             |    2 +-
 backend/app/services/task_executor.py         |    2 +-
 backend/app/services/trigger_daemon.py        |    2 +-
 12 files changed, 2877 insertions(+), 893 deletions(-)
 create mode 100644 backend/app/services/llm/__init__.py
 create mode 100644 backend/app/services/llm/caller.py
 create mode 100644 backend/app/services/llm/client.py
 rename backend/app/services/{llm_failover.py => llm/failover.py} (98%)
 rename backend/app/services/{llm_utils.py => llm/utils.py} (92%)
 delete mode 100644 backend/app/services/llm_caller.py

diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
index 6f99ddee..6d67cfae 100644
--- a/backend/app/api/feishu.py
+++ b/backend/app/api/feishu.py
@@ -1026,7 +1026,7 @@ async def _call_agent_llm(db: AsyncSession, agent_id: uuid.UUID, user_text: str,
     DEPRECATED: Use app.services.llm_caller.call_agent_llm instead.
     This function is kept for backward compatibility with existing imports.
     """
-    from app.services.llm_caller import call_agent_llm
+    from app.services.llm import call_agent_llm
     return await call_agent_llm(
         db=db,
         agent_id=agent_id,
diff --git a/backend/app/api/gateway.py b/backend/app/api/gateway.py
index 4fd99e9d..450c1206 100644
--- a/backend/app/api/gateway.py
+++ b/backend/app/api/gateway.py
@@ -343,7 +343,7 @@ async def _send_to_agent_background(
     """
     logger.info(f"[Gateway] _send_to_agent_background started: {source_agent_name} -> {target_agent_name}")
     try:
-        from app.api.websocket import call_llm
+        from app.services.llm import call_llm
         from app.services.agent_context import build_agent_context
         from app.models.llm import LLMModel
         from app.models.audit import ChatMessage
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index 1b846207..0eb1d5cf 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -16,6 +16,7 @@
 from app.models.audit import ChatMessage
 from app.models.llm import LLMModel
 from app.models.user import User
+from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
 
@@ -96,408 +97,6 @@ async def get_chat_history(
     return out
 
 
-async def call_llm(
-    model: LLMModel,
-    messages: list[dict],
-    agent_name: str,
-    role_description: str,
-    agent_id=None,
-    user_id=None,
-    on_chunk=None,
-    on_tool_call=None,
-    on_thinking=None,
-    supports_vision=False,
-) -> str:
-    """Call LLM via unified client with function-calling tool loop.
-
-    Args:
-        on_chunk: Optional async callback(text: str) for streaming chunks to client.
-        on_thinking: Optional async callback(text: str) for reasoning/thinking content.
-        on_tool_call: Optional async callback(dict) for tool call status updates.
-
-    Returns:
-        LLM response string, or error message if call fails.
-    """
-    return await _call_llm_core(
-        model, messages, agent_name, role_description,
-        agent_id, user_id, on_chunk, on_tool_call, on_thinking, supports_vision
-    )
-
-
-
-async def _get_agent_config(agent_id) -> tuple[int, str | None]:
-    """Get agent config: max_tool_rounds and token limit status."""
-    if not agent_id:
-        return 50, None
-
-    try:
-        from app.models.agent import Agent as AgentModel
-        async with async_session() as _db:
-            _ar = await _db.execute(select(AgentModel).where(AgentModel.id == agent_id))
-            _agent = _ar.scalar_one_or_none()
-            if _agent:
-                max_rounds = _agent.max_tool_rounds or 50
-                if _agent.max_tokens_per_day and _agent.tokens_used_today >= _agent.max_tokens_per_day:
-                    return max_rounds, f"⚠️ Daily token usage has reached the limit ({_agent.tokens_used_today:,}/{_agent.max_tokens_per_day:,}). Please try again tomorrow or ask admin to increase the limit."
-                if _agent.max_tokens_per_month and _agent.tokens_used_month >= _agent.max_tokens_per_month:
-                    return max_rounds, f"⚠️ Monthly token usage has reached the limit ({_agent.tokens_used_month:,}/{_agent.max_tokens_per_month:,}). Please ask admin to increase the limit."
-                return max_rounds, None
-    except Exception:
-        pass
-    return 50, None
-
-
-async def _get_user_name(user_id) -> str | None:
-    """Get user's display name for personalized context."""
-    if not user_id:
-        return None
-    try:
-        from app.models.user import User as _UserModel
-        async with async_session() as _udb:
-            _ur = await _udb.execute(select(_UserModel).where(_UserModel.id == user_id))
-            _u = _ur.scalar_one_or_none()
-            if _u:
-                return _u.display_name or _u.username
-    except Exception:
-        pass
-    return None
-
-
-def _convert_messages_for_vision(
-    api_messages: list, supports_vision: bool
-) -> list:
-    """Convert image markers to vision format if supported, or strip them."""
-    import re as _re_v
-
-    if supports_vision:
-        # Vision format: convert image markers to OpenAI Vision API format
-        for i, msg in enumerate(api_messages):
-            if msg.role != "user" or not msg.content or not isinstance(msg.content, str):
-                continue
-            content_str = msg.content
-            pattern = r'\[image_data:(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)\]'
-            images = _re_v.findall(pattern, content_str)
-            if not images:
-                continue
-            text = _re_v.sub(pattern, '', content_str).strip()
-            parts = [{"type": "image_url", "image_url": {"url": img}} for img in images]
-            if text:
-                parts.append({"type": "text", "text": text})
-            api_messages[i] = type(msg)(role=msg.role, content=parts)
-    else:
-        # Strip base64 markers for non-vision models
-        _img_pattern = r'\[image_data:data:image/[^;]+;base64,[A-Za-z0-9+/=]+\]'
-        for i, msg in enumerate(api_messages):
-            if msg.role != "user" or not isinstance(msg.content, str):
-                continue
-            if "[image_data:" in msg.content:
-                _n_imgs = len(_re_v.findall(_img_pattern, msg.content))
-                cleaned = _re_v.sub(_img_pattern, '', msg.content).strip()
-                if _n_imgs > 0:
-                    cleaned += f"\n[用户发送了 {_n_imgs} 张图片，但当前模型不支持视觉，无法查看图片内容]"
-                api_messages[i] = type(msg)(role=msg.role, content=cleaned)
-    return api_messages
-
-
-def _check_tool_requires_args(tool_name: str, args: dict) -> tuple[bool, str]:
-    """Check if tool requires arguments and return (should_execute, result_or_error)."""
-    _TOOLS_REQUIRING_ARGS = {"write_file", "read_file", "delete_file", "read_document", "send_message_to_agent", "send_feishu_message", "send_email"}
-    if not args and tool_name in _TOOLS_REQUIRING_ARGS:
-        return False, f"Error: {tool_name} was called with empty arguments. You must provide the required parameters. Please retry with the correct arguments."
-    return True, ""
-
-
-async def _process_tool_call(
-    tc: dict,
-    api_messages: list,
-    agent_id,
-    user_id,
-    on_tool_call,
-    full_reasoning_content: str,
-) -> str:
-    """Process a single tool call and return result."""
-    from app.services.agent_tools import execute_tool
-    import json
-
-    fn = tc["function"]
-    tool_name = fn["name"]
-    raw_args = fn.get("arguments", "{}")
-    logger.info(f"[LLM] Calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
-
-    try:
-        args = json.loads(raw_args) if raw_args else {}
-    except json.JSONDecodeError:
-        args = {}
-
-    # Guard: check if tool requires arguments
-    should_execute, error_msg = _check_tool_requires_args(tool_name, args)
-    if not should_execute:
-        return error_msg
-
-    # Notify client about tool call (in-progress)
-    if on_tool_call:
-        try:
-            await on_tool_call({
-                "name": tool_name,
-                "args": args,
-                "status": "running",
-                "reasoning_content": full_reasoning_content
-            })
-        except Exception:
-            pass
-
-    # Execute tool
-    result = await execute_tool(
-        tool_name, args,
-        agent_id=agent_id,
-        user_id=user_id or agent_id,
-    )
-    logger.debug(f"[LLM] Tool result: {result[:100]}")
-
-    # Notify client about tool call result
-    if on_tool_call:
-        try:
-            await on_tool_call({
-                "name": tool_name,
-                "args": args,
-                "status": "done",
-                "result": result,
-                "reasoning_content": full_reasoning_content
-            })
-        except Exception:
-            pass
-
-    return str(result)
-
-    # Load tools dynamically from DB
-    tools_for_llm = await get_agent_tools_for_llm(agent_id) if agent_id else AGENT_TOOLS
-
-    # Convert messages to LLMMessage format
-    api_messages = [LLMMessage(role="system", content=system_prompt)]
-    for msg in messages:
-        api_messages.append(LLMMessage(
-            role=msg.get("role", "user"),
-            content=msg.get("content"),
-            tool_calls=msg.get("tool_calls"),
-            tool_call_id=msg.get("tool_call_id"),
-        ))
-
-    # ── Vision format conversion ──
-    # If the model supports vision, convert image markers in user messages
-    # to OpenAI Vision API format: content becomes an array of parts.
-    if supports_vision:
-        import re as _re_v
-        for i, msg in enumerate(api_messages):
-            if msg.role != "user" or not msg.content or not isinstance(msg.content, str):
-                continue
-            content_str = msg.content
-            # Find [image_data:data:image/...;base64,...] markers
-            pattern = r'\[image_data:(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)\]'
-            images = _re_v.findall(pattern, content_str)
-            if not images:
-                continue
-            # Build content array
-            text = _re_v.sub(pattern, '', content_str).strip()
-            parts = []
-            for img_url in images:
-                parts.append({"type": "image_url", "image_url": {"url": img_url}})
-            if text:
-                parts.append({"type": "text", "text": text})
-            # Replace the message content with the array format
-            api_messages[i] = LLMMessage(
-                role=msg.role,
-                content=parts,  # type: ignore  # This is valid for vision models
-            )
-    else:
-        # Strip base64 image markers for non-vision models to avoid wasting tokens
-        import re as _re_strip
-        _img_pattern = r'\[image_data:data:image/[^;]+;base64,[A-Za-z0-9+/=]+\]'
-        for i, msg in enumerate(api_messages):
-            if msg.role != "user" or not isinstance(msg.content, str):
-                continue
-            if "[image_data:" in msg.content:
-                _n_imgs = len(_re_strip.findall(_img_pattern, msg.content))
-                cleaned = _re_strip.sub(_img_pattern, '', msg.content).strip()
-                if _n_imgs > 0:
-                    cleaned += f"\n[用户发送了 {_n_imgs} 张图片，但当前模型不支持视觉，无法查看图片内容]"
-                api_messages[i] = LLMMessage(
-                    role=msg.role,
-                    content=cleaned,
-                )
-
-    # Create the unified LLM client
-    try:
-        client = create_llm_client(
-            provider=model.provider,
-            api_key=model.api_key_encrypted,
-            model=model.model,
-            base_url=model.base_url,
-            timeout=120.0,
-        )
-    except Exception as e:
-        return f"[Error] Failed to create LLM client: {e}"
-
-    max_tokens = get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None))
-
-    # ── Per-round token accumulator ──
-    from app.services.token_tracker import record_token_usage, extract_usage_tokens, estimate_tokens_from_chars
-    _accumulated_tokens = 0
-
-    # Tool-calling loop (configurable per agent, default 50)
-    for round_i in range(_max_tool_rounds):
-        # ── Dynamic tool-call limit warning (Aware engine) ──
-        # Don't tell the agent about limits at the start — only warn when approaching.
-        # This prevents models from rushing to complete tasks prematurely.
-        _warn_threshold_80 = int(_max_tool_rounds * 0.8)
-        _warn_threshold_96 = _max_tool_rounds - 2
-        if round_i == _warn_threshold_80:
-            api_messages.append(LLMMessage(
-                role="user",
-                content=(
-                    f"⚠️ 你已使用 {round_i}/{_max_tool_rounds} 轮工具调用。"
-                    "如果当前任务尚未完成，请尽快保存进度到 focus.md，"
-                    "并使用 set_trigger 设置续接触发器，在剩余轮次中做好收尾。"
-                ),
-            ))
-        elif round_i == _warn_threshold_96:
-            api_messages.append(LLMMessage(
-                role="user",
-                content=f"🚨 仅剩 2 轮工具调用。请立即保存进度到 focus.md 并设置续接触发器。",
-            ))
-
-        try:
-            # Use streaming API for real-time responses
-            response = await client.stream(
-                messages=api_messages,
-                tools=tools_for_llm if tools_for_llm else None,
-                temperature=0.7,
-                max_tokens=max_tokens,
-                on_chunk=on_chunk,
-                on_thinking=on_thinking,
-            )
-        except LLMError as e:
-            # Record accumulated tokens before returning error
-            logger.error(
-                f"[LLM] LLMError provider={getattr(model, 'provider', '?')} "
-                f"model={getattr(model, 'model', '?')} round={round_i + 1}: {e}"
-            )
-            if agent_id and _accumulated_tokens > 0:
-                await record_token_usage(agent_id, _accumulated_tokens)
-            await client.close()
-            return f"[LLM Error] {e}"
-        except Exception as e:
-            logger.error(
-                f"[LLM] Unexpected error provider={getattr(model, 'provider', '?')} "
-                f"model={getattr(model, 'model', '?')} round={round_i + 1}: "
-                f"{type(e).__name__}: {str(e)[:300]}"
-            )
-            if agent_id and _accumulated_tokens > 0:
-                await record_token_usage(agent_id, _accumulated_tokens)
-            await client.close()
-            return f"[LLM call error] {type(e).__name__}: {str(e)[:200]}"
-
-        # ── Track tokens for this round ──
-        real_tokens = extract_usage_tokens(response.usage)
-        if real_tokens:
-            _accumulated_tokens += real_tokens
-        else:
-            # Fallback: estimate from message content length
-            round_chars = sum(len(m.content or '') if isinstance(m.content, str) else 0 for m in api_messages) + len(response.content or '')
-            _accumulated_tokens += estimate_tokens_from_chars(round_chars)
-
-        # If no tool calls, return the final content
-        if not response.tool_calls:
-            if agent_id and _accumulated_tokens > 0:
-                await record_token_usage(agent_id, _accumulated_tokens)
-            await client.close()
-            return response.content or "[LLM returned empty content]"
-
-        # Execute tool calls
-        logger.info(f"[LLM] Round {round_i+1}: {len(response.tool_calls)} tool call(s), finish_reason={response.finish_reason}")
-
-        # Add assistant message with tool calls
-        api_messages.append(LLMMessage(
-            role="assistant",
-            content=response.content or None,
-            tool_calls=[{
-                "id": tc["id"],
-                "type": "function",
-                "function": tc["function"],
-            } for tc in response.tool_calls],
-            reasoning_content=response.reasoning_content,
-        ))
-
-        full_reasoning_content = response.reasoning_content or ""
-
-        # Tools that require arguments — if LLM sends empty args, skip and ask to retry
-        _TOOLS_REQUIRING_ARGS = {"write_file", "read_file", "delete_file", "read_document", "send_message_to_agent", "send_feishu_message", "send_email"}
-
-        for tc in response.tool_calls:
-            fn = tc["function"]
-            tool_name = fn["name"]
-            raw_args = fn.get("arguments", "{}")
-            logger.info(f"[LLM] Raw arguments for {tool_name} (len={len(raw_args)}): {repr(raw_args[:300])}")
-            try:
-                args = json.loads(raw_args) if raw_args else {}
-            except json.JSONDecodeError:
-                args = {}
-
-            # Guard: if a tool that requires arguments received empty args,
-            # return an error to LLM instead of executing (Claude sometimes
-            # emits tool_use blocks with no input_json_delta events)
-            if not args and tool_name in _TOOLS_REQUIRING_ARGS:
-                logger.warning(f"[LLM] Empty arguments for {tool_name}, asking LLM to retry")
-                api_messages.append(LLMMessage(
-                    role="tool",
-                    content=f"Error: {tool_name} was called with empty arguments. You must provide the required parameters. Please retry with the correct arguments.",
-                    tool_call_id=tc.get("id", ""),
-                ))
-                continue
-
-            logger.info(f"[LLM] Calling tool: {tool_name}({args})")
-            # Notify client about tool call (in-progress)
-            if on_tool_call:
-                try:
-                    await on_tool_call({
-                        "name": tool_name,
-                        "args": args,
-                        "status": "running",
-                        "reasoning_content": full_reasoning_content
-                    })
-                except Exception:
-                    pass
-
-            result = await execute_tool(
-                tool_name, args,
-                agent_id=agent_id,
-                user_id=user_id or agent_id,
-            )
-            logger.debug(f"[LLM] Tool result: {result[:100]}")
-
-            # Notify client about tool call result
-            if on_tool_call:
-                try:
-                    await on_tool_call({
-                        "name": tool_name,
-                        "args": args,
-                        "status": "done",
-                        "result": result,
-                        "reasoning_content": full_reasoning_content
-                    })
-                except Exception:
-                    pass
-
-            api_messages.append(LLMMessage(
-                role="tool",
-                tool_call_id=tc["id"],
-                content=str(result),
-            ))
-
-    # Record tokens even on "too many rounds" exit
-    if agent_id and _accumulated_tokens > 0:
-        await record_token_usage(agent_id, _accumulated_tokens)
-    await client.close()
-    return "[Error] Too many tool call rounds"
 
 
 @router.websocket("/ws/chat/{agent_id}")
@@ -999,208 +598,3 @@ async def _on_failover(reason: str):
         except Exception:
             pass
 
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Unified Failover Wrapper (Option B implementation per issue #154)
-# ═══════════════════════════════════════════════════════════════════════════════
-
-class FailoverGuard:
-    """Guard state for failover decisions."""
-
-    def __init__(self):
-        self.tool_executed = False
-        self.streaming_started = False
-        self.failover_done = False
-
-    def mark_tool_executed(self):
-        """Mark that a side-effecting tool has been executed."""
-        self.tool_executed = True
-
-    def mark_streaming_started(self):
-        """Mark that streaming output has started."""
-        self.streaming_started = True
-
-    def mark_failover_done(self):
-        """Mark that failover has already happened once."""
-        self.failover_done = True
-
-    def can_failover(self) -> bool:
-        """Check if failover is allowed based on guard rules."""
-        if self.failover_done:
-            return False  # Only failover once
-        if self.tool_executed:
-            return False  # Don't failover after side effects
-        if self.streaming_started:
-            return False  # Don't failover after streaming started
-        return True
-
-
-def is_retryable_error(result: str) -> bool:
-    """Check if an error result is retryable (network, timeout, 429, 5xx).
-
-    Non-retryable: auth errors (401, 403), validation (400, 422), content policy
-    Retryable: timeout, connection, 429, 5xx, transient errors
-    """
-    if not (result.startswith("[LLM Error]") or result.startswith("[LLM call error]") or result.startswith("[Error]")):
-        return False
-
-    result_lower = result.lower()
-
-    # Non-retryable: authentication and authorization
-    if any(kw in result_lower for kw in ["auth", "unauthorized", "forbidden", "invalid api key", "api key invalid", "401", "403"]):
-        return False
-
-    # Non-retryable: validation and schema
-    if any(kw in result_lower for kw in ["validation", "invalid request", "schema", "bad request", "400", "422"]):
-        return False
-
-    # Non-retryable: content policy
-    if any(kw in result_lower for kw in ["content policy", "content_filter", "safety", "moderation"]):
-        return False
-
-    # Retryable by default (any other error is potentially retryable)
-    return True
-
-
-async def call_llm_with_failover(
-    primary_model: LLMModel,
-    fallback_model: LLMModel | None,
-    messages: list[dict],
-    agent_name: str,
-    role_description: str,
-    agent_id=None,
-    user_id=None,
-    on_chunk=None,
-    on_thinking=None,
-    on_tool_call=None,
-    supports_vision=False,
-    on_failover=None,
-) -> str:
-    """Call LLM with automatic failover support (wrapper approach).
-
-    This is the unified entry point for all LLM calls with failover.
-    Implements Option B from issue #154 review:
-    - Inspects return values for retryable errors
-    - Applies guard checks before failover
-    - Notifies caller when failover happens
-
-    Args:
-        primary_model: Primary LLM model
-        fallback_model: Fallback LLM model (can be None)
-        messages: Conversation messages
-        agent_name: Agent display name
-        role_description: Agent role description
-        agent_id: Optional agent UUID
-        user_id: Optional user UUID
-        on_chunk: Optional streaming callback
-        on_thinking: Optional thinking callback
-        on_tool_call: Optional tool call callback
-        supports_vision: Whether model supports vision
-        on_failover: Optional callback(reason: str) called when failover happens
-
-    Returns:
-        LLM response string (from primary or fallback)
-    """
-    from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
-    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMMessage, LLMError
-
-    guard = FailoverGuard()
-
-    # Config-level fallback: if no primary, use fallback directly
-    if primary_model is None and fallback_model is not None:
-        logger.info("[Failover] Primary model not configured, using fallback directly")
-        primary_model = fallback_model
-        fallback_model = None
-
-    if primary_model is None:
-        return "⚠️ 未配置 LLM 模型"
-
-    # Wrapper callbacks to track state for guard checks
-    async def _wrapped_on_chunk(text: str):
-        guard.mark_streaming_started()
-        if on_chunk:
-            await on_chunk(text)
-
-    async def _wrapped_on_tool_call(data: dict):
-        if data.get("status") == "done":
-            guard.mark_tool_executed()
-        if on_tool_call:
-            await on_tool_call(data)
-
-    # Try primary model
-    primary_result = await call_llm(
-        primary_model,
-        messages,
-        agent_name,
-        role_description,
-        agent_id=agent_id,
-        user_id=user_id,
-        on_chunk=_wrapped_on_chunk,
-        on_tool_call=_wrapped_on_tool_call,
-        on_thinking=on_thinking,
-        supports_vision=supports_vision,
-    )
-
-    # Check if we need to failover
-    if not is_retryable_error(primary_result):
-        return primary_result
-
-    # Check guard conditions
-    if not guard.can_failover():
-        if guard.tool_executed:
-            logger.warning("[Failover] Blocked: side-effecting tool already executed")
-        elif guard.streaming_started:
-            logger.warning("[Failover] Blocked: streaming already started")
-        elif guard.failover_done:
-            logger.warning("[Failover] Blocked: failover already done once")
-        return primary_result
-
-    # No fallback available
-    if fallback_model is None:
-        logger.warning("[Failover] No fallback model available")
-        return primary_result
-
-    # Runtime failover: retry with fallback model
-    logger.info(f"[Failover] Retrying with fallback model: {fallback_model.provider}/{fallback_model.model}")
-
-    if on_failover:
-        try:
-            await on_failover(f"Switched to fallback model: {fallback_model.model}")
-        except Exception:
-            pass
-
-    guard.mark_failover_done()
-
-    # Call fallback with fresh callbacks (streaming/tool state is per-call)
-    fallback_guard = FailoverGuard()
-    fallback_guard.mark_failover_done()  # Don't failover again
-
-    async def _fallback_on_chunk(text: str):
-        fallback_guard.mark_streaming_started()
-        if on_chunk:
-            await on_chunk(text)
-
-    async def _fallback_on_tool_call(data: dict):
-        if data.get("status") == "done":
-            fallback_guard.mark_tool_executed()
-        if on_tool_call:
-            await on_tool_call(data)
-
-    fallback_result = await call_llm(
-        fallback_model,
-        messages,
-        agent_name,
-        role_description,
-        agent_id=agent_id,
-        user_id=user_id,
-        on_chunk=_fallback_on_chunk,
-        on_tool_call=_fallback_on_tool_call,
-        on_thinking=on_thinking,
-        supports_vision=getattr(fallback_model, 'supports_vision', False),
-    )
-
-    # Combine error messages if fallback also failed
-    if is_retryable_error(fallback_result) or fallback_result.startswith("⚠️") or fallback_result.startswith("[Error]"):
-        return f"⚠️ 调用模型出错: Primary: {primary_result[:80]} | Fallback: {fallback_result[:80]}"
-
-    return fallback_result
diff --git a/backend/app/services/llm/__init__.py b/backend/app/services/llm/__init__.py
new file mode 100644
index 00000000..59476f70
--- /dev/null
+++ b/backend/app/services/llm/__init__.py
@@ -0,0 +1,55 @@
+"""LLM service module - unified LLM calling interface.
+
+This module provides:
+- call_llm: Basic LLM call with tool support
+- call_llm_with_failover: LLM call with automatic failover
+- call_agent_llm: Agent chat LLM call
+- call_agent_llm_with_tools: Agent LLM call with tools for background tasks
+
+Example:
+    from app.services.llm import call_llm, call_llm_with_failover
+
+    # Basic call
+    reply = await call_llm(model, messages, agent_name, role_description)
+
+    # With failover
+    reply = await call_llm_with_failover(
+        primary_model=primary,
+        fallback_model=fallback,
+        messages=messages,
+        ...
+    )
+"""
+
+from .caller import (
+    call_llm,
+    call_llm_with_failover,
+    call_agent_llm,
+    call_agent_llm_with_tools,
+    FailoverGuard,
+    is_retryable_error,
+)
+from .client import LLMClient, LLMResponse, LLMError, LLMMessage
+from .failover import classify_error, FailoverErrorType
+from .utils import create_llm_client, get_max_tokens
+
+__all__ = [
+    # Core caller functions
+    "call_llm",
+    "call_llm_with_failover",
+    "call_agent_llm",
+    "call_agent_llm_with_tools",
+    # Failover utilities
+    "FailoverGuard",
+    "is_retryable_error",
+    "classify_error",
+    "FailoverErrorType",
+    # Client classes
+    "LLMClient",
+    "LLMResponse",
+    "LLMError",
+    "LLMMessage",
+    # Utilities
+    "create_llm_client",
+    "get_max_tokens",
+]
diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
new file mode 100644
index 00000000..d2a2682c
--- /dev/null
+++ b/backend/app/services/llm/caller.py
@@ -0,0 +1,785 @@
+"""Unified LLM calling service with failover support for all execution paths.
+
+This module provides a shared entry point for all LLM calls across:
+- WebSocket chat
+- IM channels (Feishu, Slack, Teams, Discord, WeCom, DingTalk)
+- Background services (task executor, scheduler, heartbeat, etc.)
+
+All paths now support:
+1. Config-level fallback: if primary missing, use fallback directly
+2. Runtime failover: if primary fails with retryable error, try fallback once
+"""
+
+from __future__ import annotations
+
+import json
+import uuid
+from typing import TYPE_CHECKING
+
+from loguru import logger
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.database import async_session
+from app.services.agent_tools import AGENT_TOOLS, execute_tool, get_agent_tools_for_llm
+from app.services.token_tracker import record_token_usage, extract_usage_tokens, estimate_tokens_from_chars
+
+from .client import LLMError
+from .failover import classify_error, FailoverErrorType
+from .utils import LLMMessage, create_llm_client, get_max_tokens
+
+if TYPE_CHECKING:
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Failover Guard
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class FailoverGuard:
+    """Guard state for failover decisions."""
+
+    def __init__(self):
+        self.tool_executed = False
+        self.streaming_started = False
+        self.failover_done = False
+
+    def mark_tool_executed(self):
+        """Mark that a side-effecting tool has been executed."""
+        self.tool_executed = True
+
+    def mark_streaming_started(self):
+        """Mark that streaming output has started."""
+        self.streaming_started = True
+
+    def mark_failover_done(self):
+        """Mark that failover has already happened once."""
+        self.failover_done = True
+
+    def can_failover(self) -> bool:
+        """Check if failover is allowed based on guard rules."""
+        if self.failover_done:
+            return False  # Only failover once
+        if self.tool_executed:
+            return False  # Don't failover after side effects
+        if self.streaming_started:
+            return False  # Don't failover after streaming started
+        return True
+
+
+def is_retryable_error(result: str) -> bool:
+    """Check if an error result is retryable (network, timeout, 429, 5xx).
+
+    Non-retryable: auth errors (401, 403), validation (400, 422), content policy
+    Retryable: timeout, connection, 429, 5xx, transient errors
+    """
+    if not (result.startswith("[LLM Error]") or result.startswith("[LLM call error]") or result.startswith("[Error]")):
+        return False
+
+    result_lower = result.lower()
+
+    # Non-retryable: authentication and authorization
+    if any(kw in result_lower for kw in ["auth", "unauthorized", "forbidden", "invalid api key", "api key invalid", "401", "403"]):
+        return False
+
+    # Non-retryable: validation and schema
+    if any(kw in result_lower for kw in ["validation", "invalid request", "schema", "bad request", "400", "422"]):
+        return False
+
+    # Non-retryable: content policy
+    if any(kw in result_lower for kw in ["content policy", "content_filter", "safety", "moderation"]):
+        return False
+
+    # Retryable by default (any other error is potentially retryable)
+    return True
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Helper Functions
+# ═══════════════════════════════════════════════════════════════════════════════
+
+async def _get_agent_config(agent_id) -> tuple[int, str | None]:
+    """Get agent config: max_tool_rounds and token limit status."""
+    if not agent_id:
+        return 50, None
+
+    try:
+        from app.models.agent import Agent as AgentModel
+        async with async_session() as _db:
+            _ar = await _db.execute(select(AgentModel).where(AgentModel.id == agent_id))
+            _agent = _ar.scalar_one_or_none()
+            if _agent:
+                max_rounds = _agent.max_tool_rounds or 50
+                if _agent.max_tokens_per_day and _agent.tokens_used_today >= _agent.max_tokens_per_day:
+                    return max_rounds, f"⚠️ Daily token usage has reached the limit ({_agent.tokens_used_today:,}/{_agent.max_tokens_per_day:,}). Please try again tomorrow or ask admin to increase the limit."
+                if _agent.max_tokens_per_month and _agent.tokens_used_month >= _agent.max_tokens_per_month:
+                    return max_rounds, f"⚠️ Monthly token usage has reached the limit ({_agent.tokens_used_month:,}/{_agent.max_tokens_per_month:,}). Please ask admin to increase the limit."
+                return max_rounds, None
+    except Exception:
+        pass
+    return 50, None
+
+
+async def _get_user_name(user_id) -> str | None:
+    """Get user's display name for personalized context."""
+    if not user_id:
+        return None
+    try:
+        from app.models.user import User as _UserModel
+        async with async_session() as _udb:
+            _ur = await _udb.execute(select(_UserModel).where(_UserModel.id == user_id))
+            _u = _ur.scalar_one_or_none()
+            if _u:
+                return _u.display_name or _u.username
+    except Exception:
+        pass
+    return None
+
+
+def _convert_messages_for_vision(
+    api_messages: list, supports_vision: bool
+) -> list:
+    """Convert image markers to vision format if supported, or strip them."""
+    import re as _re_v
+
+    if supports_vision:
+        # Vision format: convert image markers to OpenAI Vision API format
+        for i, msg in enumerate(api_messages):
+            if msg.role != "user" or not msg.content or not isinstance(msg.content, str):
+                continue
+            content_str = msg.content
+            pattern = r'\[image_data:(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)\]'
+            images = _re_v.findall(pattern, content_str)
+            if not images:
+                continue
+            text = _re_v.sub(pattern, '', content_str).strip()
+            parts = [{"type": "image_url", "image_url": {"url": img}} for img in images]
+            if text:
+                parts.append({"type": "text", "text": text})
+            api_messages[i] = type(msg)(role=msg.role, content=parts)
+    else:
+        # Strip base64 markers for non-vision models
+        _img_pattern = r'\[image_data:data:image/[^;]+;base64,[A-Za-z0-9+/=]+\]'
+        for i, msg in enumerate(api_messages):
+            if msg.role != "user" or not isinstance(msg.content, str):
+                continue
+            if "[image_data:" in msg.content:
+                _n_imgs = len(_re_v.findall(_img_pattern, msg.content))
+                cleaned = _re_v.sub(_img_pattern, '', msg.content).strip()
+                if _n_imgs > 0:
+                    cleaned += f"\n[用户发送了 {_n_imgs} 张图片，但当前模型不支持视觉，无法查看图片内容]"
+                api_messages[i] = type(msg)(role=msg.role, content=cleaned)
+    return api_messages
+
+
+def _check_tool_requires_args(tool_name: str, args: dict) -> tuple[bool, str]:
+    """Check if tool requires arguments and return (should_execute, result_or_error)."""
+    _TOOLS_REQUIRING_ARGS = {"write_file", "read_file", "delete_file", "read_document", "send_message_to_agent", "send_feishu_message", "send_email"}
+    if not args and tool_name in _TOOLS_REQUIRING_ARGS:
+        return False, f"Error: {tool_name} was called with empty arguments. You must provide the required parameters. Please retry with the correct arguments."
+    return True, ""
+
+
+async def _process_tool_call(
+    tc: dict,
+    api_messages: list,
+    agent_id,
+    user_id,
+    on_tool_call,
+    full_reasoning_content: str,
+) -> str:
+    """Process a single tool call and return result."""
+    fn = tc["function"]
+    tool_name = fn["name"]
+    raw_args = fn.get("arguments", "{}")
+    logger.info(f"[LLM] Calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
+
+    try:
+        args = json.loads(raw_args) if raw_args else {}
+    except json.JSONDecodeError:
+        args = {}
+
+    # Guard: check if tool requires arguments
+    should_execute, error_msg = _check_tool_requires_args(tool_name, args)
+    if not should_execute:
+        return error_msg
+
+    # Notify client about tool call (in-progress)
+    if on_tool_call:
+        try:
+            await on_tool_call({
+                "name": tool_name,
+                "args": args,
+                "status": "running",
+                "reasoning_content": full_reasoning_content
+            })
+        except Exception:
+            pass
+
+    # Execute tool
+    result = await execute_tool(
+        tool_name, args,
+        agent_id=agent_id,
+        user_id=user_id or agent_id,
+    )
+    logger.debug(f"[LLM] Tool result: {result[:100]}")
+
+    # Notify client about tool call result
+    if on_tool_call:
+        try:
+            await on_tool_call({
+                "name": tool_name,
+                "args": args,
+                "status": "done",
+                "result": result,
+                "reasoning_content": full_reasoning_content
+            })
+        except Exception:
+            pass
+
+    return str(result)
+
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Core LLM Call Functions
+# ═══════════════════════════════════════════════════════════════════════════════
+
+async def call_llm(
+    model,
+    messages: list[dict],
+    agent_name: str,
+    role_description: str,
+    agent_id=None,
+    user_id=None,
+    on_chunk=None,
+    on_tool_call=None,
+    on_thinking=None,
+    supports_vision=False,
+) -> str:
+    """Call LLM via unified client with function-calling tool loop."""
+    # Get agent config for tool rounds
+    _max_tool_rounds, _token_limit_msg = await _get_agent_config(agent_id)
+    if _token_limit_msg:
+        return _token_limit_msg
+
+    # Get user's name for personalized context
+    _user_name = await _get_user_name(user_id)
+
+    # Build system prompt
+    system_prompt = f"你的名字是 {agent_name}。{role_description}"
+    if _user_name:
+        system_prompt += f"\n\n当前用户是: {_user_name}"
+
+    # Load tools dynamically from DB
+    tools_for_llm = await get_agent_tools_for_llm(agent_id) if agent_id else AGENT_TOOLS
+
+    # Convert messages to LLMMessage format
+    api_messages = [LLMMessage(role="system", content=system_prompt)]
+    for msg in messages:
+        api_messages.append(LLMMessage(
+            role=msg.get("role", "user"),
+            content=msg.get("content"),
+            tool_calls=msg.get("tool_calls"),
+            tool_call_id=msg.get("tool_call_id"),
+        ))
+
+    # Vision format conversion
+    api_messages = _convert_messages_for_vision(api_messages, supports_vision)
+
+    # Create the unified LLM client
+    try:
+        client = create_llm_client(
+            provider=model.provider,
+            api_key=model.api_key_encrypted,
+            model=model.model,
+            base_url=model.base_url,
+            timeout=120.0,
+        )
+    except Exception as e:
+        return f"[Error] Failed to create LLM client: {e}"
+
+    max_tokens = get_max_tokens(model.provider, model.model, getattr(model, 'max_output_tokens', None))
+    _accumulated_tokens = 0
+
+    # Tool-calling loop
+    for round_i in range(_max_tool_rounds):
+        # Dynamic tool-call limit warning
+        _warn_threshold_80 = int(_max_tool_rounds * 0.8)
+        _warn_threshold_96 = _max_tool_rounds - 2
+        if round_i == _warn_threshold_80:
+            api_messages.append(LLMMessage(
+                role="user",
+                content=(
+                    f"⚠️ 你已使用 {round_i}/{_max_tool_rounds} 轮工具调用。"
+                    "如果当前任务尚未完成，请尽快保存进度到 focus.md，"
+                    "并使用 set_trigger 设置续接触发器，在剩余轮次中做好收尾。"
+                ),
+            ))
+        elif round_i == _warn_threshold_96:
+            api_messages.append(LLMMessage(
+                role="user",
+                content=f"🚨 仅剩 2 轮工具调用。请立即保存进度到 focus.md 并设置续接触发器。",
+            ))
+
+        try:
+            # Use streaming API for real-time responses
+            response = await client.stream(
+                messages=api_messages,
+                tools=tools_for_llm if tools_for_llm else None,
+                temperature=0.7,
+                max_tokens=max_tokens,
+                on_chunk=on_chunk,
+                on_thinking=on_thinking,
+            )
+        except LLMError as e:
+            logger.error(f"[LLM] LLMError: {e}")
+            if agent_id and _accumulated_tokens > 0:
+                await record_token_usage(agent_id, _accumulated_tokens)
+            await client.close()
+            return f"[LLM Error] {e}"
+        except Exception as e:
+            logger.error(f"[LLM] Unexpected error: {type(e).__name__}: {str(e)[:300]}")
+            if agent_id and _accumulated_tokens > 0:
+                await record_token_usage(agent_id, _accumulated_tokens)
+            await client.close()
+            return f"[LLM call error] {type(e).__name__}: {str(e)[:200]}"
+
+        # Track tokens for this round
+        real_tokens = extract_usage_tokens(response.usage)
+        if real_tokens:
+            _accumulated_tokens += real_tokens
+        else:
+            round_chars = sum(len(m.content or '') if isinstance(m.content, str) else 0 for m in api_messages) + len(response.content or '')
+            _accumulated_tokens += estimate_tokens_from_chars(round_chars)
+
+        # If no tool calls, return the final content
+        if not response.tool_calls:
+            if agent_id and _accumulated_tokens > 0:
+                await record_token_usage(agent_id, _accumulated_tokens)
+            await client.close()
+            return response.content or "[LLM returned empty content]"
+
+        # Execute tool calls
+        logger.info(f"[LLM] Round {round_i+1}: {len(response.tool_calls)} tool call(s)")
+
+        # Add assistant message with tool calls
+        api_messages.append(LLMMessage(
+            role="assistant",
+            content=response.content or None,
+            tool_calls=[{
+                "id": tc["id"],
+                "type": "function",
+                "function": tc["function"],
+            } for tc in response.tool_calls],
+            reasoning_content=response.reasoning_content,
+        ))
+
+        full_reasoning_content = response.reasoning_content or ""
+
+        # Tools that require arguments
+        _TOOLS_REQUIRING_ARGS = {"write_file", "read_file", "delete_file", "read_document", "send_message_to_agent", "send_feishu_message", "send_email"}
+
+        for tc in response.tool_calls:
+            fn = tc["function"]
+            tool_name = fn["name"]
+            raw_args = fn.get("arguments", "{}")
+            logger.info(f"[LLM] Raw arguments for {tool_name}: {repr(raw_args[:300])}")
+            try:
+                args = json.loads(raw_args) if raw_args else {}
+            except json.JSONDecodeError:
+                args = {}
+
+            # Guard: if a tool that requires arguments received empty args
+            if not args and tool_name in _TOOLS_REQUIRING_ARGS:
+                logger.warning(f"[LLM] Empty arguments for {tool_name}, asking LLM to retry")
+                api_messages.append(LLMMessage(
+                    role="tool",
+                    content=f"Error: {tool_name} was called with empty arguments. You must provide the required parameters. Please retry with the correct arguments.",
+                    tool_call_id=tc.get("id", ""),
+                ))
+                continue
+
+            logger.info(f"[LLM] Calling tool: {tool_name}({args})")
+            # Notify client about tool call (in-progress)
+            if on_tool_call:
+                try:
+                    await on_tool_call({
+                        "name": tool_name,
+                        "args": args,
+                        "status": "running",
+                        "reasoning_content": full_reasoning_content
+                    })
+                except Exception:
+                    pass
+
+            result = await execute_tool(
+                tool_name, args,
+                agent_id=agent_id,
+                user_id=user_id or agent_id,
+            )
+            logger.debug(f"[LLM] Tool result: {result[:100]}")
+
+            # Notify client about tool call result
+            if on_tool_call:
+                try:
+                    await on_tool_call({
+                        "name": tool_name,
+                        "args": args,
+                        "status": "done",
+                        "result": result,
+                        "reasoning_content": full_reasoning_content
+                    })
+                except Exception:
+                    pass
+
+            api_messages.append(LLMMessage(
+                role="tool",
+                tool_call_id=tc["id"],
+                content=str(result),
+            ))
+
+    # Record tokens even on "too many rounds" exit
+    if agent_id and _accumulated_tokens > 0:
+        await record_token_usage(agent_id, _accumulated_tokens)
+    await client.close()
+    return "[Error] Too many tool call rounds"
+
+
+async def call_llm_with_failover(
+    primary_model,
+    fallback_model,
+    messages: list[dict],
+    agent_name: str,
+    role_description: str,
+    agent_id=None,
+    user_id=None,
+    on_chunk=None,
+    on_thinking=None,
+    on_tool_call=None,
+    supports_vision=False,
+    on_failover=None,
+) -> str:
+    """Call LLM with automatic failover support."""
+    guard = FailoverGuard()
+
+    # Config-level fallback: if no primary, use fallback directly
+    if primary_model is None and fallback_model is not None:
+        logger.info("[Failover] Primary model not configured, using fallback directly")
+        primary_model = fallback_model
+        fallback_model = None
+
+    if primary_model is None:
+        return "⚠️ 未配置 LLM 模型"
+
+    # Wrapper callbacks to track state for guard checks
+    async def _wrapped_on_chunk(text: str):
+        guard.mark_streaming_started()
+        if on_chunk:
+            await on_chunk(text)
+
+    async def _wrapped_on_tool_call(data: dict):
+        if data.get("status") == "done":
+            guard.mark_tool_executed()
+        if on_tool_call:
+            await on_tool_call(data)
+
+    # Try primary model
+    primary_result = await call_llm(
+        primary_model,
+        messages,
+        agent_name,
+        role_description,
+        agent_id=agent_id,
+        user_id=user_id,
+        on_chunk=_wrapped_on_chunk,
+        on_tool_call=_wrapped_on_tool_call,
+        on_thinking=on_thinking,
+        supports_vision=supports_vision,
+    )
+
+    # Check if we need to failover
+    if not is_retryable_error(primary_result):
+        return primary_result
+
+    # Check guard conditions
+    if not guard.can_failover():
+        if guard.tool_executed:
+            logger.warning("[Failover] Blocked: side-effecting tool already executed")
+        elif guard.streaming_started:
+            logger.warning("[Failover] Blocked: streaming already started")
+        elif guard.failover_done:
+            logger.warning("[Failover] Blocked: failover already done once")
+        return primary_result
+
+    # No fallback available
+    if fallback_model is None:
+        logger.warning("[Failover] No fallback model available")
+        return primary_result
+
+    # Runtime failover: retry with fallback model
+    logger.info(f"[Failover] Retrying with fallback model: {fallback_model.provider}/{fallback_model.model}")
+
+    if on_failover:
+        try:
+            await on_failover(f"Switched to fallback model: {fallback_model.model}")
+        except Exception:
+            pass
+
+    guard.mark_failover_done()
+
+    # Call fallback with fresh callbacks
+    fallback_guard = FailoverGuard()
+    fallback_guard.mark_failover_done()
+
+    async def _fallback_on_chunk(text: str):
+        fallback_guard.mark_streaming_started()
+        if on_chunk:
+            await on_chunk(text)
+
+    async def _fallback_on_tool_call(data: dict):
+        if data.get("status") == "done":
+            fallback_guard.mark_tool_executed()
+        if on_tool_call:
+            await on_tool_call(data)
+
+    fallback_result = await call_llm(
+        fallback_model,
+        messages,
+        agent_name,
+        role_description,
+        agent_id=agent_id,
+        user_id=user_id,
+        on_chunk=_fallback_on_chunk,
+        on_tool_call=_fallback_on_tool_call,
+        on_thinking=on_thinking,
+        supports_vision=getattr(fallback_model, 'supports_vision', False),
+    )
+
+    # Combine error messages if fallback also failed
+    if is_retryable_error(fallback_result) or fallback_result.startswith("⚠️") or fallback_result.startswith("[Error]"):
+        return f"⚠️ 调用模型出错: Primary: {primary_result[:80]} | Fallback: {fallback_result[:80]}"
+
+    return fallback_result
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# High-level Agent Call Functions
+# ═══════════════════════════════════════════════════════════════════════════════
+
+async def call_agent_llm(
+    db: AsyncSession,
+    agent_id: uuid.UUID,
+    user_text: str,
+    history: list[dict] | None = None,
+    user_id: uuid.UUID | None = None,
+    on_chunk=None,
+    on_thinking=None,
+    supports_vision: bool = False,
+) -> str:
+    """Call the agent's LLM with automatic failover support."""
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+    from app.core.permissions import is_agent_expired
+
+    # Load agent
+    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
+    agent: Agent | None = agent_result.scalar_one_or_none()
+    if not agent:
+        return "⚠️ 数字员工未找到"
+
+    if is_agent_expired(agent):
+        return "This Agent has expired and is off duty. Please contact your admin to extend its service."
+
+    # Load primary model
+    primary_model: LLMModel | None = None
+    if agent.primary_model_id:
+        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
+        primary_model = model_result.scalar_one_or_none()
+
+    # Load fallback model
+    fallback_model: LLMModel | None = None
+    if agent.fallback_model_id:
+        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
+        fallback_model = fb_result.scalar_one_or_none()
+
+    # Config-level fallback: primary missing -> use fallback
+    if not primary_model and fallback_model:
+        primary_model = fallback_model
+        fallback_model = None
+        logger.warning(f"[call_agent_llm] Primary model unavailable, using fallback: {primary_model.model}")
+
+    if not primary_model:
+        return f"⚠️ {agent.name} 未配置 LLM 模型，请在管理后台设置。"
+
+    # Build conversation messages
+    messages: list[dict] = []
+    if history:
+        messages.extend(history[-10:])
+    messages.append({"role": "user", "content": user_text})
+
+    # Use unified call_llm_with_failover
+    try:
+        reply = await call_llm_with_failover(
+            primary_model=primary_model,
+            fallback_model=fallback_model,
+            messages=messages,
+            agent_name=agent.name,
+            role_description=agent.role_description or "",
+            agent_id=agent_id,
+            user_id=user_id or agent_id,
+            on_chunk=on_chunk,
+            on_thinking=on_thinking,
+            supports_vision=supports_vision or getattr(primary_model, 'supports_vision', False),
+        )
+        return reply
+    except Exception as e:
+        error_msg = str(e) or repr(e)
+        logger.error(f"[call_agent_llm] Unexpected error: {error_msg}")
+        return f"⚠️ 调用模型出错: {error_msg[:150]}"
+
+
+async def call_agent_llm_with_tools(
+    db: AsyncSession,
+    agent_id: uuid.UUID,
+    system_prompt: str,
+    user_prompt: str,
+    max_rounds: int = 50,
+) -> str:
+    """Call agent LLM with tool-calling loop (for background services)."""
+    from app.models.agent import Agent
+    from app.models.llm import LLMModel
+
+    # Load agent and models
+    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
+    agent: Agent | None = agent_result.scalar_one_or_none()
+    if not agent:
+        return "⚠️ Agent not found"
+
+    # Load models
+    primary_model: LLMModel | None = None
+    if agent.primary_model_id:
+        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
+        primary_model = model_result.scalar_one_or_none()
+
+    fallback_model: LLMModel | None = None
+    if agent.fallback_model_id:
+        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
+        fallback_model = fb_result.scalar_one_or_none()
+
+    # Config-level fallback
+    if not primary_model and fallback_model:
+        primary_model = fallback_model
+        fallback_model = None
+
+    if not primary_model:
+        return f"⚠️ {agent.name} has no LLM model configured"
+
+    # Build messages
+    messages = [
+        LLMMessage(role="system", content=system_prompt),
+        LLMMessage(role="user", content=user_prompt),
+    ]
+
+    # Load tools
+    tools_for_llm = await get_agent_tools_for_llm(agent_id)
+
+    async def _try_model(model: LLMModel) -> tuple[str, bool]:
+        """Try to complete with a model. Returns (response, success)."""
+        try:
+            client = create_llm_client(
+                provider=model.provider,
+                api_key=model.api_key_encrypted,
+                model=model.model,
+                base_url=model.base_url,
+                timeout=120.0,
+            )
+
+            max_tokens = get_max_tokens(
+                model.provider, model.model,
+                getattr(model, 'max_output_tokens', None)
+            )
+
+            # Tool-calling loop
+            api_messages = list(messages)
+            for round_i in range(max_rounds):
+                try:
+                    response = await client.complete(
+                        messages=api_messages,
+                        tools=tools_for_llm if tools_for_llm else None,
+                        temperature=0.7,
+                        max_tokens=max_tokens,
+                    )
+                except Exception as e:
+                    await client.close()
+                    raise
+
+                if not response.tool_calls:
+                    await client.close()
+                    return response.content or "[Empty response]", True
+
+                # Execute tool calls
+                api_messages.append(LLMMessage(
+                    role="assistant",
+                    content=response.content or None,
+                    tool_calls=[{
+                        "id": tc["id"],
+                        "type": "function",
+                        "function": tc["function"],
+                    } for tc in response.tool_calls],
+                ))
+
+                for tc in response.tool_calls:
+                    fn = tc["function"]
+                    tool_name = fn["name"]
+                    raw_args = fn.get("arguments", "{}")
+                    try:
+                        args = json.loads(raw_args) if raw_args else {}
+                    except json.JSONDecodeError:
+                        args = {}
+
+                    result = await execute_tool(
+                        tool_name, args,
+                        agent_id=agent_id,
+                        user_id=agent.creator_id,
+                    )
+                    api_messages.append(LLMMessage(
+                        role="tool",
+                        tool_call_id=tc["id"],
+                        content=str(result),
+                    ))
+
+            await client.close()
+            return "[Error] Too many tool call rounds", False
+
+        except Exception as e:
+            return f"[Error] {e}", False
+
+    # Try primary model
+    reply, success = await _try_model(primary_model)
+    if success:
+        return reply
+
+    # Primary failed - check if retryable
+    error_type = classify_error(Exception(reply))
+    if error_type == FailoverErrorType.NON_RETRYABLE or not fallback_model:
+        return reply
+
+    # Try fallback model
+    logger.info(f"[call_agent_llm_with_tools] Retrying with fallback: {fallback_model.model}")
+    reply2, success2 = await _try_model(fallback_model)
+    if success2:
+        return reply2
+
+    return f"⚠️ Both models failed | Primary: {reply[:80]} | Fallback: {reply2[:80]}"
+
+
+__all__ = [
+    "call_llm",
+    "call_llm_with_failover",
+    "call_agent_llm",
+    "call_agent_llm_with_tools",
+    "FailoverGuard",
+    "is_retryable_error",
+]
diff --git a/backend/app/services/llm/client.py b/backend/app/services/llm/client.py
new file mode 100644
index 00000000..411ccf88
--- /dev/null
+++ b/backend/app/services/llm/client.py
@@ -0,0 +1,2025 @@
+"""Unified LLM client for multiple providers.
+
+Supports OpenAI-compatible APIs, Anthropic native API, and streaming/non-streaming modes.
+Provides a consistent interface for all LLM operations across the application.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Callable, Coroutine, Literal
+
+import httpx
+from loguru import logger
+
+
+# ============================================================================
+# Data Models
+# ============================================================================
+
+@dataclass
+class LLMMessage:
+    """Unified message format."""
+
+    role: Literal["system", "user", "assistant", "tool"]
+    content: str | None = None
+    tool_calls: list[dict] | None = None
+    tool_call_id: str | None = None
+    reasoning_content: str | None = None
+    reasoning_signature: str | None = None
+
+    def to_openai_format(self) -> dict:
+        """Convert to OpenAI format."""
+        msg: dict[str, Any] = {"role": self.role}
+        if self.content is not None:
+            msg["content"] = self.content
+        if self.tool_calls:
+            msg["tool_calls"] = self.tool_calls
+        if self.tool_call_id:
+            msg["tool_call_id"] = self.tool_call_id
+        if self.reasoning_content:
+            msg["reasoning_content"] = self.reasoning_content
+        return msg
+
+    def to_anthropic_format(self) -> dict | None:
+        """Convert to Anthropic format (returns None for system messages)."""
+        if self.role == "system":
+            return None
+            
+        role = self.role
+        
+        # Tool response (from user to assistant)
+        if role == "tool":
+            return {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": self.tool_call_id,
+                        "content": self.content or ""
+                    }
+                ]
+            }
+            
+        content_blocks = []
+        
+        # Add reasoning/thinking content if present
+        if self.role == "assistant" and self.reasoning_content:
+            content_blocks.append({
+                "type": "thinking",
+                "thinking": self.reasoning_content,
+                "signature": self.reasoning_signature or "synthetic_signature" 
+            })
+
+        if self.content:
+            content_blocks.append({"type": "text", "text": self.content})
+            
+        # Tool requests (from assistant to user)
+        if self.tool_calls:
+            for tc in self.tool_calls:
+                function_call = tc.get("function", {})
+                args = function_call.get("arguments", "{}")
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except json.JSONDecodeError:
+                        args = {}
+                
+                content_blocks.append({
+                    "type": "tool_use",
+                    "id": tc.get("id", ""),
+                    "name": function_call.get("name", ""),
+                    "input": args
+                })
+                
+        # Handle the structure
+        if len(content_blocks) == 1 and content_blocks[0]["type"] == "text":
+            content = content_blocks[0]["text"]
+        else:
+            content = content_blocks
+
+        return {"role": role, "content": content}
+
+
+@dataclass
+class LLMResponse:
+    """Unified response format."""
+
+    content: str
+    tool_calls: list[dict] = field(default_factory=list)
+    reasoning_content: str | None = None
+    reasoning_signature: str | None = None
+    finish_reason: str | None = None
+    usage: dict[str, int] | None = None
+    model: str | None = None
+
+
+@dataclass
+class LLMStreamChunk:
+    """Stream chunk format."""
+
+    content: str = ""
+    reasoning_content: str = ""
+    tool_call: dict | None = None
+    finish_reason: str | None = None
+    is_finished: bool = False
+    usage: dict | None = None
+
+
+# ============================================================================
+# Type Definitions
+# ============================================================================
+
+ChunkCallback = Callable[[str], Coroutine[Any, Any, None]]
+ToolCallback = Callable[[dict], Coroutine[Any, Any, None]]
+ThinkingCallback = Callable[[str], Coroutine[Any, Any, None]]
+
+
+# ============================================================================
+# Base Client Interface
+# ============================================================================
+
+class LLMClient(ABC):
+    """Abstract base class for LLM clients."""
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+    ):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model = model
+        self.timeout = timeout
+
+    @abstractmethod
+    async def complete(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Send a completion request and return the full response."""
+        pass
+
+    @abstractmethod
+    async def stream(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        on_chunk: ChunkCallback | None = None,
+        on_thinking: ThinkingCallback | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Send a streaming request and return the aggregated response."""
+        pass
+
+    @abstractmethod
+    def _get_headers(self) -> dict[str, str]:
+        """Get request headers."""
+        pass
+
+
+# ============================================================================
+# OpenAI-Compatible Client
+# ============================================================================
+
+class OpenAICompatibleClient(LLMClient):
+    """Client for OpenAI-compatible APIs (OpenAI, DeepSeek, Qwen, etc.)."""
+
+    DEFAULT_BASE_URL = "https://api.openai.com/v1"
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+        supports_tool_choice: bool = True,
+    ):
+        super().__init__(api_key, base_url or self.DEFAULT_BASE_URL, model, timeout)
+        self.supports_tool_choice = supports_tool_choice
+        self._client: httpx.AsyncClient | None = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=self.timeout, follow_redirects=True)
+        return self._client
+
+    def _get_headers(self) -> dict[str, str]:
+        return {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+
+    def _normalize_base_url(self) -> str:
+        """Normalize base URL by stripping trailing /chat/completions."""
+        url = self.base_url.rstrip("/")
+        if url.endswith("/chat/completions"):
+            url = url[: -len("/chat/completions")]
+        return url
+
+    def _build_payload(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None,
+        temperature: float,
+        max_tokens: int | None,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Build request payload."""
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": [m.to_openai_format() for m in messages],
+            "temperature": temperature,
+            "stream": stream,
+        }
+
+        # Request usage stats in streaming responses (OpenAI extension)
+        if stream:
+            payload["stream_options"] = {"include_usage": True}
+
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+
+        if tools:
+            payload["tools"] = tools
+            if self.supports_tool_choice:
+                payload["tool_choice"] = "auto"
+                payload["parallel_tool_calls"] = True
+
+        # Add any additional kwargs
+        payload.update(kwargs)
+
+        return payload
+
+    def _parse_stream_line(
+        self,
+        line: str,
+        in_think: bool,
+        tag_buffer: str,
+        json_buffer: str = "",
+    ) -> tuple[LLMStreamChunk, bool, str, str]:
+        """Parse a single SSE line from stream.
+
+        Returns (chunk, new_in_think, new_tag_buffer, new_json_buffer).
+        The json_buffer accumulates partial JSON from non-standard APIs that
+        split a single JSON object across multiple data: lines.
+        """
+        chunk = LLMStreamChunk()
+
+        # SSE spec: "data:" may or may not have a space after the colon
+        if line.startswith("data: "):
+            data_str = line[6:]
+        elif line.startswith("data:"):
+            data_str = line[5:]
+        else:
+            # Non-data lines (comments, event types, empty) — never buffer
+            return chunk, in_think, tag_buffer, json_buffer
+
+        data_str = data_str.strip()
+        if not data_str:
+            return chunk, in_think, tag_buffer, json_buffer
+
+        if data_str == "[DONE]":
+            chunk.is_finished = True
+            return chunk, in_think, tag_buffer, ""
+
+        # Accumulate into json_buffer for split JSON handling
+        if json_buffer:
+            json_buffer += data_str
+        else:
+            json_buffer = data_str
+
+        try:
+            data = json.loads(json_buffer)
+            json_buffer = ""  # Reset on successful parse
+        except json.JSONDecodeError:
+            # Cap buffer at 64KB to prevent memory leaks
+            if len(json_buffer) > 65536:
+                logger.warning("[LLM] JSON buffer exceeded 64KB, discarding")
+                json_buffer = ""
+            return chunk, in_think, tag_buffer, json_buffer
+
+        if "error" in data:
+            raise LLMError(f"Stream error: {data['error']}")
+
+        # Parse usage from stream (returned in the final chunk with include_usage)
+        if data.get("usage"):
+            chunk.usage = data["usage"]
+
+        choices = data.get("choices", [])
+        if not choices:
+            return chunk, in_think, tag_buffer, json_buffer
+
+        choice = choices[0]
+        delta = choice.get("delta", {})
+
+        if choice.get("finish_reason"):
+            chunk.finish_reason = choice["finish_reason"]
+
+        # Reasoning content (DeepSeek R1)
+        if delta.get("reasoning_content"):
+            chunk.reasoning_content = delta["reasoning_content"]
+
+        # Regular content with think tag filtering
+        if delta.get("content"):
+            text = delta["content"]
+            chunk.content, in_think, tag_buffer = self._filter_think_tags(
+                text, in_think, tag_buffer
+            )
+
+        # Tool calls
+        if delta.get("tool_calls"):
+            for tc_delta in delta["tool_calls"]:
+                chunk.tool_call = tc_delta
+                break  # Return one at a time
+
+        return chunk, in_think, tag_buffer, json_buffer
+
+    def _filter_think_tags(
+        self, text: str, in_think: bool, tag_buffer: str
+    ) -> tuple[str, bool, str]:
+        """Filter out <think>...</think> tags from content.
+
+        Returns (filtered_content, new_in_think, new_tag_buffer).
+        """
+        tag_buffer += text
+        emit = ""
+        i = 0
+        buf = tag_buffer
+
+        while i < len(buf):
+            if not in_think:
+                # Look for <think open tag
+                if buf[i] == "<":
+                    tag_candidate = buf[i:]
+                    if tag_candidate.startswith("<think>"):
+                        in_think = True
+                        i += len("<think>")
+                        continue
+                    elif "<think>".startswith(tag_candidate):
+                        # Partial match - keep in buffer
+                        break
+                    else:
+                        emit += buf[i]
+                        i += 1
+                else:
+                    emit += buf[i]
+                    i += 1
+            else:
+                # Inside think - look for </think> close tag
+                if buf[i] == "<":
+                    tag_candidate = buf[i:]
+                    if tag_candidate.startswith("</think>"):
+                        in_think = False
+                        i += len("</think>")
+                        continue
+                    elif "</think>".startswith(tag_candidate):
+                        break
+                i += 1
+
+        tag_buffer = buf[i:]
+        return emit, in_think, tag_buffer
+
+    async def complete(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Non-streaming completion."""
+        url = f"{self._normalize_base_url()}/chat/completions"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, stream=False, **kwargs)
+
+        client = await self._get_client()
+        response = await client.post(url, json=payload, headers=self._get_headers())
+
+        if response.status_code >= 400:
+            error_text = response.text[:500]
+            raise LLMError(f"HTTP {response.status_code}: {error_text}")
+
+        data = response.json()
+
+        if "error" in data:
+            raise LLMError(f"API error: {data['error']}")
+
+        choice = data.get("choices", [{}])[0]
+        msg = choice.get("message", {})
+
+        return LLMResponse(
+            content=msg.get("content", ""),
+            tool_calls=msg.get("tool_calls", []),
+            finish_reason=choice.get("finish_reason"),
+            usage=data.get("usage"),
+            model=data.get("model"),
+        )
+
+    async def stream(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        on_chunk: ChunkCallback | None = None,
+        on_thinking: ThinkingCallback | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Streaming completion."""
+        url = f"{self._normalize_base_url()}/chat/completions"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, stream=True, **kwargs)
+
+        full_content = ""
+        full_reasoning = ""
+        tool_calls_data: list[dict] = []
+        last_finish_reason: str | None = None
+        final_usage: dict | None = None
+
+        in_think = False
+        tag_buffer = ""
+        json_buffer = ""  # Buffer for non-standard APIs with split JSON (inspired by PR #120)
+
+        max_retries = 3
+        client = await self._get_client()
+
+        for attempt in range(max_retries):
+            try:
+                async with client.stream("POST", url, json=payload, headers=self._get_headers()) as resp:
+                    if resp.status_code >= 400:
+                        error_body = ""
+                        async for chunk in resp.aiter_bytes():
+                            error_body += chunk.decode(errors="replace")
+                        raise LLMError(f"HTTP {resp.status_code}: {error_body[:500]}")
+
+                    async for line in resp.aiter_lines():
+                        chunk, in_think, tag_buffer, json_buffer = self._parse_stream_line(
+                            line, in_think, tag_buffer, json_buffer
+                        )
+
+                        if chunk.is_finished:
+                            break
+
+                        if chunk.content:
+                            full_content += chunk.content
+                            if on_chunk:
+                                await on_chunk(chunk.content)
+
+                        if chunk.reasoning_content:
+                            full_reasoning += chunk.reasoning_content
+                            if on_thinking:
+                                await on_thinking(chunk.reasoning_content)
+
+                        if chunk.tool_call:
+                            idx = chunk.tool_call.get("index", 0)
+                            while len(tool_calls_data) <= idx:
+                                tool_calls_data.append({"id": "", "function": {"name": "", "arguments": ""}})
+                            tc = tool_calls_data[idx]
+                            if chunk.tool_call.get("id"):
+                                tc["id"] = chunk.tool_call["id"]
+                            fn_delta = chunk.tool_call.get("function", {})
+                            if fn_delta.get("name"):
+                                tc["function"]["name"] += fn_delta["name"]
+                            if fn_delta.get("arguments") is not None:
+                                arg_chunk = fn_delta["arguments"]
+                                if isinstance(arg_chunk, dict):
+                                    tc["function"]["arguments"] = json.dumps(arg_chunk, ensure_ascii=False)
+                                else:
+                                    tc["function"]["arguments"] += str(arg_chunk)
+
+                        if chunk.usage:
+                            final_usage = chunk.usage
+
+                        if chunk.finish_reason:
+                            last_finish_reason = chunk.finish_reason
+
+                break  # Success
+
+            except (httpx.ConnectError, httpx.ReadError, httpx.ConnectTimeout) as e:
+                if attempt < max_retries - 1:
+                    wait = (attempt + 1) * 1
+                    logger.warning(f"Stream attempt {attempt + 1} failed ({type(e).__name__}), retrying in {wait}s...")
+                    await asyncio.sleep(wait)
+                    full_content = ""
+                    full_reasoning = ""
+                    tool_calls_data = []
+                    in_think = False
+                    tag_buffer = ""
+                    json_buffer = ""
+                else:
+                    raise LLMError(f"Connection failed after {max_retries} attempts: {e}")
+
+        # Clean up any remaining think tags
+        full_content = re.sub(r"<think>[\s\S]*?</think>\s*", "", full_content).strip()
+
+        return LLMResponse(
+            content=full_content,
+            tool_calls=tool_calls_data,
+            reasoning_content=full_reasoning or None,
+            finish_reason=last_finish_reason,
+            usage=final_usage,
+            model=self.model,
+        )
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+
+# ============================================================================
+# OpenAI Responses API Client
+# ============================================================================
+
+class OpenAIResponsesClient(LLMClient):
+    """Client for OpenAI Responses API (`/v1/responses`)."""
+
+    DEFAULT_BASE_URL = "https://api.openai.com/v1"
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+        supports_tool_choice: bool = True,
+    ):
+        super().__init__(api_key, base_url or self.DEFAULT_BASE_URL, model, timeout)
+        self.supports_tool_choice = supports_tool_choice
+        self._client: httpx.AsyncClient | None = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=self.timeout, follow_redirects=True)
+        return self._client
+
+    def _get_headers(self) -> dict[str, str]:
+        return {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+
+    def _normalize_base_url(self) -> str:
+        """Normalize base URL by stripping trailing /responses endpoint."""
+        url = self.base_url.rstrip("/")
+        if url.endswith("/responses"):
+            url = url[: -len("/responses")]
+        return url
+
+    def _format_content_for_input(self, content: Any) -> Any:
+        """Convert OpenAI chat-style content into Responses API input content."""
+        if not isinstance(content, list):
+            return content
+
+        formatted: list[dict[str, Any]] = []
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            ptype = part.get("type")
+            if ptype == "text":
+                formatted.append({"type": "input_text", "text": part.get("text", "")})
+            elif ptype == "image_url":
+                img = part.get("image_url", {})
+                if isinstance(img, dict):
+                    formatted.append({"type": "input_image", "image_url": img.get("url", "")})
+            else:
+                formatted.append(part)
+        return formatted if formatted else content
+
+    def _messages_to_input(self, messages: list[LLMMessage]) -> list[dict[str, Any]]:
+        """Convert canonical message format to Responses API input format."""
+        input_items: list[dict[str, Any]] = []
+
+        for msg in messages:
+            if msg.role in {"system", "user", "assistant"} and msg.content is not None:
+                item: dict[str, Any] = {"role": msg.role}
+                item["content"] = self._format_content_for_input(msg.content)
+                input_items.append(item)
+
+            if msg.role == "assistant" and msg.tool_calls:
+                for tc in msg.tool_calls:
+                    fn = tc.get("function", {})
+                    args = fn.get("arguments", "{}")
+                    if isinstance(args, dict):
+                        args = json.dumps(args, ensure_ascii=False)
+                    input_items.append({
+                        "type": "function_call",
+                        "call_id": tc.get("id", ""),
+                        "name": fn.get("name", ""),
+                        "arguments": str(args or "{}"),
+                    })
+
+            if msg.role == "tool":
+                input_items.append({
+                    "type": "function_call_output",
+                    "call_id": msg.tool_call_id or "",
+                    "output": msg.content or "",
+                })
+
+        return input_items
+
+    def _convert_tools(self, tools: list[dict] | None) -> list[dict] | None:
+        """Convert OpenAI tool schema to Responses API function tool schema."""
+        if not tools:
+            return None
+
+        converted: list[dict[str, Any]] = []
+        for tool in tools:
+            if tool.get("type") != "function":
+                continue
+            fn = tool.get("function", {})
+            converted.append({
+                "type": "function",
+                "name": fn.get("name", ""),
+                "description": fn.get("description", ""),
+                "parameters": fn.get("parameters", {"type": "object"}),
+            })
+        return converted or None
+
+    def _build_payload(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None,
+        temperature: float,
+        max_tokens: int | None,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Build request payload."""
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "input": self._messages_to_input(messages),
+            "temperature": temperature,
+            "stream": stream,
+        }
+
+        if max_tokens:
+            payload["max_output_tokens"] = max_tokens
+
+        converted_tools = self._convert_tools(tools)
+        if converted_tools:
+            payload["tools"] = converted_tools
+            if self.supports_tool_choice:
+                payload["tool_choice"] = "auto"
+
+        payload.update(kwargs)
+        return payload
+
+    def _parse_response_data(self, data: dict[str, Any]) -> LLMResponse:
+        """Convert Responses API payload into canonical LLMResponse."""
+        content_parts: list[str] = []
+        reasoning_parts: list[str] = []
+        tool_calls: list[dict[str, Any]] = []
+
+        for item in data.get("output", []) or []:
+            item_type = item.get("type")
+            if item_type == "message":
+                for c in item.get("content", []) or []:
+                    c_type = c.get("type")
+                    if c_type in {"output_text", "text"}:
+                        content_parts.append(c.get("text", ""))
+                    elif c_type == "reasoning":
+                        reasoning_parts.append(c.get("summary", "") or c.get("text", ""))
+            elif item_type == "function_call":
+                args = item.get("arguments", "{}")
+                if isinstance(args, dict):
+                    args = json.dumps(args, ensure_ascii=False)
+                tool_calls.append({
+                    "id": item.get("call_id") or item.get("id", ""),
+                    "type": "function",
+                    "function": {
+                        "name": item.get("name", ""),
+                        "arguments": str(args or "{}"),
+                    },
+                })
+
+        # Some Responses payloads include a pre-aggregated output_text field.
+        # Use it as a fallback when output blocks are empty.
+        if not content_parts and data.get("output_text"):
+            content_parts.append(str(data.get("output_text", "")))
+
+        usage = data.get("usage")
+        finish_reason = "tool_calls" if tool_calls else "stop"
+
+        return LLMResponse(
+            content="".join(content_parts),
+            tool_calls=tool_calls,
+            reasoning_content="".join(reasoning_parts) or None,
+            finish_reason=finish_reason,
+            usage=usage if isinstance(usage, dict) else None,
+            model=data.get("model"),
+        )
+
+    def _extract_api_error(self, data: dict[str, Any]) -> str | None:
+        """Extract meaningful error message from Responses API payload."""
+        # OpenAI Responses often returns `"error": null` on success,
+        # so we must only treat it as error when it's truthy.
+        err = data.get("error")
+        if err:
+            if isinstance(err, dict):
+                msg = err.get("message") or str(err)
+                err_type = err.get("type")
+                err_code = err.get("code")
+                extra = []
+                if err_type:
+                    extra.append(f"type={err_type}")
+                if err_code:
+                    extra.append(f"code={err_code}")
+                suffix = f" ({', '.join(extra)})" if extra else ""
+                return f"{msg}{suffix}"
+            return str(err)
+
+        status = str(data.get("status") or "").lower()
+        if status in {"failed", "incomplete", "cancelled"}:
+            last_error = data.get("last_error")
+            incomplete = data.get("incomplete_details")
+            rid = data.get("id")
+            details: list[str] = [f"status={status}"]
+            if rid:
+                details.append(f"id={rid}")
+            if last_error:
+                details.append(f"last_error={last_error}")
+            if incomplete:
+                details.append(f"incomplete_details={incomplete}")
+            return "Responses API returned non-success status: " + "; ".join(details)
+
+        return None
+
+    def _build_error_log_context(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Build compact context for error logs."""
+        return {
+            "provider": "openai-response",
+            "model": self.model,
+            "response_id": data.get("id"),
+            "status": data.get("status"),
+            "incomplete_details": data.get("incomplete_details"),
+            "last_error": data.get("last_error"),
+            "has_output": bool(data.get("output")),
+        }
+
+    async def complete(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Non-streaming completion."""
+        url = f"{self._normalize_base_url()}/responses"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, stream=False, **kwargs)
+
+        client = await self._get_client()
+        response = await client.post(url, json=payload, headers=self._get_headers())
+
+        if response.status_code >= 400:
+            error_text = response.text[:500]
+            raise LLMError(f"HTTP {response.status_code}: {error_text}")
+
+        data = response.json()
+        api_error = self._extract_api_error(data)
+        if api_error:
+            ctx = self._build_error_log_context(data)
+            logger.error(
+                "OpenAIResponses API error: %s | context=%s",
+                api_error,
+                ctx,
+            )
+            raise LLMError(api_error)
+
+        return self._parse_response_data(data)
+
+    async def stream(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        on_chunk: ChunkCallback | None = None,
+        on_thinking: ThinkingCallback | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Streaming completion.
+
+        Minimal implementation: fallback to non-streaming and forward final text.
+        """
+        response = await self.complete(
+            messages=messages,
+            tools=tools,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            **kwargs,
+        )
+        if on_chunk and response.content:
+            await on_chunk(response.content)
+        if on_thinking and response.reasoning_content:
+            await on_thinking(response.reasoning_content)
+        return response
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+
+# ============================================================================
+# Gemini Native Client
+# ============================================================================
+
+class GeminiClient(LLMClient):
+    """Client for Gemini native API (`generateContent` / `streamGenerateContent`)."""
+
+    DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+        supports_tool_choice: bool = True,
+    ):
+        super().__init__(api_key, base_url or self.DEFAULT_BASE_URL, model, timeout)
+        self.supports_tool_choice = supports_tool_choice
+        self._client: httpx.AsyncClient | None = None
+        self._openai_fallback_client: OpenAICompatibleClient | None = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=self.timeout, follow_redirects=True)
+        return self._client
+
+    async def _get_openai_fallback_client(self) -> OpenAICompatibleClient:
+        """Fallback for legacy `/openai` base URL deployments."""
+        if self._openai_fallback_client is None:
+            self._openai_fallback_client = OpenAICompatibleClient(
+                api_key=self.api_key,
+                base_url=self.base_url,
+                model=self.model,
+                timeout=self.timeout,
+                supports_tool_choice=self.supports_tool_choice,
+            )
+        return self._openai_fallback_client
+
+    def _is_openai_compatible_base(self) -> bool:
+        """Detect legacy OpenAI-compatible Gemini gateway endpoint."""
+        url = self.base_url.rstrip("/").lower()
+        return url.endswith("/openai") or "/openai/" in url
+
+    def _get_headers(self) -> dict[str, str]:
+        return {
+            "Content-Type": "application/json",
+            "x-goog-api-key": self.api_key,
+        }
+
+    def _normalize_base_url(self) -> str:
+        """Normalize base URL for Gemini native endpoints."""
+        url = self.base_url.rstrip("/")
+        if "/models/" in url and (url.endswith(":generateContent") or url.endswith(":streamGenerateContent")):
+            url = url.split("/models/")[0]
+        return url
+
+    def _normalize_model_name(self) -> str:
+        """Normalize model id for native Gemini endpoint path."""
+        model = (self.model or "").strip()
+        if model.startswith("models/"):
+            model = model[len("models/"):]
+        return model
+
+    def _parse_data_url_image(self, data_url: str) -> tuple[str, str] | None:
+        """Parse data URL into (mime_type, base64_data)."""
+        m = re.match(r"^data:([^;]+);base64,([A-Za-z0-9+/=]+)$", data_url or "")
+        if not m:
+            return None
+        return m.group(1), m.group(2)
+
+    def _content_to_gemini_parts(self, content: Any) -> list[dict[str, Any]]:
+        """Convert canonical content into Gemini `parts`."""
+        if content is None:
+            return []
+
+        if isinstance(content, str):
+            return [{"text": content}]
+
+        if isinstance(content, list):
+            parts: list[dict[str, Any]] = []
+            for part in content:
+                if not isinstance(part, dict):
+                    continue
+                ptype = part.get("type")
+                if ptype == "text":
+                    text = part.get("text", "")
+                    if text:
+                        parts.append({"text": text})
+                elif ptype == "image_url":
+                    image_obj = part.get("image_url", {})
+                    image_url = image_obj.get("url", "") if isinstance(image_obj, dict) else ""
+                    parsed = self._parse_data_url_image(image_url)
+                    if parsed:
+                        mime_type, b64_data = parsed
+                        parts.append({
+                            "inlineData": {
+                                "mimeType": mime_type,
+                                "data": b64_data,
+                            }
+                        })
+                    elif image_url:
+                        # Gemini native API requires uploaded files or inline data;
+                        # preserve reference in text when URL cannot be inlined.
+                        parts.append({"text": f"[image_url:{image_url}]"})
+            return parts
+
+        return [{"text": str(content)}]
+
+    def _extract_tool_name_map(self, messages: list[LLMMessage]) -> dict[str, str]:
+        """Build tool_call_id -> function_name map from assistant messages."""
+        out: dict[str, str] = {}
+        for msg in messages:
+            if msg.role != "assistant" or not msg.tool_calls:
+                continue
+            for tc in msg.tool_calls:
+                tc_id = tc.get("id")
+                tc_name = tc.get("function", {}).get("name")
+                if tc_id and tc_name:
+                    out[tc_id] = tc_name
+        return out
+
+    def _convert_tools(self, tools: list[dict] | None) -> tuple[list[dict[str, Any]] | None, dict[str, Any] | None]:
+        """Convert OpenAI-style tools to Gemini function declarations."""
+        if not tools:
+            return None, None
+
+        declarations: list[dict[str, Any]] = []
+        for tool in tools:
+            if tool.get("type") != "function":
+                continue
+            fn = tool.get("function", {})
+            decl: dict[str, Any] = {
+                "name": fn.get("name", ""),
+                "description": fn.get("description", ""),
+            }
+            params = fn.get("parameters")
+            if isinstance(params, dict):
+                decl["parameters"] = params
+            declarations.append(decl)
+
+        if not declarations:
+            return None, None
+
+        tools_payload = [{"functionDeclarations": declarations}]
+        tool_config = None
+        if self.supports_tool_choice:
+            tool_config = {"functionCallingConfig": {"mode": "AUTO"}}
+        return tools_payload, tool_config
+
+    def _build_payload(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None,
+        temperature: float,
+        max_tokens: int | None,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Build Gemini request payload."""
+        system_blocks: list[str] = []
+        contents: list[dict[str, Any]] = []
+        tool_name_map = self._extract_tool_name_map(messages)
+
+        for msg in messages:
+            if msg.role == "system":
+                parts = self._content_to_gemini_parts(msg.content)
+                text_chunks = [p.get("text", "") for p in parts if p.get("text")]
+                if text_chunks:
+                    system_blocks.append("\n".join(text_chunks))
+                continue
+
+            if msg.role == "user":
+                parts = self._content_to_gemini_parts(msg.content)
+                if parts:
+                    contents.append({"role": "user", "parts": parts})
+                continue
+
+            if msg.role == "assistant":
+                parts = self._content_to_gemini_parts(msg.content)
+                if msg.tool_calls:
+                    for tc in msg.tool_calls:
+                        fn = tc.get("function", {})
+                        args = fn.get("arguments", "{}")
+                        if isinstance(args, str):
+                            try:
+                                parsed_args = json.loads(args)
+                            except json.JSONDecodeError:
+                                parsed_args = {}
+                        elif isinstance(args, dict):
+                            parsed_args = args
+                        else:
+                            parsed_args = {}
+                        parts.append({
+                            "functionCall": {
+                                "name": fn.get("name", ""),
+                                "args": parsed_args,
+                            }
+                        })
+                if parts:
+                    contents.append({"role": "model", "parts": parts})
+                continue
+
+            if msg.role == "tool":
+                name = tool_name_map.get(msg.tool_call_id or "", msg.tool_call_id or "tool_result")
+                response_content = msg.content or ""
+                if isinstance(response_content, str):
+                    try:
+                        parsed = json.loads(response_content)
+                        if isinstance(parsed, dict):
+                            response_obj: dict[str, Any] = parsed
+                        else:
+                            response_obj = {"result": parsed}
+                    except json.JSONDecodeError:
+                        response_obj = {"result": response_content}
+                elif isinstance(response_content, dict):
+                    response_obj = response_content
+                else:
+                    response_obj = {"result": str(response_content)}
+
+                contents.append({
+                    "role": "user",
+                    "parts": [{
+                        "functionResponse": {
+                            "name": name,
+                            "response": response_obj,
+                        }
+                    }],
+                })
+
+        payload: dict[str, Any] = {
+            "contents": contents or [{"role": "user", "parts": [{"text": ""}]}],
+            "generationConfig": {
+                "temperature": temperature,
+            },
+        }
+
+        if max_tokens:
+            payload["generationConfig"]["maxOutputTokens"] = max_tokens
+
+        if system_blocks:
+            payload["systemInstruction"] = {
+                "parts": [{"text": "\n\n".join(system_blocks)}]
+            }
+
+        tools_payload, tool_config = self._convert_tools(tools)
+        if tools_payload:
+            payload["tools"] = tools_payload
+        if tool_config:
+            payload["toolConfig"] = tool_config
+
+        payload.update(kwargs)
+        return payload
+
+    def _normalize_usage(self, usage: dict[str, Any] | None) -> dict[str, int] | None:
+        """Normalize Gemini usage metadata to unified usage dict."""
+        if not isinstance(usage, dict):
+            return None
+        input_tokens = int(usage.get("promptTokenCount", 0) or 0)
+        output_tokens = int(usage.get("candidatesTokenCount", 0) or 0)
+        total_tokens = int(usage.get("totalTokenCount", input_tokens + output_tokens) or 0)
+        return {
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "total_tokens": total_tokens,
+        }
+
+    def _normalize_finish_reason(self, finish_reason: str | None, tool_calls: list[dict]) -> str | None:
+        """Normalize Gemini finish reason to OpenAI-style labels."""
+        if tool_calls:
+            return "tool_calls"
+        if not finish_reason:
+            return None
+        mapping = {
+            "STOP": "stop",
+            "MAX_TOKENS": "length",
+            "SAFETY": "content_filter",
+            "RECITATION": "content_filter",
+        }
+        return mapping.get(finish_reason, "stop")
+
+    def _parse_response_data(self, data: dict[str, Any]) -> LLMResponse:
+        """Convert Gemini native response into canonical LLMResponse."""
+        content_chunks: list[str] = []
+        tool_calls: list[dict[str, Any]] = []
+        seen_tool_calls: set[str] = set()
+        finish_reason = None
+
+        candidates = data.get("candidates") or []
+        if candidates:
+            candidate = candidates[0]
+            finish_reason = candidate.get("finishReason")
+            content_obj = candidate.get("content", {}) or {}
+            for part in content_obj.get("parts", []) or []:
+                text = part.get("text")
+                if text:
+                    content_chunks.append(text)
+                function_call = part.get("functionCall")
+                if function_call:
+                    name = function_call.get("name", "")
+                    args = function_call.get("args", {})
+                    args_str = json.dumps(args if isinstance(args, dict) else {}, ensure_ascii=False)
+                    dedup_key = f"{name}:{args_str}"
+                    if dedup_key in seen_tool_calls:
+                        continue
+                    seen_tool_calls.add(dedup_key)
+                    tool_calls.append({
+                        "id": f"call_{len(tool_calls) + 1}",
+                        "type": "function",
+                        "function": {
+                            "name": name,
+                            "arguments": args_str,
+                        },
+                    })
+
+        usage = self._normalize_usage(data.get("usageMetadata"))
+
+        return LLMResponse(
+            content="".join(content_chunks),
+            tool_calls=tool_calls,
+            finish_reason=self._normalize_finish_reason(finish_reason, tool_calls),
+            usage=usage,
+            model=data.get("modelVersion") or self.model,
+        )
+
+    async def complete(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Non-streaming completion."""
+        if self._is_openai_compatible_base():
+            fallback = await self._get_openai_fallback_client()
+            return await fallback.complete(
+                messages=messages,
+                tools=tools,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                **kwargs,
+            )
+
+        model_name = self._normalize_model_name()
+        url = f"{self._normalize_base_url()}/models/{model_name}:generateContent"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, **kwargs)
+
+        client = await self._get_client()
+        response = await client.post(url, json=payload, headers=self._get_headers())
+
+        if response.status_code >= 400:
+            error_text = response.text[:500]
+            raise LLMError(f"HTTP {response.status_code}: {error_text}")
+
+        data = response.json()
+        if isinstance(data, dict) and data.get("error"):
+            raise LLMError(f"API error: {data['error']}")
+
+        return self._parse_response_data(data)
+
+    async def stream(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        on_chunk: ChunkCallback | None = None,
+        on_thinking: ThinkingCallback | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Streaming completion using Gemini SSE endpoint."""
+        if self._is_openai_compatible_base():
+            fallback = await self._get_openai_fallback_client()
+            return await fallback.stream(
+                messages=messages,
+                tools=tools,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                on_chunk=on_chunk,
+                on_thinking=on_thinking,
+                **kwargs,
+            )
+
+        model_name = self._normalize_model_name()
+        url = f"{self._normalize_base_url()}/models/{model_name}:streamGenerateContent"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, **kwargs)
+
+        full_text = ""
+        tool_calls: list[dict[str, Any]] = []
+        seen_tool_calls: set[str] = set()
+        final_usage: dict[str, int] | None = None
+        final_finish_reason: str | None = None
+
+        client = await self._get_client()
+
+        try:
+            async with client.stream(
+                "POST",
+                url,
+                params={"alt": "sse"},
+                json=payload,
+                headers=self._get_headers(),
+            ) as resp:
+                if resp.status_code >= 400:
+                    error_body = ""
+                    async for chunk in resp.aiter_bytes():
+                        error_body += chunk.decode(errors="replace")
+                    raise LLMError(f"HTTP {resp.status_code}: {error_body[:500]}")
+
+                async for line in resp.aiter_lines():
+                    if not line.startswith("data:"):
+                        continue
+                    data_str = line[len("data:"):].strip()
+                    if not data_str or data_str == "[DONE]":
+                        continue
+
+                    try:
+                        data = json.loads(data_str)
+                    except json.JSONDecodeError:
+                        continue
+
+                    if isinstance(data, dict) and data.get("error"):
+                        raise LLMError(f"API error: {data['error']}")
+
+                    usage = self._normalize_usage(data.get("usageMetadata"))
+                    if usage:
+                        final_usage = usage
+
+                    candidates = data.get("candidates") or []
+                    if not candidates:
+                        continue
+                    candidate = candidates[0]
+                    final_finish_reason = candidate.get("finishReason") or final_finish_reason
+                    content_obj = candidate.get("content", {}) or {}
+                    for part in content_obj.get("parts", []) or []:
+                        text = part.get("text")
+                        if text:
+                            full_text += text
+                            if on_chunk:
+                                await on_chunk(text)
+
+                        function_call = part.get("functionCall")
+                        if function_call:
+                            name = function_call.get("name", "")
+                            args = function_call.get("args", {})
+                            args_str = json.dumps(args if isinstance(args, dict) else {}, ensure_ascii=False)
+                            dedup_key = f"{name}:{args_str}"
+                            if dedup_key in seen_tool_calls:
+                                continue
+                            seen_tool_calls.add(dedup_key)
+                            tool_calls.append({
+                                "id": f"call_{len(tool_calls) + 1}",
+                                "type": "function",
+                                "function": {
+                                    "name": name,
+                                    "arguments": args_str,
+                                },
+                            })
+
+        except (httpx.ConnectError, httpx.ReadError, httpx.ConnectTimeout) as e:
+            raise LLMError(f"Connection failed: {e}")
+
+        return LLMResponse(
+            content=full_text,
+            tool_calls=tool_calls,
+            finish_reason=self._normalize_finish_reason(final_finish_reason, tool_calls),
+            usage=final_usage,
+            model=self.model,
+        )
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        if self._openai_fallback_client:
+            await self._openai_fallback_client.close()
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+
+# ============================================================================
+# Anthropic Native Client
+# ============================================================================
+
+class AnthropicClient(LLMClient):
+    """Client for Anthropic's native Messages API.
+    
+    Supports Claude 3.x and Claude 3.7+ with extended thinking.
+    """
+
+    DEFAULT_BASE_URL = "https://api.anthropic.com"
+    API_VERSION = "2023-06-01"
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+    ):
+        super().__init__(api_key, base_url or self.DEFAULT_BASE_URL, model, timeout)
+        self._client: httpx.AsyncClient | None = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=self.timeout, follow_redirects=True)
+        return self._client
+
+    def _get_headers(self) -> dict[str, str]:
+        return {
+            "Content-Type": "application/json",
+            "x-api-key": self.api_key,
+            "anthropic-version": self.API_VERSION,
+        }
+
+    def _build_payload(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None,
+        temperature: float,
+        max_tokens: int | None,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Build Anthropic request payload."""
+        system_content = None
+        anthropic_messages = []
+
+        for msg in messages:
+            if msg.role == "system":
+                system_content = msg.content
+            else:
+                formatted = msg.to_anthropic_format()
+                if formatted:
+                    anthropic_messages.append(formatted)
+
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": anthropic_messages,
+            "max_tokens": max_tokens or 4096,
+            "temperature": temperature,
+            "stream": stream,
+        }
+
+        if system_content:
+            payload["system"] = system_content
+
+        # Handle Extended Thinking
+        thinking = kwargs.pop("thinking", None)
+        if thinking:
+            payload["thinking"] = thinking
+            # For thinking models, temperature must be 1.0 or omitted in some cases
+            # But usually it's best to let user specify or default to 1.0 if not set
+            if "temperature" not in kwargs:
+                payload["temperature"] = 1.0
+
+        if tools:
+            anthropic_tools = []
+            for tool in tools:
+                if tool.get("type") == "function":
+                    func = tool["function"]
+                    anthropic_tools.append({
+                        "name": func["name"],
+                        "description": func.get("description", ""),
+                        "input_schema": func.get("parameters", {"type": "object"}),
+                    })
+            payload["tools"] = anthropic_tools
+
+        payload.update(kwargs)
+        return payload
+
+    async def complete(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Non-streaming completion."""
+        url = f"{self.base_url.rstrip('/')}/v1/messages"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, stream=False, **kwargs)
+
+        client = await self._get_client()
+        response = await client.post(url, json=payload, headers=self._get_headers())
+
+        if response.status_code >= 400:
+            error_text = response.text[:500]
+            raise LLMError(f"HTTP {response.status_code}: {error_text}")
+
+        data = response.json()
+        if data.get("type") == "error":
+            raise LLMError(f"API error: {data.get('error', {})}")
+
+        full_content = ""
+        full_reasoning = ""
+        full_signature = None
+        tool_calls = []
+        
+        for block in data.get("content", []):
+            if block.get("type") == "text":
+                full_content += block.get("text", "")
+            elif block.get("type") == "thinking":
+                full_reasoning += block.get("thinking", "")
+                full_signature = block.get("signature")
+            elif block.get("type") == "tool_use":
+                tool_calls.append({
+                    "id": block.get("id"),
+                    "type": "function",
+                    "function": {
+                        "name": block.get("name"),
+                        "arguments": json.dumps(block.get("input", {}), ensure_ascii=False)
+                    }
+                })
+
+        usage = None
+        if "usage" in data:
+            usage = {
+                "input_tokens": data["usage"].get("input_tokens", 0),
+                "output_tokens": data["usage"].get("output_tokens", 0),
+            }
+
+        return LLMResponse(
+            content=full_content,
+            tool_calls=tool_calls,
+            reasoning_content=full_reasoning or None,
+            reasoning_signature=full_signature,
+            finish_reason=data.get("stop_reason"),
+            usage=usage,
+            model=data.get("model"),
+        )
+
+    async def stream(
+        self,
+        messages: list[LLMMessage],
+        tools: list[dict] | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        on_chunk: ChunkCallback | None = None,
+        on_thinking: ThinkingCallback | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Streaming completion."""
+        url = f"{self.base_url.rstrip('/')}/v1/messages"
+        payload = self._build_payload(messages, tools, temperature, max_tokens, stream=True, **kwargs)
+
+        full_content = ""
+        full_reasoning = ""
+        full_signature = None
+        tool_calls_data: list[dict] = []
+        tool_call_index_map: dict[int, int] = {}
+        last_finish_reason: str | None = None
+        final_usage = None
+        final_model = self.model
+
+        client = await self._get_client()
+        
+        try:
+            async with client.stream("POST", url, json=payload, headers=self._get_headers()) as resp:
+                if resp.status_code >= 400:
+                    error_body = ""
+                    async for chunk in resp.aiter_bytes():
+                        error_body += chunk.decode(errors="replace")
+                    raise LLMError(f"HTTP {resp.status_code}: {error_body[:500]}")
+
+                current_event = None
+                
+                async for line in resp.aiter_lines():
+                    if not line.strip():
+                        continue
+                        
+                    if line.startswith("event:"):
+                        current_event = line[len("event:"):].strip()
+                        continue
+                        
+                    if not line.startswith("data:"):
+                        continue
+                        
+                    data_str = line[len("data:"):].strip()
+                    if data_str == "[DONE]":
+                        break
+                        
+                    try:
+                        data = json.loads(data_str)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Handle events
+                    if current_event == "message_start":
+                        msg = data.get("message", {})
+                        if msg.get("model"):
+                            final_model = msg["model"]
+                        if msg.get("usage"):
+                            final_usage = msg["usage"]
+                            
+                    elif current_event == "content_block_start":
+                        block = data.get("content_block", {})
+                        idx = data.get("index", 0)
+                        if block.get("type") == "tool_use":
+                            tool_call_index_map[idx] = len(tool_calls_data)
+                            tool_calls_data.append({
+                                "id": block.get("id"),
+                                "type": "function",
+                                "function": {"name": block.get("name"), "arguments": ""}
+                            })
+                            
+                    elif current_event == "content_block_delta":
+                        idx = data.get("index", 0)
+                        delta = data.get("delta", {})
+                        delta_type = delta.get("type")
+                        
+                        if delta_type == "text_delta":
+                            text = delta.get("text", "")
+                            full_content += text
+                            if on_chunk:
+                                await on_chunk(text)
+                                
+                        elif delta_type == "thinking_delta":
+                            thought = delta.get("thinking", "")
+                            full_reasoning += thought
+                            if on_thinking:
+                                await on_thinking(thought)
+                        
+                        elif delta_type == "signature_delta":
+                            full_signature = delta.get("signature")
+                                
+                        elif delta_type == "input_json_delta":
+                            if idx in tool_call_index_map:
+                                tc_idx = tool_call_index_map[idx]
+                                tool_calls_data[tc_idx]["function"]["arguments"] += delta.get("partial_json", "")
+                                
+                    elif current_event == "message_delta":
+                        delta = data.get("delta", {})
+                        if delta.get("stop_reason"):
+                            last_finish_reason = delta["stop_reason"]
+                        if data.get("usage"):
+                            # message_delta usage is cumulative
+                            final_usage = data["usage"]
+                            
+                    elif current_event == "error":
+                        error_info = data.get("error", {})
+                        raise LLMError(f"Anthropic stream error ({error_info.get('type')}): {error_info.get('message')}")
+
+                    elif current_event == "message_stop":
+                        break
+
+        except (httpx.ConnectError, httpx.ReadError, httpx.ConnectTimeout) as e:
+            raise LLMError(f"Connection failed: {e}")
+
+        # Normalize stop reason to OpenAI style (optional but helpful for consistency)
+        if last_finish_reason == "end_turn":
+            last_finish_reason = "stop"
+        elif last_finish_reason == "tool_use":
+            last_finish_reason = "tool_calls"
+
+        return LLMResponse(
+            content=full_content,
+            tool_calls=tool_calls_data,
+            reasoning_content=full_reasoning or None,
+            reasoning_signature=full_signature,
+            finish_reason=last_finish_reason,
+            usage=final_usage,
+            model=final_model,
+        )
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+
+# ============================================================================
+# Factory and Utilities
+# ============================================================================
+
+@dataclass(frozen=True)
+class ProviderSpec:
+    """Provider registry entry."""
+
+    provider: str
+    display_name: str
+    protocol: Literal["openai_compatible", "anthropic", "openai_responses", "gemini"]
+    default_base_url: str | None
+    supports_tool_choice: bool = True
+    default_max_tokens: int = 4096
+    model_max_tokens: dict[str, int] = field(default_factory=dict)
+
+
+# Provider aliases accepted for compatibility
+PROVIDER_ALIASES: dict[str, str] = {
+    "openai_response": "openai-response",
+    "openairesponses": "openai-response",
+}
+
+
+# Canonical provider registry (single source of truth)
+PROVIDER_REGISTRY: dict[str, ProviderSpec] = {
+    "anthropic": ProviderSpec(
+        provider="anthropic",
+        display_name="Anthropic",
+        protocol="anthropic",
+        default_base_url="https://api.anthropic.com",
+        supports_tool_choice=False,
+        default_max_tokens=8192,
+    ),
+    "openai": ProviderSpec(
+        provider="openai",
+        display_name="OpenAI",
+        protocol="openai_compatible",
+        default_base_url="https://api.openai.com/v1",
+        default_max_tokens=16384,
+    ),
+    "openai-response": ProviderSpec(
+        provider="openai-response",
+        display_name="OpenAI Responses",
+        protocol="openai_responses",
+        default_base_url="https://api.openai.com/v1",
+        default_max_tokens=16384,
+    ),
+    "azure": ProviderSpec(
+        provider="azure",
+        display_name="Azure OpenAI",
+        protocol="openai_compatible",
+        default_base_url=None,
+        default_max_tokens=16384,
+    ),
+    "deepseek": ProviderSpec(
+        provider="deepseek",
+        display_name="DeepSeek",
+        protocol="openai_compatible",
+        default_base_url="https://api.deepseek.com/v1",
+        default_max_tokens=8192,
+    ),
+    "qwen": ProviderSpec(
+        provider="qwen",
+        display_name="Qwen (DashScope)",
+        protocol="openai_compatible",
+        default_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        default_max_tokens=8192,
+        model_max_tokens={
+            "qwen-plus": 16384,
+            "qwen-long": 16384,
+            "qwen-turbo": 8192,
+            "qwen-max": 8192,
+        },
+    ),
+    "minimax": ProviderSpec(
+        provider="minimax",
+        display_name="MiniMax",
+        protocol="openai_compatible",
+        default_base_url="https://api.minimaxi.com/v1",
+        default_max_tokens=16384,
+    ),
+    "openrouter": ProviderSpec(
+        provider="openrouter",
+        display_name="OpenRouter",
+        protocol="openai_compatible",
+        default_base_url="https://openrouter.ai/api/v1",
+        default_max_tokens=4096,
+    ),
+    "zhipu": ProviderSpec(
+        provider="zhipu",
+        display_name="Zhipu",
+        protocol="openai_compatible",
+        default_base_url="https://open.bigmodel.cn/api/paas/v4",
+        default_max_tokens=8192,
+    ),
+    "baidu": ProviderSpec(
+        provider="baidu",
+        display_name="Baidu (Qianfan)",
+        protocol="openai_compatible",
+        default_base_url="https://qianfan.baidubce.com/v2",
+        supports_tool_choice=False,
+        default_max_tokens=4096,
+    ),
+    "gemini": ProviderSpec(
+        provider="gemini",
+        display_name="Gemini",
+        protocol="gemini",
+        default_base_url="https://generativelanguage.googleapis.com/v1beta",
+        default_max_tokens=8192,
+    ),
+    "kimi": ProviderSpec(
+        provider="kimi",
+        display_name="Kimi (Moonshot)",
+        protocol="openai_compatible",
+        default_base_url="https://api.moonshot.cn/v1",
+        default_max_tokens=8192,
+    ),
+    "vllm": ProviderSpec(
+        provider="vllm",
+        display_name="vLLM",
+        protocol="openai_compatible",
+        default_base_url="http://localhost:8000/v1",
+        default_max_tokens=4096,
+    ),
+    "ollama": ProviderSpec(
+        provider="ollama",
+        display_name="Ollama",
+        protocol="openai_compatible",
+        default_base_url="http://localhost:11434/v1",
+        default_max_tokens=4096,
+    ),
+    "sglang": ProviderSpec(
+        provider="sglang",
+        display_name="SGLang",
+        protocol="openai_compatible",
+        default_base_url="http://localhost:30000/v1",
+        default_max_tokens=4096,
+    ),
+    "custom": ProviderSpec(
+        provider="custom",
+        display_name="Custom",
+        protocol="openai_compatible",
+        default_base_url=None,
+        default_max_tokens=4096,
+    ),
+}
+
+
+def normalize_provider(provider: str) -> str:
+    """Normalize provider id with aliases and lowercase."""
+    p = (provider or "").strip().lower()
+    return PROVIDER_ALIASES.get(p, p)
+
+
+def get_provider_spec(provider: str) -> ProviderSpec | None:
+    """Get provider spec from registry."""
+    return PROVIDER_REGISTRY.get(normalize_provider(provider))
+
+
+def get_provider_manifest() -> list[dict[str, Any]]:
+    """List supported providers and capabilities for UI/config discovery."""
+    out: list[dict[str, Any]] = []
+    for spec in PROVIDER_REGISTRY.values():
+        out.append({
+            "provider": spec.provider,
+            "display_name": spec.display_name,
+            "protocol": spec.protocol,
+            "default_base_url": spec.default_base_url,
+            "supports_tool_choice": spec.supports_tool_choice,
+            "default_max_tokens": spec.default_max_tokens,
+            "model_max_tokens": spec.model_max_tokens,
+            "aliases": [k for k, v in PROVIDER_ALIASES.items() if v == spec.provider],
+        })
+    return out
+
+
+# Backward-compatible constants derived from registry
+PROVIDER_CLIENTS: dict[str, type[LLMClient]] = {
+    spec.provider: (
+        AnthropicClient
+        if spec.protocol == "anthropic"
+        else OpenAIResponsesClient
+        if spec.protocol == "openai_responses"
+        else GeminiClient
+        if spec.protocol == "gemini"
+        else OpenAICompatibleClient
+    )
+    for spec in PROVIDER_REGISTRY.values()
+}
+
+PROVIDER_URLS: dict[str, str | None] = {
+    spec.provider: spec.default_base_url for spec in PROVIDER_REGISTRY.values()
+}
+
+TOOL_CHOICE_PROVIDERS = {
+    spec.provider for spec in PROVIDER_REGISTRY.values() if spec.supports_tool_choice
+}
+
+MAX_TOKENS_BY_PROVIDER: dict[str, int] = {
+    spec.provider: spec.default_max_tokens for spec in PROVIDER_REGISTRY.values()
+}
+
+MAX_TOKENS_BY_MODEL: dict[str, int] = {
+    prefix: limit
+    for spec in PROVIDER_REGISTRY.values()
+    for prefix, limit in spec.model_max_tokens.items()
+}
+
+
+class LLMError(Exception):
+    """Base exception for LLM client errors."""
+    pass
+
+
+def get_provider_base_url(provider: str, custom_base_url: str | None = None) -> str | None:
+    """Return the API base URL for a provider.
+
+    If a custom base_url is provided, it takes precedence.
+    Otherwise falls back to the default URL for the provider.
+    """
+    if custom_base_url:
+        return custom_base_url
+    spec = get_provider_spec(provider)
+    if spec:
+        return spec.default_base_url
+    return PROVIDER_URLS.get(normalize_provider(provider))
+
+
+def get_max_tokens(provider: str, model: str | None = None, max_output_tokens: int | None = None) -> int:
+    """Return a safe max_tokens value for the given provider/model pair.
+
+    Priority: max_output_tokens (DB override) > model prefix > provider default > 4096
+    """
+    spec = get_provider_spec(provider)
+    model_limits = spec.model_max_tokens if spec else MAX_TOKENS_BY_MODEL
+
+    # Highest priority: per-model DB override
+    if max_output_tokens and max_output_tokens > 0:
+        return max_output_tokens
+
+    # Check model-specific limits
+    if model:
+        for prefix, limit in model_limits.items():
+            if model.lower().startswith(prefix):
+                return limit
+
+    if spec:
+        return spec.default_max_tokens
+
+    # Provider default, falling back to safe 4096
+    return MAX_TOKENS_BY_PROVIDER.get(normalize_provider(provider), 4096)
+
+
+def create_llm_client(
+    provider: str,
+    api_key: str,
+    model: str,
+    base_url: str | None = None,
+    timeout: float = 120.0,
+) -> LLMClient:
+    """Create an LLM client for the given provider.
+
+    Args:
+        provider: Provider name (openai, anthropic, deepseek, etc.)
+        api_key: API key for authentication
+        model: Model name
+        base_url: Optional custom base URL
+        timeout: Request timeout in seconds
+
+    Returns:
+        An instance of the appropriate LLMClient subclass
+
+    Raises:
+        ValueError: If provider is not supported
+    """
+    normalized_provider = normalize_provider(provider)
+    spec = get_provider_spec(normalized_provider)
+
+    # Get base URL
+    final_base_url = get_provider_base_url(normalized_provider, base_url)
+
+    # Create appropriate client
+    if spec and spec.protocol == "anthropic":
+        return AnthropicClient(
+            api_key=api_key,
+            base_url=final_base_url,
+            model=model,
+            timeout=timeout,
+        )
+    elif spec and spec.protocol == "openai_responses":
+        return OpenAIResponsesClient(
+            api_key=api_key,
+            base_url=final_base_url,
+            model=model,
+            timeout=timeout,
+            supports_tool_choice=spec.supports_tool_choice,
+        )
+    elif spec and spec.protocol == "gemini":
+        return GeminiClient(
+            api_key=api_key,
+            base_url=final_base_url,
+            model=model,
+            timeout=timeout,
+            supports_tool_choice=spec.supports_tool_choice,
+        )
+    elif normalized_provider in PROVIDER_CLIENTS:
+        supports_tool_choice = normalized_provider in TOOL_CHOICE_PROVIDERS
+        return OpenAICompatibleClient(
+            api_key=api_key,
+            base_url=final_base_url,
+            model=model,
+            timeout=timeout,
+            supports_tool_choice=supports_tool_choice,
+        )
+    else:
+        # Default to OpenAI-compatible for unknown providers
+        return OpenAICompatibleClient(
+            api_key=api_key,
+            base_url=final_base_url or PROVIDER_URLS["openai"],
+            model=model,
+            timeout=timeout,
+            supports_tool_choice=True,
+        )
+
+
+# ============================================================================
+# High-level Convenience Functions
+# ============================================================================
+
+async def chat_complete(
+    provider: str,
+    api_key: str,
+    model: str,
+    messages: list[dict],
+    base_url: str | None = None,
+    tools: list[dict] | None = None,
+    temperature: float = 0.7,
+    max_tokens: int | None = None,
+    timeout: float = 120.0,
+) -> dict:
+    """High-level function for non-streaming chat completion.
+
+    Returns response in OpenAI-compatible format for backward compatibility.
+    """
+    client = create_llm_client(provider, api_key, model, base_url, timeout)
+
+    try:
+        llm_messages = [LLMMessage(**m) for m in messages]
+        response = await client.complete(
+            messages=llm_messages,
+            tools=tools,
+            temperature=temperature,
+            max_tokens=max_tokens or get_max_tokens(provider, model),
+        )
+
+        return {
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": response.content,
+                    "tool_calls": response.tool_calls or None,
+                },
+                "finish_reason": response.finish_reason or "stop",
+            }],
+            "model": response.model or model,
+            "usage": response.usage or {},
+        }
+    finally:
+        await client.close()
+
+
+async def chat_stream(
+    provider: str,
+    api_key: str,
+    model: str,
+    messages: list[dict],
+    base_url: str | None = None,
+    tools: list[dict] | None = None,
+    temperature: float = 0.7,
+    max_tokens: int | None = None,
+    timeout: float = 120.0,
+    on_chunk: ChunkCallback | None = None,
+    on_thinking: ThinkingCallback | None = None,
+) -> dict:
+    """High-level function for streaming chat completion.
+
+    Returns aggregated response in OpenAI-compatible format.
+    """
+    client = create_llm_client(provider, api_key, model, base_url, timeout)
+
+    try:
+        llm_messages = [LLMMessage(**m) for m in messages]
+        response = await client.stream(
+            messages=llm_messages,
+            tools=tools,
+            temperature=temperature,
+            max_tokens=max_tokens or get_max_tokens(provider, model),
+            on_chunk=on_chunk,
+            on_thinking=on_thinking,
+        )
+
+        return {
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": response.content,
+                    "tool_calls": response.tool_calls or None,
+                },
+                "finish_reason": response.finish_reason or "stop",
+            }],
+            "model": response.model or model,
+            "usage": response.usage or {},
+        }
+    finally:
+        await client.close()
diff --git a/backend/app/services/llm_failover.py b/backend/app/services/llm/failover.py
similarity index 98%
rename from backend/app/services/llm_failover.py
rename to backend/app/services/llm/failover.py
index 2f3e287b..60cc5646 100644
--- a/backend/app/services/llm_failover.py
+++ b/backend/app/services/llm/failover.py
@@ -7,7 +7,7 @@
 
 from enum import Enum
 
-from app.services.llm_client import LLMError
+from .client import LLMError
 
 
 class FailoverErrorType(Enum):
@@ -77,4 +77,4 @@ def classify_error(error: Exception) -> FailoverErrorType:
 __all__ = [
     "FailoverErrorType",
     "classify_error",
-]
\ No newline at end of file
+]
diff --git a/backend/app/services/llm_utils.py b/backend/app/services/llm/utils.py
similarity index 92%
rename from backend/app/services/llm_utils.py
rename to backend/app/services/llm/utils.py
index 9cde9454..9ac73f27 100644
--- a/backend/app/services/llm_utils.py
+++ b/backend/app/services/llm/utils.py
@@ -4,12 +4,12 @@
 so they don't need to be duplicated across websocket.py, scheduler.py,
 task_executor.py, agent_tools.py, and feishu.py.
 
-This module also exports the unified LLM client classes from llm_client.py
+This module also exports the unified LLM client classes from client.py
 for convenient access.
 """
 
-# Re-export all client classes and functions from llm_client.py
-from app.services.llm_client import (
+# Re-export all client classes and functions from client.py
+from .client import (
     AnthropicClient,
     GeminiClient,
     LLMClient,
@@ -39,7 +39,7 @@
 # Keep ANTHROPIC_API_PROVIDERS for backward compatibility
 ANTHROPIC_API_PROVIDERS = {"anthropic"}
 
-# Keep the original PROVIDER_URLS reference (already exported from llm_client)
+# Keep the original PROVIDER_URLS reference (already exported from client)
 
 
 def get_tool_params(provider: str) -> dict:
diff --git a/backend/app/services/llm_caller.py b/backend/app/services/llm_caller.py
deleted file mode 100644
index f352030b..00000000
--- a/backend/app/services/llm_caller.py
+++ /dev/null
@@ -1,275 +0,0 @@
-"""Unified LLM calling service with failover support for all execution paths.
-
-This module provides a shared entry point for all LLM calls across:
-- WebSocket chat
-- IM channels (Feishu, Slack, Teams, Discord, WeCom, DingTalk)
-- Background services (task executor, scheduler, heartbeat, etc.)
-
-All paths now support:
-1. Config-level fallback: if primary missing, use fallback directly
-2. Runtime failover: if primary fails with retryable error, try fallback once
-"""
-
-from __future__ import annotations
-
-import uuid
-from typing import TYPE_CHECKING
-
-from loguru import logger
-from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.services.llm_failover import classify_error, FailoverErrorType
-from app.services.llm_utils import LLMMessage
-
-if TYPE_CHECKING:
-    from app.models.agent import Agent
-    from app.models.llm import LLMModel
-
-
-async def call_agent_llm(
-    db: AsyncSession,
-    agent_id: uuid.UUID,
-    user_text: str,
-    history: list[dict] | None = None,
-    user_id: uuid.UUID | None = None,
-    on_chunk=None,
-    on_thinking=None,
-    supports_vision: bool = False,
-) -> str:
-    """Call the agent's LLM with automatic failover support.
-
-    This is the unified entry point for ALL LLM calls across all channels.
-
-    Args:
-        db: Database session
-        agent_id: Agent UUID
-        user_text: User message text
-        history: Optional conversation history (last N messages)
-        user_id: Optional user UUID (for personalized context)
-        on_chunk: Optional streaming callback
-        on_thinking: Optional thinking/reasoning callback
-        supports_vision: Whether the model supports vision
-
-    Returns:
-        LLM response string, or error message if both primary and fallback fail
-    """
-    from app.models.agent import Agent
-    from app.models.llm import LLMModel
-    from app.api.websocket import call_llm
-
-    # Load agent
-    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
-    agent: Agent | None = agent_result.scalar_one_or_none()
-    if not agent:
-        return "⚠️ 数字员工未找到"
-
-    from app.core.permissions import is_agent_expired
-    if is_agent_expired(agent):
-        return "This Agent has expired and is off duty. Please contact your admin to extend its service."
-
-    # Load primary model
-    primary_model: LLMModel | None = None
-    if agent.primary_model_id:
-        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
-        primary_model = model_result.scalar_one_or_none()
-
-    # Load fallback model
-    fallback_model: LLMModel | None = None
-    if agent.fallback_model_id:
-        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
-        fallback_model = fb_result.scalar_one_or_none()
-
-    # Config-level fallback: primary missing -> use fallback
-    if not primary_model and fallback_model:
-        primary_model = fallback_model
-        fallback_model = None
-        logger.warning(f"[call_agent_llm] Primary model unavailable, using fallback: {primary_model.model}")
-
-    if not primary_model:
-        return f"⚠️ {agent.name} 未配置 LLM 模型，请在管理后台设置。"
-
-    # Build conversation messages
-    messages: list[dict] = []
-    if history:
-        messages.extend(history[-10:])
-    messages.append({"role": "user", "content": user_text})
-
-    # Use unified call_llm_with_failover
-    from app.api.websocket import call_llm_with_failover
-    try:
-        reply = await call_llm_with_failover(
-            primary_model=primary_model,
-            fallback_model=fallback_model,
-            messages=messages,
-            agent_name=agent.name,
-            role_description=agent.role_description or "",
-            agent_id=agent_id,
-            user_id=user_id or agent_id,
-            on_chunk=on_chunk,
-            on_thinking=on_thinking,
-            supports_vision=supports_vision or getattr(primary_model, 'supports_vision', False),
-        )
-        return reply
-    except Exception as e:
-        # call_llm_with_failover should handle failover internally, but catch any unexpected errors
-        error_msg = str(e) or repr(e)
-        logger.error(f"[call_agent_llm] Unexpected error: {error_msg}")
-        return f"⚠️ 调用模型出错: {error_msg[:150]}"
-
-
-async def call_agent_llm_with_tools(
-    db: AsyncSession,
-    agent_id: uuid.UUID,
-    system_prompt: str,
-    user_prompt: str,
-    max_rounds: int = 50,
-) -> str:
-    """Call agent LLM with tool-calling loop (for background services).
-
-    Used by scheduler, heartbeat, and other background tasks.
-
-    Args:
-        db: Database session
-        agent_id: Agent UUID
-        system_prompt: System prompt/context
-        user_prompt: User/instruction message
-        max_rounds: Maximum tool-calling rounds
-
-    Returns:
-        Final response string
-    """
-    from app.models.agent import Agent
-    from app.models.llm import LLMModel
-    from app.services.agent_tools import execute_tool, get_agent_tools_for_llm
-    from app.services.llm_utils import create_llm_client, get_max_tokens, LLMError
-
-    # Load agent and models
-    agent_result = await db.execute(select(Agent).where(Agent.id == agent_id))
-    agent: Agent | None = agent_result.scalar_one_or_none()
-    if not agent:
-        return "⚠️ Agent not found"
-
-    # Load models
-    primary_model: LLMModel | None = None
-    if agent.primary_model_id:
-        model_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.primary_model_id))
-        primary_model = model_result.scalar_one_or_none()
-
-    fallback_model: LLMModel | None = None
-    if agent.fallback_model_id:
-        fb_result = await db.execute(select(LLMModel).where(LLMModel.id == agent.fallback_model_id))
-        fallback_model = fb_result.scalar_one_or_none()
-
-    # Config-level fallback
-    if not primary_model and fallback_model:
-        primary_model = fallback_model
-        fallback_model = None
-
-    if not primary_model:
-        return f"⚠️ {agent.name} has no LLM model configured"
-
-    # Build messages
-    messages = [
-        LLMMessage(role="system", content=system_prompt),
-        LLMMessage(role="user", content=user_prompt),
-    ]
-
-    # Load tools
-    tools_for_llm = await get_agent_tools_for_llm(agent_id)
-
-    async def _try_model(model: LLMModel) -> tuple[str, bool]:
-        """Try to complete with a model. Returns (response, success)."""
-        try:
-            client = create_llm_client(
-                provider=model.provider,
-                api_key=model.api_key_encrypted,
-                model=model.model,
-                base_url=model.base_url,
-                timeout=120.0,
-            )
-
-            max_tokens = get_max_tokens(
-                model.provider, model.model,
-                getattr(model, 'max_output_tokens', None)
-            )
-
-            # Tool-calling loop
-            api_messages = list(messages)  # Copy
-            for round_i in range(max_rounds):
-                try:
-                    response = await client.complete(
-                        messages=api_messages,
-                        tools=tools_for_llm if tools_for_llm else None,
-                        temperature=0.7,
-                        max_tokens=max_tokens,
-                    )
-                except Exception as e:
-                    await client.close()
-                    raise
-
-                if not response.tool_calls:
-                    await client.close()
-                    return response.content or "[Empty response]", True
-
-                # Execute tool calls
-                api_messages.append(LLMMessage(
-                    role="assistant",
-                    content=response.content or None,
-                    tool_calls=[{
-                        "id": tc["id"],
-                        "type": "function",
-                        "function": tc["function"],
-                    } for tc in response.tool_calls],
-                ))
-
-                for tc in response.tool_calls:
-                    fn = tc["function"]
-                    tool_name = fn["name"]
-                    raw_args = fn.get("arguments", "{}")
-                    try:
-                        import json
-                        args = json.loads(raw_args) if raw_args else {}
-                    except json.JSONDecodeError:
-                        args = {}
-
-                    result = await execute_tool(
-                        tool_name, args,
-                        agent_id=agent_id,
-                        user_id=agent.creator_id,
-                    )
-                    api_messages.append(LLMMessage(
-                        role="tool",
-                        tool_call_id=tc["id"],
-                        content=str(result),
-                    ))
-
-            await client.close()
-            return "[Error] Too many tool call rounds", False
-
-        except Exception as e:
-            return f"[Error] {e}", False
-
-    # Try primary model
-    reply, success = await _try_model(primary_model)
-    if success:
-        return reply
-
-    # Primary failed - check if retryable
-    error_type = classify_error(Exception(reply))
-    if error_type == FailoverErrorType.NON_RETRYABLE or not fallback_model:
-        return reply
-
-    # Try fallback model
-    logger.info(f"[call_agent_llm_with_tools] Retrying with fallback: {fallback_model.model}")
-    reply2, success2 = await _try_model(fallback_model)
-    if success2:
-        return reply2
-
-    return f"⚠️ Both models failed | Primary: {reply[:80]} | Fallback: {reply2[:80]}"
-
-
-__all__ = [
-    "call_agent_llm",
-    "call_agent_llm_with_tools",
-]
diff --git a/backend/app/services/scheduler.py b/backend/app/services/scheduler.py
index 714b5727..2aaf8e0e 100644
--- a/backend/app/services/scheduler.py
+++ b/backend/app/services/scheduler.py
@@ -62,7 +62,7 @@ async def _execute_schedule(schedule_id: uuid.UUID, agent_id: uuid.UUID, instruc
 
             # Build context and call LLM with failover support
             from app.services.agent_context import build_agent_context
-            from app.services.llm_caller import call_agent_llm_with_tools
+            from app.services.llm import call_agent_llm_with_tools
 
             system_prompt = await build_agent_context(agent_id, agent.name, agent.role_description or "")
 
diff --git a/backend/app/services/task_executor.py b/backend/app/services/task_executor.py
index 29f493fd..0ffc2e5f 100644
--- a/backend/app/services/task_executor.py
+++ b/backend/app/services/task_executor.py
@@ -111,7 +111,7 @@ async def execute_task(task_id: uuid.UUID, agent_id: uuid.UUID) -> None:
         user_prompt += "\n\n请认真完成此任务，给出详细的执行结果。"
 
     # Step 4: Call LLM with unified failover support
-    from app.services.llm_caller import call_agent_llm_with_tools
+    from app.services.llm import call_agent_llm_with_tools
 
     try:
         logger.info(f"[TaskExec] Calling LLM with tools for task: {task_title}")
diff --git a/backend/app/services/trigger_daemon.py b/backend/app/services/trigger_daemon.py
index b6acd895..65abb3d9 100644
--- a/backend/app/services/trigger_daemon.py
+++ b/backend/app/services/trigger_daemon.py
@@ -362,7 +362,7 @@ async def _invoke_agent_for_triggers(agent_id: uuid.UUID, triggers: list[AgentTr
 
     Creates a Reflection Session and calls the LLM.
     """
-    from app.api.websocket import call_llm
+    from app.services.llm import call_llm
     from app.services.agent_context import build_agent_context
     from app.models.llm import LLMModel
     from app.models.audit import ChatMessage

From 13e7a54f132c8f4f22697a465eac3998a9a4fb8e Mon Sep 17 00:00:00 2001
From: yaojin <yaojin@58.com>
Date: Tue, 24 Mar 2026 15:40:10 +0800
Subject: [PATCH 6/6] update temprature

---
 backend/app/services/llm/caller.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
index 5481d708..229b71bd 100644
--- a/backend/app/services/llm/caller.py
+++ b/backend/app/services/llm/caller.py
@@ -708,10 +708,11 @@ async def _try_model(model: LLMModel) -> tuple[str, bool]:
                     response = await client.complete(
                         messages=api_messages,
                         tools=tools_for_llm if tools_for_llm else None,
-                        temperature=0.7,
+                        temperature=model.temperature,
                         max_tokens=max_tokens,
                     )
                 except Exception as e:
+                    logger.error(f"[call_agent_llm_with_tools] Agent {agent_id}: LLM call error: {e}")
                     await client.close()
                     raise
 
@@ -728,6 +729,7 @@ async def _try_model(model: LLMModel) -> tuple[str, bool]:
                         "type": "function",
                         "function": tc["function"],
                     } for tc in response.tool_calls],
+		            reasoning_content=response.reasoning_content,
                 ))
 
                 for tc in response.tool_calls: