OpenHands · xingyaoww · Nov 26, 2025 · Oct 19, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/examples/01_standalone_sdk/29_llm_streaming.py b/examples/01_standalone_sdk/29_llm_streaming.py
@@ -0,0 +1,131 @@
+import os
+import sys
+from typing import Literal
+
+from pydantic import SecretStr
+
+from openhands.sdk import (
+    Conversation,
+    get_logger,
+)
+from openhands.sdk.llm import LLM
+from openhands.sdk.llm.streaming import ModelResponseStream
+from openhands.tools.preset.default import get_default_agent
+
+
+logger = get_logger(__name__)
+
+
+api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")
+if not api_key:
+    raise RuntimeError("Set LLM_API_KEY or OPENAI_API_KEY in your environment.")
+
+model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")
+base_url = os.getenv("LLM_BASE_URL")
+llm = LLM(
+    model=model,
+    api_key=SecretStr(api_key),
+    base_url=base_url,
+    usage_id="stream-demo",
+    stream=True,
+)
+
+agent = get_default_agent(llm=llm, cli_mode=True)
+
+
+# Define streaming states
+StreamingState = Literal["thinking", "content", "tool_name", "tool_args"]
+# Track state across on_token calls for boundary detection
+_current_state: StreamingState | None = None
+
+
+def on_token(chunk: ModelResponseStream) -> None:
+    """
+    Handle all types of streaming tokens including content,
+    tool calls, and thinking blocks with dynamic boundary detection.
+    """
+    global _current_state
+
+    choices = chunk.choices
+    for choice in choices:
+        delta = choice.delta
+        if delta is not None:
+            # Handle thinking blocks (reasoning content)
+            reasoning_content = getattr(delta, "reasoning_content", None)
+            if isinstance(reasoning_content, str) and reasoning_content:
+                if _current_state != "thinking":
+                    if _current_state is not None:
+                        sys.stdout.write("\n")
+                    sys.stdout.write("THINKING: ")
+                    _current_state = "thinking"
+                sys.stdout.write(reasoning_content)
+                sys.stdout.flush()
+
+            # Handle regular content
+            content = getattr(delta, "content", None)
+            if isinstance(content, str) and content:
+                if _current_state != "content":
+                    if _current_state is not None:
+                        sys.stdout.write("\n")
+                    sys.stdout.write("CONTENT: ")
+                    _current_state = "content"
+                sys.stdout.write(content)
+                sys.stdout.flush()
+
+            # Handle tool calls
+            tool_calls = getattr(delta, "tool_calls", None)
+            if tool_calls:
+                for tool_call in tool_calls:
+                    tool_name = (
+                        tool_call.function.name if tool_call.function.name else ""
+                    )
+                    tool_args = (
+                        tool_call.function.arguments
+                        if tool_call.function.arguments
+                        else ""
+                    )
+                    if tool_name:
+                        if _current_state != "tool_name":
+                            if _current_state is not None:
+                                sys.stdout.write("\n")
+                            sys.stdout.write("TOOL NAME: ")
+                            _current_state = "tool_name"
+                        sys.stdout.write(tool_name)
+                        sys.stdout.flush()
+                    if tool_args:
+                        if _current_state != "tool_args":
+                            if _current_state is not None:
+                                sys.stdout.write("\n")
+                            sys.stdout.write("TOOL ARGS: ")
+                            _current_state = "tool_args"
+                        sys.stdout.write(tool_args)
+                        sys.stdout.flush()
+
+
+conversation = Conversation(
+    agent=agent,
+    workspace=os.getcwd(),
+    token_callbacks=[on_token],
+)
+
+story_prompt = (
+    "Tell me a long story about LLM streaming, write it a file, "
+    "make sure it has multiple paragraphs. "
+)
+conversation.send_message(story_prompt)
+print("Token Streaming:")
+print("-" * 100 + "\n")
+conversation.run()
+
+cleanup_prompt = (
+    "Thank you. Please delete the streaming story file now that I've read it, "
+    "then confirm the deletion."
+)
+conversation.send_message(cleanup_prompt)
+print("Token Streaming:")
+print("-" * 100 + "\n")
+conversation.run()
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/openhands-sdk/openhands/sdk/__init__.py b/openhands-sdk/openhands/sdk/__init__.py
@@ -21,11 +21,13 @@
     LLM,
     ImageContent,
     LLMRegistry,
+    LLMStreamChunk,
     Message,
     RedactedThinkingBlock,
     RegistryEvent,
     TextContent,
     ThinkingBlock,
+    TokenCallbackType,
 )
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp import (
@@ -58,6 +60,8 @@
 __all__ = [
     "LLM",
     "LLMRegistry",
+    "LLMStreamChunk",
+    "TokenCallbackType",
     "ConversationStats",
     "RegistryEvent",
     "Message",

diff --git a/openhands-sdk/openhands/sdk/agent/agent.py b/openhands-sdk/openhands/sdk/agent/agent.py
@@ -13,6 +13,7 @@
 from openhands.sdk.conversation import (
     ConversationCallbackType,
     ConversationState,
+    ConversationTokenCallbackType,
     LocalConversation,
 )
 from openhands.sdk.conversation.state import ConversationExecutionStatus
@@ -135,6 +136,7 @@ def step(
         self,
         conversation: LocalConversation,
         on_event: ConversationCallbackType,
+        on_token: ConversationTokenCallbackType | None = None,
     ) -> None:
         state = conversation.state
         # Check for pending actions (implicit confirmation)
@@ -167,7 +169,10 @@ def step(
 
         try:
             llm_response = make_llm_completion(
-                self.llm, _messages, tools=list(self.tools_map.values())
+                self.llm,
+                _messages,
+                tools=list(self.tools_map.values()),
+                on_token=on_token,
             )
         except FunctionCallValidationError as e:
             logger.warning(f"LLM generated malformed function call: {e}")

diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py
@@ -20,7 +20,10 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation import ConversationState, LocalConversation
-    from openhands.sdk.conversation.types import ConversationCallbackType
+    from openhands.sdk.conversation.types import (
+        ConversationCallbackType,
+        ConversationTokenCallbackType,
+    )
 
 
 logger = get_logger(__name__)
@@ -239,6 +242,7 @@ def step(
         self,
         conversation: "LocalConversation",
         on_event: "ConversationCallbackType",
+        on_token: "ConversationTokenCallbackType | None" = None,
     ) -> None:
         """Taking a step in the conversation.
 
@@ -250,6 +254,9 @@ def step(
         4.1 If conversation is finished, set state.execution_status to FINISHED
         4.2 Otherwise, just return, Conversation will kick off the next step
 
+        If the underlying LLM supports streaming, partial deltas are forwarded to
+        ``on_token`` before the full response is returned.
+
         NOTE: state will be mutated in-place.
         """
 

diff --git a/openhands-sdk/openhands/sdk/agent/utils.py b/openhands-sdk/openhands/sdk/agent/utils.py
@@ -12,6 +12,7 @@
 
 from openhands.sdk.context.condenser.base import CondenserBase
 from openhands.sdk.context.view import View
+from openhands.sdk.conversation.types import ConversationTokenCallbackType
 from openhands.sdk.event.base import Event, LLMConvertibleEvent
 from openhands.sdk.event.condenser import Condensation
 from openhands.sdk.llm import LLM, LLMResponse, Message
@@ -182,13 +183,15 @@ def make_llm_completion(
     llm: LLM,
     messages: list[Message],
     tools: list[ToolDefinition] | None = None,
+    on_token: ConversationTokenCallbackType | None = None,
 ) -> LLMResponse:
     """Make an LLM completion call with the provided messages and tools.
 
     Args:
         llm: The LLM instance to use for completion
         messages: The messages to send to the LLM
         tools: Optional list of tools to provide to the LLM
+        on_token: Optional callback for streaming token updates
 
     Returns:
         LLMResponse from the LLM completion call
@@ -200,10 +203,12 @@ def make_llm_completion(
             include=None,
             store=False,
             add_security_risk_prediction=True,
+            on_token=on_token,
         )
     else:
         return llm.completion(
             messages=messages,
             tools=tools or [],
             add_security_risk_prediction=True,
+            on_token=on_token,
         )
diff --git a/openhands-sdk/openhands/sdk/conversation/__init__.py b/openhands-sdk/openhands/sdk/conversation/__init__.py
@@ -11,7 +11,10 @@
     ConversationState,
 )
 from openhands.sdk.conversation.stuck_detector import StuckDetector
-from openhands.sdk.conversation.types import ConversationCallbackType
+from openhands.sdk.conversation.types import (
+    ConversationCallbackType,
+    ConversationTokenCallbackType,
+)
 from openhands.sdk.conversation.visualizer import (
     ConversationVisualizerBase,
     DefaultConversationVisualizer,
@@ -24,6 +27,7 @@
     "ConversationState",
     "ConversationExecutionStatus",
     "ConversationCallbackType",
+    "ConversationTokenCallbackType",
     "DefaultConversationVisualizer",
     "ConversationVisualizerBase",
     "SecretRegistry",

diff --git a/openhands-sdk/openhands/sdk/conversation/base.py b/openhands-sdk/openhands/sdk/conversation/base.py
@@ -1,12 +1,16 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping
 from pathlib import Path
-from typing import TYPE_CHECKING, Protocol
+from typing import TYPE_CHECKING, Protocol, TypeVar, cast
 
 from openhands.sdk.conversation.conversation_stats import ConversationStats
 from openhands.sdk.conversation.events_list_base import EventsListBase
 from openhands.sdk.conversation.secret_registry import SecretValue
-from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
+from openhands.sdk.conversation.types import (
+    ConversationCallbackType,
+    ConversationID,
+    ConversationTokenCallbackType,
+)
 from openhands.sdk.llm.llm import LLM
 from openhands.sdk.llm.message import Message
 from openhands.sdk.observability.laminar import (
@@ -27,6 +31,13 @@
     from openhands.sdk.conversation.state import ConversationExecutionStatus
 
 
+CallbackType = TypeVar(
+    "CallbackType",
+    ConversationCallbackType,
+    ConversationTokenCallbackType,
+)
+
+
 class ConversationStateProtocol(Protocol):
     """Protocol defining the interface for conversation state objects."""
 
@@ -235,9 +246,7 @@ def ask_agent(self, question: str) -> str:
         ...
 
     @staticmethod
-    def compose_callbacks(
-        callbacks: Iterable[ConversationCallbackType],
-    ) -> ConversationCallbackType:
+    def compose_callbacks(callbacks: Iterable[CallbackType]) -> CallbackType:
         """Compose multiple callbacks into a single callback function.
 
         Args:
@@ -252,4 +261,4 @@ def composed(event) -> None:
                 if cb:
                     cb(event)
 
-        return composed
+        return cast(CallbackType, composed)
diff --git a/openhands-sdk/openhands/sdk/conversation/conversation.py b/openhands-sdk/openhands/sdk/conversation/conversation.py
@@ -4,7 +4,11 @@
 from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.base import BaseConversation
 from openhands.sdk.conversation.secret_registry import SecretValue
-from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
+from openhands.sdk.conversation.types import (
+    ConversationCallbackType,
+    ConversationID,
+    ConversationTokenCallbackType,
+)
 from openhands.sdk.conversation.visualizer import (
     ConversationVisualizerBase,
     DefaultConversationVisualizer,
@@ -49,6 +53,7 @@ def __new__(
         persistence_dir: str | Path | None = None,
         conversation_id: ConversationID | None = None,
         callbacks: list[ConversationCallbackType] | None = None,
+        token_callbacks: list[ConversationTokenCallbackType] | None = None,
         max_iteration_per_run: int = 500,
         stuck_detection: bool = True,
         visualizer: (
@@ -65,6 +70,7 @@ def __new__(
         workspace: RemoteWorkspace,
         conversation_id: ConversationID | None = None,
         callbacks: list[ConversationCallbackType] | None = None,
+        token_callbacks: list[ConversationTokenCallbackType] | None = None,
         max_iteration_per_run: int = 500,
         stuck_detection: bool = True,
         visualizer: (
@@ -81,6 +87,7 @@ def __new__(
         persistence_dir: str | Path | None = None,
         conversation_id: ConversationID | None = None,
         callbacks: list[ConversationCallbackType] | None = None,
+        token_callbacks: list[ConversationTokenCallbackType] | None = None,
         max_iteration_per_run: int = 500,
         stuck_detection: bool = True,
         visualizer: (
@@ -104,6 +111,7 @@ def __new__(
                 agent=agent,
                 conversation_id=conversation_id,
                 callbacks=callbacks,
+                token_callbacks=token_callbacks,
                 max_iteration_per_run=max_iteration_per_run,
                 stuck_detection=stuck_detection,
                 visualizer=visualizer,
@@ -115,6 +123,7 @@ def __new__(
             agent=agent,
             conversation_id=conversation_id,
             callbacks=callbacks,
+            token_callbacks=token_callbacks,
             max_iteration_per_run=max_iteration_per_run,
             stuck_detection=stuck_detection,
             visualizer=visualizer,