diff --git a/docs.json b/docs.json index dc0e6e64..19e0d0dd 100644 --- a/docs.json +++ b/docs.json @@ -269,6 +269,7 @@ "sdk/guides/agent-server/api-sandbox", "sdk/guides/agent-server/cloud-workspace", "sdk/guides/agent-server/custom-tools", + "sdk/guides/agent-server/settings-secrets-api", { "group": "API Reference", "openapi": { diff --git a/sdk/guides/agent-acp.mdx b/sdk/guides/agent-acp.mdx index b4ac3ca0..292fd5fe 100644 --- a/sdk/guides/agent-acp.mdx +++ b/sdk/guides/agent-acp.mdx @@ -185,7 +185,8 @@ This example is available on GitHub: [examples/01_standalone_sdk/40_acp_agent_ex This example shows how to use an ACP-compatible server (claude-agent-acp) as the agent backend instead of direct LLM calls. It also demonstrates ``ask_agent()`` — a stateless side-question that forks the ACP session -and leaves the main conversation untouched. +and leaves the main conversation untouched — and sending an image alongside +text to verify multimodal (vision) input support. Prerequisites: - Node.js / npx available @@ -197,23 +198,41 @@ Usage: import os +from openhands.sdk import ImageContent, Message, TextContent from openhands.sdk.agent import ACPAgent from openhands.sdk.conversation import Conversation +IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" + agent = ACPAgent(acp_command=["npx", "-y", "@agentclientprotocol/claude-agent-acp"]) try: cwd = os.getcwd() conversation = Conversation(agent=agent, workspace=cwd) - # --- Main conversation turn --- + # --- Main conversation turn (text only) --- conversation.send_message( "List the Python source files under openhands-sdk/openhands/sdk/agent/, " "then read the __init__.py and summarize what agent classes are exported." ) conversation.run() + # --- Image input turn (text + image) --- + print("\n--- image input ---") + conversation.send_message( + Message( + role="user", + content=[ + TextContent( + text="Describe what you see in this image in one sentence." + ), + ImageContent(image_urls=[IMAGE_URL]), + ], + ) + ) + conversation.run() + # --- ask_agent: stateless side-question via fork_session --- print("\n--- ask_agent ---") response = conversation.ask_agent( @@ -294,9 +313,9 @@ os.environ["ANTHROPIC_API_KEY"] = llm_api_key runtime_api_key = os.getenv("RUNTIME_API_KEY") assert runtime_api_key, "RUNTIME_API_KEY required" -# If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency -# Otherwise, use the latest image from main -server_image_sha = os.getenv("GITHUB_SHA") or "main" +# SDK_SHA is the canonical commit SHA set by CI workflows (avoids the +# built-in GITHUB_SHA which resolves to the merge-commit on PRs). +server_image_sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") or "main" server_image = f"ghcr.io/openhands/agent-server:{server_image_sha[:7]}-python-amd64" logger.info(f"Using server image: {server_image}") diff --git a/sdk/guides/agent-delegation.mdx b/sdk/guides/agent-delegation.mdx index dac6a8f4..d043bd93 100644 --- a/sdk/guides/agent-delegation.mdx +++ b/sdk/guides/agent-delegation.mdx @@ -179,15 +179,13 @@ from openhands.sdk import ( from openhands.sdk.context import Skill from openhands.sdk.subagent import register_agent from openhands.sdk.tool import register_tool +from openhands.tools import register_builtins_agents from openhands.tools.delegate import ( DelegateTool, DelegationVisualizer, ) -from openhands.tools.preset.default import get_default_tools, register_builtins_agents -ONLY_RUN_SIMPLE_DELEGATION = False - logger = get_logger(__name__) # Configure LLM and agent @@ -198,91 +196,6 @@ llm = LLM( usage_id="agent", ) -cwd = os.getcwd() - -tools = get_default_tools(enable_browser=True) -tools.append(Tool(name=DelegateTool.name)) -register_builtins_agents() - -main_agent = Agent( - llm=llm, - tools=tools, -) -conversation = Conversation( - agent=main_agent, - workspace=cwd, - visualizer=DelegationVisualizer(name="Delegator"), -) - -conversation.send_message( - "Forget about coding. Let's switch to travel planning. " - "Let's plan a trip to London. I have two issues I need to solve: " - "Lodging: what are the best areas to stay at while keeping budget in mind? " - "Activities: what are the top 5 must-see attractions and hidden gems? " - "Please use the delegation tools to handle these two tasks in parallel. " - "Make sure the sub-agents use their own knowledge " - "and dont rely on internet access. " - "They should keep it short. After getting the results, merge both analyses " - "into a single consolidated report.\n\n" -) -conversation.run() - -conversation.send_message( - "Ask the lodging sub-agent what it thinks about Covent Garden." -) -conversation.run() - -# Report cost for simple delegation example -cost_simple = conversation.conversation_stats.get_combined_metrics().accumulated_cost -print(f"EXAMPLE_COST (simple delegation): {cost_simple}") - -print("Simple delegation example done!", "\n" * 20) - -if ONLY_RUN_SIMPLE_DELEGATION: - # For CI: always emit the EXAMPLE_COST marker before exiting. - print(f"EXAMPLE_COST: {cost_simple}") - exit(0) - - -# -------- Agent Delegation Second Part: Built-in Agent Types (Explore + Bash) -------- - -main_agent = Agent( - llm=llm, - tools=[Tool(name=DelegateTool.name)], -) -conversation = Conversation( - agent=main_agent, - workspace=cwd, - visualizer=DelegationVisualizer(name="Delegator (builtins)"), -) - -builtin_task_message = ( - "Demonstrate SDK built-in sub-agent types. " - "1) Spawn an 'explore' sub-agent and ask it to list the markdown files in " - "openhands-sdk/openhands/sdk/subagent/builtins/ and summarize what each " - "built-in agent type is for (based on the file contents). " - "2) Spawn a 'bash' sub-agent and ask it to run `python --version` in the " - "terminal and return the exact output. " - "3) Merge both results into a short report. " - "Do not use internet access." -) - -print("=" * 100) -print("Demonstrating built-in agent delegation (explore + bash)...") -print("=" * 100) - -conversation.send_message(builtin_task_message) -conversation.run() - -# Report cost for builtin agent types example -cost_builtin = conversation.conversation_stats.get_combined_metrics().accumulated_cost -print(f"EXAMPLE_COST (builtin agents): {cost_builtin}") - -print("Built-in agent delegation example done!", "\n" * 20) - - -# -------- Agent Delegation Third Part: User-Defined Agent Types -------- - def create_lodging_planner(llm: LLM) -> Agent: """Create a lodging planner focused on London stays.""" @@ -343,6 +256,7 @@ register_agent( factory_func=create_activities_planner, description="Creates time-efficient London activity itineraries.", ) +register_builtins_agents() # Make the delegation tool available to the main agent register_tool("DelegateTool", DelegateTool) @@ -353,26 +267,26 @@ main_agent = Agent( ) conversation = Conversation( agent=main_agent, - workspace=cwd, + workspace=os.getcwd(), visualizer=DelegationVisualizer(name="Delegator"), ) -task_message = ( - "Plan a 3-day London trip. " - "1) Spawn two sub-agents: lodging_planner (hotel options) and " - "activities_planner (itinerary). " - "2) Ask lodging_planner for 3-4 central London hotel recommendations with " - "neighborhoods, quick pros/cons, and transit notes by budget. " - "3) Ask activities_planner for a concise 3-day itinerary with nearby stops, " - " food/coffee suggestions, and any ticket/reservation notes. " - "4) Share both sub-agent results and propose a combined plan." -) - print("=" * 100) print("Demonstrating London trip delegation (lodging + activities)...") print("=" * 100) -conversation.send_message(task_message) +conversation.send_message(""" +Let's plan a trip to London. I have two specific areas to address: + +Lodging: What are the best areas to stay in while keeping a budget in mind? +Activities: What are the top five must-see attractions and hidden gems? + +Please use delegation tools to handle these two tasks in parallel. +Ensure the sub-agents use their own internal knowledge and do not +rely on internet access. Keep the responses concise. +Once you have the results, use the bash sub-agent to write a file +named london_trip_report.txt containing the findings in the working directory. +""") conversation.run() conversation.send_message( @@ -384,12 +298,9 @@ conversation.run() cost_user_defined = ( conversation.conversation_stats.get_combined_metrics().accumulated_cost ) -print(f"EXAMPLE_COST (user-defined agents): {cost_user_defined}") +print(f"EXAMPLE_COST: {cost_user_defined}") print("All done!") - -# Full example cost report for CI workflow -print(f"EXAMPLE_COST: {cost_simple + cost_builtin + cost_user_defined}") ``` diff --git a/sdk/guides/agent-file-based.mdx b/sdk/guides/agent-file-based.mdx index f8f5f0f7..ed576c4b 100644 --- a/sdk/guides/agent-file-based.mdx +++ b/sdk/guides/agent-file-based.mdx @@ -518,7 +518,7 @@ grammar_checker = AgentDefinition( register_agent( name=grammar_checker.name, factory_func=agent_definition_to_factory(grammar_checker), - description=grammar_checker.description, + description=grammar_checker, ) # 3. Set up the orchestrator agent with the DelegateTool diff --git a/sdk/guides/agent-server/api-sandbox.mdx b/sdk/guides/agent-server/api-sandbox.mdx index 2fea0916..79d47705 100644 --- a/sdk/guides/agent-server/api-sandbox.mdx +++ b/sdk/guides/agent-server/api-sandbox.mdx @@ -151,9 +151,9 @@ if not runtime_api_key: exit(1) -# If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency -# Otherwise, use the latest image from main -server_image_sha = os.getenv("GITHUB_SHA") or "main" +# SDK_SHA is the canonical commit SHA set by CI workflows (avoids the +# built-in GITHUB_SHA which resolves to the merge-commit on PRs). +server_image_sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") or "main" server_image = f"ghcr.io/openhands/agent-server:{server_image_sha[:7]}-python-amd64" logger.info(f"Using server image: {server_image}") diff --git a/sdk/guides/agent-server/apptainer-sandbox.mdx b/sdk/guides/agent-server/apptainer-sandbox.mdx index 19adceb0..411920a0 100644 --- a/sdk/guides/agent-server/apptainer-sandbox.mdx +++ b/sdk/guides/agent-server/apptainer-sandbox.mdx @@ -77,11 +77,11 @@ def get_server_image(): """Get the server image tag, using PR-specific image in CI.""" platform_str = detect_platform() arch = "arm64" if "arm64" in platform_str else "amd64" - # If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency - # Otherwise, use the latest image from main - github_sha = os.getenv("GITHUB_SHA") - if github_sha: - return f"ghcr.io/openhands/agent-server:{github_sha[:7]}-python-{arch}" + # SDK_SHA is the canonical commit SHA set by CI workflows (avoids the + # built-in GITHUB_SHA which resolves to the merge-commit on PRs). + sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") + if sha: + return f"ghcr.io/openhands/agent-server:{sha[:7]}-python-{arch}" return "ghcr.io/openhands/agent-server:latest-python" @@ -95,7 +95,7 @@ logger.info(f"Using server image: {server_image}") with ApptainerWorkspace( # use pre-built image for faster startup server_image=server_image, - host_port=8010, + # host_port auto-selects an available port when not specified platform=detect_platform(), ) as workspace: # 3) Create agent diff --git a/sdk/guides/agent-server/cloud-workspace.mdx b/sdk/guides/agent-server/cloud-workspace.mdx index 43cc07e9..3900c3a9 100644 --- a/sdk/guides/agent-server/cloud-workspace.mdx +++ b/sdk/guides/agent-server/cloud-workspace.mdx @@ -310,10 +310,9 @@ with OpenHandsCloudWorkspace( cloud_api_key=cloud_api_key, ) as workspace: # --- LLM from SaaS account settings --- - # get_llm() calls GET /users/me?expose_secrets=true, - # sending your Cloud API key plus the sandbox session - # key that OpenHands Cloud issued for this workspace. - # It returns a fully configured LLM instance. + # get_llm() calls GET /users/me?expose_secrets=true + # (dual auth: Bearer + session key) and returns a + # fully configured LLM instance. # Override any parameter: workspace.get_llm(model="gpt-4o") llm = workspace.get_llm() logger.info(f"LLM configured: model={llm.model}") diff --git a/sdk/guides/agent-server/docker-sandbox.mdx b/sdk/guides/agent-server/docker-sandbox.mdx index 301d5df3..daee6e43 100644 --- a/sdk/guides/agent-server/docker-sandbox.mdx +++ b/sdk/guides/agent-server/docker-sandbox.mdx @@ -160,11 +160,11 @@ def get_server_image(): """Get the server image tag, using PR-specific image in CI.""" platform_str = detect_platform() arch = "arm64" if "arm64" in platform_str else "amd64" - # If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency - # Otherwise, use the latest image from main - github_sha = os.getenv("GITHUB_SHA") - if github_sha: - return f"ghcr.io/openhands/agent-server:{github_sha[:7]}-python-{arch}" + # SDK_SHA is the canonical commit SHA set by CI workflows (avoids the + # built-in GITHUB_SHA which resolves to the merge-commit on PRs). + sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") + if sha: + return f"ghcr.io/openhands/agent-server:{sha[:7]}-python-{arch}" return "ghcr.io/openhands/agent-server:latest-python" @@ -173,7 +173,7 @@ def get_server_image(): # image or `DockerDevWorkspace` to automatically build the image on-demand. # with DockerDevWorkspace( # # dynamically build agent-server image -# base_image="nikolaik/python-nodejs:python3.13-nodejs22", +# base_image="nikolaik/python-nodejs:python3.13-nodejs22-slim", # host_port=8010, # platform=detect_platform(), # ) as workspace: @@ -182,7 +182,7 @@ logger.info(f"Using server image: {server_image}") with DockerWorkspace( # use pre-built image for faster startup server_image=server_image, - host_port=8010, + # host_port auto-selects an available port when not specified platform=detect_platform(), ) as workspace: # 3) Create agent @@ -364,11 +364,11 @@ def get_server_image(): """Get the server image tag, using PR-specific image in CI.""" platform_str = detect_platform() arch = "arm64" if "arm64" in platform_str else "amd64" - # If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency - # Otherwise, use the latest image from main - github_sha = os.getenv("GITHUB_SHA") - if github_sha: - return f"ghcr.io/openhands/agent-server:{github_sha[:7]}-python-{arch}" + # SDK_SHA is the canonical commit SHA set by CI workflows (avoids the + # built-in GITHUB_SHA which resolves to the merge-commit on PRs). + sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") + if sha: + return f"ghcr.io/openhands/agent-server:{sha[:7]}-python-{arch}" return "ghcr.io/openhands/agent-server:latest-python" @@ -561,11 +561,11 @@ def get_server_image(): """Get the server image tag, using PR-specific image in CI.""" platform_str = detect_platform() arch = "arm64" if "arm64" in platform_str else "amd64" - # If GITHUB_SHA is set (e.g. running in CI of a PR), use that to ensure consistency - # Otherwise, use the latest image from main - github_sha = os.getenv("GITHUB_SHA") - if github_sha: - return f"ghcr.io/openhands/agent-server:{github_sha[:7]}-python-{arch}" + # SDK_SHA is the canonical commit SHA set by CI workflows (avoids the + # built-in GITHUB_SHA which resolves to the merge-commit on PRs). + sha = os.getenv("SDK_SHA") or os.getenv("GITHUB_SHA") + if sha: + return f"ghcr.io/openhands/agent-server:{sha[:7]}-python-{arch}" return "ghcr.io/openhands/agent-server:latest-python" @@ -574,7 +574,7 @@ def get_server_image(): # automatically build the image on-demand. # with DockerDevWorkspace( # # dynamically build agent-server image -# base_image="nikolaik/python-nodejs:python3.13-nodejs22", +# base_image="nikolaik/python-nodejs:python3.13-nodejs22-slim", # host_port=8010, # platform=detect_platform(), # ) as workspace: @@ -582,7 +582,7 @@ server_image = get_server_image() logger.info(f"Using server image: {server_image}") with DockerWorkspace( server_image=server_image, - host_port=8011, + # host_port auto-selects an available port when not specified platform=detect_platform(), extra_ports=True, # Expose extra ports for VSCode and VNC ) as workspace: diff --git a/sdk/guides/agent-server/local-server.mdx b/sdk/guides/agent-server/local-server.mdx index 541c5038..6e0cb7e9 100644 --- a/sdk/guides/agent-server/local-server.mdx +++ b/sdk/guides/agent-server/local-server.mdx @@ -114,16 +114,21 @@ import sys import tempfile import threading import time +from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Conversation, RemoteConversation, Workspace, get_logger -from openhands.sdk.event import ConversationStateUpdateEvent +from openhands.sdk.event import ConversationStateUpdateEvent, HookExecutionEvent +from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher from openhands.tools.preset.default import get_default_agent logger = get_logger(__name__) +# Hook script directory for this example +HOOK_SCRIPTS_DIR = Path(__file__).parent / "hook_scripts" + def _stream_output(stream, prefix, target_stream): """Stream output from subprocess to target stream with prefix.""" @@ -278,20 +283,62 @@ with ManagedAPIServer(port=8001) as server: ) logger.info(f"Output: {result.stdout}") + # Configure hooks - demonstrating the hooks system with RemoteConversation + # Server-side hooks (PreToolUse, PostToolUse, UserPromptSubmit, Stop) are + # executed by the agent server. Client-side hooks (SessionStart, SessionEnd) + # are executed locally. + + hook_config = HookConfig( + # Stop hook - run Python syntax check before allowing agent to finish. + # If any Python file has syntax errors, the hook returns "deny" with the + # error output, which gets sent back to the agent as feedback, and the + # agent continues working to fix the issue. + stop=[ + HookMatcher( + matcher="*", # Match all stop reasons + hooks=[ + HookDefinition( + command=str(HOOK_SCRIPTS_DIR / "pycompile_check.sh"), + timeout=60, + ) + ], + ) + ], + ) + conversation = Conversation( agent=agent, workspace=workspace, callbacks=[event_callback], + hook_config=hook_config, ) assert isinstance(conversation, RemoteConversation) + # Track hook execution events + hook_events: list[HookExecutionEvent] = [] + + def hook_event_tracker(event): + """Additional callback to track hook execution events.""" + if isinstance(event, HookExecutionEvent): + hook_events.append(event) + logger.info(f"šŸŖ HookExecutionEvent captured: {event.hook_event_type}") + + # Append our hook tracker to the existing callbacks + conversation._callbacks.append(hook_event_tracker) + try: logger.info(f"\nšŸ“‹ Conversation ID: {conversation.state.id}") - # Send first message and run - logger.info("šŸ“ Sending first message...") + # Test scenario: Ask the agent to create a Python file with syntax errors + # The stop hook should detect the syntax error and send feedback back + # to the agent to fix it + logger.info("šŸ“ Sending message to test on_stop hook with syntax check...") conversation.send_message( - "Read the current repo and write 3 facts about the project into FACTS.txt." + "Create a Python file called 'test_broken.py' in the current directory " + "with an obvious syntax error (like 'def broken(:\n pass' - missing " + "closing parenthesis). After creating the file, immediately use the " + "finish action. If you receive any feedback about errors, fix them and " + "try to finish again." ) # Generate title using a specific LLM @@ -299,10 +346,41 @@ with ManagedAPIServer(port=8001) as server: logger.info(f"Generated conversation title: {title}") logger.info("šŸš€ Running conversation...") - conversation.run() + logger.info( + "Expected behavior: Agent creates broken .py file -> tries to finish " + "-> stop hook runs syntax check -> check fails -> hook sends feedback " + "-> agent fixes the syntax error -> tries to finish again -> passes" + ) - logger.info("āœ… First task completed!") - logger.info(f"Agent status: {conversation.state.execution_status}") + # Keep running until the agent actually finishes + # When a stop hook denies, the state goes: running -> finished -> running + # The client's run() may return when it sees 'finished', so we need to + # check if the agent is still running and continue + max_runs = 10 # Allow enough retries for agent to fix issues + run_count = 0 + while run_count < max_runs: + run_count += 1 + logger.info(f"šŸ”„ Run attempt #{run_count}") + conversation.run() + current_status = conversation.state.execution_status + logger.info(f" After run(), status = {current_status}") + + # Small delay to let any pending state updates arrive + time.sleep(0.5) + current_status = conversation.state.execution_status + logger.info(f" After delay, status = {current_status}") + + if current_status.value == "finished": + logger.info(" āœ… Agent finished!") + break + elif current_status.value == "running": + logger.info(" Agent still running (hook denied stop), continuing...") + else: + logger.info(f" Unexpected status: {current_status}, stopping") + break + + logger.info("āœ… Task completed!") + logger.info(f"Final agent status: {conversation.state.execution_status}") # Wait for events to stop coming (no events for 2 seconds) logger.info("ā³ Waiting for events to stop...") @@ -310,10 +388,50 @@ with ManagedAPIServer(port=8001) as server: time.sleep(0.1) logger.info("āœ… Events have stopped") - logger.info("šŸš€ Running conversation again...") - conversation.send_message("Great! Now delete that file.") - conversation.run() - logger.info("āœ… Second task completed!") + # Analyze hook execution events + logger.info("\n" + "=" * 50) + logger.info("šŸ“Š Hook Execution Events Analysis") + logger.info("=" * 50) + + logger.info(f"Total HookExecutionEvents received: {len(hook_events)}") + for i, he in enumerate(hook_events, 1): + logger.info(f"\n Hook Event #{i}:") + logger.info(f" Type: {he.hook_event_type}") + logger.info(f" Command: {he.hook_command}") + logger.info(f" Success: {he.success}") + logger.info(f" Blocked: {he.blocked}") + logger.info(f" Exit Code: {he.exit_code}") + if he.additional_context: + # Truncate for readability + ctx = ( + he.additional_context[:500] + "..." + if len(he.additional_context) > 500 + else he.additional_context + ) + logger.info(f" Additional Context: {ctx}") + if he.error: + logger.info(f" Error: {he.error}") + + # Count stop hooks that were denied (pre-commit failed) + stop_events = [e for e in hook_events if e.hook_event_type == "Stop"] + denied_stops = [e for e in stop_events if e.blocked] + + logger.info(f"\nStop hook events: {len(stop_events)}") + logger.info(f"Denied stops (pre-commit failures): {len(denied_stops)}") + + if denied_stops: + logger.info( + "\nāœ… SUCCESS: Stop hook denied at least once due to " + "pre-commit failure!" + ) + logger.info( + " The agent should have received feedback and fixed the issue." + ) + else: + logger.info( + "\nāš ļø No denied stops detected. Either pre-commit passed on first " + "try or the hook didn't work as expected." + ) # Demonstrate state.events functionality logger.info("\n" + "=" * 50) @@ -324,10 +442,10 @@ with ManagedAPIServer(port=8001) as server: total_events = len(conversation.state.events) logger.info(f"šŸ“ˆ Total events in conversation: {total_events}") - # Get recent events (last 5) using state.events - logger.info("\nšŸ” Getting last 5 events using state.events...") + # Get recent events (last 10) using state.events + logger.info("\nšŸ” Getting last 10 events using state.events...") all_events = conversation.state.events - recent_events = all_events[-5:] if len(all_events) >= 5 else all_events + recent_events = all_events[-10:] if len(all_events) >= 10 else all_events for i, event in enumerate(recent_events, 1): event_type = type(event).__name__ @@ -335,7 +453,7 @@ with ManagedAPIServer(port=8001) as server: logger.info(f" {i}. {event_type} at {timestamp}") # Let's see what the actual event types are - logger.info("\nšŸ” Event types found:") + logger.info("\nšŸ” Event types found in recent events:") event_types = set() for event in recent_events: event_type = type(event).__name__ diff --git a/sdk/guides/agent-server/settings-secrets-api.mdx b/sdk/guides/agent-server/settings-secrets-api.mdx new file mode 100644 index 00000000..c9ba61f7 --- /dev/null +++ b/sdk/guides/agent-server/settings-secrets-api.mdx @@ -0,0 +1,153 @@ +--- +title: Settings and Secrets API +description: Manage agent settings and custom secrets through the agent server REST API. +--- + +import RunExampleCode from "/sdk/shared-snippets/how-to-run-example.mdx"; + +> A ready-to-run example is available [here](#ready-to-run-example)! + +The Settings and Secrets API provides REST endpoints for managing agent configuration and custom secrets through a local agent server. This is the recommended pattern for frontend clients that need to: + +- Store secrets securely via the Settings API (encrypted at rest) +- Pass encrypted secrets when starting conversations via `secrets_encrypted=True` +- Never have access to plaintext secrets after initial storage + +## Key Concepts + +### Settings Endpoints + +The agent server exposes settings management via REST: + +- **GET /api/settings** - Retrieve current settings +- **PATCH /api/settings** - Update settings with a partial diff + +```python icon="python" +# Store LLM configuration - API key is encrypted at rest +response = client.patch( + "/api/settings", + json={ + "agent_settings_diff": { + "llm": { + "model": "anthropic/claude-sonnet-4-5-20250929", + "api_key": api_key, + } + } + }, +) +settings = response.json() +# API key is redacted by default +assert settings["agent_settings"]["llm"]["api_key"] == "**********" +``` + +### Encrypted Secrets for Starting Conversations + +Frontend clients use the `X-Expose-Secrets: encrypted` header to get cipher-encrypted secrets: + +```python icon="python" +# Get settings with cipher-encrypted secrets +response = client.get( + "/api/settings", + headers={"X-Expose-Secrets": "encrypted"}, +) +encrypted_settings = response.json() + +# Encrypted keys start with "gAAAAA" (Fernet token format) +encrypted_api_key = encrypted_settings["agent_settings"]["llm"]["api_key"] +``` + +Then use the encrypted LLM config when starting a conversation: + +```python icon="python" +# Extract LLM config from settings (includes encrypted api_key) +encrypted_llm = encrypted_settings["agent_settings"]["llm"] + +# Start conversation with encrypted secrets +start_request = { + "agent": { + "kind": "Agent", + "llm": encrypted_llm, # Use entire LLM config from settings + "tools": [{"name": "TerminalTool"}, {"name": "FileEditorTool"}], + }, + "workspace": {"working_dir": "/tmp/demo"}, + "secrets_encrypted": True, # Server will decrypt the API key + "initial_message": { + "role": "user", + "content": [{"type": "text", "text": "Create a hello.txt file"}], + "run": True, + }, +} +response = client.post("/api/conversations", json=start_request) +``` + +The server decrypts the secrets before using them, ensuring the frontend never has access to plaintext secrets after initial storage. + +### Custom Secrets CRUD Operations + +Custom secrets can be created, listed, retrieved, and deleted: + +```python icon="python" +# Create a secret +client.put( + "/api/settings/secrets", + json={ + "name": "MY_PROJECT_TOKEN", + "value": "secret-token-abc123", + "description": "Example project token", + }, +) + +# List secrets (values not exposed) +secrets = client.get("/api/settings/secrets").json()["secrets"] + +# Get secret value +value = client.get("/api/settings/secrets/MY_PROJECT_TOKEN").text + +# Delete secret +client.delete("/api/settings/secrets/MY_PROJECT_TOKEN") +``` + +### Secret Name Validation + +Secret names must follow environment variable naming conventions: + +- Start with a letter (a-z, A-Z) +- Contain only letters, numbers, and underscores +- Be 1-64 characters long + +Invalid names are rejected with a 422 response: + +```python icon="python" +# Invalid: starts with number - returns 422 +response = client.put( + "/api/settings/secrets", + json={"name": "123_invalid", "value": "test"}, +) + +# Invalid: contains hyphen - returns 422 +response = client.put( + "/api/settings/secrets", + json={"name": "invalid-name", "value": "test"}, +) +``` + +## Ready-to-Run Example + +This example demonstrates the complete encrypted secrets workflow: +1. Store LLM API key via `PATCH /api/settings` (encrypted at rest) +2. Fetch settings with `X-Expose-Secrets: encrypted` header +3. Start conversation via `POST /api/conversations` with `secrets_encrypted=True` +4. Poll conversation state and verify agent task completion +5. Test custom secrets CRUD operations + +```python icon="python" expandable examples/02_remote_agent_server/12_settings_and_secrets_api.py + +``` + + + +## Next Steps + +- **[Local Agent Server](/sdk/guides/agent-server/local-server)** - Run agents through a local HTTP server +- **[Docker Sandboxed Server](/sdk/guides/agent-server/docker-sandbox)** - Run server in Docker for isolation +- **[Agent Server Overview](/sdk/guides/agent-server/overview)** - Architecture and implementation details diff --git a/sdk/guides/agent-settings.mdx b/sdk/guides/agent-settings.mdx index 12f9e26e..05e8ed6c 100644 --- a/sdk/guides/agent-settings.mdx +++ b/sdk/guides/agent-settings.mdx @@ -74,10 +74,10 @@ This example is available on GitHub: [examples/01_standalone_sdk/46_agent_settin ```python icon="python" expandable examples/01_standalone_sdk/46_agent_settings.py -"""Create, serialize, and deserialize AgentSettings, then build a working agent. +"""Create, serialize, and deserialize OpenHandsAgentSettings, then build an agent. Demonstrates: -1. Configuring an agent entirely through AgentSettings (LLM, tools, condenser). +1. Configuring an agent entirely through OpenHandsAgentSettings (LLM, tools, condenser). 2. Serializing settings to JSON and restoring them. 3. Building an Agent from settings via ``create_agent()``. 4. Running a short conversation to prove the settings take effect. @@ -89,7 +89,7 @@ import os from pydantic import SecretStr -from openhands.sdk import LLM, AgentSettings, Conversation, Tool +from openhands.sdk import LLM, Conversation, OpenHandsAgentSettings, Tool from openhands.sdk.settings import CondenserSettings from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool @@ -99,7 +99,7 @@ from openhands.tools.terminal import TerminalTool api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." -settings = AgentSettings( +settings = OpenHandsAgentSettings( llm=LLM( model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), api_key=SecretStr(api_key), @@ -118,7 +118,7 @@ print("Serialized settings (JSON):") print(json.dumps(payload, indent=2, default=str)[:800], "…") print() -restored = AgentSettings.model_validate(payload) +restored = OpenHandsAgentSettings.model_validate(payload) assert restored.condenser.enabled is True assert restored.condenser.max_size == 50 assert len(restored.tools) == 2 @@ -149,7 +149,7 @@ print() # ── 4. Different settings → different behavior ─────────────────────────── # Now create settings with ONLY the terminal tool and condenser disabled. -terminal_only_settings = AgentSettings( +terminal_only_settings = OpenHandsAgentSettings( llm=settings.llm, tools=[Tool(name=TerminalTool.name)], condenser=CondenserSettings(enabled=False), diff --git a/sdk/guides/browser-session-recording.mdx b/sdk/guides/browser-session-recording.mdx index 39a50f09..176a6859 100644 --- a/sdk/guides/browser-session-recording.mdx +++ b/sdk/guides/browser-session-recording.mdx @@ -64,7 +64,10 @@ from openhands.sdk import ( ) from openhands.sdk.tool import Tool from openhands.tools.browser_use import BrowserToolSet -from openhands.tools.browser_use.definition import BROWSER_RECORDING_OUTPUT_DIR +from openhands.tools.browser_use.definition import ( + BROWSER_RECORDING_OUTPUT_DIR, + BrowserNavigateAction, +) logger = get_logger(__name__) @@ -108,31 +111,39 @@ conversation = Conversation( # The prompt instructs the agent to: # 1. Start recording the browser session -# 2. Browse to a website and perform some actions +# 2. Navigate to a page and get its content # 3. Stop recording (auto-saves to file) PROMPT = """ Please complete the following task to demonstrate browser session recording: -1. First, use `browser_start_recording` to begin recording the browser session. - -2. Then navigate to https://docs.openhands.dev/ and: - - Get the page content - - Scroll down the page - - Get the browser state to see interactive elements - -3. Next, navigate to https://docs.openhands.dev/openhands/usage/cli/installation and: - - Get the page content - - Scroll down to see more content - -4. Finally, use `browser_stop_recording` to stop the recording. - Events are automatically saved. +1. Use `browser_start_recording` to begin recording. +2. Navigate to https://docs.openhands.dev/ and: + - Get the page content + - Scroll down the page + - Get the browser state to see interactive elements +3. Use `browser_stop_recording` to stop and save the recording. """ print("=" * 80) print("Browser Session Recording Example") print("=" * 80) print("\nTask: Record an agent's browser session and save it for replay") -print("\nStarting conversation with agent...\n") + +# Pre-initialize the browser so CDP is ready before the agent starts. +# This avoids wasting LLM calls if the browser fails to connect. +print("\nInitializing browser...") + +init_obs = conversation.execute_tool( + "browser_navigate", + BrowserNavigateAction(url="about:blank"), +) +if init_obs.is_error: + print(f"Browser initialization failed: {init_obs.text}") + print("Ensure Chrome/Chromium is installed and accessible.") + exit(1) +print("Browser initialized successfully.\n") + +print("Starting conversation with agent...\n") conversation.send_message(PROMPT) conversation.run() @@ -213,6 +224,9 @@ print("=" * 100) cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"Conversation ID: {conversation.id}") print(f"EXAMPLE_COST: {cost}") + +# Close conversation to shut down browser and other tool executors +conversation.close() ``` diff --git a/sdk/guides/custom-tools.mdx b/sdk/guides/custom-tools.mdx index 57cb96ce..f2e802a2 100644 --- a/sdk/guides/custom-tools.mdx +++ b/sdk/guides/custom-tools.mdx @@ -411,24 +411,29 @@ llm = LLM( cwd = os.getcwd() -def _make_bash_and_grep_tools(conv_state) -> list[ToolDefinition]: - """Create terminal and custom grep tools sharing one executor.""" - - terminal_executor = TerminalExecutor(working_dir=conv_state.workspace.working_dir) - # terminal_tool = terminal_tool.set_executor(executor=terminal_executor) - terminal_tool = TerminalTool.create(conv_state, executor=terminal_executor)[0] +class BashAndGrepToolSet(ToolDefinition[Action, Observation]): + """Create terminal and grep tools sharing one terminal executor.""" - # Use the GrepTool.create() method with shared terminal_executor - grep_tool = GrepTool.create(conv_state, terminal_executor=terminal_executor)[0] - - return [terminal_tool, grep_tool] + @classmethod + def create(cls, conv_state, **params) -> Sequence[ToolDefinition]: + terminal_executor = TerminalExecutor( + working_dir=conv_state.workspace.working_dir + ) + terminal_tool = TerminalTool.create( + conv_state, executor=terminal_executor, **params + )[0] + grep_tool = GrepTool.create( + conv_state, + terminal_executor=terminal_executor, + )[0] + return [terminal_tool, grep_tool] -register_tool("BashAndGrepToolSet", _make_bash_and_grep_tools) +register_tool(BashAndGrepToolSet.name, BashAndGrepToolSet) tools = [ Tool(name=FileEditorTool.name), - Tool(name="BashAndGrepToolSet"), + Tool(name=BashAndGrepToolSet.name), ] # Agent diff --git a/sdk/guides/github-workflows/pr-review.mdx b/sdk/guides/github-workflows/pr-review.mdx index 51c5d5ff..ebe7329d 100644 --- a/sdk/guides/github-workflows/pr-review.mdx +++ b/sdk/guides/github-workflows/pr-review.mdx @@ -151,28 +151,16 @@ jobs: github.event.requested_reviewer.login == 'openhands-agent' runs-on: ubuntu-latest steps: - - name: Checkout for composite action - uses: actions/checkout@v4 - with: - repository: OpenHands/software-agent-sdk - # Use a specific version tag or branch (e.g., 'v1.0.0' or 'main') - ref: main - sparse-checkout: .github/actions/pr-review - - name: Run PR Review - uses: ./.github/actions/pr-review + uses: OpenHands/extensions/plugins/pr-review@main with: - # LLM model(s) to use. Can be comma-separated for A/B testing - # - one model will be randomly selected per review llm-model: anthropic/claude-sonnet-4-5-20250929 llm-base-url: '' - # [DEPRECATED] review-style is no longer used; standard and roasted are merged - # review-style: roasted - # Extensions version to use (version tag or branch name) - extensions-version: main - # Secrets + review-style: roasted llm-api-key: ${{ secrets.LLM_API_KEY }} github-token: ${{ secrets.GITHUB_TOKEN }} + # Optional: Laminar API key for observability + lmnr-api-key: ${{ secrets.LMNR_PROJECT_API_KEY }} ``` ### Action Inputs diff --git a/sdk/guides/llm-subscriptions.mdx b/sdk/guides/llm-subscriptions.mdx index 524d6e71..c2966ba8 100644 --- a/sdk/guides/llm-subscriptions.mdx +++ b/sdk/guides/llm-subscriptions.mdx @@ -100,6 +100,7 @@ to access OpenAI's Codex models without consuming API credits. The subscription_login() method handles: - OAuth PKCE authentication flow +- Device-code authentication for remote/headless environments - Credential caching (~/.openhands/auth/) - Automatic token refresh @@ -111,21 +112,44 @@ Supported models: Requirements: - Active ChatGPT Plus or Pro subscription -- Browser access for initial OAuth login +- Browser access for initial OAuth login, or another browser/device for + device-code login + +Environment variables: +- OPENHANDS_SUBSCRIPTION_MODEL: Model to use (default: gpt-5.2-codex) +- OPENHANDS_SUBSCRIPTION_AUTH_METHOD: "browser" or "device_code" + (default: browser) +- OPENHANDS_SUBSCRIPTION_FORCE_LOGIN: Set to "1" to force fresh login +- SUBSCRIPTION_LOGIN_ONLY: Set to "1" to verify login without running an agent """ import os +from typing import Literal from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool +AuthMethod = Literal["browser", "device_code"] + + # First time: Opens browser for OAuth login # Subsequent calls: Reuses cached credentials (auto-refreshes if expired) +model = os.getenv("OPENHANDS_SUBSCRIPTION_MODEL", "gpt-5.2-codex") +auth_method_env = os.getenv("OPENHANDS_SUBSCRIPTION_AUTH_METHOD", "browser") +if auth_method_env not in ("browser", "device_code"): + raise ValueError( + "OPENHANDS_SUBSCRIPTION_AUTH_METHOD must be 'browser' or 'device_code'" + ) +auth_method: AuthMethod = auth_method_env +force_login = os.getenv("OPENHANDS_SUBSCRIPTION_FORCE_LOGIN") == "1" + llm = LLM.subscription_login( vendor="openai", - model="gpt-5.2-codex", # or "gpt-5.2", "gpt-5.1-codex-max", "gpt-5.1-codex-mini" + model=model, # or "gpt-5.2", "gpt-5.1-codex-max", "gpt-5.1-codex-mini" + auth_method=auth_method, + force_login=force_login, ) # Alternative: Force a fresh login (useful if credentials are stale) @@ -135,9 +159,23 @@ llm = LLM.subscription_login( # llm = LLM.subscription_login( # vendor="openai", model="gpt-5.2-codex", open_browser=False # ) +# +# Alternative: Use device-code login for remote/headless environments +# llm = LLM.subscription_login( +# vendor="openai", +# model="gpt-5.2-codex", +# auth_method="device_code", +# force_login=True, +# ) # Verify subscription mode is active print(f"Using subscription mode: {llm.is_subscription}") +print(f"Model: {llm.model}") +print(f"Auth method: {auth_method}") + +if os.getenv("SUBSCRIPTION_LOGIN_ONLY") == "1": + print("Login verified; skipping agent run because SUBSCRIPTION_LOGIN_ONLY=1.") + raise SystemExit(0) # Use the LLM with an agent as usual agent = Agent( diff --git a/sdk/guides/plugins.mdx b/sdk/guides/plugins.mdx index 74099f54..e05a2e0a 100644 --- a/sdk/guides/plugins.mdx +++ b/sdk/guides/plugins.mdx @@ -404,13 +404,13 @@ def demo_enable_disable_plugin(installed_dir: Path, plugin_name: str) -> None: ] metadata = json.loads((installed_dir / ".installed.json").read_text()) - assert metadata["plugins"][plugin_name]["enabled"] is False + assert metadata["extensions"][plugin_name]["enabled"] is False assert enable_plugin(plugin_name, installed_dir=installed_dir) is True print_state("After re-enable", installed_dir) metadata = json.loads((installed_dir / ".installed.json").read_text()) - assert metadata["plugins"][plugin_name]["enabled"] is True + assert metadata["extensions"][plugin_name]["enabled"] is True assert plugin_name in [ plugin.name for plugin in load_installed_plugins(installed_dir=installed_dir) ] diff --git a/sdk/guides/skill.mdx b/sdk/guides/skill.mdx index 5bf4d441..c8ae727e 100644 --- a/sdk/guides/skill.mdx +++ b/sdk/guides/skill.mdx @@ -391,7 +391,7 @@ Usage: import sys from pathlib import Path -from openhands.sdk.plugin import Marketplace +from openhands.sdk.marketplace import Marketplace from openhands.sdk.skills import ( install_skills_from_marketplace, list_installed_skills, @@ -470,6 +470,7 @@ def main(): if __name__ == "__main__": main() + print("EXAMPLE_COST: 0") ``` @@ -769,7 +770,7 @@ from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Agent, AgentContext, Conversation -from openhands.sdk.context.skills import ( +from openhands.sdk.skills import ( discover_skill_resources, load_skills_from_dir, )