Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion docs/02_using_agents.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Using Agents

AskUI Vision Agent provides three predefined agent types for different automation targets. All agents share the same core API (`act()`, `get()`, `locate()`) but are optimized for their respective platforms. Each agent comes with its own system prompt tailored to its platform-specific tools and capabilities.
AskUI Vision Agent provides four predefined agent types for different automation targets. All agents share the same core API (`act()`, `get()`, `locate()`) but are optimized for their respective platforms. Each agent comes with its own system prompt tailored to its platform-specific tools and capabilities.

## ComputerAgent

Expand Down Expand Up @@ -47,10 +47,30 @@ with WebVisionAgent() as agent:

**Default tools:** All `ComputerAgent` tools plus `goto`, `back`, `forward`, `get_page_title`, `get_page_url`

## MultiDeviceAgent

Use this agent when you need to control a desktop computer and an Android device within the same task. The agent has access to both the full set of computer tools (via AskUI Agent OS) and Android tools (via ADB), and can switch between devices seamlessly during execution.

This is useful for cross-device workflows, such as triggering an action on the desktop and verifying the result on a mobile device, or transferring data between devices.

```python
from askui import MultiDeviceAgent

with MultiDeviceAgent(android_device_sn="emulator-5554") as agent:
agent.act("Open the web app on the computer and send a push notification, then verify it appears on the Android device")
```

If you have multiple Android devices connected, pass the serial number of the target device via `android_device_sn`. You can find serial numbers by running `adb devices`. If omitted, no device is preselected and the agent will select one at runtime.

Requires the `android` dependency installed (`pip install askui[android]`) and a connected device (physical or emulator).

**Default tools:** All `ComputerAgent` tools plus all `AndroidAgent` tools. Additional tools can be provided via the `act_tools` parameter.

## Choosing an Agent

| Target | Agent | Backend |
|--------|-------|---------|
| Desktop (Windows/macOS/Linux) | `ComputerAgent` | AskUI Agent OS (gRPC) |
| Android devices | `AndroidAgent` | ADB |
| Desktop + Android | `MultiDeviceAgent` | AskUI Agent OS (gRPC) + ADB |
| Web browsers | `WebVisionAgent` | Playwright |
5 changes: 4 additions & 1 deletion src/askui/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
except ImportError:
_ANDROID_AGENT_AVAILABLE = False

if _ANDROID_AGENT_AVAILABLE:
from .multi_device_agent import MultiDeviceAgent

try:
from .web_agent import WebVisionAgent
from .web_testing_agent import WebTestingAgent
Expand Down Expand Up @@ -107,7 +110,7 @@
]

if _ANDROID_AGENT_AVAILABLE:
__all__ += ["AndroidAgent", "AndroidVisionAgent"]
__all__ += ["AndroidAgent", "AndroidVisionAgent", "MultiDeviceAgent"]

if _WEB_AGENTS_AVAILABLE:
__all__ += ["WebVisionAgent", "WebTestingAgent"]
165 changes: 165 additions & 0 deletions src/askui/multi_device_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from typing import Annotated, Optional, Type, overload

from pydantic import Field

from askui.agent import ComputerAgent
from askui.agent_base import Agent
from askui.agent_settings import AgentSettings
from askui.android_agent import AndroidAgent
from askui.locators.locators import Locator
from askui.models.shared.settings import GetSettings, LocateSettings
from askui.models.shared.tools import Tool
from askui.models.types.geometry import Point
from askui.models.types.response_schemas import ResponseSchema
from askui.prompts.act_prompts import create_multidevice_agent_prompt
from askui.reporting import CompositeReporter, Reporter
from askui.retry import Retry
from askui.utils.source_utils import InputSource


class MultiDeviceAgent(Agent):
"""
Multi device agent that combines a computer and an Android agent.
It can be used to perform actions on both devices simultaneously.

Args:
display (int, optional): The display number for computer screen
interactions. Defaults to `1`.
reporters (list[Reporter] | None, optional): List of reporter instances.
tools (AgentToolbox | None, optional): Not supported; use `act_tools`.
retry (Retry | None, optional): Retry instance for failed actions.
act_tools (list[Tool] | None, optional): Additional tools for `act()`.
android_device_sn (str | None, optional): Android device serial number
to select on open.

Example:
```python
from askui import MultiDeviceAgent

with MultiDeviceAgent(android_device_sn="emulator-5554") as agent:
agent.computer.click("Start")
agent.android.tap("OK")
agent.act("Fill the form on the phone and submit from the desktop")
```
"""

def __init__(
self,
desktop_display: Annotated[int, Field(ge=1)] = 1,
android_device_sn: str | int = 0,
reporters: list[Reporter] | None = None,
retry: Retry | None = None,
act_tools: list[Tool] | None = None,
settings: AgentSettings | None = None,
) -> None:
reporter = CompositeReporter(reporters=reporters)

# Initialize the base agent
super().__init__(
reporter=reporter,
retry=retry,
settings=settings,
)

# Initialize the computer agent
self._computer_agent = ComputerAgent(
display=desktop_display,
reporters=[reporter],
settings=settings,
)

# Initialize the Android agent
self._android_agent = AndroidAgent(
device=android_device_sn,
reporters=[reporter],
settings=settings,
)

# Combine the tool collections of the computer and Android agents
self.act_tool_collection = (
self._computer_agent.act_tool_collection
+ self._android_agent.act_tool_collection
)

self.act_tool_collection.append_tool(*(act_tools or []))

self.act_settings.messages.system = create_multidevice_agent_prompt()

@property
def computer(self) -> ComputerAgent:
"""The composed computer agent."""
return self._computer_agent

@property
def android(self) -> AndroidAgent:
"""The composed Android agent."""
return self._android_agent

@overload
def get(
self,
query: Annotated[str, Field(min_length=1)],
response_schema: None = None,
source: Optional[InputSource] = None,
get_settings: GetSettings | None = None,
) -> str: ...
@overload
def get(
self,
query: Annotated[str, Field(min_length=1)],
response_schema: Type[ResponseSchema],
source: Optional[InputSource] = None,
get_settings: GetSettings | None = None,
) -> ResponseSchema: ...

def get(
self,
query: Annotated[str, Field(min_length=1)],
response_schema: Type[ResponseSchema] | None = None,
source: Optional[InputSource] = None,
get_settings: GetSettings | None = None,
) -> ResponseSchema | str:
"""Not supported on `MultiDeviceAgent`.

Use `agent.computer.get()` or `agent.android.get()` instead.

Raises:
NotImplementedError: Always.
"""
error_msg = (
"MultiDeviceAgent does not support get() directly."
" Use agent.computer.get() or agent.android.get()"
" instead."
)
raise NotImplementedError(error_msg)

def locate(
self,
locator: str | Locator,
screenshot: Optional[InputSource] = None,
locate_settings: LocateSettings | None = None,
) -> Point:
"""Not supported on `MultiDeviceAgent`.

Use `agent.computer.locate()` or `agent.android.locate()`
instead.

Raises:
NotImplementedError: Always.
"""
error_msg = (
"MultiDeviceAgent does not support locate() directly."
" Use agent.computer.locate() or"
" agent.android.locate() instead."
)
raise NotImplementedError(error_msg)

def close(self) -> None:
self._computer_agent.act_agent_os_facade.disconnect()
self._android_agent.act_agent_os_facade.disconnect()
super().close()

def open(self) -> None:
self._computer_agent.open()
self._android_agent.open()
super().open()