diff --git a/docs/02_using_agents.md b/docs/02_using_agents.md index 8d736e52..a75e531b 100644 --- a/docs/02_using_agents.md +++ b/docs/02_using_agents.md @@ -1,6 +1,6 @@ # Using Agents -AskUI Vision Agent provides three predefined agent types for different automation targets. All agents share the same core API (`act()`, `get()`, `locate()`) but are optimized for their respective platforms. Each agent comes with its own system prompt tailored to its platform-specific tools and capabilities. +AskUI Vision Agent provides four predefined agent types for different automation targets. All agents share the same core API (`act()`, `get()`, `locate()`) but are optimized for their respective platforms. Each agent comes with its own system prompt tailored to its platform-specific tools and capabilities. ## ComputerAgent @@ -47,10 +47,30 @@ with WebVisionAgent() as agent: **Default tools:** All `ComputerAgent` tools plus `goto`, `back`, `forward`, `get_page_title`, `get_page_url` +## MultiDeviceAgent + +Use this agent when you need to control a desktop computer and an Android device within the same task. The agent has access to both the full set of computer tools (via AskUI Agent OS) and Android tools (via ADB), and can switch between devices seamlessly during execution. + +This is useful for cross-device workflows, such as triggering an action on the desktop and verifying the result on a mobile device, or transferring data between devices. + +```python +from askui import MultiDeviceAgent + +with MultiDeviceAgent(android_device_sn="emulator-5554") as agent: + agent.act("Open the web app on the computer and send a push notification, then verify it appears on the Android device") +``` + +If you have multiple Android devices connected, pass the serial number of the target device via `android_device_sn`. You can find serial numbers by running `adb devices`. If omitted, no device is preselected and the agent will select one at runtime. + +Requires the `android` dependency installed (`pip install askui[android]`) and a connected device (physical or emulator). + +**Default tools:** All `ComputerAgent` tools plus all `AndroidAgent` tools. Additional tools can be provided via the `act_tools` parameter. + ## Choosing an Agent | Target | Agent | Backend | |--------|-------|---------| | Desktop (Windows/macOS/Linux) | `ComputerAgent` | AskUI Agent OS (gRPC) | | Android devices | `AndroidAgent` | ADB | +| Desktop + Android | `MultiDeviceAgent` | AskUI Agent OS (gRPC) + ADB | | Web browsers | `WebVisionAgent` | Playwright | diff --git a/src/askui/__init__.py b/src/askui/__init__.py index 9d71f276..69925c9b 100644 --- a/src/askui/__init__.py +++ b/src/askui/__init__.py @@ -54,6 +54,9 @@ except ImportError: _ANDROID_AGENT_AVAILABLE = False +if _ANDROID_AGENT_AVAILABLE: + from .multi_device_agent import MultiDeviceAgent + try: from .web_agent import WebVisionAgent from .web_testing_agent import WebTestingAgent @@ -107,7 +110,7 @@ ] if _ANDROID_AGENT_AVAILABLE: - __all__ += ["AndroidAgent", "AndroidVisionAgent"] + __all__ += ["AndroidAgent", "AndroidVisionAgent", "MultiDeviceAgent"] if _WEB_AGENTS_AVAILABLE: __all__ += ["WebVisionAgent", "WebTestingAgent"] diff --git a/src/askui/multi_device_agent.py b/src/askui/multi_device_agent.py new file mode 100644 index 00000000..6dba3ee2 --- /dev/null +++ b/src/askui/multi_device_agent.py @@ -0,0 +1,165 @@ +from typing import Annotated, Optional, Type, overload + +from pydantic import Field + +from askui.agent import ComputerAgent +from askui.agent_base import Agent +from askui.agent_settings import AgentSettings +from askui.android_agent import AndroidAgent +from askui.locators.locators import Locator +from askui.models.shared.settings import GetSettings, LocateSettings +from askui.models.shared.tools import Tool +from askui.models.types.geometry import Point +from askui.models.types.response_schemas import ResponseSchema +from askui.prompts.act_prompts import create_multidevice_agent_prompt +from askui.reporting import CompositeReporter, Reporter +from askui.retry import Retry +from askui.utils.source_utils import InputSource + + +class MultiDeviceAgent(Agent): + """ + Multi device agent that combines a computer and an Android agent. + It can be used to perform actions on both devices simultaneously. + + Args: + display (int, optional): The display number for computer screen + interactions. Defaults to `1`. + reporters (list[Reporter] | None, optional): List of reporter instances. + tools (AgentToolbox | None, optional): Not supported; use `act_tools`. + retry (Retry | None, optional): Retry instance for failed actions. + act_tools (list[Tool] | None, optional): Additional tools for `act()`. + android_device_sn (str | None, optional): Android device serial number + to select on open. + + Example: + ```python + from askui import MultiDeviceAgent + + with MultiDeviceAgent(android_device_sn="emulator-5554") as agent: + agent.computer.click("Start") + agent.android.tap("OK") + agent.act("Fill the form on the phone and submit from the desktop") + ``` + """ + + def __init__( + self, + desktop_display: Annotated[int, Field(ge=1)] = 1, + android_device_sn: str | int = 0, + reporters: list[Reporter] | None = None, + retry: Retry | None = None, + act_tools: list[Tool] | None = None, + settings: AgentSettings | None = None, + ) -> None: + reporter = CompositeReporter(reporters=reporters) + + # Initialize the base agent + super().__init__( + reporter=reporter, + retry=retry, + settings=settings, + ) + + # Initialize the computer agent + self._computer_agent = ComputerAgent( + display=desktop_display, + reporters=[reporter], + settings=settings, + ) + + # Initialize the Android agent + self._android_agent = AndroidAgent( + device=android_device_sn, + reporters=[reporter], + settings=settings, + ) + + # Combine the tool collections of the computer and Android agents + self.act_tool_collection = ( + self._computer_agent.act_tool_collection + + self._android_agent.act_tool_collection + ) + + self.act_tool_collection.append_tool(*(act_tools or [])) + + self.act_settings.messages.system = create_multidevice_agent_prompt() + + @property + def computer(self) -> ComputerAgent: + """The composed computer agent.""" + return self._computer_agent + + @property + def android(self) -> AndroidAgent: + """The composed Android agent.""" + return self._android_agent + + @overload + def get( + self, + query: Annotated[str, Field(min_length=1)], + response_schema: None = None, + source: Optional[InputSource] = None, + get_settings: GetSettings | None = None, + ) -> str: ... + @overload + def get( + self, + query: Annotated[str, Field(min_length=1)], + response_schema: Type[ResponseSchema], + source: Optional[InputSource] = None, + get_settings: GetSettings | None = None, + ) -> ResponseSchema: ... + + def get( + self, + query: Annotated[str, Field(min_length=1)], + response_schema: Type[ResponseSchema] | None = None, + source: Optional[InputSource] = None, + get_settings: GetSettings | None = None, + ) -> ResponseSchema | str: + """Not supported on `MultiDeviceAgent`. + + Use `agent.computer.get()` or `agent.android.get()` instead. + + Raises: + NotImplementedError: Always. + """ + error_msg = ( + "MultiDeviceAgent does not support get() directly." + " Use agent.computer.get() or agent.android.get()" + " instead." + ) + raise NotImplementedError(error_msg) + + def locate( + self, + locator: str | Locator, + screenshot: Optional[InputSource] = None, + locate_settings: LocateSettings | None = None, + ) -> Point: + """Not supported on `MultiDeviceAgent`. + + Use `agent.computer.locate()` or `agent.android.locate()` + instead. + + Raises: + NotImplementedError: Always. + """ + error_msg = ( + "MultiDeviceAgent does not support locate() directly." + " Use agent.computer.locate() or" + " agent.android.locate() instead." + ) + raise NotImplementedError(error_msg) + + def close(self) -> None: + self._computer_agent.act_agent_os_facade.disconnect() + self._android_agent.act_agent_os_facade.disconnect() + super().close() + + def open(self) -> None: + self._computer_agent.open() + self._android_agent.open() + super().open()