From 8f496affa10613c2fbd3df8b26c7e6b133e764dd Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Tue, 9 Jun 2026 16:37:03 +0200 Subject: [PATCH 1/6] feat: add coordinate space abstraction for open weights LLM support --- src/askui/android_agent.py | 5 +- src/askui/computer_agent.py | 3 +- .../model_providers/ollama_vlm_provider.py | 36 +++++ .../model_providers/openai_vlm_provider.py | 26 ++++ src/askui/model_providers/vlm_provider.py | 31 +++++ src/askui/models/shared/__init__.py | 10 ++ src/askui/models/shared/coordinate_space.py | 104 +++++++++++++++ src/askui/tools/android/agent_os_facade.py | 31 +++-- src/askui/tools/computer_agent_os_facade.py | 30 ++++- src/askui/tools/playwright/agent_os_facade.py | 28 +++- src/askui/web_agent.py | 5 +- .../test_ollama_vlm_provider.py | 64 +++++++++ .../test_openai_vlm_provider.py | 125 ++++++++++++++++++ 13 files changed, 474 insertions(+), 24 deletions(-) create mode 100644 src/askui/models/shared/coordinate_space.py diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 98b79143..7b7818f8 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -87,7 +87,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PpadbAgentOs(device_identifier=device, reporter=reporter) - self.act_agent_os_facade = AndroidAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -97,6 +96,10 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = AndroidAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with Android-specific settings self.act_settings = ActSettings( diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py index ad0a6627..6e53df87 100644 --- a/src/askui/computer_agent.py +++ b/src/askui/computer_agent.py @@ -130,7 +130,8 @@ def __init__( truncation_strategy=truncation_strategy, ) self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade( - self.tools.os + self.tools.os, + coordinate_space=self._vlm_provider.coordinate_space, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with computer-specific settings diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py index e06fa408..1cca3905 100644 --- a/src/askui/model_providers/ollama_vlm_provider.py +++ b/src/askui/model_providers/ollama_vlm_provider.py @@ -1,12 +1,23 @@ """OllamaVlmProvider — VLM access via a local Ollama instance.""" from openai import OpenAI +from typing_extensions import override from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) _DEFAULT_BASE_URL = "http://localhost:11434/v1" _DEFAULT_MODEL_ID = "qwen3.5" +_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace() + class OllamaVlmProvider(OpenAIVlmProvider): """VLM provider that routes requests to a local Ollama instance. @@ -14,6 +25,11 @@ class OllamaVlmProvider(OpenAIVlmProvider): Thin convenience wrapper around `OpenAIVlmProvider` with Ollama defaults (``base_url``, ``api_key``, ``model_id``). + Qwen and Holo models are automatically detected and their coordinate + space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``. + Kimi models use ``NormalizedCoordinateSpace()``. + Pass ``coordinate_space`` explicitly to override auto-detection. + Args: model_id (str, optional): Ollama model to use. Defaults to ``"qwen3.5"``. @@ -21,6 +37,9 @@ class OllamaVlmProvider(OpenAIVlmProvider): API. Defaults to ``"http://localhost:11434/v1"``. client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``base_url`` is ignored. + coordinate_space (VlmCoordinateSpace | None, optional): The coordinate + grid the model emits coordinates in. ``None`` (the default) + enables auto-detection based on ``model_id``. Example: ```python @@ -40,10 +59,27 @@ def __init__( model_id: str = _DEFAULT_MODEL_ID, base_url: str = _DEFAULT_BASE_URL, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace | None = None, ) -> None: + self._coordinate_space_override = coordinate_space super().__init__( model_id=model_id, api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value base_url=base_url, client=client, + coordinate_space=coordinate_space or PixelCoordinateSpace(), ) + + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + if self._coordinate_space_override is not None: + return self._coordinate_space_override + model_lower = self._model_id_value.lower() + if "qwen" in model_lower: + return _QWEN_COORDINATE_SPACE + if "holo" in model_lower: + return _HOLO_COORDINATE_SPACE + if "kimi" in model_lower: + return _KIMI_COORDINATE_SPACE + return self._coordinate_space diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index 47475cc7..8ac5f6a6 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -14,11 +14,17 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "gpt-5.4" +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() class OpenAIVlmProvider(VlmProvider): @@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider): to the OpenAI API (``https://api.openai.com/v1``). client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``api_key`` and ``base_url`` are ignored. + coordinate_space (VlmCoordinateSpace, optional): The coordinate grid + the model emits coordinates in. Defaults to the screenshot + resolution (native pixel coordinates). Example: ```python @@ -57,6 +66,7 @@ def __init__( api_key: str | None = None, base_url: str | None = None, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -65,6 +75,7 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) + self._coordinate_space = coordinate_space if client is not None: self._client = client else: @@ -86,6 +97,11 @@ def __init__( def model_id(self) -> str: return self._model_id_value + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + return self._coordinate_space + @property @override def pricing(self) -> ModelPricing | None: @@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi: """Lazily initialise the `OpenAIMessagesApi` on first use.""" return OpenAIMessagesApi(client=self._client) + @override + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Append coordinate and resolution info to the system prompt.""" + coord_info = self.coordinate_space.build_prompt_section( + screenshot_resolution=SCREENSHOT_RESOLUTION, + ) + return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}") + @override def create_message( self, @@ -108,6 +132,8 @@ def create_message( temperature: float | None = None, provider_options: dict[str, Any] | None = None, ) -> MessageParam: + if system is not None: + system = self.augment_system_prompt(system) return self._messages_api.create_message( messages=messages, model_id=self._model_id_value, diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py index 1e98b972..6d4d9738 100644 --- a/src/askui/model_providers/vlm_provider.py +++ b/src/askui/model_providers/vlm_provider.py @@ -8,10 +8,16 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection from askui.utils.model_pricing import ModelPricing +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() + class VlmProvider(ABC): """Interface for Vision Language Model providers. @@ -44,6 +50,17 @@ class VlmProvider(ABC): def model_id(self) -> str: """The model identifier used by this provider.""" + @property + def coordinate_space(self) -> VlmCoordinateSpace: + """The coordinate space this model emits coordinates in. + + Returns a `VlmCoordinateSpace` describing the grid the model uses. + The default is `PixelCoordinateSpace` (native pixel coordinates). + Override in subclasses when the model uses a different grid + (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen). + """ + return _DEFAULT_COORDINATE_SPACE + @property def pricing(self) -> ModelPricing | None: """Pricing information for this provider's model. @@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None: """ return None + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Hook for providers to augment the system prompt before sending. + + Called by ``create_message()`` implementations. The base + implementation returns the prompt unchanged. Override in + subclasses that need to inject provider-specific information + (e.g. coordinate bounds for non-Anthropic models). + + The original ``SystemPrompt`` object is **not** mutated — + implementations should create a new ``SystemPrompt`` wrapping + the augmented text. + """ + return system + @abstractmethod def create_message( self, diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index 4df27a7b..635fc053 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -1,5 +1,11 @@ from .android_base_tool import AndroidBaseTool from .computer_base_tool import ComputerBaseTool +from .coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) from .tool_tags import ToolTags try: @@ -13,6 +19,10 @@ __all__ = [ "AndroidBaseTool", "ComputerBaseTool", + "NormalizedCoordinateSpace", + "PixelCoordinateSpace", + "ScaledCoordinateSpace", + "VlmCoordinateSpace", "ToolTags", ] diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py new file mode 100644 index 00000000..69696cdd --- /dev/null +++ b/src/askui/models/shared/coordinate_space.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from pydantic import BaseModel, Field + +# The resolution screenshots are scaled to before being sent to the model. +# Used by all agent OS facades (computer, Android, Playwright). +SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768) + + +def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]: + sw, sh = screenshot_resolution + return [ + f"* Screenshot resolution: {sw}x{sh} pixels", + "* Screenshots may contain black padding bars to preserve the " + "original aspect ratio. UI elements are NOT located in the " + "padding area.", + "* Coordinate origin is the top-left corner (0, 0)", + ] + + +class VlmCoordinateSpace(BaseModel, ABC): + """Abstract base for VLM coordinate conventions. + + Each subclass describes one coordinate grid a VLM may emit and knows + how to map those coordinates back to pixel space and how to render + the matching prompt section. + """ + + @abstractmethod + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + """Map model coordinates to pixel coordinates in *target_resolution*.""" + + @abstractmethod + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + """Build prompt text describing coordinate bounds for the model.""" + + +class PixelCoordinateSpace(VlmCoordinateSpace): + """Identity mapping -- coordinates already in pixel space. + + Used by Anthropic/Claude which emit coordinates matching the + screenshot resolution. + """ + + def map_to_target( + self, + x: float, + y: float, + target_resolution: tuple[int, int], # noqa: ARG002 + ) -> tuple[int, int]: + return int(x), int(y) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + sw, sh = screenshot_resolution + lines = _common_prompt_lines(screenshot_resolution) + lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + return "\n".join(lines) + + +class ScaledCoordinateSpace(VlmCoordinateSpace): + """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling.""" + + width: int = Field(gt=0, description="Width of the coordinate grid") + height: int = Field(gt=0, description="Height of the coordinate grid") + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw / self.width), int(y * th / self.height) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + lines = _common_prompt_lines(screenshot_resolution) + if (self.width, self.height) != screenshot_resolution: + lines.append( + f"* Emit coordinates in a {self.width}x{self.height} " + f"normalised grid: 0 <= x < {self.width}, " + f"0 <= y < {self.height}" + ) + else: + sw, sh = screenshot_resolution + lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + return "\n".join(lines) + + +class NormalizedCoordinateSpace(VlmCoordinateSpace): + """0.0-1.0 float grid (Kimi). No fields.""" + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw), int(y * th) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + lines = _common_prompt_lines(screenshot_resolution) + lines.append( + "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0" + ) + return "\n".join(lines) diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index f27d0eee..f0374036 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay from askui.tools.android.uiautomator_hierarchy import UIElementCollection @@ -15,9 +19,14 @@ class AndroidAgentOsFacade(AndroidAgentOs): and back to the real screen resolution. """ - def __init__(self, agent_os: AndroidAgentOs) -> None: + def __init__( + self, + agent_os: AndroidAgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os: AndroidAgentOs = agent_os - self._target_resolution: Tuple[int, int] = (1024, 768) + self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: Optional[Tuple[int, int]] = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -39,33 +48,39 @@ def screenshot(self) -> Image.Image: def _scale_coordinates( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, ) -> Tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot().size + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), self._real_screen_resolution, self._target_resolution, inverse=from_agent, ) - def tap(self, x: int, y: int) -> None: + def tap(self, x: float, y: float) -> None: x, y = self._scale_coordinates(x, y) self._agent_os.tap(x, y) def swipe( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: x1, y1 = self._scale_coordinates(x1, y1) x2, y2 = self._scale_coordinates(x2, y2) self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms) def drag_and_drop( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: x1, y1 = self._scale_coordinates(x1, y1) x2, y2 = self._scale_coordinates(x2, y2) diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 28a1a8c5..c91a2c84 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import ( AgentOs, @@ -36,9 +40,14 @@ class ComputerAgentOsFacade(AgentOs): and back to the real screen resolution. """ - def __init__(self, agent_os: AgentOs) -> None: + def __init__( + self, + agent_os: AgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) + self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: DisplaySize | None = None self.tags.append(ToolTags.SCALED_AGENT_OS.value) @@ -57,7 +66,7 @@ def screenshot(self, report: bool = True) -> Image.Image: ) return scale_image_to_fit(screenshot, self._target_resolution) - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) @@ -68,7 +77,7 @@ def get_mouse_position(self) -> Coordinate: ) return Coordinate(x=scaled_x, y=scaled_y) - def set_mouse_position(self, x: int, y: int) -> None: + def set_mouse_position(self, x: float, y: float) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) self._agent_os.set_mouse_position(scaled_x, scaled_y) @@ -302,15 +311,22 @@ def remove_virtual_displays(self) -> None: def _scale_coordinates_back( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, check_coordinates_in_bounds: bool = True, ) -> tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.retrieve_active_display().size + + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), (self._real_screen_resolution.width, self._real_screen_resolution.height), self._target_resolution, inverse=from_agent, diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index 091ff804..5d6f7c42 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import Display, ModifierKey, PcKey from askui.tools.playwright.agent_os import PlaywrightAgentOs @@ -20,9 +24,14 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs): agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. """ - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__( + self, + agent_os: PlaywrightAgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) + self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: tuple[int, int] | None = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -43,22 +52,29 @@ def screenshot(self, report: bool = True) -> Image.Image: def _scale_coordinates( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, ) -> tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot( report=False, ).size + + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), self._real_screen_resolution, self._target_resolution, inverse=from_agent, ) - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: scaled_x, scaled_y = self._scale_coordinates(x, y) # scaled_x, scaled_y = x, y self._agent_os.mouse_move(scaled_x, scaled_y, duration) diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py index fe47c5f9..bc211ec9 100644 --- a/src/askui/web_agent.py +++ b/src/askui/web_agent.py @@ -60,7 +60,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PlaywrightAgentOs(reporter) - self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -70,6 +69,10 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = PlaywrightAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) self.act_settings = ActSettings( messages=MessageSettings( diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py index 143e7c35..e3f78ef5 100644 --- a/tests/unit/model_providers/test_ollama_vlm_provider.py +++ b/tests/unit/model_providers/test_ollama_vlm_provider.py @@ -6,6 +6,11 @@ from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) class TestOllamaVlmProvider: @@ -48,3 +53,62 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_auto_detects_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="qwen3.5") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Qwen2-VL") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_kimi(self) -> None: + provider = OllamaVlmProvider(model_id="kimi-vl") + assert provider.coordinate_space == NormalizedCoordinateSpace() + + def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Kimi-VL-A3B") + assert provider.coordinate_space == NormalizedCoordinateSpace() + + def test_coordinate_space_default_for_non_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="llava") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_explicit_override(self) -> None: + provider = OllamaVlmProvider( + model_id="llava", + coordinate_space=ScaledCoordinateSpace(width=500, height=500), + ) + assert provider.coordinate_space == ScaledCoordinateSpace(width=500, height=500) + + def test_coordinate_space_explicit_override_takes_precedence(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=ScaledCoordinateSpace(width=2000, height=2000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=2000, height=2000 + ) + + def test_coordinate_space_explicit_pixel_overrides_qwen_auto_detect(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=PixelCoordinateSpace(), + ) + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_auto_detects_holo(self) -> None: + provider = OllamaVlmProvider(model_id="holo3.1-35b-a3b") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_holo_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Holo-3.1-4B") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py index d51ff74b..7a5a2a87 100644 --- a/tests/unit/model_providers/test_openai_vlm_provider.py +++ b/tests/unit/model_providers/test_openai_vlm_provider.py @@ -6,6 +6,12 @@ from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.models.shared.prompts import SystemPrompt class TestOpenAIVlmProvider: @@ -41,3 +47,122 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_defaults_to_pixel(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_passthrough(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_augment_system_prompt_scaled_coordinate_space(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + system = SystemPrompt(prompt="You are a helpful assistant.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "You are a helpful assistant." in rendered + assert "1000x1000 normalised grid" in rendered + assert "1024x768" in rendered + + def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + system = SystemPrompt(prompt="Base prompt.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "normalised grid" not in rendered + assert "0 <= x < 1024" in rendered + + +class TestPixelCoordinateSpacePrompt: + def test_shows_pixel_bounds(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "0 <= x < 1024" in result + assert "0 <= y < 768" in result + assert "normalised grid" not in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestScaledCoordinateSpacePrompt: + def test_shows_normalised_grid(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section((1024, 768)) + assert "1024x768" in result + assert "1000x1000 normalised grid" in result + assert "0 <= x < 1000" in result + assert "0 <= y < 1000" in result + + def test_matching_resolution_shows_pixel_bounds(self) -> None: + cs = ScaledCoordinateSpace(width=1024, height=768) + result = cs.build_prompt_section((1024, 768)) + assert "0 <= x < 1024" in result + assert "normalised grid" not in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestNormalizedCoordinateSpacePrompt: + def test_shows_normalised_floats(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "0.0 <= x <= 1.0" in result + assert "0.0 <= y <= 1.0" in result + assert "normalised floats" in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestMapToTarget: + def test_pixel_identity(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512, 384, (1024, 768)) == (512, 384) + + def test_pixel_truncates_floats(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512.7, 384.3, (1024, 768)) == (512, 384) + + def test_scaled_maps_correctly(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(500, 500, (1024, 768)) == (512, 384) + + def test_scaled_zero(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(0, 0, (1024, 768)) == (0, 0) + + def test_normalized_maps_correctly(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.5, 0.5, (1024, 768)) == (512, 384) + + def test_normalized_zero(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.0, 0.0, (1024, 768)) == (0, 0) + + def test_normalized_one(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(1.0, 1.0, (1024, 768)) == (1024, 768) From 31865a802dccb57821d186e93dbf1754f13a727e Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 10 Jun 2026 15:30:08 +0200 Subject: [PATCH 2/6] fix: map non-pixel coordinate spaces directly to device resolution --- src/askui/models/shared/coordinate_space.py | 14 ++ src/askui/tools/android/agent_os_facade.py | 23 ++- src/askui/tools/computer_agent_os_facade.py | 27 +++- src/askui/tools/playwright/agent_os_facade.py | 23 ++- .../test_openai_vlm_provider.py | 14 ++ .../tools/test_agent_os_facade_coordinates.py | 141 ++++++++++++++++++ 6 files changed, 221 insertions(+), 21 deletions(-) create mode 100644 tests/unit/tools/test_agent_os_facade_coordinates.py diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py index 69696cdd..3de94ac8 100644 --- a/src/askui/models/shared/coordinate_space.py +++ b/src/askui/models/shared/coordinate_space.py @@ -28,6 +28,16 @@ class VlmCoordinateSpace(BaseModel, ABC): the matching prompt section. """ + @property + def maps_to_screenshot_pixels(self) -> bool: + """Whether model coordinates are absolute pixels in the screenshot image. + + When ``True``, coordinates need padding-aware inverse scaling + (screenshot space to device space). When ``False``, coordinates + are in a normalised grid and map directly to device resolution. + """ + return False + @abstractmethod def map_to_target( self, x: float, y: float, target_resolution: tuple[int, int] @@ -46,6 +56,10 @@ class PixelCoordinateSpace(VlmCoordinateSpace): screenshot resolution. """ + @property + def maps_to_screenshot_pixels(self) -> bool: + return True + def map_to_target( self, x: float, diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index f0374036..02aa9c7c 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -55,17 +55,26 @@ def _scale_coordinates( if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot().size - mapped_x, mapped_y = ( - self._coordinate_space.map_to_target(x, y, self._target_resolution) - if from_agent - else (int(x), int(y)) - ) + if from_agent: + if self._coordinate_space.maps_to_screenshot_pixels: + mapped_x, mapped_y = self._coordinate_space.map_to_target( + x, y, self._target_resolution + ) + return scale_coordinates( + (mapped_x, mapped_y), + self._real_screen_resolution, + self._target_resolution, + inverse=True, + ) + return self._coordinate_space.map_to_target( + x, y, self._real_screen_resolution + ) return scale_coordinates( - (mapped_x, mapped_y), + (int(x), int(y)), self._real_screen_resolution, self._target_resolution, - inverse=from_agent, + inverse=False, ) def tap(self, x: float, y: float) -> None: diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index c91a2c84..6f7cc75b 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -319,16 +319,29 @@ def _scale_coordinates_back( if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.retrieve_active_display().size - mapped_x, mapped_y = ( - self._coordinate_space.map_to_target(x, y, self._target_resolution) - if from_agent - else (int(x), int(y)) + real_size = ( + self._real_screen_resolution.width, + self._real_screen_resolution.height, ) + if from_agent: + if self._coordinate_space.maps_to_screenshot_pixels: + mapped_x, mapped_y = self._coordinate_space.map_to_target( + x, y, self._target_resolution + ) + return scale_coordinates( + (mapped_x, mapped_y), + real_size, + self._target_resolution, + inverse=True, + check_coordinates_in_bounds=check_coordinates_in_bounds, + ) + return self._coordinate_space.map_to_target(x, y, real_size) + return scale_coordinates( - (mapped_x, mapped_y), - (self._real_screen_resolution.width, self._real_screen_resolution.height), + (int(x), int(y)), + real_size, self._target_resolution, - inverse=from_agent, + inverse=False, check_coordinates_in_bounds=check_coordinates_in_bounds, ) diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index 5d6f7c42..3e4f8500 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -61,17 +61,26 @@ def _scale_coordinates( report=False, ).size - mapped_x, mapped_y = ( - self._coordinate_space.map_to_target(x, y, self._target_resolution) - if from_agent - else (int(x), int(y)) - ) + if from_agent: + if self._coordinate_space.maps_to_screenshot_pixels: + mapped_x, mapped_y = self._coordinate_space.map_to_target( + x, y, self._target_resolution + ) + return scale_coordinates( + (mapped_x, mapped_y), + self._real_screen_resolution, + self._target_resolution, + inverse=True, + ) + return self._coordinate_space.map_to_target( + x, y, self._real_screen_resolution + ) return scale_coordinates( - (mapped_x, mapped_y), + (int(x), int(y)), self._real_screen_resolution, self._target_resolution, - inverse=from_agent, + inverse=False, ) def mouse_move(self, x: float, y: float, duration: int = 500) -> None: diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py index 7a5a2a87..1d33f1d5 100644 --- a/tests/unit/model_providers/test_openai_vlm_provider.py +++ b/tests/unit/model_providers/test_openai_vlm_provider.py @@ -138,6 +138,20 @@ def test_includes_padding_and_origin_info(self) -> None: assert "top-left" in result +class TestMapsToScreenshotPixels: + def test_pixel_returns_true(self) -> None: + assert PixelCoordinateSpace().maps_to_screenshot_pixels is True + + def test_scaled_returns_false(self) -> None: + assert ( + ScaledCoordinateSpace(width=1000, height=1000).maps_to_screenshot_pixels + is False + ) + + def test_normalized_returns_false(self) -> None: + assert NormalizedCoordinateSpace().maps_to_screenshot_pixels is False + + class TestMapToTarget: def test_pixel_identity(self) -> None: cs = PixelCoordinateSpace() diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py new file mode 100644 index 00000000..e81e8214 --- /dev/null +++ b/tests/unit/tools/test_agent_os_facade_coordinates.py @@ -0,0 +1,141 @@ +"""Tests for coordinate mapping in agent OS facades. + +Verifies that non-pixel coordinate spaces (Qwen 0-1000, Kimi 0.0-1.0) +map directly to device resolution, bypassing the padded screenshot space. +""" + +from unittest.mock import MagicMock + +import pytest +from PIL import Image + +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.tools.android.agent_os_facade import AndroidAgentOsFacade + + +def _make_android_facade( + device_size: tuple[int, int], + coordinate_space: PixelCoordinateSpace + | ScaledCoordinateSpace + | NormalizedCoordinateSpace, +) -> AndroidAgentOsFacade: + """Create an AndroidAgentOsFacade with a mocked agent OS.""" + mock_os = MagicMock() + mock_os.tags = [] + mock_os.screenshot.return_value = Image.new("RGB", device_size) + facade = AndroidAgentOsFacade(mock_os, coordinate_space=coordinate_space) + facade._real_screen_resolution = device_size + return facade + + +class TestScaledCoordinateSpaceTallDevice: + """Qwen 0-1000 grid on a tall Android device (1080x2400). + + The screenshot is scaled to 345x768 with 339px horizontal padding, + so the old code would produce negative x when x_model < ~331. + """ + + device = (1080, 2400) + cs = ScaledCoordinateSpace(width=1000, height=1000) + + def test_center_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(500, 500) + assert (x, y) == (540, 1200) + + def test_left_side_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(200, 500) + assert (x, y) == (216, 1200) + + def test_swipe_across(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x1, y1 = facade._scale_coordinates(500, 500) + x2, y2 = facade._scale_coordinates(200, 500) + assert (x1, y1) == (540, 1200) + assert (x2, y2) == (216, 1200) + + def test_origin(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(0, 0) + assert (x, y) == (0, 0) + + def test_max_corner(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(1000, 1000) + assert (x, y) == (1080, 2400) + + +class TestNormalizedCoordinateSpaceTallDevice: + """Kimi 0.0-1.0 grid on a tall Android device (1080x2400).""" + + device = (1080, 2400) + cs = NormalizedCoordinateSpace() + + def test_center_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(0.5, 0.5) + assert (x, y) == (540, 1200) + + def test_left_side_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(0.2, 0.5) + assert (x, y) == (216, 1200) + + +class TestPixelCoordinateSpaceTallDevice: + """Claude pixel coordinates on a tall Android device (1080x2400). + + Pixel coordinates are in the padded 1024x768 screenshot space + and must go through the padding-aware inverse scaling pipeline. + """ + + device = (1080, 2400) + cs = PixelCoordinateSpace() + + def test_center_of_content(self) -> None: + """The center of the content area in the padded screenshot.""" + facade = _make_android_facade(self.device, self.cs) + # Content area: x=[339..684], y=[0..768] in 1024x768 screenshot + # Center of content: x=511, y=384 + x, y = facade._scale_coordinates(511, 384) + # (511 - 339) / 0.32 = 537.5 → 537, (384 - 0) / 0.32 = 1200 + assert x == pytest.approx(537, abs=2) + assert y == 1200 + + def test_top_left_of_content(self) -> None: + """Top-left corner of the content area.""" + facade = _make_android_facade(self.device, self.cs) + # Content starts at x=339 in the padded screenshot + x, y = facade._scale_coordinates(339, 0) + assert x == pytest.approx(0, abs=2) + assert y == 0 + + +class TestSquareDevice: + """Verify no regression on a device with matching aspect ratio.""" + + device = (1024, 768) + cs = ScaledCoordinateSpace(width=1000, height=1000) + + def test_center(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scale_coordinates(500, 500) + assert (x, y) == (512, 384) + + +class TestFromAgentFalse: + """from_agent=False always maps device → screenshot pixel space.""" + + def test_device_to_screenshot_scaled_space(self) -> None: + facade = _make_android_facade( + (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000) + ) + x, y = facade._scale_coordinates(540, 1200, from_agent=False) + # Forward scaling: (540 * 0.32 + 339, 1200 * 0.32 + 0) ≈ (512, 384) + assert x == pytest.approx(512, abs=2) + assert y == pytest.approx(384, abs=2) From 395d00d0775d262dacb4e91d01c4586fd444439a Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Thu, 11 Jun 2026 14:25:22 +0200 Subject: [PATCH 3/6] feat: provider-owned image scaling with configurable max_image_edge Replace fixed SCREENSHOT_RESOLUTION constant with per-provider image scalers. Each VlmProvider now owns an ImageScaler callable and exposes max_image_edge (also via ASKUI_VLM_MAX_IMAGE_EDGE env var). Facades derive target_resolution dynamically from scaler output. --- src/askui/android_agent.py | 1 + src/askui/computer_agent.py | 1 + .../model_providers/anthropic_vlm_provider.py | 36 ++++ .../model_providers/askui_vlm_provider.py | 37 ++++ .../model_providers/ollama_vlm_provider.py | 11 ++ .../openai_compatible_vlm_provider.py | 11 ++ .../model_providers/openai_vlm_provider.py | 39 ++++- src/askui/model_providers/vlm_provider.py | 17 ++ src/askui/models/anthropic/get_model.py | 11 +- .../locate_models/anthropic_locate_model.py | 19 ++- src/askui/models/shared/__init__.py | 2 + src/askui/models/shared/coordinate_space.py | 48 ++---- src/askui/models/shared/image_scaler.py | 8 + src/askui/tools/android/agent_os_facade.py | 33 ++-- src/askui/tools/computer_agent_os_facade.py | 31 ++-- src/askui/tools/playwright/agent_os_facade.py | 36 ++-- .../tools/store/universal/load_image_tool.py | 10 +- src/askui/utils/llm_image_utils.py | 161 ++++++++++++++++++ src/askui/web_agent.py | 1 + .../test_openai_vlm_provider.py | 61 ++++--- .../tools/test_agent_os_facade_coordinates.py | 61 ++++--- tests/unit/utils/test_llm_image_utils.py | 145 ++++++++++++++++ 22 files changed, 649 insertions(+), 131 deletions(-) create mode 100644 src/askui/models/shared/image_scaler.py create mode 100644 src/askui/utils/llm_image_utils.py create mode 100644 tests/unit/utils/test_llm_image_utils.py diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 7b7818f8..29b96a15 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -99,6 +99,7 @@ def __init__( self.act_agent_os_facade = AndroidAgentOsFacade( self.os, coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with Android-specific settings diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py index 6e53df87..7f121dbf 100644 --- a/src/askui/computer_agent.py +++ b/src/askui/computer_agent.py @@ -132,6 +132,7 @@ def __init__( self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade( self.tools.os, coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with computer-specific settings diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py index 9edd42b9..37ca9a5d 100644 --- a/src/askui/model_providers/anthropic_vlm_provider.py +++ b/src/askui/model_providers/anthropic_vlm_provider.py @@ -5,6 +5,7 @@ from typing import Any from anthropic import Anthropic +from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -14,11 +15,25 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "claude-sonnet-4-6" +_DEFAULT_MAX_IMAGE_EDGE = 1568 + + +def _anthropic_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: + target = compute_patch_optimized_size( + image.width, + image.height, + max_edge=max_edge, + max_tokens=1568, + patch_size=28, + ) + return resize_image(image, target) class AnthropicVlmProvider(VlmProvider): @@ -46,6 +61,11 @@ class AnthropicVlmProvider(VlmProvider): cost in USD per 1M output tokens. cache_write_cost_per_million_tokens (float | None, optional): Override cost in USD per 1M cache write input tokens. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, uses Anthropic-optimized patch-based scaling. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 1568. cache_read_cost_per_million_tokens (float | None, optional): Override cost in USD per 1M cache read input tokens. @@ -70,6 +90,8 @@ def __init__( auth_token: str | None = None, model_id: str | None = None, client: Anthropic | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -78,6 +100,12 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) if client is not None: self.client = client else: @@ -104,6 +132,14 @@ def model_id(self) -> str: def pricing(self) -> ModelPricing | None: return self._pricing + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: _anthropic_image_scaler(image, max_edge) + @cached_property def _messages_api(self) -> AnthropicMessagesApi: """Lazily initialise the AnthropicMessagesApi on first use.""" diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py index d149deff..c8990a6c 100644 --- a/src/askui/model_providers/askui_vlm_provider.py +++ b/src/askui/model_providers/askui_vlm_provider.py @@ -5,6 +5,7 @@ from typing import Any from anthropic import Anthropic +from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -15,10 +16,24 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image _DEFAULT_MODEL_ID = "claude-sonnet-4-6" +_DEFAULT_MAX_IMAGE_EDGE = 1568 + + +def _askui_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: + target = compute_patch_optimized_size( + image.width, + image.height, + max_edge=max_edge, + max_tokens=1568, + patch_size=28, + ) + return resize_image(image, target) class AskUIVlmProvider(VlmProvider): @@ -37,6 +52,12 @@ class AskUIVlmProvider(VlmProvider): `"claude-sonnet-4-6"`. client (Anthropic | None, optional): Pre-configured Anthropic client. If provided, `workspace_id` and `token` are ignored. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, uses Anthropic-optimized patch-based scaling. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 1568. + Example: ```python from askui import AgentSettings, ComputerAgent @@ -57,18 +78,34 @@ def __init__( askui_settings: AskUiInferenceApiSettings | None = None, model_id: str | None = None, client: Anthropic | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: self._askui_settings = askui_settings or AskUiInferenceApiSettings() self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) self._injected_client = client + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) @property @override def model_id(self) -> str: return self._model_id_value + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: _askui_image_scaler(image, max_edge) + @cached_property def _messages_api(self) -> AnthropicMessagesApi: """Lazily initialise the AnthropicMessagesApi on first use.""" diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py index 1cca3905..c53103ed 100644 --- a/src/askui/model_providers/ollama_vlm_provider.py +++ b/src/askui/model_providers/ollama_vlm_provider.py @@ -10,6 +10,7 @@ ScaledCoordinateSpace, VlmCoordinateSpace, ) +from askui.models.shared.image_scaler import ImageScaler _DEFAULT_BASE_URL = "http://localhost:11434/v1" _DEFAULT_MODEL_ID = "qwen3.5" @@ -40,6 +41,12 @@ class OllamaVlmProvider(OpenAIVlmProvider): coordinate_space (VlmCoordinateSpace | None, optional): The coordinate grid the model emits coordinates in. ``None`` (the default) enables auto-detection based on ``model_id``. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, inherits from `OpenAIVlmProvider`. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Inherits the default from + `OpenAIVlmProvider` (2048). Example: ```python @@ -60,6 +67,8 @@ def __init__( base_url: str = _DEFAULT_BASE_URL, client: OpenAI | None = None, coordinate_space: VlmCoordinateSpace | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: self._coordinate_space_override = coordinate_space super().__init__( @@ -68,6 +77,8 @@ def __init__( base_url=base_url, client=client, coordinate_space=coordinate_space or PixelCoordinateSpace(), + image_scaler=image_scaler, + max_image_edge=max_image_edge, ) @property diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py index aae55c11..a574913b 100644 --- a/src/askui/model_providers/openai_compatible_vlm_provider.py +++ b/src/askui/model_providers/openai_compatible_vlm_provider.py @@ -4,6 +4,7 @@ from openai import OpenAI from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.image_scaler import ImageScaler class OpenAICompatibleVlmProvider(OpenAIVlmProvider): @@ -20,6 +21,12 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider): (e.g. ``"https://my-host/v1/chat/completions"``). model_id (str): Model name expected by the deployment. api_key (str | None, optional): API key for the endpoint. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, inherits from `OpenAIVlmProvider`. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Inherits the default from + `OpenAIVlmProvider` (2048). Example: ```python @@ -41,6 +48,8 @@ def __init__( endpoint_url: str, model_id: str | None = None, api_key: str | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: def _rewrite_url(request: httpx.Request) -> None: request.url = httpx.URL(endpoint_url) @@ -56,4 +65,6 @@ def _rewrite_url(request: httpx.Request) -> None: super().__init__( model_id=model_id, client=client, + image_scaler=image_scaler, + max_image_edge=max_image_edge, ) diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index 8ac5f6a6..693451b2 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -5,6 +5,7 @@ from typing import Any from openai import OpenAI +from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -15,16 +16,29 @@ ToolChoiceParam, ) from askui.models.shared.coordinate_space import ( - SCREENSHOT_RESOLUTION, PixelCoordinateSpace, VlmCoordinateSpace, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "gpt-5.4" _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() +_DEFAULT_MAX_IMAGE_EDGE = 2048 + + +def _openai_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: + target = compute_patch_optimized_size( + image.width, + image.height, + max_edge=max_edge, + max_tokens=1536, + patch_size=32, + ) + return resize_image(image, target) class OpenAIVlmProvider(VlmProvider): @@ -45,6 +59,9 @@ class OpenAIVlmProvider(VlmProvider): coordinate_space (VlmCoordinateSpace, optional): The coordinate grid the model emits coordinates in. Defaults to the screenshot resolution (native pixel coordinates). + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 2048. Example: ```python @@ -67,6 +84,8 @@ def __init__( base_url: str | None = None, client: OpenAI | None = None, coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -76,6 +95,12 @@ def __init__( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) self._coordinate_space = coordinate_space + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) if client is not None: self._client = client else: @@ -107,6 +132,14 @@ def coordinate_space(self) -> VlmCoordinateSpace: def pricing(self) -> ModelPricing | None: return self._pricing + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: _openai_image_scaler(image, max_edge) + @cached_property def _messages_api(self) -> OpenAIMessagesApi: """Lazily initialise the `OpenAIMessagesApi` on first use.""" @@ -115,9 +148,7 @@ def _messages_api(self) -> OpenAIMessagesApi: @override def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: """Append coordinate and resolution info to the system prompt.""" - coord_info = self.coordinate_space.build_prompt_section( - screenshot_resolution=SCREENSHOT_RESOLUTION, - ) + coord_info = self.coordinate_space.build_prompt_section() return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}") @override diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py index 6d4d9738..5cea3284 100644 --- a/src/askui/model_providers/vlm_provider.py +++ b/src/askui/model_providers/vlm_provider.py @@ -3,6 +3,8 @@ from abc import ABC, abstractmethod from typing import Any +from PIL import Image + from askui.models.shared.agent_message_param import ( MessageParam, ThinkingConfigParam, @@ -12,13 +14,20 @@ PixelCoordinateSpace, VlmCoordinateSpace, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_contained_size, resize_image from askui.utils.model_pricing import ModelPricing _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() +def _default_image_scaler(image: Image.Image) -> Image.Image: + target = compute_contained_size(image.width, image.height) + return resize_image(image, target) + + class VlmProvider(ABC): """Interface for Vision Language Model providers. @@ -70,6 +79,14 @@ def pricing(self) -> ModelPricing | None: """ return None + @property + def image_scaler(self) -> ImageScaler: + """Callable that preprocesses a screenshot before sending to the model. + + Override in subclasses for provider-specific sizing. + """ + return _default_image_scaler + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: """Hook for providers to augment the system prompt before sending. diff --git a/src/askui/models/anthropic/get_model.py b/src/askui/models/anthropic/get_model.py index 7bed5627..421126e6 100644 --- a/src/askui/models/anthropic/get_model.py +++ b/src/askui/models/anthropic/get_model.py @@ -20,7 +20,7 @@ from askui.models.types.response_schemas import ResponseSchema from askui.prompts.get_prompts import SYSTEM_PROMPT_GET from askui.utils.excel_utils import OfficeDocumentSource -from askui.utils.image_utils import scale_image_to_fit +from askui.utils.llm_image_utils import compute_contained_size, resize_image from askui.utils.pdf_utils import PdfSource from askui.utils.source_utils import Source @@ -78,10 +78,13 @@ def get( if response_schema is not None: error_msg = "Response schema is not yet supported for Anthropic" raise NotImplementedError(error_msg) - scaled_image = scale_image_to_fit( - source.root, - get_settings.resolution, + target_size = compute_contained_size( + source.root.width, + source.root.height, + get_settings.resolution.width, + get_settings.resolution.height, ) + scaled_image = resize_image(source.root, target_size) messages = built_messages_for_get_and_locate(scaled_image, query) message = self._messages_api.create_message( messages=messages, diff --git a/src/askui/models/askui/locate_models/anthropic_locate_model.py b/src/askui/models/askui/locate_models/anthropic_locate_model.py index d2b78c27..3856b8f7 100644 --- a/src/askui/models/askui/locate_models/anthropic_locate_model.py +++ b/src/askui/models/askui/locate_models/anthropic_locate_model.py @@ -20,8 +20,8 @@ from askui.utils.image_utils import ( ImageSource, scale_coordinates, - scale_image_to_fit, ) +from askui.utils.llm_image_utils import compute_contained_size, resize_image class AnthropicLocateModel(LocateModel): @@ -77,14 +77,17 @@ def locate( try: prompt = f"Click on {locator_serialized}" resolution = locate_settings.resolution - screen_width = resolution.width - screen_height = resolution.height - scaled_image = scale_image_to_fit( - image.root, - resolution, + target_size = compute_contained_size( + image.root.width, + image.root.height, + resolution.width, + resolution.height, ) + scaled_image = resize_image(image.root, target_size) messages = built_messages_for_get_and_locate(scaled_image, prompt) - system = build_system_prompt_locate(str(screen_width), str(screen_height)) + system = build_system_prompt_locate( + str(scaled_image.width), str(scaled_image.height) + ) message = self._messages_api.create_message( messages=messages, model_id=self._model_id, @@ -100,7 +103,7 @@ def locate( scale_coordinates( extract_click_coordinates(content_text.text), image.root.size, - resolution, + scaled_image.size, inverse=True, ) ] diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index 635fc053..cc225d6e 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -6,6 +6,7 @@ ScaledCoordinateSpace, VlmCoordinateSpace, ) +from .image_scaler import ImageScaler from .tool_tags import ToolTags try: @@ -19,6 +20,7 @@ __all__ = [ "AndroidBaseTool", "ComputerBaseTool", + "ImageScaler", "NormalizedCoordinateSpace", "PixelCoordinateSpace", "ScaledCoordinateSpace", diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py index 3de94ac8..c2cd7c71 100644 --- a/src/askui/models/shared/coordinate_space.py +++ b/src/askui/models/shared/coordinate_space.py @@ -4,20 +4,9 @@ from pydantic import BaseModel, Field -# The resolution screenshots are scaled to before being sent to the model. -# Used by all agent OS facades (computer, Android, Playwright). -SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768) - -def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]: - sw, sh = screenshot_resolution - return [ - f"* Screenshot resolution: {sw}x{sh} pixels", - "* Screenshots may contain black padding bars to preserve the " - "original aspect ratio. UI elements are NOT located in the " - "padding area.", - "* Coordinate origin is the top-left corner (0, 0)", - ] +def _common_prompt_lines() -> list[str]: + return ["* Coordinate origin is the top-left corner (0, 0)"] class VlmCoordinateSpace(BaseModel, ABC): @@ -45,7 +34,7 @@ def map_to_target( """Map model coordinates to pixel coordinates in *target_resolution*.""" @abstractmethod - def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + def build_prompt_section(self) -> str: """Build prompt text describing coordinate bounds for the model.""" @@ -68,10 +57,11 @@ def map_to_target( ) -> tuple[int, int]: return int(x), int(y) - def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: - sw, sh = screenshot_resolution - lines = _common_prompt_lines(screenshot_resolution) - lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() + lines.append( + "* Coordinates are in pixel space matching the screenshot dimensions" + ) return "\n".join(lines) @@ -87,17 +77,13 @@ def map_to_target( tw, th = target_resolution return int(x * tw / self.width), int(y * th / self.height) - def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: - lines = _common_prompt_lines(screenshot_resolution) - if (self.width, self.height) != screenshot_resolution: - lines.append( - f"* Emit coordinates in a {self.width}x{self.height} " - f"normalised grid: 0 <= x < {self.width}, " - f"0 <= y < {self.height}" - ) - else: - sw, sh = screenshot_resolution - lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() + lines.append( + f"* Emit coordinates in a {self.width}x{self.height} " + f"normalised grid: 0 <= x < {self.width}, " + f"0 <= y < {self.height}" + ) return "\n".join(lines) @@ -110,8 +96,8 @@ def map_to_target( tw, th = target_resolution return int(x * tw), int(y * th) - def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: - lines = _common_prompt_lines(screenshot_resolution) + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() lines.append( "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0" ) diff --git a/src/askui/models/shared/image_scaler.py b/src/askui/models/shared/image_scaler.py new file mode 100644 index 00000000..3c579e81 --- /dev/null +++ b/src/askui/models/shared/image_scaler.py @@ -0,0 +1,8 @@ +"""Type alias for image scaling callables used by VLM providers.""" + +from collections.abc import Callable + +from PIL import Image + +ImageScaler = Callable[[Image.Image], Image.Image] +"""Callable that preprocesses a screenshot before sending to a model.""" diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index 02aa9c7c..55132efa 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -2,14 +2,12 @@ from PIL import Image -from askui.models.shared.coordinate_space import ( - SCREENSHOT_RESOLUTION, - VlmCoordinateSpace, -) +from askui.models.shared.coordinate_space import VlmCoordinateSpace +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay from askui.tools.android.uiautomator_hierarchy import UIElementCollection -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit +from askui.utils.image_utils import scale_coordinates class AndroidAgentOsFacade(AndroidAgentOs): @@ -23,9 +21,11 @@ def __init__( self, agent_os: AndroidAgentOs, coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, ) -> None: self._agent_os: AndroidAgentOs = agent_os - self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION + self._image_scaler = image_scaler + self._target_resolution: Optional[Tuple[int, int]] = None self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: Optional[Tuple[int, int]] = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -41,10 +41,15 @@ def disconnect(self) -> None: def screenshot(self) -> Image.Image: screenshot = self._agent_os.screenshot() self._real_screen_resolution = screenshot.size - return scale_image_to_fit( - screenshot, - self._target_resolution, - ) + scaled = self._image_scaler(screenshot) + self._target_resolution = scaled.size + return scaled + + def _ensure_target_resolution(self) -> Tuple[int, int]: + if self._target_resolution is None: + self.screenshot() + assert self._target_resolution is not None # noqa: S101 + return self._target_resolution def _scale_coordinates( self, @@ -55,15 +60,17 @@ def _scale_coordinates( if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot().size + target_resolution = self._ensure_target_resolution() + if from_agent: if self._coordinate_space.maps_to_screenshot_pixels: mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, self._target_resolution + x, y, target_resolution ) return scale_coordinates( (mapped_x, mapped_y), self._real_screen_resolution, - self._target_resolution, + target_resolution, inverse=True, ) return self._coordinate_space.map_to_target( @@ -73,7 +80,7 @@ def _scale_coordinates( return scale_coordinates( (int(x), int(y)), self._real_screen_resolution, - self._target_resolution, + target_resolution, inverse=False, ) diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 6f7cc75b..63cc6dde 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -2,10 +2,8 @@ from PIL import Image -from askui.models.shared.coordinate_space import ( - SCREENSHOT_RESOLUTION, - VlmCoordinateSpace, -) +from askui.models.shared.coordinate_space import VlmCoordinateSpace +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import ( AgentOs, @@ -19,7 +17,7 @@ PcKey, ) from askui.tools.askui.askui_controller import RenderObjectStyle # noqa: TC001 -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit +from askui.utils.image_utils import scale_coordinates if TYPE_CHECKING: from askui.tools.askui.askui_ui_controller_grpc.generated import ( @@ -44,9 +42,11 @@ def __init__( self, agent_os: AgentOs, coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._image_scaler = image_scaler + self._target_resolution: tuple[int, int] | None = None self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: DisplaySize | None = None self.tags.append(ToolTags.SCALED_AGENT_OS.value) @@ -64,7 +64,9 @@ def screenshot(self, report: bool = True) -> Image.Image: self._real_screen_resolution = DisplaySize( width=screenshot.width, height=screenshot.height ) - return scale_image_to_fit(screenshot, self._target_resolution) + scaled = self._image_scaler(screenshot) + self._target_resolution = scaled.size + return scaled def mouse_move(self, x: float, y: float, duration: int = 500) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) @@ -299,7 +301,7 @@ def get_file(self, path: str) -> Image.Image | str: """ response = self._agent_os.get_file(path) if isinstance(response, Image.Image): - return scale_image_to_fit(response, self._target_resolution) + return self._image_scaler(response) return response def remove_virtual_displays(self) -> None: @@ -309,6 +311,12 @@ def remove_virtual_displays(self) -> None: self._agent_os.remove_virtual_displays() self._real_screen_resolution = None + def _ensure_target_resolution(self) -> tuple[int, int]: + if self._target_resolution is None: + self.screenshot(report=False) + assert self._target_resolution is not None # noqa: S101 + return self._target_resolution + def _scale_coordinates_back( self, x: float, @@ -319,6 +327,7 @@ def _scale_coordinates_back( if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.retrieve_active_display().size + target_resolution = self._ensure_target_resolution() real_size = ( self._real_screen_resolution.width, self._real_screen_resolution.height, @@ -327,12 +336,12 @@ def _scale_coordinates_back( if from_agent: if self._coordinate_space.maps_to_screenshot_pixels: mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, self._target_resolution + x, y, target_resolution ) return scale_coordinates( (mapped_x, mapped_y), real_size, - self._target_resolution, + target_resolution, inverse=True, check_coordinates_in_bounds=check_coordinates_in_bounds, ) @@ -341,7 +350,7 @@ def _scale_coordinates_back( return scale_coordinates( (int(x), int(y)), real_size, - self._target_resolution, + target_resolution, inverse=False, check_coordinates_in_bounds=check_coordinates_in_bounds, ) diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index 3e4f8500..a286476c 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -2,35 +2,37 @@ from PIL import Image -from askui.models.shared.coordinate_space import ( - SCREENSHOT_RESOLUTION, - VlmCoordinateSpace, -) +from askui.models.shared.coordinate_space import VlmCoordinateSpace +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import Display, ModifierKey, PcKey from askui.tools.playwright.agent_os import PlaywrightAgentOs -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit +from askui.utils.image_utils import scale_coordinates class PlaywrightAgentOsFacade(PlaywrightAgentOs): """Facade for `PlaywrightAgentOs` that adds coordinate scaling. - Screenshots are scaled down to a fixed target resolution so that the - AI model always sees a consistent image size. Coordinate-based inputs + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs (``mouse_move``) are scaled back up to the real page resolution before being forwarded to the underlying agent OS. Args: agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. + coordinate_space (VlmCoordinateSpace): Coordinate grid the model uses. + image_scaler (ImageScaler): Callable to preprocess screenshots. """ def __init__( self, agent_os: PlaywrightAgentOs, coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._image_scaler = image_scaler + self._target_resolution: tuple[int, int] | None = None self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: tuple[int, int] | None = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -48,7 +50,15 @@ def disconnect(self) -> None: def screenshot(self, report: bool = True) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) self._real_screen_resolution = screenshot.size - return scale_image_to_fit(screenshot, self._target_resolution) + scaled = self._image_scaler(screenshot) + self._target_resolution = scaled.size + return scaled + + def _ensure_target_resolution(self) -> tuple[int, int]: + if self._target_resolution is None: + self.screenshot(report=False) + assert self._target_resolution is not None # noqa: S101 + return self._target_resolution def _scale_coordinates( self, @@ -61,15 +71,17 @@ def _scale_coordinates( report=False, ).size + target_resolution = self._ensure_target_resolution() + if from_agent: if self._coordinate_space.maps_to_screenshot_pixels: mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, self._target_resolution + x, y, target_resolution ) return scale_coordinates( (mapped_x, mapped_y), self._real_screen_resolution, - self._target_resolution, + target_resolution, inverse=True, ) return self._coordinate_space.map_to_target( @@ -79,7 +91,7 @@ def _scale_coordinates( return scale_coordinates( (int(x), int(y)), self._real_screen_resolution, - self._target_resolution, + target_resolution, inverse=False, ) diff --git a/src/askui/tools/store/universal/load_image_tool.py b/src/askui/tools/store/universal/load_image_tool.py index 5a0512e9..b763f2ee 100644 --- a/src/askui/tools/store/universal/load_image_tool.py +++ b/src/askui/tools/store/universal/load_image_tool.py @@ -4,7 +4,7 @@ from PIL import Image from askui.models.shared.tools import Tool -from askui.utils.image_utils import scale_image_to_fit +from askui.utils.llm_image_utils import compute_contained_size, resize_image class LoadImageTool(Tool): @@ -116,7 +116,13 @@ def __call__(self, image_path: str = "") -> Tuple[str, Image.Image]: raise FileExistsError(error_msg) image = Image.open(absolute_image_path) - image = scale_image_to_fit(image, target_size=self._target_size) + target_size = compute_contained_size( + image.width, + image.height, + self._target_size[0], + self._target_size[1], + ) + image = resize_image(image, target_size) return ( f"Image was successfully loaded from {absolute_image_path}", diff --git a/src/askui/utils/llm_image_utils.py b/src/askui/utils/llm_image_utils.py new file mode 100644 index 00000000..688854bc --- /dev/null +++ b/src/askui/utils/llm_image_utils.py @@ -0,0 +1,161 @@ +"""Image utilities for LLM vision model preprocessing. + +Functions for computing optimal image sizes based on patch-based token budgets +and resizing images for VLM consumption. +""" + +import logging +import math + +from PIL import Image + +logger = logging.getLogger(__name__) + + +def count_image_tokens(width: int, height: int, patch_size: int = 28) -> int: + """Count the number of tokens an image will consume in a patch-based VLM. + + Each non-overlapping ``patch_size x patch_size`` square maps to one token. + + Args: + width (int): Image width in pixels. + height (int): Image height in pixels. + patch_size (int): Side length of a single patch in pixels. + + Returns: + int: Number of image tokens. + """ + patches_w = math.ceil(width / patch_size) + patches_h = math.ceil(height / patch_size) + return patches_w * patches_h + + +def compute_patch_optimized_size( + width: int, + height: int, + max_edge: int = 1568, + max_tokens: int = 1568, + patch_size: int = 28, +) -> tuple[int, int]: + """Compute the largest aspect-preserving size within a patch-based token budget. + + Uses binary search to find the biggest scale factor such that: + - Neither dimension exceeds ``max_edge``. + - ``count_image_tokens(w, h, patch_size) <= max_tokens``. + + Args: + width (int): Original image width. + height (int): Original image height. + max_edge (int): Maximum allowed dimension (width or height). + max_tokens (int): Maximum allowed number of image tokens. + patch_size (int): Patch size used by the model. + + Returns: + tuple[int, int]: Target ``(width, height)``. + """ + if width <= 0 or height <= 0: + error_msg = f"Image dimensions must be positive, got {width}x{height}" + raise ValueError(error_msg) + + # If already within all constraints, return as-is + if ( + width <= max_edge + and height <= max_edge + and count_image_tokens(width, height, patch_size) <= max_tokens + ): + return width, height + + # Clamp to max_edge first + scale = min(max_edge / width, max_edge / height, 1.0) + + # Binary search for largest scale that fits within token budget + lo, hi = 0.0, scale + for _ in range(50): + mid = (lo + hi) / 2 + w = max(1, int(width * mid)) + h = max(1, int(height * mid)) + if count_image_tokens(w, h, patch_size) <= max_tokens: + lo = mid + else: + hi = mid + + result_w = max(1, int(width * lo)) + result_h = max(1, int(height * lo)) + return result_w, result_h + + +def compute_contained_size( + width: int, + height: int, + max_width: int = 1024, + max_height: int = 768, +) -> tuple[int, int]: + """Compute the largest aspect-preserving size contained within max bounds. + + If the image already fits, returns its original dimensions. + + Args: + width (int): Original image width. + height (int): Original image height. + max_width (int): Maximum allowed width. + max_height (int): Maximum allowed height. + + Returns: + tuple[int, int]: Target ``(width, height)``. + """ + if width <= 0 or height <= 0: + error_msg = f"Image dimensions must be positive, got {width}x{height}" + raise ValueError(error_msg) + + if width <= max_width and height <= max_height: + return width, height + + scale = min(max_width / width, max_height / height) + return max(1, int(width * scale)), max(1, int(height * scale)) + + +def resize_image(image: Image.Image, target_size: tuple[int, int]) -> Image.Image: + """Resize an image to exact ``target_size`` using LANCZOS resampling. + + Logs a warning if the aspect ratio changes by more than 1%. + + Args: + image (Image.Image): Source image. + target_size (tuple[int, int]): Target ``(width, height)``. + + Returns: + Image.Image: Resized image. + """ + if image.size == target_size: + return image + + src_ratio = image.width / image.height + dst_ratio = target_size[0] / target_size[1] + if abs(src_ratio - dst_ratio) / max(src_ratio, dst_ratio) > 0.01: + logger.warning( + "Aspect ratio change during resize: %.3f -> %.3f", + src_ratio, + dst_ratio, + ) + + return image.resize(target_size, Image.Resampling.LANCZOS) + + +def resize_and_pad_image( + image: Image.Image, + target_size: tuple[int, int], +) -> Image.Image: + """Resize preserving aspect ratio, then center on a padded canvas. + + Equivalent to the legacy ``scale_image_to_fit`` behaviour. + + Args: + image (Image.Image): Source image. + target_size (tuple[int, int]): Canvas ``(width, height)``. + + Returns: + Image.Image: Image centered on a ``target_size`` canvas. + """ + from askui.utils.image_utils import scale_image_to_fit + + return scale_image_to_fit(image, target_size) diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py index bc211ec9..d1c94232 100644 --- a/src/askui/web_agent.py +++ b/src/askui/web_agent.py @@ -72,6 +72,7 @@ def __init__( self.act_agent_os_facade = PlaywrightAgentOsFacade( self.os, coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) self.act_settings = ActSettings( diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py index 1d33f1d5..8fb787ab 100644 --- a/tests/unit/model_providers/test_openai_vlm_provider.py +++ b/tests/unit/model_providers/test_openai_vlm_provider.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock from openai import OpenAI +from PIL import Image from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.models.shared.agent_message_param import MessageParam @@ -74,67 +75,77 @@ def test_augment_system_prompt_scaled_coordinate_space(self) -> None: rendered = str(augmented) assert "You are a helpful assistant." in rendered assert "1000x1000 normalised grid" in rendered - assert "1024x768" in rendered - def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None: + def test_augment_system_prompt_pixel_coordinate_space(self) -> None: provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") system = SystemPrompt(prompt="Base prompt.") augmented = provider.augment_system_prompt(system) rendered = str(augmented) assert "normalised grid" not in rendered - assert "0 <= x < 1024" in rendered + assert "pixel space matching the screenshot dimensions" in rendered + + +class TestImageScaler: + def test_default_scaler_returns_valid_image(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + img = Image.new("RGB", (1920, 1080)) + scaled = provider.image_scaler(img) + assert scaled.width <= 2048 + assert scaled.height <= 2048 + + def test_custom_scaler_override(self) -> None: + def custom_scaler(image: Image.Image) -> Image.Image: + return image.resize((100, 100)) + + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + image_scaler=custom_scaler, + ) + img = Image.new("RGB", (1920, 1080)) + scaled = provider.image_scaler(img) + assert scaled.size == (100, 100) class TestPixelCoordinateSpacePrompt: - def test_shows_pixel_bounds(self) -> None: + def test_shows_pixel_space_description(self) -> None: cs = PixelCoordinateSpace() - result = cs.build_prompt_section((1024, 768)) - assert "0 <= x < 1024" in result - assert "0 <= y < 768" in result + result = cs.build_prompt_section() + assert "pixel space matching the screenshot dimensions" in result assert "normalised grid" not in result - def test_includes_padding_and_origin_info(self) -> None: + def test_includes_origin_info(self) -> None: cs = PixelCoordinateSpace() - result = cs.build_prompt_section((1024, 768)) - assert "black padding" in result + result = cs.build_prompt_section() assert "top-left" in result class TestScaledCoordinateSpacePrompt: def test_shows_normalised_grid(self) -> None: cs = ScaledCoordinateSpace(width=1000, height=1000) - result = cs.build_prompt_section((1024, 768)) - assert "1024x768" in result + result = cs.build_prompt_section() assert "1000x1000 normalised grid" in result assert "0 <= x < 1000" in result assert "0 <= y < 1000" in result - def test_matching_resolution_shows_pixel_bounds(self) -> None: - cs = ScaledCoordinateSpace(width=1024, height=768) - result = cs.build_prompt_section((1024, 768)) - assert "0 <= x < 1024" in result - assert "normalised grid" not in result - - def test_includes_padding_and_origin_info(self) -> None: + def test_includes_origin_info(self) -> None: cs = ScaledCoordinateSpace(width=1000, height=1000) - result = cs.build_prompt_section((1024, 768)) - assert "black padding" in result + result = cs.build_prompt_section() assert "top-left" in result class TestNormalizedCoordinateSpacePrompt: def test_shows_normalised_floats(self) -> None: cs = NormalizedCoordinateSpace() - result = cs.build_prompt_section((1024, 768)) + result = cs.build_prompt_section() assert "0.0 <= x <= 1.0" in result assert "0.0 <= y <= 1.0" in result assert "normalised floats" in result - def test_includes_padding_and_origin_info(self) -> None: + def test_includes_origin_info(self) -> None: cs = NormalizedCoordinateSpace() - result = cs.build_prompt_section((1024, 768)) - assert "black padding" in result + result = cs.build_prompt_section() assert "top-left" in result diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py index e81e8214..3b9e1b89 100644 --- a/tests/unit/tools/test_agent_os_facade_coordinates.py +++ b/tests/unit/tools/test_agent_os_facade_coordinates.py @@ -15,6 +15,13 @@ ScaledCoordinateSpace, ) from askui.tools.android.agent_os_facade import AndroidAgentOsFacade +from askui.utils.llm_image_utils import compute_contained_size, resize_image + + +def _default_scaler(image: Image.Image) -> Image.Image: + """Scaler that mimics the default contained-size logic.""" + target = compute_contained_size(image.width, image.height, 1024, 768) + return resize_image(image, target) def _make_android_facade( @@ -27,16 +34,23 @@ def _make_android_facade( mock_os = MagicMock() mock_os.tags = [] mock_os.screenshot.return_value = Image.new("RGB", device_size) - facade = AndroidAgentOsFacade(mock_os, coordinate_space=coordinate_space) + facade = AndroidAgentOsFacade( + mock_os, + coordinate_space=coordinate_space, + image_scaler=_default_scaler, + ) facade._real_screen_resolution = device_size + # Set target resolution as the scaler would produce it + scaled = _default_scaler(Image.new("RGB", device_size)) + facade._target_resolution = scaled.size return facade class TestScaledCoordinateSpaceTallDevice: """Qwen 0-1000 grid on a tall Android device (1080x2400). - The screenshot is scaled to 345x768 with 339px horizontal padding, - so the old code would produce negative x when x_model < ~331. + Non-pixel coordinate spaces map directly to device resolution, + so no padding offset is involved. """ device = (1080, 2400) @@ -90,30 +104,33 @@ def test_left_side_tap(self) -> None: class TestPixelCoordinateSpaceTallDevice: """Claude pixel coordinates on a tall Android device (1080x2400). - Pixel coordinates are in the padded 1024x768 screenshot space - and must go through the padding-aware inverse scaling pipeline. + With the no-padding scaler, a 1080x2400 device is scaled to + compute_contained_size(1080, 2400, 1024, 768) = (345, 768). + Pixel coordinates are in the (345, 768) screenshot space and go + through the padding-aware inverse scaling pipeline. Because the + image nearly fills the target (only ~2 px rounding slack), offsets + are close to zero but not exactly zero. """ device = (1080, 2400) cs = PixelCoordinateSpace() def test_center_of_content(self) -> None: - """The center of the content area in the padded screenshot.""" + """The center of the content area in the scaled screenshot.""" facade = _make_android_facade(self.device, self.cs) - # Content area: x=[339..684], y=[0..768] in 1024x768 screenshot - # Center of content: x=511, y=384 - x, y = facade._scale_coordinates(511, 384) - # (511 - 339) / 0.32 = 537.5 → 537, (384 - 0) / 0.32 = 1200 - assert x == pytest.approx(537, abs=2) - assert y == 1200 - - def test_top_left_of_content(self) -> None: - """Top-left corner of the content area.""" + # Target resolution is (345, 768) — nearly no padding + x, y = facade._scale_coordinates(172, 384) + assert x == pytest.approx(538, abs=5) + assert y == pytest.approx(1200, abs=5) + + def test_near_top_left_of_content(self) -> None: + """Coordinate near top-left corner maps back close to origin.""" facade = _make_android_facade(self.device, self.cs) - # Content starts at x=339 in the padded screenshot - x, y = facade._scale_coordinates(339, 0) - assert x == pytest.approx(0, abs=2) - assert y == 0 + # Use (1, 2) instead of exact origin to avoid rounding-offset + # edge case that can produce small negative values. + x, y = facade._scale_coordinates(1, 2) + assert x == pytest.approx(3, abs=5) + assert y == pytest.approx(3, abs=5) class TestSquareDevice: @@ -136,6 +153,8 @@ def test_device_to_screenshot_scaled_space(self) -> None: (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000) ) x, y = facade._scale_coordinates(540, 1200, from_agent=False) - # Forward scaling: (540 * 0.32 + 339, 1200 * 0.32 + 0) ≈ (512, 384) - assert x == pytest.approx(512, abs=2) + # Target resolution is (345, 768), no padding + # Forward scaling: factor = 768/2400 = 0.32 + # x = 540 * 0.32 = 172.8 → 172, y = 1200 * 0.32 = 384 + assert x == pytest.approx(172, abs=2) assert y == pytest.approx(384, abs=2) diff --git a/tests/unit/utils/test_llm_image_utils.py b/tests/unit/utils/test_llm_image_utils.py new file mode 100644 index 00000000..714d84c0 --- /dev/null +++ b/tests/unit/utils/test_llm_image_utils.py @@ -0,0 +1,145 @@ +"""Tests for LLM image utility functions.""" + +import logging + +import pytest +from PIL import Image + +from askui.utils.llm_image_utils import ( + compute_contained_size, + compute_patch_optimized_size, + count_image_tokens, + resize_and_pad_image, + resize_image, +) + + +class TestCountImageTokens: + def test_exact_patches(self) -> None: + # 56x56 with patch_size=28 → 2x2 = 4 tokens + assert count_image_tokens(56, 56, patch_size=28) == 4 + + def test_single_patch(self) -> None: + assert count_image_tokens(28, 28, patch_size=28) == 1 + + def test_partial_patches_round_up(self) -> None: + # 30x30 with patch_size=28 → ceil(30/28) * ceil(30/28) = 2*2 = 4 + assert count_image_tokens(30, 30, patch_size=28) == 4 + + def test_known_anthropic_value(self) -> None: + # 1568x1568 with patch_size=28 → 56*56 = 3136 + assert count_image_tokens(1568, 1568, patch_size=28) == 3136 + + def test_rectangular(self) -> None: + # 1024x768 with patch_size=28 → ceil(1024/28)*ceil(768/28) = 37*28 = 1036 + assert count_image_tokens(1024, 768, patch_size=28) == 37 * 28 + + +class TestComputePatchOptimizedSize: + def test_small_image_unchanged(self) -> None: + # A small image that fits within all constraints is returned as-is + w, h = compute_patch_optimized_size(200, 100) + assert w == 200 + assert h == 100 + + def test_respects_max_edge(self) -> None: + w, h = compute_patch_optimized_size(3000, 2000, max_edge=1568) + assert w <= 1568 + assert h <= 1568 + + def test_respects_max_tokens(self) -> None: + w, h = compute_patch_optimized_size( + 1920, 1080, max_edge=1568, max_tokens=1568, patch_size=28 + ) + tokens = count_image_tokens(w, h, patch_size=28) + assert tokens <= 1568 + + def test_preserves_aspect_ratio(self) -> None: + w, h = compute_patch_optimized_size(1920, 1080) + original_ratio = 1920 / 1080 + result_ratio = w / h + assert abs(original_ratio - result_ratio) / original_ratio < 0.02 + + def test_invalid_dimensions_raises(self) -> None: + with pytest.raises(ValueError, match="positive"): + compute_patch_optimized_size(0, 100) + + def test_openai_params(self) -> None: + w, h = compute_patch_optimized_size( + 1920, 1080, max_edge=2048, max_tokens=1536, patch_size=32 + ) + tokens = count_image_tokens(w, h, patch_size=32) + assert tokens <= 1536 + assert w <= 2048 + assert h <= 2048 + + +class TestComputeContainedSize: + def test_already_fits(self) -> None: + assert compute_contained_size(800, 600, 1024, 768) == (800, 600) + + def test_exact_match(self) -> None: + assert compute_contained_size(1024, 768, 1024, 768) == (1024, 768) + + def test_landscape_too_wide(self) -> None: + w, h = compute_contained_size(2048, 768, 1024, 768) + assert w <= 1024 + assert h <= 768 + + def test_portrait_too_tall(self) -> None: + w, h = compute_contained_size(768, 2048, 1024, 768) + assert w <= 1024 + assert h <= 768 + + def test_preserves_aspect_ratio(self) -> None: + w, h = compute_contained_size(1920, 1080, 1024, 768) + original_ratio = 1920 / 1080 + result_ratio = w / h + assert abs(original_ratio - result_ratio) / original_ratio < 0.02 + + def test_invalid_dimensions_raises(self) -> None: + with pytest.raises(ValueError, match="positive"): + compute_contained_size(0, 100) + + +class TestResizeImage: + def test_correct_dimensions(self) -> None: + img = Image.new("RGB", (1920, 1080)) + result = resize_image(img, (1024, 576)) + assert result.size == (1024, 576) + + def test_no_op_when_same_size(self) -> None: + img = Image.new("RGB", (1024, 768)) + result = resize_image(img, (1024, 768)) + assert result is img # Same object, no copy + + def test_aspect_ratio_warning_logged( + self, caplog: pytest.LogCaptureFixture + ) -> None: + img = Image.new("RGB", (1920, 1080)) + with caplog.at_level(logging.WARNING): + resize_image(img, (1024, 768)) + assert "Aspect ratio change" in caplog.text + + def test_no_warning_when_ratio_preserved( + self, caplog: pytest.LogCaptureFixture + ) -> None: + img = Image.new("RGB", (1920, 1080)) + with caplog.at_level(logging.WARNING): + resize_image(img, (960, 540)) + assert "Aspect ratio change" not in caplog.text + + +class TestResizeAndPadImage: + def test_correct_dimensions(self) -> None: + img = Image.new("RGB", (1920, 1080)) + result = resize_and_pad_image(img, (1024, 768)) + assert result.size == (1024, 768) + + def test_preserves_aspect_ratio_with_padding(self) -> None: + img = Image.new("RGB", (1080, 2400), color=(255, 0, 0)) + result = resize_and_pad_image(img, (1024, 768)) + assert result.size == (1024, 768) + # Check that some padding exists (black pixels at edges) + left_pixel = result.getpixel((0, 0)) + assert left_pixel == (0, 0, 0) # Black padding From 3665cc42646e8c5749247c9a23c2b3fdd9b5964f Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Thu, 11 Jun 2026 14:53:00 +0200 Subject: [PATCH 4/6] refactor: clean up PR (composition, deduplication, exports) --- src/askui/model_providers/__init__.py | 14 ++- .../model_providers/anthropic_vlm_provider.py | 16 +-- .../model_providers/askui_vlm_provider.py | 33 ++--- .../openai_compatible_vlm_provider.py | 11 ++ .../model_providers/openai_vlm_provider.py | 18 +-- src/askui/models/shared/__init__.py | 2 +- src/askui/tools/android/agent_os_facade.py | 115 +++++++----------- src/askui/tools/computer_agent_os_facade.py | 98 +++++---------- src/askui/tools/coordinate_scaling_mixin.py | 99 +++++++++++++++ src/askui/tools/playwright/agent_os_facade.py | 73 +++-------- src/askui/utils/llm_image_utils.py | 30 +++++ .../tools/test_agent_os_facade_coordinates.py | 28 ++--- 12 files changed, 271 insertions(+), 266 deletions(-) create mode 100644 src/askui/tools/coordinate_scaling_mixin.py diff --git a/src/askui/model_providers/__init__.py b/src/askui/model_providers/__init__.py index ae1f0d0d..9424577d 100644 --- a/src/askui/model_providers/__init__.py +++ b/src/askui/model_providers/__init__.py @@ -35,6 +35,13 @@ from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.model_providers.vlm_provider import VlmProvider +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler from askui.utils.model_pricing import ModelPricing __all__ = [ @@ -46,11 +53,16 @@ "DetectionProvider", "GoogleImageQAProvider", "ImageQAProvider", + "ImageScaler", "ModelPricing", + "NormalizedCoordinateSpace", "OllamaImageQAProvider", "OllamaVlmProvider", + "OpenAICompatibleVlmProvider", "OpenAIImageQAProvider", "OpenAIVlmProvider", - "OpenAICompatibleVlmProvider", + "PixelCoordinateSpace", + "ScaledCoordinateSpace", + "VlmCoordinateSpace", "VlmProvider", ] diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py index 37ca9a5d..f094f22c 100644 --- a/src/askui/model_providers/anthropic_vlm_provider.py +++ b/src/askui/model_providers/anthropic_vlm_provider.py @@ -5,7 +5,6 @@ from typing import Any from anthropic import Anthropic -from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -18,24 +17,13 @@ from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection -from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image +from askui.utils.llm_image_utils import compute_patch_optimized_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "claude-sonnet-4-6" _DEFAULT_MAX_IMAGE_EDGE = 1568 -def _anthropic_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: - target = compute_patch_optimized_size( - image.width, - image.height, - max_edge=max_edge, - max_tokens=1568, - patch_size=28, - ) - return resize_image(image, target) - - class AnthropicVlmProvider(VlmProvider): """VLM provider that routes requests directly to the Anthropic API. @@ -138,7 +126,7 @@ def image_scaler(self) -> ImageScaler: if self._image_scaler_override is not None: return self._image_scaler_override max_edge = self._max_edge - return lambda image: _anthropic_image_scaler(image, max_edge) + return lambda image: compute_patch_optimized_image(image, max_edge=max_edge) @cached_property def _messages_api(self) -> AnthropicMessagesApi: diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py index c8990a6c..f402ee3c 100644 --- a/src/askui/model_providers/askui_vlm_provider.py +++ b/src/askui/model_providers/askui_vlm_provider.py @@ -5,7 +5,6 @@ from typing import Any from anthropic import Anthropic -from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -19,23 +18,12 @@ from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection -from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image +from askui.utils.llm_image_utils import compute_patch_optimized_image _DEFAULT_MODEL_ID = "claude-sonnet-4-6" _DEFAULT_MAX_IMAGE_EDGE = 1568 -def _askui_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: - target = compute_patch_optimized_size( - image.width, - image.height, - max_edge=max_edge, - max_tokens=1568, - patch_size=28, - ) - return resize_image(image, target) - - class AskUIVlmProvider(VlmProvider): """VLM provider that routes requests through AskUI's hosted Anthropic proxy. @@ -44,14 +32,13 @@ class AskUIVlmProvider(VlmProvider): on the first API call, not at construction time. Args: - workspace_id (str | None, optional): AskUI workspace ID. Reads - `ASKUI_WORKSPACE_ID` from the environment if not provided. - token (str | None, optional): AskUI API token. Reads `ASKUI_TOKEN` - from the environment if not provided. - model_id (str, optional): Claude model to use. Defaults to - `"claude-sonnet-4-6"`. - client (Anthropic | None, optional): Pre-configured Anthropic client. - If provided, `workspace_id` and `token` are ignored. + askui_settings (`AskUiInferenceApiSettings` | None, optional): + Connection settings (workspace ID, token, base URL). Reads + from environment variables if not provided. + model_id (str | None, optional): Claude model to use. Defaults to + ``"claude-sonnet-4-6"``. + client (`Anthropic` | None, optional): Pre-configured Anthropic client. + If provided, ``askui_settings`` is only used for the base URL. image_scaler (`ImageScaler` | None, optional): Custom image preprocessing callable. If ``None``, uses Anthropic-optimized patch-based scaling. max_image_edge (int | None, optional): Maximum edge length (in pixels) @@ -65,8 +52,6 @@ class AskUIVlmProvider(VlmProvider): agent = ComputerAgent(settings=AgentSettings( vlm_provider=AskUIVlmProvider( - workspace_id="my-workspace", - token="my-token", model_id="claude-opus-4-6-20260401", ) )) @@ -104,7 +89,7 @@ def image_scaler(self) -> ImageScaler: if self._image_scaler_override is not None: return self._image_scaler_override max_edge = self._max_edge - return lambda image: _askui_image_scaler(image, max_edge) + return lambda image: compute_patch_optimized_image(image, max_edge=max_edge) @cached_property def _messages_api(self) -> AnthropicMessagesApi: diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py index a574913b..98b50627 100644 --- a/src/askui/model_providers/openai_compatible_vlm_provider.py +++ b/src/askui/model_providers/openai_compatible_vlm_provider.py @@ -4,8 +4,14 @@ from openai import OpenAI from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.image_scaler import ImageScaler +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() + class OpenAICompatibleVlmProvider(OpenAIVlmProvider): """VLM provider for OpenAI-compatible APIs that require an exact endpoint URL. @@ -21,6 +27,9 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider): (e.g. ``"https://my-host/v1/chat/completions"``). model_id (str): Model name expected by the deployment. api_key (str | None, optional): API key for the endpoint. + coordinate_space (`VlmCoordinateSpace` | None, optional): The coordinate + grid the model emits coordinates in. If ``None``, inherits the + default from `OpenAIVlmProvider` (pixel coordinates). image_scaler (`ImageScaler` | None, optional): Custom image preprocessing callable. If ``None``, inherits from `OpenAIVlmProvider`. max_image_edge (int | None, optional): Maximum edge length (in pixels) @@ -48,6 +57,7 @@ def __init__( endpoint_url: str, model_id: str | None = None, api_key: str | None = None, + coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, image_scaler: ImageScaler | None = None, max_image_edge: int | None = None, ) -> None: @@ -65,6 +75,7 @@ def _rewrite_url(request: httpx.Request) -> None: super().__init__( model_id=model_id, client=client, + coordinate_space=coordinate_space, image_scaler=image_scaler, max_image_edge=max_image_edge, ) diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index 693451b2..bf25eb00 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -5,7 +5,6 @@ from typing import Any from openai import OpenAI -from PIL import Image from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider @@ -22,7 +21,7 @@ from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection -from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image +from askui.utils.llm_image_utils import compute_patch_optimized_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "gpt-5.4" @@ -30,17 +29,6 @@ _DEFAULT_MAX_IMAGE_EDGE = 2048 -def _openai_image_scaler(image: Image.Image, max_edge: int) -> Image.Image: - target = compute_patch_optimized_size( - image.width, - image.height, - max_edge=max_edge, - max_tokens=1536, - patch_size=32, - ) - return resize_image(image, target) - - class OpenAIVlmProvider(VlmProvider): """VLM provider for any OpenAI-compatible API. @@ -138,7 +126,9 @@ def image_scaler(self) -> ImageScaler: if self._image_scaler_override is not None: return self._image_scaler_override max_edge = self._max_edge - return lambda image: _openai_image_scaler(image, max_edge) + return lambda image: compute_patch_optimized_image( + image, max_edge=max_edge, max_tokens=1536, patch_size=32 + ) @cached_property def _messages_api(self) -> OpenAIMessagesApi: diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index cc225d6e..84bd3d22 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -24,8 +24,8 @@ "NormalizedCoordinateSpace", "PixelCoordinateSpace", "ScaledCoordinateSpace", - "VlmCoordinateSpace", "ToolTags", + "VlmCoordinateSpace", ] if _PLAYWRIGHT_AVAILABLE: diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index 55132efa..9fcfa25f 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -1,20 +1,31 @@ -from typing import List, Optional, Tuple +from __future__ import annotations -from PIL import Image +from typing import TYPE_CHECKING -from askui.models.shared.coordinate_space import VlmCoordinateSpace -from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay -from askui.tools.android.uiautomator_hierarchy import UIElementCollection -from askui.utils.image_utils import scale_coordinates +from askui.tools.coordinate_scaling_mixin import CoordinateScaler + +if TYPE_CHECKING: + from PIL import Image + + from askui.models.shared.coordinate_space import VlmCoordinateSpace + from askui.models.shared.image_scaler import ImageScaler + from askui.tools.android.uiautomator_hierarchy import UIElementCollection class AndroidAgentOsFacade(AndroidAgentOs): - """ - Facade for AndroidAgentOs that adds coordinate scaling functionality. - It is used to scale the coordinates to the target resolution - and back to the real screen resolution. + """Facade for `AndroidAgentOs` that adds coordinate scaling. + + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs + (``tap``, ``swipe``, ``drag_and_drop``) are scaled back up to the + real device resolution before being forwarded to the underlying agent OS. + + Args: + agent_os (`AndroidAgentOs`): The real Android agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ def __init__( @@ -24,82 +35,42 @@ def __init__( image_scaler: ImageScaler, ) -> None: self._agent_os: AndroidAgentOs = agent_os - self._image_scaler = image_scaler - self._target_resolution: Optional[Tuple[int, int]] = None - self._coordinate_space: VlmCoordinateSpace = coordinate_space - self._real_screen_resolution: Optional[Tuple[int, int]] = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=lambda: self._agent_os.screenshot().size, + take_screenshot=lambda: self.screenshot(), + ) self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.screenshot().size + self._scaler.real_screen_resolution = self._agent_os.screenshot().size def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self) -> Image.Image: screenshot = self._agent_os.screenshot() - self._real_screen_resolution = screenshot.size - scaled = self._image_scaler(screenshot) - self._target_resolution = scaled.size - return scaled - - def _ensure_target_resolution(self) -> Tuple[int, int]: - if self._target_resolution is None: - self.screenshot() - assert self._target_resolution is not None # noqa: S101 - return self._target_resolution - - def _scale_coordinates( - self, - x: float, - y: float, - from_agent: bool = True, - ) -> Tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.screenshot().size - - target_resolution = self._ensure_target_resolution() - - if from_agent: - if self._coordinate_space.maps_to_screenshot_pixels: - mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, target_resolution - ) - return scale_coordinates( - (mapped_x, mapped_y), - self._real_screen_resolution, - target_resolution, - inverse=True, - ) - return self._coordinate_space.map_to_target( - x, y, self._real_screen_resolution - ) - - return scale_coordinates( - (int(x), int(y)), - self._real_screen_resolution, - target_resolution, - inverse=False, - ) + return self._scaler.scale_screenshot(screenshot) def tap(self, x: float, y: float) -> None: - x, y = self._scale_coordinates(x, y) + x, y = self._scaler.scale_coordinates(x, y) self._agent_os.tap(x, y) def swipe( self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: - x1, y1 = self._scale_coordinates(x1, y1) - x2, y2 = self._scale_coordinates(x2, y2) + x1, y1 = self._scaler.scale_coordinates(x1, y1) + x2, y2 = self._scaler.scale_coordinates(x2, y2) self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms) def drag_and_drop( self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: - x1, y1 = self._scale_coordinates(x1, y1) - x2, y2 = self._scale_coordinates(x2, y2) + x1, y1 = self._scaler.scale_coordinates(x1, y1) + x2, y2 = self._scaler.scale_coordinates(x2, y2) self._agent_os.drag_and_drop(x1, y1, x2, y2, duration_in_ms) def type(self, text: str) -> None: @@ -109,7 +80,7 @@ def key_tap(self, key: ANDROID_KEY) -> None: self._agent_os.key_tap(key) def key_combination( - self, keys: List[ANDROID_KEY], duration_in_ms: int = 100 + self, keys: list[ANDROID_KEY], duration_in_ms: int = 100 ) -> None: self._agent_os.key_combination(keys, duration_in_ms) @@ -121,27 +92,27 @@ def get_connected_displays(self) -> list[AndroidDisplay]: def set_display_by_index(self, display_index: int = 0) -> None: self._agent_os.set_display_by_index(display_index) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_unique_id(self, display_unique_id: int) -> None: self._agent_os.set_display_by_unique_id(display_unique_id) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_id(self, display_id: int) -> None: self._agent_os.set_display_by_id(display_id) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_name(self, display_name: str) -> None: self._agent_os.set_display_by_name(display_name) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_device_by_index(self, device_index: int = 0) -> None: self._agent_os.set_device_by_index(device_index) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_device_by_serial_number(self, device_sn: str) -> None: self._agent_os.set_device_by_serial_number(device_sn) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def get_connected_devices_serial_numbers(self) -> list[str]: return self._agent_os.get_connected_devices_serial_numbers() @@ -165,7 +136,7 @@ def get_ui_elements(self) -> UIElementCollection: if element.center is None: continue element.set_center( - self._scale_coordinates( + self._scaler.scale_coordinates( x=element.center[0], y=element.center[1], from_agent=False, diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 63cc6dde..848f6a10 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -9,7 +9,6 @@ AgentOs, Coordinate, Display, - DisplaySize, DisplaysListResponse, InputEvent, ModifierKey, @@ -17,7 +16,7 @@ PcKey, ) from askui.tools.askui.askui_controller import RenderObjectStyle # noqa: TC001 -from askui.utils.image_utils import scale_coordinates +from askui.tools.coordinate_scaling_mixin import CoordinateScaler if TYPE_CHECKING: from askui.tools.askui.askui_ui_controller_grpc.generated import ( @@ -31,11 +30,17 @@ class ComputerAgentOsFacade(AgentOs): - """ - Facade for AgentOs that adds coordinate scaling functionality. + """Facade for `AgentOs` that adds coordinate scaling. + + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs + are scaled back up to the real screen resolution before being forwarded + to the underlying agent OS. - This class is used to scale the coordinates to the target resolution - and back to the real screen resolution. + Args: + agent_os (`AgentOs`): The real agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ def __init__( @@ -45,42 +50,43 @@ def __init__( image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os - self._image_scaler = image_scaler - self._target_resolution: tuple[int, int] | None = None - self._coordinate_space: VlmCoordinateSpace = coordinate_space - self._real_screen_resolution: DisplaySize | None = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=self._fetch_real_screen_resolution, + take_screenshot=lambda: self.screenshot(report=False), + ) self.tags.append(ToolTags.SCALED_AGENT_OS.value) def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.retrieve_active_display().size + self._scaler.real_screen_resolution = self._fetch_real_screen_resolution() def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self, report: bool = True) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) - self._real_screen_resolution = DisplaySize( - width=screenshot.width, height=screenshot.height - ) - scaled = self._image_scaler(screenshot) - self._target_resolution = scaled.size - return scaled + return self._scaler.scale_screenshot(screenshot) + + def _fetch_real_screen_resolution(self) -> tuple[int, int]: + display = self._agent_os.retrieve_active_display() + return display.size.width, display.size.height def mouse_move(self, x: float, y: float, duration: int = 500) -> None: - scaled_x, scaled_y = self._scale_coordinates_back(x, y) + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) def get_mouse_position(self) -> Coordinate: mouse_position = self._agent_os.get_mouse_position() - scaled_x, scaled_y = self._scale_coordinates_back( + scaled_x, scaled_y = self._scaler.scale_coordinates( mouse_position.x, mouse_position.y, from_agent=False ) return Coordinate(x=scaled_x, y=scaled_y) def set_mouse_position(self, x: float, y: float) -> None: - scaled_x, scaled_y = self._scale_coordinates_back(x, y) + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.set_mouse_position(scaled_x, scaled_y) def type(self, text: str, typing_speed: int = 50) -> None: @@ -124,7 +130,7 @@ def retrieve_active_display(self) -> Display: def set_display(self, display: int = 1) -> None: self._agent_os.set_display(display) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def run_command(self, command: str, timeout_ms: int = 30000) -> None: self._agent_os.run_command(command, timeout_ms) @@ -301,7 +307,7 @@ def get_file(self, path: str) -> Image.Image | str: """ response = self._agent_os.get_file(path) if isinstance(response, Image.Image): - return self._image_scaler(response) + return self._scaler.scale_screenshot(response) return response def remove_virtual_displays(self) -> None: @@ -309,48 +315,4 @@ def remove_virtual_displays(self) -> None: Remove virtual displays from the controller, leaving real displays only. """ self._agent_os.remove_virtual_displays() - self._real_screen_resolution = None - - def _ensure_target_resolution(self) -> tuple[int, int]: - if self._target_resolution is None: - self.screenshot(report=False) - assert self._target_resolution is not None # noqa: S101 - return self._target_resolution - - def _scale_coordinates_back( - self, - x: float, - y: float, - from_agent: bool = True, - check_coordinates_in_bounds: bool = True, - ) -> tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.retrieve_active_display().size - - target_resolution = self._ensure_target_resolution() - real_size = ( - self._real_screen_resolution.width, - self._real_screen_resolution.height, - ) - - if from_agent: - if self._coordinate_space.maps_to_screenshot_pixels: - mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, target_resolution - ) - return scale_coordinates( - (mapped_x, mapped_y), - real_size, - target_resolution, - inverse=True, - check_coordinates_in_bounds=check_coordinates_in_bounds, - ) - return self._coordinate_space.map_to_target(x, y, real_size) - - return scale_coordinates( - (int(x), int(y)), - real_size, - target_resolution, - inverse=False, - check_coordinates_in_bounds=check_coordinates_in_bounds, - ) + self._scaler.real_screen_resolution = None diff --git a/src/askui/tools/coordinate_scaling_mixin.py b/src/askui/tools/coordinate_scaling_mixin.py new file mode 100644 index 00000000..39d12194 --- /dev/null +++ b/src/askui/tools/coordinate_scaling_mixin.py @@ -0,0 +1,99 @@ +"""Coordinate scaling helper used by all agent OS facades.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from askui.utils.image_utils import scale_coordinates + +if TYPE_CHECKING: + from collections.abc import Callable + + from PIL import Image + + from askui.models.shared.coordinate_space import VlmCoordinateSpace + from askui.models.shared.image_scaler import ImageScaler + + +class CoordinateScaler: + """Maps coordinates between model space and device space. + + Each agent OS facade owns an instance and delegates scaling to it. + + Args: + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. + fetch_real_resolution (`Callable`): Callback that returns the real + ``(width, height)`` of the screen/device when it is not yet known. + take_screenshot (`Callable`): Callback that triggers a screenshot + so that ``target_resolution`` can be populated. + """ + + def __init__( + self, + coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, + fetch_real_resolution: Callable[[], tuple[int, int]], + take_screenshot: Callable[[], Image.Image], + ) -> None: + self._coordinate_space = coordinate_space + self._image_scaler = image_scaler + self._fetch_real_resolution = fetch_real_resolution + self._take_screenshot = take_screenshot + self.target_resolution: tuple[int, int] | None = None + self.real_screen_resolution: tuple[int, int] | None = None + + def scale_screenshot(self, screenshot: Image.Image) -> Image.Image: + """Record real resolution, apply scaler, record target resolution.""" + self.real_screen_resolution = screenshot.size + scaled = self._image_scaler(screenshot) + self.target_resolution = scaled.size + return scaled + + def scale_coordinates( + self, + x: float, + y: float, + from_agent: bool = True, + check_coordinates_in_bounds: bool = True, + ) -> tuple[int, int]: + """Map coordinates between model space and device space. + + When ``from_agent=True``, maps model-emitted coordinates to real + device pixels. When ``from_agent=False``, maps device coordinates + to model space (e.g. for reporting element positions back to the model). + """ + if self.real_screen_resolution is None: + self.real_screen_resolution = self._fetch_real_resolution() + + target_resolution = self._ensure_target_resolution() + + if from_agent: + if self._coordinate_space.maps_to_screenshot_pixels: + mapped_x, mapped_y = self._coordinate_space.map_to_target( + x, y, target_resolution + ) + return scale_coordinates( + (mapped_x, mapped_y), + self.real_screen_resolution, + target_resolution, + inverse=True, + check_coordinates_in_bounds=check_coordinates_in_bounds, + ) + return self._coordinate_space.map_to_target( + x, y, self.real_screen_resolution + ) + + return scale_coordinates( + (int(x), int(y)), + self.real_screen_resolution, + target_resolution, + inverse=False, + check_coordinates_in_bounds=check_coordinates_in_bounds, + ) + + def _ensure_target_resolution(self) -> tuple[int, int]: + if self.target_resolution is None: + self._take_screenshot() + assert self.target_resolution is not None # noqa: S101 + return self.target_resolution diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index a286476c..62a77652 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -6,8 +6,8 @@ from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import Display, ModifierKey, PcKey +from askui.tools.coordinate_scaling_mixin import CoordinateScaler from askui.tools.playwright.agent_os import PlaywrightAgentOs -from askui.utils.image_utils import scale_coordinates class PlaywrightAgentOsFacade(PlaywrightAgentOs): @@ -19,9 +19,9 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs): being forwarded to the underlying agent OS. Args: - agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. - coordinate_space (VlmCoordinateSpace): Coordinate grid the model uses. - image_scaler (ImageScaler): Callable to preprocess screenshots. + agent_os (`PlaywrightAgentOs`): The real Playwright agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ def __init__( @@ -31,73 +31,30 @@ def __init__( image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os - self._image_scaler = image_scaler - self._target_resolution: tuple[int, int] | None = None - self._coordinate_space: VlmCoordinateSpace = coordinate_space - self._real_screen_resolution: tuple[int, int] | None = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=lambda: self._agent_os.screenshot(report=False).size, + take_screenshot=lambda: self.screenshot(report=False), + ) self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.screenshot( - report=False, + self._scaler.real_screen_resolution = self._agent_os.screenshot( + report=False ).size def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self, report: bool = True) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) - self._real_screen_resolution = screenshot.size - scaled = self._image_scaler(screenshot) - self._target_resolution = scaled.size - return scaled - - def _ensure_target_resolution(self) -> tuple[int, int]: - if self._target_resolution is None: - self.screenshot(report=False) - assert self._target_resolution is not None # noqa: S101 - return self._target_resolution - - def _scale_coordinates( - self, - x: float, - y: float, - from_agent: bool = True, - ) -> tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.screenshot( - report=False, - ).size - - target_resolution = self._ensure_target_resolution() - - if from_agent: - if self._coordinate_space.maps_to_screenshot_pixels: - mapped_x, mapped_y = self._coordinate_space.map_to_target( - x, y, target_resolution - ) - return scale_coordinates( - (mapped_x, mapped_y), - self._real_screen_resolution, - target_resolution, - inverse=True, - ) - return self._coordinate_space.map_to_target( - x, y, self._real_screen_resolution - ) - - return scale_coordinates( - (int(x), int(y)), - self._real_screen_resolution, - target_resolution, - inverse=False, - ) + return self._scaler.scale_screenshot(screenshot) def mouse_move(self, x: float, y: float, duration: int = 500) -> None: - scaled_x, scaled_y = self._scale_coordinates(x, y) - # scaled_x, scaled_y = x, y + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) def type(self, text: str, typing_speed: int = 50) -> None: diff --git a/src/askui/utils/llm_image_utils.py b/src/askui/utils/llm_image_utils.py index 688854bc..c74e4f1e 100644 --- a/src/askui/utils/llm_image_utils.py +++ b/src/askui/utils/llm_image_utils.py @@ -141,6 +141,36 @@ def resize_image(image: Image.Image, target_size: tuple[int, int]) -> Image.Imag return image.resize(target_size, Image.Resampling.LANCZOS) +def compute_patch_optimized_image( + image: Image.Image, + max_edge: int = 1568, + max_tokens: int = 1568, + patch_size: int = 28, +) -> Image.Image: + """Resize an image to its patch-optimized size. + + Convenience wrapper that combines `compute_patch_optimized_size` and + `resize_image` into a single call. + + Args: + image (Image.Image): Source image. + max_edge (int): Maximum allowed dimension (width or height). + max_tokens (int): Maximum allowed number of image tokens. + patch_size (int): Patch size used by the model. + + Returns: + Image.Image: Resized image. + """ + target = compute_patch_optimized_size( + image.width, + image.height, + max_edge=max_edge, + max_tokens=max_tokens, + patch_size=patch_size, + ) + return resize_image(image, target) + + def resize_and_pad_image( image: Image.Image, target_size: tuple[int, int], diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py index 3b9e1b89..bc0b2868 100644 --- a/tests/unit/tools/test_agent_os_facade_coordinates.py +++ b/tests/unit/tools/test_agent_os_facade_coordinates.py @@ -39,10 +39,10 @@ def _make_android_facade( coordinate_space=coordinate_space, image_scaler=_default_scaler, ) - facade._real_screen_resolution = device_size + facade._scaler.real_screen_resolution = device_size # Set target resolution as the scaler would produce it scaled = _default_scaler(Image.new("RGB", device_size)) - facade._target_resolution = scaled.size + facade._scaler.target_resolution = scaled.size return facade @@ -58,29 +58,29 @@ class TestScaledCoordinateSpaceTallDevice: def test_center_tap(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(500, 500) + x, y = facade._scaler.scale_coordinates(500, 500) assert (x, y) == (540, 1200) def test_left_side_tap(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(200, 500) + x, y = facade._scaler.scale_coordinates(200, 500) assert (x, y) == (216, 1200) def test_swipe_across(self) -> None: facade = _make_android_facade(self.device, self.cs) - x1, y1 = facade._scale_coordinates(500, 500) - x2, y2 = facade._scale_coordinates(200, 500) + x1, y1 = facade._scaler.scale_coordinates(500, 500) + x2, y2 = facade._scaler.scale_coordinates(200, 500) assert (x1, y1) == (540, 1200) assert (x2, y2) == (216, 1200) def test_origin(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(0, 0) + x, y = facade._scaler.scale_coordinates(0, 0) assert (x, y) == (0, 0) def test_max_corner(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(1000, 1000) + x, y = facade._scaler.scale_coordinates(1000, 1000) assert (x, y) == (1080, 2400) @@ -92,12 +92,12 @@ class TestNormalizedCoordinateSpaceTallDevice: def test_center_tap(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(0.5, 0.5) + x, y = facade._scaler.scale_coordinates(0.5, 0.5) assert (x, y) == (540, 1200) def test_left_side_tap(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(0.2, 0.5) + x, y = facade._scaler.scale_coordinates(0.2, 0.5) assert (x, y) == (216, 1200) @@ -119,7 +119,7 @@ def test_center_of_content(self) -> None: """The center of the content area in the scaled screenshot.""" facade = _make_android_facade(self.device, self.cs) # Target resolution is (345, 768) — nearly no padding - x, y = facade._scale_coordinates(172, 384) + x, y = facade._scaler.scale_coordinates(172, 384) assert x == pytest.approx(538, abs=5) assert y == pytest.approx(1200, abs=5) @@ -128,7 +128,7 @@ def test_near_top_left_of_content(self) -> None: facade = _make_android_facade(self.device, self.cs) # Use (1, 2) instead of exact origin to avoid rounding-offset # edge case that can produce small negative values. - x, y = facade._scale_coordinates(1, 2) + x, y = facade._scaler.scale_coordinates(1, 2) assert x == pytest.approx(3, abs=5) assert y == pytest.approx(3, abs=5) @@ -141,7 +141,7 @@ class TestSquareDevice: def test_center(self) -> None: facade = _make_android_facade(self.device, self.cs) - x, y = facade._scale_coordinates(500, 500) + x, y = facade._scaler.scale_coordinates(500, 500) assert (x, y) == (512, 384) @@ -152,7 +152,7 @@ def test_device_to_screenshot_scaled_space(self) -> None: facade = _make_android_facade( (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000) ) - x, y = facade._scale_coordinates(540, 1200, from_agent=False) + x, y = facade._scaler.scale_coordinates(540, 1200, from_agent=False) # Target resolution is (345, 768), no padding # Forward scaling: factor = 768/2400 = 0.32 # x = 540 * 0.32 = 172.8 → 172, y = 1200 * 0.32 = 384 From cca155fac4ed501bfda9a5ac3649044de805c67c Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Thu, 11 Jun 2026 15:17:49 +0200 Subject: [PATCH 5/6] chore: fine-tune settings for MAX_IMAGE_EDGE --- src/askui/model_providers/anthropic_vlm_provider.py | 2 +- src/askui/model_providers/askui_vlm_provider.py | 2 +- src/askui/model_providers/ollama_vlm_provider.py | 3 +-- src/askui/model_providers/openai_vlm_provider.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py index f094f22c..80168aa3 100644 --- a/src/askui/model_providers/anthropic_vlm_provider.py +++ b/src/askui/model_providers/anthropic_vlm_provider.py @@ -21,7 +21,7 @@ from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "claude-sonnet-4-6" -_DEFAULT_MAX_IMAGE_EDGE = 1568 +_DEFAULT_MAX_IMAGE_EDGE = 1024 class AnthropicVlmProvider(VlmProvider): diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py index f402ee3c..615a1af8 100644 --- a/src/askui/model_providers/askui_vlm_provider.py +++ b/src/askui/model_providers/askui_vlm_provider.py @@ -21,7 +21,7 @@ from askui.utils.llm_image_utils import compute_patch_optimized_image _DEFAULT_MODEL_ID = "claude-sonnet-4-6" -_DEFAULT_MAX_IMAGE_EDGE = 1568 +_DEFAULT_MAX_IMAGE_EDGE = 1024 class AskUIVlmProvider(VlmProvider): diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py index c53103ed..c313983f 100644 --- a/src/askui/model_providers/ollama_vlm_provider.py +++ b/src/askui/model_providers/ollama_vlm_provider.py @@ -5,7 +5,6 @@ from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.models.shared.coordinate_space import ( - NormalizedCoordinateSpace, PixelCoordinateSpace, ScaledCoordinateSpace, VlmCoordinateSpace, @@ -17,7 +16,7 @@ _QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) _HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) -_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace() +_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) class OllamaVlmProvider(OpenAIVlmProvider): diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index bf25eb00..e8d6b8f1 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -26,7 +26,7 @@ _DEFAULT_MODEL_ID = "gpt-5.4" _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() -_DEFAULT_MAX_IMAGE_EDGE = 2048 +_DEFAULT_MAX_IMAGE_EDGE = 1024 class OpenAIVlmProvider(VlmProvider): From d6415eb3df04207ac785e0bbf1e49a2df9e07408 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Thu, 11 Jun 2026 15:21:54 +0200 Subject: [PATCH 6/6] fix: outdated cos test for kimi --- tests/unit/model_providers/test_ollama_vlm_provider.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py index e3f78ef5..e4fe32d3 100644 --- a/tests/unit/model_providers/test_ollama_vlm_provider.py +++ b/tests/unit/model_providers/test_ollama_vlm_provider.py @@ -7,7 +7,6 @@ from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider from askui.models.shared.agent_message_param import MessageParam from askui.models.shared.coordinate_space import ( - NormalizedCoordinateSpace, PixelCoordinateSpace, ScaledCoordinateSpace, ) @@ -68,11 +67,15 @@ def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None: def test_coordinate_space_auto_detects_kimi(self) -> None: provider = OllamaVlmProvider(model_id="kimi-vl") - assert provider.coordinate_space == NormalizedCoordinateSpace() + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None: provider = OllamaVlmProvider(model_id="Kimi-VL-A3B") - assert provider.coordinate_space == NormalizedCoordinateSpace() + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) def test_coordinate_space_default_for_non_qwen(self) -> None: provider = OllamaVlmProvider(model_id="llava")