From 8f496affa10613c2fbd3df8b26c7e6b133e764dd Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Tue, 9 Jun 2026 16:37:03 +0200
Subject: [PATCH 1/6] feat: add coordinate space abstraction for open weights
 LLM support

---
 src/askui/android_agent.py                    |   5 +-
 src/askui/computer_agent.py                   |   3 +-
 .../model_providers/ollama_vlm_provider.py    |  36 +++++
 .../model_providers/openai_vlm_provider.py    |  26 ++++
 src/askui/model_providers/vlm_provider.py     |  31 +++++
 src/askui/models/shared/__init__.py           |  10 ++
 src/askui/models/shared/coordinate_space.py   | 104 +++++++++++++++
 src/askui/tools/android/agent_os_facade.py    |  31 +++--
 src/askui/tools/computer_agent_os_facade.py   |  30 ++++-
 src/askui/tools/playwright/agent_os_facade.py |  28 +++-
 src/askui/web_agent.py                        |   5 +-
 .../test_ollama_vlm_provider.py               |  64 +++++++++
 .../test_openai_vlm_provider.py               | 125 ++++++++++++++++++
 13 files changed, 474 insertions(+), 24 deletions(-)
 create mode 100644 src/askui/models/shared/coordinate_space.py

diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py
index 98b79143..7b7818f8 100644
--- a/src/askui/android_agent.py
+++ b/src/askui/android_agent.py
@@ -87,7 +87,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
-        self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -97,6 +96,10 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = AndroidAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with Android-specific settings
         self.act_settings = ActSettings(
diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py
index ad0a6627..6e53df87 100644
--- a/src/askui/computer_agent.py
+++ b/src/askui/computer_agent.py
@@ -130,7 +130,8 @@ def __init__(
             truncation_strategy=truncation_strategy,
         )
         self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
-            self.tools.os
+            self.tools.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with computer-specific settings
diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
index e06fa408..1cca3905 100644
--- a/src/askui/model_providers/ollama_vlm_provider.py
+++ b/src/askui/model_providers/ollama_vlm_provider.py
@@ -1,12 +1,23 @@
 """OllamaVlmProvider — VLM access via a local Ollama instance."""
 
 from openai import OpenAI
+from typing_extensions import override
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 
 _DEFAULT_BASE_URL = "http://localhost:11434/v1"
 _DEFAULT_MODEL_ID = "qwen3.5"
 
+_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace()
+
 
 class OllamaVlmProvider(OpenAIVlmProvider):
     """VLM provider that routes requests to a local Ollama instance.
@@ -14,6 +25,11 @@ class OllamaVlmProvider(OpenAIVlmProvider):
     Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
     defaults (``base_url``, ``api_key``, ``model_id``).
 
+    Qwen and Holo models are automatically detected and their coordinate
+    space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
+    Kimi models use ``NormalizedCoordinateSpace()``.
+    Pass ``coordinate_space`` explicitly to override auto-detection.
+
     Args:
         model_id (str, optional): Ollama model to use. Defaults to
             ``"qwen3.5"``.
@@ -21,6 +37,9 @@ class OllamaVlmProvider(OpenAIVlmProvider):
             API. Defaults to ``"http://localhost:11434/v1"``.
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``base_url`` is ignored.
+        coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
+            grid the model emits coordinates in.  ``None`` (the default)
+            enables auto-detection based on ``model_id``.
 
     Example:
         ```python
@@ -40,10 +59,27 @@ def __init__(
         model_id: str = _DEFAULT_MODEL_ID,
         base_url: str = _DEFAULT_BASE_URL,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace | None = None,
     ) -> None:
+        self._coordinate_space_override = coordinate_space
         super().__init__(
             model_id=model_id,
             api_key="ollama",  # Ollama requires no auth; OpenAI SDK needs a value
             base_url=base_url,
             client=client,
+            coordinate_space=coordinate_space or PixelCoordinateSpace(),
         )
+
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        if self._coordinate_space_override is not None:
+            return self._coordinate_space_override
+        model_lower = self._model_id_value.lower()
+        if "qwen" in model_lower:
+            return _QWEN_COORDINATE_SPACE
+        if "holo" in model_lower:
+            return _HOLO_COORDINATE_SPACE
+        if "kimi" in model_lower:
+            return _KIMI_COORDINATE_SPACE
+        return self._coordinate_space
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
index 47475cc7..8ac5f6a6 100644
--- a/src/askui/model_providers/openai_vlm_provider.py
+++ b/src/askui/model_providers/openai_vlm_provider.py
@@ -14,11 +14,17 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
 
 
 class OpenAIVlmProvider(VlmProvider):
@@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider):
             to the OpenAI API (``https://api.openai.com/v1``).
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``api_key`` and ``base_url`` are ignored.
+        coordinate_space (VlmCoordinateSpace, optional): The coordinate grid
+            the model emits coordinates in.  Defaults to the screenshot
+            resolution (native pixel coordinates).
 
     Example:
         ```python
@@ -57,6 +66,7 @@ def __init__(
         api_key: str | None = None,
         base_url: str | None = None,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -65,6 +75,7 @@ def __init__(
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
+        self._coordinate_space = coordinate_space
         if client is not None:
             self._client = client
         else:
@@ -86,6 +97,11 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id_value
 
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        return self._coordinate_space
+
     @property
     @override
     def pricing(self) -> ModelPricing | None:
@@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi:
         """Lazily initialise the `OpenAIMessagesApi` on first use."""
         return OpenAIMessagesApi(client=self._client)
 
+    @override
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Append coordinate and resolution info to the system prompt."""
+        coord_info = self.coordinate_space.build_prompt_section(
+            screenshot_resolution=SCREENSHOT_RESOLUTION,
+        )
+        return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")
+
     @override
     def create_message(
         self,
@@ -108,6 +132,8 @@ def create_message(
         temperature: float | None = None,
         provider_options: dict[str, Any] | None = None,
     ) -> MessageParam:
+        if system is not None:
+            system = self.augment_system_prompt(system)
         return self._messages_api.create_message(
             messages=messages,
             model_id=self._model_id_value,
diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py
index 1e98b972..6d4d9738 100644
--- a/src/askui/model_providers/vlm_provider.py
+++ b/src/askui/model_providers/vlm_provider.py
@@ -8,10 +8,16 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
+
 
 class VlmProvider(ABC):
     """Interface for Vision Language Model providers.
@@ -44,6 +50,17 @@ class VlmProvider(ABC):
     def model_id(self) -> str:
         """The model identifier used by this provider."""
 
+    @property
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        """The coordinate space this model emits coordinates in.
+
+        Returns a `VlmCoordinateSpace` describing the grid the model uses.
+        The default is `PixelCoordinateSpace` (native pixel coordinates).
+        Override in subclasses when the model uses a different grid
+        (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen).
+        """
+        return _DEFAULT_COORDINATE_SPACE
+
     @property
     def pricing(self) -> ModelPricing | None:
         """Pricing information for this provider's model.
@@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None:
         """
         return None
 
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Hook for providers to augment the system prompt before sending.
+
+        Called by ``create_message()`` implementations.  The base
+        implementation returns the prompt unchanged.  Override in
+        subclasses that need to inject provider-specific information
+        (e.g. coordinate bounds for non-Anthropic models).
+
+        The original ``SystemPrompt`` object is **not** mutated —
+        implementations should create a new ``SystemPrompt`` wrapping
+        the augmented text.
+        """
+        return system
+
     @abstractmethod
     def create_message(
         self,
diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
index 4df27a7b..635fc053 100644
--- a/src/askui/models/shared/__init__.py
+++ b/src/askui/models/shared/__init__.py
@@ -1,5 +1,11 @@
 from .android_base_tool import AndroidBaseTool
 from .computer_base_tool import ComputerBaseTool
+from .coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from .tool_tags import ToolTags
 
 try:
@@ -13,6 +19,10 @@
 __all__ = [
     "AndroidBaseTool",
     "ComputerBaseTool",
+    "NormalizedCoordinateSpace",
+    "PixelCoordinateSpace",
+    "ScaledCoordinateSpace",
+    "VlmCoordinateSpace",
     "ToolTags",
 ]
 
diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py
new file mode 100644
index 00000000..69696cdd
--- /dev/null
+++ b/src/askui/models/shared/coordinate_space.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel, Field
+
+# The resolution screenshots are scaled to before being sent to the model.
+# Used by all agent OS facades (computer, Android, Playwright).
+SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768)
+
+
+def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]:
+    sw, sh = screenshot_resolution
+    return [
+        f"* Screenshot resolution: {sw}x{sh} pixels",
+        "* Screenshots may contain black padding bars to preserve the "
+        "original aspect ratio. UI elements are NOT located in the "
+        "padding area.",
+        "* Coordinate origin is the top-left corner (0, 0)",
+    ]
+
+
+class VlmCoordinateSpace(BaseModel, ABC):
+    """Abstract base for VLM coordinate conventions.
+
+    Each subclass describes one coordinate grid a VLM may emit and knows
+    how to map those coordinates back to pixel space and how to render
+    the matching prompt section.
+    """
+
+    @abstractmethod
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        """Map model coordinates to pixel coordinates in *target_resolution*."""
+
+    @abstractmethod
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        """Build prompt text describing coordinate bounds for the model."""
+
+
+class PixelCoordinateSpace(VlmCoordinateSpace):
+    """Identity mapping -- coordinates already in pixel space.
+
+    Used by Anthropic/Claude which emit coordinates matching the
+    screenshot resolution.
+    """
+
+    def map_to_target(
+        self,
+        x: float,
+        y: float,
+        target_resolution: tuple[int, int],  # noqa: ARG002
+    ) -> tuple[int, int]:
+        return int(x), int(y)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        sw, sh = screenshot_resolution
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class ScaledCoordinateSpace(VlmCoordinateSpace):
+    """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling."""
+
+    width: int = Field(gt=0, description="Width of the coordinate grid")
+    height: int = Field(gt=0, description="Height of the coordinate grid")
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw / self.width), int(y * th / self.height)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        if (self.width, self.height) != screenshot_resolution:
+            lines.append(
+                f"* Emit coordinates in a {self.width}x{self.height} "
+                f"normalised grid: 0 <= x < {self.width}, "
+                f"0 <= y < {self.height}"
+            )
+        else:
+            sw, sh = screenshot_resolution
+            lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class NormalizedCoordinateSpace(VlmCoordinateSpace):
+    """0.0-1.0 float grid (Kimi). No fields."""
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw), int(y * th)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(
+            "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0"
+        )
+        return "\n".join(lines)
diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
index f27d0eee..f0374036 100644
--- a/src/askui/tools/android/agent_os_facade.py
+++ b/src/askui/tools/android/agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
 from askui.tools.android.uiautomator_hierarchy import UIElementCollection
@@ -15,9 +19,14 @@ class AndroidAgentOsFacade(AndroidAgentOs):
     and back to the real screen resolution.
     """
 
-    def __init__(self, agent_os: AndroidAgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: AndroidAgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os: AndroidAgentOs = agent_os
-        self._target_resolution: Tuple[int, int] = (1024, 768)
+        self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: Optional[Tuple[int, int]] = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
@@ -39,33 +48,39 @@ def screenshot(self) -> Image.Image:
 
     def _scale_coordinates(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
     ) -> Tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot().size
 
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             self._real_screen_resolution,
             self._target_resolution,
             inverse=from_agent,
         )
 
-    def tap(self, x: int, y: int) -> None:
+    def tap(self, x: float, y: float) -> None:
         x, y = self._scale_coordinates(x, y)
         self._agent_os.tap(x, y)
 
     def swipe(
-        self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
+        self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
         x1, y1 = self._scale_coordinates(x1, y1)
         x2, y2 = self._scale_coordinates(x2, y2)
         self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms)
 
     def drag_and_drop(
-        self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
+        self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
         x1, y1 = self._scale_coordinates(x1, y1)
         x2, y2 = self._scale_coordinates(x2, y2)
diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py
index 28a1a8c5..c91a2c84 100644
--- a/src/askui/tools/computer_agent_os_facade.py
+++ b/src/askui/tools/computer_agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import (
     AgentOs,
@@ -36,9 +40,14 @@ class ComputerAgentOsFacade(AgentOs):
     and back to the real screen resolution.
     """
 
-    def __init__(self, agent_os: AgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: AgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = (1024, 768)
+        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: DisplaySize | None = None
         self.tags.append(ToolTags.SCALED_AGENT_OS.value)
 
@@ -57,7 +66,7 @@ def screenshot(self, report: bool = True) -> Image.Image:
         )
         return scale_image_to_fit(screenshot, self._target_resolution)
 
-    def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+    def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
 
@@ -68,7 +77,7 @@ def get_mouse_position(self) -> Coordinate:
         )
         return Coordinate(x=scaled_x, y=scaled_y)
 
-    def set_mouse_position(self, x: int, y: int) -> None:
+    def set_mouse_position(self, x: float, y: float) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)
         self._agent_os.set_mouse_position(scaled_x, scaled_y)
 
@@ -302,15 +311,22 @@ def remove_virtual_displays(self) -> None:
 
     def _scale_coordinates_back(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
         check_coordinates_in_bounds: bool = True,
     ) -> tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.retrieve_active_display().size
+
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             (self._real_screen_resolution.width, self._real_screen_resolution.height),
             self._target_resolution,
             inverse=from_agent,
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
index 091ff804..5d6f7c42 100644
--- a/src/askui/tools/playwright/agent_os_facade.py
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import Display, ModifierKey, PcKey
 from askui.tools.playwright.agent_os import PlaywrightAgentOs
@@ -20,9 +24,14 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs):
         agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap.
     """
 
-    def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: PlaywrightAgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = (1024, 768)
+        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: tuple[int, int] | None = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
@@ -43,22 +52,29 @@ def screenshot(self, report: bool = True) -> Image.Image:
 
     def _scale_coordinates(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
     ) -> tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot(
                 report=False,
             ).size
+
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             self._real_screen_resolution,
             self._target_resolution,
             inverse=from_agent,
         )
 
-    def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+    def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
         scaled_x, scaled_y = self._scale_coordinates(x, y)
         # scaled_x, scaled_y = x, y
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py
index fe47c5f9..bc211ec9 100644
--- a/src/askui/web_agent.py
+++ b/src/askui/web_agent.py
@@ -60,7 +60,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PlaywrightAgentOs(reporter)
-        self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -70,6 +69,10 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = PlaywrightAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         self.act_settings = ActSettings(
             messages=MessageSettings(
diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py
index 143e7c35..e3f78ef5 100644
--- a/tests/unit/model_providers/test_ollama_vlm_provider.py
+++ b/tests/unit/model_providers/test_ollama_vlm_provider.py
@@ -6,6 +6,11 @@
 
 from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+)
 
 
 class TestOllamaVlmProvider:
@@ -48,3 +53,62 @@ def test_create_message_delegates_to_messages_api(self) -> None:
 
         mock_client.chat.completions.create.assert_called_once()
         assert result.role == "assistant"
+
+    def test_coordinate_space_auto_detects_qwen(self) -> None:
+        provider = OllamaVlmProvider(model_id="qwen3.5")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Qwen2-VL")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_kimi(self) -> None:
+        provider = OllamaVlmProvider(model_id="kimi-vl")
+        assert provider.coordinate_space == NormalizedCoordinateSpace()
+
+    def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Kimi-VL-A3B")
+        assert provider.coordinate_space == NormalizedCoordinateSpace()
+
+    def test_coordinate_space_default_for_non_qwen(self) -> None:
+        provider = OllamaVlmProvider(model_id="llava")
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_explicit_override(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="llava",
+            coordinate_space=ScaledCoordinateSpace(width=500, height=500),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(width=500, height=500)
+
+    def test_coordinate_space_explicit_override_takes_precedence(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="qwen3.5",
+            coordinate_space=ScaledCoordinateSpace(width=2000, height=2000),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=2000, height=2000
+        )
+
+    def test_coordinate_space_explicit_pixel_overrides_qwen_auto_detect(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="qwen3.5",
+            coordinate_space=PixelCoordinateSpace(),
+        )
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_auto_detects_holo(self) -> None:
+        provider = OllamaVlmProvider(model_id="holo3.1-35b-a3b")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_holo_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Holo-3.1-4B")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py
index d51ff74b..7a5a2a87 100644
--- a/tests/unit/model_providers/test_openai_vlm_provider.py
+++ b/tests/unit/model_providers/test_openai_vlm_provider.py
@@ -6,6 +6,12 @@
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+)
+from askui.models.shared.prompts import SystemPrompt
 
 
 class TestOpenAIVlmProvider:
@@ -41,3 +47,122 @@ def test_create_message_delegates_to_messages_api(self) -> None:
 
         mock_client.chat.completions.create.assert_called_once()
         assert result.role == "assistant"
+
+    def test_coordinate_space_defaults_to_pixel(self) -> None:
+        provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_passthrough(self) -> None:
+        provider = OpenAIVlmProvider(
+            model_id="gpt-4o",
+            api_key="sk-test",
+            coordinate_space=ScaledCoordinateSpace(width=1000, height=1000),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_augment_system_prompt_scaled_coordinate_space(self) -> None:
+        provider = OpenAIVlmProvider(
+            model_id="gpt-4o",
+            api_key="sk-test",
+            coordinate_space=ScaledCoordinateSpace(width=1000, height=1000),
+        )
+        system = SystemPrompt(prompt="You are a helpful assistant.")
+        augmented = provider.augment_system_prompt(system)
+
+        rendered = str(augmented)
+        assert "You are a helpful assistant." in rendered
+        assert "1000x1000 normalised grid" in rendered
+        assert "1024x768" in rendered
+
+    def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None:
+        provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
+        system = SystemPrompt(prompt="Base prompt.")
+        augmented = provider.augment_system_prompt(system)
+
+        rendered = str(augmented)
+        assert "normalised grid" not in rendered
+        assert "0 <= x < 1024" in rendered
+
+
+class TestPixelCoordinateSpacePrompt:
+    def test_shows_pixel_bounds(self) -> None:
+        cs = PixelCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "0 <= x < 1024" in result
+        assert "0 <= y < 768" in result
+        assert "normalised grid" not in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = PixelCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestScaledCoordinateSpacePrompt:
+    def test_shows_normalised_grid(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        result = cs.build_prompt_section((1024, 768))
+        assert "1024x768" in result
+        assert "1000x1000 normalised grid" in result
+        assert "0 <= x < 1000" in result
+        assert "0 <= y < 1000" in result
+
+    def test_matching_resolution_shows_pixel_bounds(self) -> None:
+        cs = ScaledCoordinateSpace(width=1024, height=768)
+        result = cs.build_prompt_section((1024, 768))
+        assert "0 <= x < 1024" in result
+        assert "normalised grid" not in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestNormalizedCoordinateSpacePrompt:
+    def test_shows_normalised_floats(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "0.0 <= x <= 1.0" in result
+        assert "0.0 <= y <= 1.0" in result
+        assert "normalised floats" in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestMapToTarget:
+    def test_pixel_identity(self) -> None:
+        cs = PixelCoordinateSpace()
+        assert cs.map_to_target(512, 384, (1024, 768)) == (512, 384)
+
+    def test_pixel_truncates_floats(self) -> None:
+        cs = PixelCoordinateSpace()
+        assert cs.map_to_target(512.7, 384.3, (1024, 768)) == (512, 384)
+
+    def test_scaled_maps_correctly(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        assert cs.map_to_target(500, 500, (1024, 768)) == (512, 384)
+
+    def test_scaled_zero(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        assert cs.map_to_target(0, 0, (1024, 768)) == (0, 0)
+
+    def test_normalized_maps_correctly(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(0.5, 0.5, (1024, 768)) == (512, 384)
+
+    def test_normalized_zero(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(0.0, 0.0, (1024, 768)) == (0, 0)
+
+    def test_normalized_one(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(1.0, 1.0, (1024, 768)) == (1024, 768)

From 31865a802dccb57821d186e93dbf1754f13a727e Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Wed, 10 Jun 2026 15:30:08 +0200
Subject: [PATCH 2/6] fix: map non-pixel coordinate spaces directly to device
 resolution

---
 src/askui/models/shared/coordinate_space.py   |  14 ++
 src/askui/tools/android/agent_os_facade.py    |  23 ++-
 src/askui/tools/computer_agent_os_facade.py   |  27 +++-
 src/askui/tools/playwright/agent_os_facade.py |  23 ++-
 .../test_openai_vlm_provider.py               |  14 ++
 .../tools/test_agent_os_facade_coordinates.py | 141 ++++++++++++++++++
 6 files changed, 221 insertions(+), 21 deletions(-)
 create mode 100644 tests/unit/tools/test_agent_os_facade_coordinates.py

diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py
index 69696cdd..3de94ac8 100644
--- a/src/askui/models/shared/coordinate_space.py
+++ b/src/askui/models/shared/coordinate_space.py
@@ -28,6 +28,16 @@ class VlmCoordinateSpace(BaseModel, ABC):
     the matching prompt section.
     """
 
+    @property
+    def maps_to_screenshot_pixels(self) -> bool:
+        """Whether model coordinates are absolute pixels in the screenshot image.
+
+        When ``True``, coordinates need padding-aware inverse scaling
+        (screenshot space to device space).  When ``False``, coordinates
+        are in a normalised grid and map directly to device resolution.
+        """
+        return False
+
     @abstractmethod
     def map_to_target(
         self, x: float, y: float, target_resolution: tuple[int, int]
@@ -46,6 +56,10 @@ class PixelCoordinateSpace(VlmCoordinateSpace):
     screenshot resolution.
     """
 
+    @property
+    def maps_to_screenshot_pixels(self) -> bool:
+        return True
+
     def map_to_target(
         self,
         x: float,
diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
index f0374036..02aa9c7c 100644
--- a/src/askui/tools/android/agent_os_facade.py
+++ b/src/askui/tools/android/agent_os_facade.py
@@ -55,17 +55,26 @@ def _scale_coordinates(
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot().size
 
-        mapped_x, mapped_y = (
-            self._coordinate_space.map_to_target(x, y, self._target_resolution)
-            if from_agent
-            else (int(x), int(y))
-        )
+        if from_agent:
+            if self._coordinate_space.maps_to_screenshot_pixels:
+                mapped_x, mapped_y = self._coordinate_space.map_to_target(
+                    x, y, self._target_resolution
+                )
+                return scale_coordinates(
+                    (mapped_x, mapped_y),
+                    self._real_screen_resolution,
+                    self._target_resolution,
+                    inverse=True,
+                )
+            return self._coordinate_space.map_to_target(
+                x, y, self._real_screen_resolution
+            )
 
         return scale_coordinates(
-            (mapped_x, mapped_y),
+            (int(x), int(y)),
             self._real_screen_resolution,
             self._target_resolution,
-            inverse=from_agent,
+            inverse=False,
         )
 
     def tap(self, x: float, y: float) -> None:
diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py
index c91a2c84..6f7cc75b 100644
--- a/src/askui/tools/computer_agent_os_facade.py
+++ b/src/askui/tools/computer_agent_os_facade.py
@@ -319,16 +319,29 @@ def _scale_coordinates_back(
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.retrieve_active_display().size
 
-        mapped_x, mapped_y = (
-            self._coordinate_space.map_to_target(x, y, self._target_resolution)
-            if from_agent
-            else (int(x), int(y))
+        real_size = (
+            self._real_screen_resolution.width,
+            self._real_screen_resolution.height,
         )
 
+        if from_agent:
+            if self._coordinate_space.maps_to_screenshot_pixels:
+                mapped_x, mapped_y = self._coordinate_space.map_to_target(
+                    x, y, self._target_resolution
+                )
+                return scale_coordinates(
+                    (mapped_x, mapped_y),
+                    real_size,
+                    self._target_resolution,
+                    inverse=True,
+                    check_coordinates_in_bounds=check_coordinates_in_bounds,
+                )
+            return self._coordinate_space.map_to_target(x, y, real_size)
+
         return scale_coordinates(
-            (mapped_x, mapped_y),
-            (self._real_screen_resolution.width, self._real_screen_resolution.height),
+            (int(x), int(y)),
+            real_size,
             self._target_resolution,
-            inverse=from_agent,
+            inverse=False,
             check_coordinates_in_bounds=check_coordinates_in_bounds,
         )
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
index 5d6f7c42..3e4f8500 100644
--- a/src/askui/tools/playwright/agent_os_facade.py
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -61,17 +61,26 @@ def _scale_coordinates(
                 report=False,
             ).size
 
-        mapped_x, mapped_y = (
-            self._coordinate_space.map_to_target(x, y, self._target_resolution)
-            if from_agent
-            else (int(x), int(y))
-        )
+        if from_agent:
+            if self._coordinate_space.maps_to_screenshot_pixels:
+                mapped_x, mapped_y = self._coordinate_space.map_to_target(
+                    x, y, self._target_resolution
+                )
+                return scale_coordinates(
+                    (mapped_x, mapped_y),
+                    self._real_screen_resolution,
+                    self._target_resolution,
+                    inverse=True,
+                )
+            return self._coordinate_space.map_to_target(
+                x, y, self._real_screen_resolution
+            )
 
         return scale_coordinates(
-            (mapped_x, mapped_y),
+            (int(x), int(y)),
             self._real_screen_resolution,
             self._target_resolution,
-            inverse=from_agent,
+            inverse=False,
         )
 
     def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py
index 7a5a2a87..1d33f1d5 100644
--- a/tests/unit/model_providers/test_openai_vlm_provider.py
+++ b/tests/unit/model_providers/test_openai_vlm_provider.py
@@ -138,6 +138,20 @@ def test_includes_padding_and_origin_info(self) -> None:
         assert "top-left" in result
 
 
+class TestMapsToScreenshotPixels:
+    def test_pixel_returns_true(self) -> None:
+        assert PixelCoordinateSpace().maps_to_screenshot_pixels is True
+
+    def test_scaled_returns_false(self) -> None:
+        assert (
+            ScaledCoordinateSpace(width=1000, height=1000).maps_to_screenshot_pixels
+            is False
+        )
+
+    def test_normalized_returns_false(self) -> None:
+        assert NormalizedCoordinateSpace().maps_to_screenshot_pixels is False
+
+
 class TestMapToTarget:
     def test_pixel_identity(self) -> None:
         cs = PixelCoordinateSpace()
diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py
new file mode 100644
index 00000000..e81e8214
--- /dev/null
+++ b/tests/unit/tools/test_agent_os_facade_coordinates.py
@@ -0,0 +1,141 @@
+"""Tests for coordinate mapping in agent OS facades.
+
+Verifies that non-pixel coordinate spaces (Qwen 0-1000, Kimi 0.0-1.0)
+map directly to device resolution, bypassing the padded screenshot space.
+"""
+
+from unittest.mock import MagicMock
+
+import pytest
+from PIL import Image
+
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+)
+from askui.tools.android.agent_os_facade import AndroidAgentOsFacade
+
+
+def _make_android_facade(
+    device_size: tuple[int, int],
+    coordinate_space: PixelCoordinateSpace
+    | ScaledCoordinateSpace
+    | NormalizedCoordinateSpace,
+) -> AndroidAgentOsFacade:
+    """Create an AndroidAgentOsFacade with a mocked agent OS."""
+    mock_os = MagicMock()
+    mock_os.tags = []
+    mock_os.screenshot.return_value = Image.new("RGB", device_size)
+    facade = AndroidAgentOsFacade(mock_os, coordinate_space=coordinate_space)
+    facade._real_screen_resolution = device_size
+    return facade
+
+
+class TestScaledCoordinateSpaceTallDevice:
+    """Qwen 0-1000 grid on a tall Android device (1080x2400).
+
+    The screenshot is scaled to 345x768 with 339px horizontal padding,
+    so the old code would produce negative x when x_model < ~331.
+    """
+
+    device = (1080, 2400)
+    cs = ScaledCoordinateSpace(width=1000, height=1000)
+
+    def test_center_tap(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(500, 500)
+        assert (x, y) == (540, 1200)
+
+    def test_left_side_tap(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(200, 500)
+        assert (x, y) == (216, 1200)
+
+    def test_swipe_across(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x1, y1 = facade._scale_coordinates(500, 500)
+        x2, y2 = facade._scale_coordinates(200, 500)
+        assert (x1, y1) == (540, 1200)
+        assert (x2, y2) == (216, 1200)
+
+    def test_origin(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(0, 0)
+        assert (x, y) == (0, 0)
+
+    def test_max_corner(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(1000, 1000)
+        assert (x, y) == (1080, 2400)
+
+
+class TestNormalizedCoordinateSpaceTallDevice:
+    """Kimi 0.0-1.0 grid on a tall Android device (1080x2400)."""
+
+    device = (1080, 2400)
+    cs = NormalizedCoordinateSpace()
+
+    def test_center_tap(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(0.5, 0.5)
+        assert (x, y) == (540, 1200)
+
+    def test_left_side_tap(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(0.2, 0.5)
+        assert (x, y) == (216, 1200)
+
+
+class TestPixelCoordinateSpaceTallDevice:
+    """Claude pixel coordinates on a tall Android device (1080x2400).
+
+    Pixel coordinates are in the padded 1024x768 screenshot space
+    and must go through the padding-aware inverse scaling pipeline.
+    """
+
+    device = (1080, 2400)
+    cs = PixelCoordinateSpace()
+
+    def test_center_of_content(self) -> None:
+        """The center of the content area in the padded screenshot."""
+        facade = _make_android_facade(self.device, self.cs)
+        # Content area: x=[339..684], y=[0..768] in 1024x768 screenshot
+        # Center of content: x=511, y=384
+        x, y = facade._scale_coordinates(511, 384)
+        # (511 - 339) / 0.32 = 537.5 → 537, (384 - 0) / 0.32 = 1200
+        assert x == pytest.approx(537, abs=2)
+        assert y == 1200
+
+    def test_top_left_of_content(self) -> None:
+        """Top-left corner of the content area."""
+        facade = _make_android_facade(self.device, self.cs)
+        # Content starts at x=339 in the padded screenshot
+        x, y = facade._scale_coordinates(339, 0)
+        assert x == pytest.approx(0, abs=2)
+        assert y == 0
+
+
+class TestSquareDevice:
+    """Verify no regression on a device with matching aspect ratio."""
+
+    device = (1024, 768)
+    cs = ScaledCoordinateSpace(width=1000, height=1000)
+
+    def test_center(self) -> None:
+        facade = _make_android_facade(self.device, self.cs)
+        x, y = facade._scale_coordinates(500, 500)
+        assert (x, y) == (512, 384)
+
+
+class TestFromAgentFalse:
+    """from_agent=False always maps device → screenshot pixel space."""
+
+    def test_device_to_screenshot_scaled_space(self) -> None:
+        facade = _make_android_facade(
+            (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000)
+        )
+        x, y = facade._scale_coordinates(540, 1200, from_agent=False)
+        # Forward scaling: (540 * 0.32 + 339, 1200 * 0.32 + 0) ≈ (512, 384)
+        assert x == pytest.approx(512, abs=2)
+        assert y == pytest.approx(384, abs=2)

From 395d00d0775d262dacb4e91d01c4586fd444439a Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Thu, 11 Jun 2026 14:25:22 +0200
Subject: [PATCH 3/6] feat: provider-owned image scaling with configurable
 max_image_edge

Replace fixed SCREENSHOT_RESOLUTION constant with per-provider image
scalers. Each VlmProvider now owns an ImageScaler callable and exposes
max_image_edge (also via ASKUI_VLM_MAX_IMAGE_EDGE env var). Facades
derive target_resolution dynamically from scaler output.
---
 src/askui/android_agent.py                    |   1 +
 src/askui/computer_agent.py                   |   1 +
 .../model_providers/anthropic_vlm_provider.py |  36 ++++
 .../model_providers/askui_vlm_provider.py     |  37 ++++
 .../model_providers/ollama_vlm_provider.py    |  11 ++
 .../openai_compatible_vlm_provider.py         |  11 ++
 .../model_providers/openai_vlm_provider.py    |  39 ++++-
 src/askui/model_providers/vlm_provider.py     |  17 ++
 src/askui/models/anthropic/get_model.py       |  11 +-
 .../locate_models/anthropic_locate_model.py   |  19 ++-
 src/askui/models/shared/__init__.py           |   2 +
 src/askui/models/shared/coordinate_space.py   |  48 ++----
 src/askui/models/shared/image_scaler.py       |   8 +
 src/askui/tools/android/agent_os_facade.py    |  33 ++--
 src/askui/tools/computer_agent_os_facade.py   |  31 ++--
 src/askui/tools/playwright/agent_os_facade.py |  36 ++--
 .../tools/store/universal/load_image_tool.py  |  10 +-
 src/askui/utils/llm_image_utils.py            | 161 ++++++++++++++++++
 src/askui/web_agent.py                        |   1 +
 .../test_openai_vlm_provider.py               |  61 ++++---
 .../tools/test_agent_os_facade_coordinates.py |  61 ++++---
 tests/unit/utils/test_llm_image_utils.py      | 145 ++++++++++++++++
 22 files changed, 649 insertions(+), 131 deletions(-)
 create mode 100644 src/askui/models/shared/image_scaler.py
 create mode 100644 src/askui/utils/llm_image_utils.py
 create mode 100644 tests/unit/utils/test_llm_image_utils.py

diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py
index 7b7818f8..29b96a15 100644
--- a/src/askui/android_agent.py
+++ b/src/askui/android_agent.py
@@ -99,6 +99,7 @@ def __init__(
         self.act_agent_os_facade = AndroidAgentOsFacade(
             self.os,
             coordinate_space=self._vlm_provider.coordinate_space,
+            image_scaler=self._vlm_provider.image_scaler,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with Android-specific settings
diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py
index 6e53df87..7f121dbf 100644
--- a/src/askui/computer_agent.py
+++ b/src/askui/computer_agent.py
@@ -132,6 +132,7 @@ def __init__(
         self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
             self.tools.os,
             coordinate_space=self._vlm_provider.coordinate_space,
+            image_scaler=self._vlm_provider.image_scaler,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with computer-specific settings
diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py
index 9edd42b9..37ca9a5d 100644
--- a/src/askui/model_providers/anthropic_vlm_provider.py
+++ b/src/askui/model_providers/anthropic_vlm_provider.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from anthropic import Anthropic
+from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -14,11 +15,25 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
+_DEFAULT_MAX_IMAGE_EDGE = 1568
+
+
+def _anthropic_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
+    target = compute_patch_optimized_size(
+        image.width,
+        image.height,
+        max_edge=max_edge,
+        max_tokens=1568,
+        patch_size=28,
+    )
+    return resize_image(image, target)
 
 
 class AnthropicVlmProvider(VlmProvider):
@@ -46,6 +61,11 @@ class AnthropicVlmProvider(VlmProvider):
             cost in USD per 1M output tokens.
         cache_write_cost_per_million_tokens (float | None, optional): Override
             cost in USD per 1M cache write input tokens.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, uses Anthropic-optimized patch-based scaling.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Defaults to 1568.
         cache_read_cost_per_million_tokens (float | None, optional): Override
             cost in USD per 1M cache read input tokens.
 
@@ -70,6 +90,8 @@ def __init__(
         auth_token: str | None = None,
         model_id: str | None = None,
         client: Anthropic | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -78,6 +100,12 @@ def __init__(
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
+        self._image_scaler_override = image_scaler
+        self._max_edge = (
+            max_image_edge
+            or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
+            or _DEFAULT_MAX_IMAGE_EDGE
+        )
         if client is not None:
             self.client = client
         else:
@@ -104,6 +132,14 @@ def model_id(self) -> str:
     def pricing(self) -> ModelPricing | None:
         return self._pricing
 
+    @property
+    @override
+    def image_scaler(self) -> ImageScaler:
+        if self._image_scaler_override is not None:
+            return self._image_scaler_override
+        max_edge = self._max_edge
+        return lambda image: _anthropic_image_scaler(image, max_edge)
+
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
         """Lazily initialise the AnthropicMessagesApi on first use."""
diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py
index d149deff..c8990a6c 100644
--- a/src/askui/model_providers/askui_vlm_provider.py
+++ b/src/askui/model_providers/askui_vlm_provider.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from anthropic import Anthropic
+from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -15,10 +16,24 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
+_DEFAULT_MAX_IMAGE_EDGE = 1568
+
+
+def _askui_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
+    target = compute_patch_optimized_size(
+        image.width,
+        image.height,
+        max_edge=max_edge,
+        max_tokens=1568,
+        patch_size=28,
+    )
+    return resize_image(image, target)
 
 
 class AskUIVlmProvider(VlmProvider):
@@ -37,6 +52,12 @@ class AskUIVlmProvider(VlmProvider):
             `"claude-sonnet-4-6"`.
         client (Anthropic | None, optional): Pre-configured Anthropic client.
             If provided, `workspace_id` and `token` are ignored.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, uses Anthropic-optimized patch-based scaling.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Defaults to 1568.
+
     Example:
         ```python
         from askui import AgentSettings, ComputerAgent
@@ -57,18 +78,34 @@ def __init__(
         askui_settings: AskUiInferenceApiSettings | None = None,
         model_id: str | None = None,
         client: Anthropic | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
         self._askui_settings = askui_settings or AskUiInferenceApiSettings()
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
         self._injected_client = client
+        self._image_scaler_override = image_scaler
+        self._max_edge = (
+            max_image_edge
+            or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
+            or _DEFAULT_MAX_IMAGE_EDGE
+        )
 
     @property
     @override
     def model_id(self) -> str:
         return self._model_id_value
 
+    @property
+    @override
+    def image_scaler(self) -> ImageScaler:
+        if self._image_scaler_override is not None:
+            return self._image_scaler_override
+        max_edge = self._max_edge
+        return lambda image: _askui_image_scaler(image, max_edge)
+
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
         """Lazily initialise the AnthropicMessagesApi on first use."""
diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
index 1cca3905..c53103ed 100644
--- a/src/askui/model_providers/ollama_vlm_provider.py
+++ b/src/askui/model_providers/ollama_vlm_provider.py
@@ -10,6 +10,7 @@
     ScaledCoordinateSpace,
     VlmCoordinateSpace,
 )
+from askui.models.shared.image_scaler import ImageScaler
 
 _DEFAULT_BASE_URL = "http://localhost:11434/v1"
 _DEFAULT_MODEL_ID = "qwen3.5"
@@ -40,6 +41,12 @@ class OllamaVlmProvider(OpenAIVlmProvider):
         coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
             grid the model emits coordinates in.  ``None`` (the default)
             enables auto-detection based on ``model_id``.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, inherits from `OpenAIVlmProvider`.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Inherits the default from
+            `OpenAIVlmProvider` (2048).
 
     Example:
         ```python
@@ -60,6 +67,8 @@ def __init__(
         base_url: str = _DEFAULT_BASE_URL,
         client: OpenAI | None = None,
         coordinate_space: VlmCoordinateSpace | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
         self._coordinate_space_override = coordinate_space
         super().__init__(
@@ -68,6 +77,8 @@ def __init__(
             base_url=base_url,
             client=client,
             coordinate_space=coordinate_space or PixelCoordinateSpace(),
+            image_scaler=image_scaler,
+            max_image_edge=max_image_edge,
         )
 
     @property
diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py
index aae55c11..a574913b 100644
--- a/src/askui/model_providers/openai_compatible_vlm_provider.py
+++ b/src/askui/model_providers/openai_compatible_vlm_provider.py
@@ -4,6 +4,7 @@
 from openai import OpenAI
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.image_scaler import ImageScaler
 
 
 class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
@@ -20,6 +21,12 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
             (e.g. ``"https://my-host/v1/chat/completions"``).
         model_id (str): Model name expected by the deployment.
         api_key (str | None, optional): API key for the endpoint.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, inherits from `OpenAIVlmProvider`.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Inherits the default from
+            `OpenAIVlmProvider` (2048).
 
     Example:
         ```python
@@ -41,6 +48,8 @@ def __init__(
         endpoint_url: str,
         model_id: str | None = None,
         api_key: str | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
         def _rewrite_url(request: httpx.Request) -> None:
             request.url = httpx.URL(endpoint_url)
@@ -56,4 +65,6 @@ def _rewrite_url(request: httpx.Request) -> None:
         super().__init__(
             model_id=model_id,
             client=client,
+            image_scaler=image_scaler,
+            max_image_edge=max_image_edge,
         )
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
index 8ac5f6a6..693451b2 100644
--- a/src/askui/model_providers/openai_vlm_provider.py
+++ b/src/askui/model_providers/openai_vlm_provider.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from openai import OpenAI
+from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -15,16 +16,29 @@
     ToolChoiceParam,
 )
 from askui.models.shared.coordinate_space import (
-    SCREENSHOT_RESOLUTION,
     PixelCoordinateSpace,
     VlmCoordinateSpace,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
 _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
+_DEFAULT_MAX_IMAGE_EDGE = 2048
+
+
+def _openai_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
+    target = compute_patch_optimized_size(
+        image.width,
+        image.height,
+        max_edge=max_edge,
+        max_tokens=1536,
+        patch_size=32,
+    )
+    return resize_image(image, target)
 
 
 class OpenAIVlmProvider(VlmProvider):
@@ -45,6 +59,9 @@ class OpenAIVlmProvider(VlmProvider):
         coordinate_space (VlmCoordinateSpace, optional): The coordinate grid
             the model emits coordinates in.  Defaults to the screenshot
             resolution (native pixel coordinates).
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Defaults to 2048.
 
     Example:
         ```python
@@ -67,6 +84,8 @@ def __init__(
         base_url: str | None = None,
         client: OpenAI | None = None,
         coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -76,6 +95,12 @@ def __init__(
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
         self._coordinate_space = coordinate_space
+        self._image_scaler_override = image_scaler
+        self._max_edge = (
+            max_image_edge
+            or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
+            or _DEFAULT_MAX_IMAGE_EDGE
+        )
         if client is not None:
             self._client = client
         else:
@@ -107,6 +132,14 @@ def coordinate_space(self) -> VlmCoordinateSpace:
     def pricing(self) -> ModelPricing | None:
         return self._pricing
 
+    @property
+    @override
+    def image_scaler(self) -> ImageScaler:
+        if self._image_scaler_override is not None:
+            return self._image_scaler_override
+        max_edge = self._max_edge
+        return lambda image: _openai_image_scaler(image, max_edge)
+
     @cached_property
     def _messages_api(self) -> OpenAIMessagesApi:
         """Lazily initialise the `OpenAIMessagesApi` on first use."""
@@ -115,9 +148,7 @@ def _messages_api(self) -> OpenAIMessagesApi:
     @override
     def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
         """Append coordinate and resolution info to the system prompt."""
-        coord_info = self.coordinate_space.build_prompt_section(
-            screenshot_resolution=SCREENSHOT_RESOLUTION,
-        )
+        coord_info = self.coordinate_space.build_prompt_section()
         return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")
 
     @override
diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py
index 6d4d9738..5cea3284 100644
--- a/src/askui/model_providers/vlm_provider.py
+++ b/src/askui/model_providers/vlm_provider.py
@@ -3,6 +3,8 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+from PIL import Image
+
 from askui.models.shared.agent_message_param import (
     MessageParam,
     ThinkingConfigParam,
@@ -12,13 +14,20 @@
     PixelCoordinateSpace,
     VlmCoordinateSpace,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_contained_size, resize_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
 
 
+def _default_image_scaler(image: Image.Image) -> Image.Image:
+    target = compute_contained_size(image.width, image.height)
+    return resize_image(image, target)
+
+
 class VlmProvider(ABC):
     """Interface for Vision Language Model providers.
 
@@ -70,6 +79,14 @@ def pricing(self) -> ModelPricing | None:
         """
         return None
 
+    @property
+    def image_scaler(self) -> ImageScaler:
+        """Callable that preprocesses a screenshot before sending to the model.
+
+        Override in subclasses for provider-specific sizing.
+        """
+        return _default_image_scaler
+
     def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
         """Hook for providers to augment the system prompt before sending.
 
diff --git a/src/askui/models/anthropic/get_model.py b/src/askui/models/anthropic/get_model.py
index 7bed5627..421126e6 100644
--- a/src/askui/models/anthropic/get_model.py
+++ b/src/askui/models/anthropic/get_model.py
@@ -20,7 +20,7 @@
 from askui.models.types.response_schemas import ResponseSchema
 from askui.prompts.get_prompts import SYSTEM_PROMPT_GET
 from askui.utils.excel_utils import OfficeDocumentSource
-from askui.utils.image_utils import scale_image_to_fit
+from askui.utils.llm_image_utils import compute_contained_size, resize_image
 from askui.utils.pdf_utils import PdfSource
 from askui.utils.source_utils import Source
 
@@ -78,10 +78,13 @@ def get(
             if response_schema is not None:
                 error_msg = "Response schema is not yet supported for Anthropic"
                 raise NotImplementedError(error_msg)
-            scaled_image = scale_image_to_fit(
-                source.root,
-                get_settings.resolution,
+            target_size = compute_contained_size(
+                source.root.width,
+                source.root.height,
+                get_settings.resolution.width,
+                get_settings.resolution.height,
             )
+            scaled_image = resize_image(source.root, target_size)
             messages = built_messages_for_get_and_locate(scaled_image, query)
             message = self._messages_api.create_message(
                 messages=messages,
diff --git a/src/askui/models/askui/locate_models/anthropic_locate_model.py b/src/askui/models/askui/locate_models/anthropic_locate_model.py
index d2b78c27..3856b8f7 100644
--- a/src/askui/models/askui/locate_models/anthropic_locate_model.py
+++ b/src/askui/models/askui/locate_models/anthropic_locate_model.py
@@ -20,8 +20,8 @@
 from askui.utils.image_utils import (
     ImageSource,
     scale_coordinates,
-    scale_image_to_fit,
 )
+from askui.utils.llm_image_utils import compute_contained_size, resize_image
 
 
 class AnthropicLocateModel(LocateModel):
@@ -77,14 +77,17 @@ def locate(
         try:
             prompt = f"Click on {locator_serialized}"
             resolution = locate_settings.resolution
-            screen_width = resolution.width
-            screen_height = resolution.height
-            scaled_image = scale_image_to_fit(
-                image.root,
-                resolution,
+            target_size = compute_contained_size(
+                image.root.width,
+                image.root.height,
+                resolution.width,
+                resolution.height,
             )
+            scaled_image = resize_image(image.root, target_size)
             messages = built_messages_for_get_and_locate(scaled_image, prompt)
-            system = build_system_prompt_locate(str(screen_width), str(screen_height))
+            system = build_system_prompt_locate(
+                str(scaled_image.width), str(scaled_image.height)
+            )
             message = self._messages_api.create_message(
                 messages=messages,
                 model_id=self._model_id,
@@ -100,7 +103,7 @@ def locate(
                 scale_coordinates(
                     extract_click_coordinates(content_text.text),
                     image.root.size,
-                    resolution,
+                    scaled_image.size,
                     inverse=True,
                 )
             ]
diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
index 635fc053..cc225d6e 100644
--- a/src/askui/models/shared/__init__.py
+++ b/src/askui/models/shared/__init__.py
@@ -6,6 +6,7 @@
     ScaledCoordinateSpace,
     VlmCoordinateSpace,
 )
+from .image_scaler import ImageScaler
 from .tool_tags import ToolTags
 
 try:
@@ -19,6 +20,7 @@
 __all__ = [
     "AndroidBaseTool",
     "ComputerBaseTool",
+    "ImageScaler",
     "NormalizedCoordinateSpace",
     "PixelCoordinateSpace",
     "ScaledCoordinateSpace",
diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py
index 3de94ac8..c2cd7c71 100644
--- a/src/askui/models/shared/coordinate_space.py
+++ b/src/askui/models/shared/coordinate_space.py
@@ -4,20 +4,9 @@
 
 from pydantic import BaseModel, Field
 
-# The resolution screenshots are scaled to before being sent to the model.
-# Used by all agent OS facades (computer, Android, Playwright).
-SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768)
 
-
-def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]:
-    sw, sh = screenshot_resolution
-    return [
-        f"* Screenshot resolution: {sw}x{sh} pixels",
-        "* Screenshots may contain black padding bars to preserve the "
-        "original aspect ratio. UI elements are NOT located in the "
-        "padding area.",
-        "* Coordinate origin is the top-left corner (0, 0)",
-    ]
+def _common_prompt_lines() -> list[str]:
+    return ["* Coordinate origin is the top-left corner (0, 0)"]
 
 
 class VlmCoordinateSpace(BaseModel, ABC):
@@ -45,7 +34,7 @@ def map_to_target(
         """Map model coordinates to pixel coordinates in *target_resolution*."""
 
     @abstractmethod
-    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+    def build_prompt_section(self) -> str:
         """Build prompt text describing coordinate bounds for the model."""
 
 
@@ -68,10 +57,11 @@ def map_to_target(
     ) -> tuple[int, int]:
         return int(x), int(y)
 
-    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
-        sw, sh = screenshot_resolution
-        lines = _common_prompt_lines(screenshot_resolution)
-        lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+    def build_prompt_section(self) -> str:
+        lines = _common_prompt_lines()
+        lines.append(
+            "* Coordinates are in pixel space matching the screenshot dimensions"
+        )
         return "\n".join(lines)
 
 
@@ -87,17 +77,13 @@ def map_to_target(
         tw, th = target_resolution
         return int(x * tw / self.width), int(y * th / self.height)
 
-    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
-        lines = _common_prompt_lines(screenshot_resolution)
-        if (self.width, self.height) != screenshot_resolution:
-            lines.append(
-                f"* Emit coordinates in a {self.width}x{self.height} "
-                f"normalised grid: 0 <= x < {self.width}, "
-                f"0 <= y < {self.height}"
-            )
-        else:
-            sw, sh = screenshot_resolution
-            lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+    def build_prompt_section(self) -> str:
+        lines = _common_prompt_lines()
+        lines.append(
+            f"* Emit coordinates in a {self.width}x{self.height} "
+            f"normalised grid: 0 <= x < {self.width}, "
+            f"0 <= y < {self.height}"
+        )
         return "\n".join(lines)
 
 
@@ -110,8 +96,8 @@ def map_to_target(
         tw, th = target_resolution
         return int(x * tw), int(y * th)
 
-    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
-        lines = _common_prompt_lines(screenshot_resolution)
+    def build_prompt_section(self) -> str:
+        lines = _common_prompt_lines()
         lines.append(
             "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0"
         )
diff --git a/src/askui/models/shared/image_scaler.py b/src/askui/models/shared/image_scaler.py
new file mode 100644
index 00000000..3c579e81
--- /dev/null
+++ b/src/askui/models/shared/image_scaler.py
@@ -0,0 +1,8 @@
+"""Type alias for image scaling callables used by VLM providers."""
+
+from collections.abc import Callable
+
+from PIL import Image
+
+ImageScaler = Callable[[Image.Image], Image.Image]
+"""Callable that preprocesses a screenshot before sending to a model."""
diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
index 02aa9c7c..55132efa 100644
--- a/src/askui/tools/android/agent_os_facade.py
+++ b/src/askui/tools/android/agent_os_facade.py
@@ -2,14 +2,12 @@
 
 from PIL import Image
 
-from askui.models.shared.coordinate_space import (
-    SCREENSHOT_RESOLUTION,
-    VlmCoordinateSpace,
-)
+from askui.models.shared.coordinate_space import VlmCoordinateSpace
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
 from askui.tools.android.uiautomator_hierarchy import UIElementCollection
-from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
+from askui.utils.image_utils import scale_coordinates
 
 
 class AndroidAgentOsFacade(AndroidAgentOs):
@@ -23,9 +21,11 @@ def __init__(
         self,
         agent_os: AndroidAgentOs,
         coordinate_space: VlmCoordinateSpace,
+        image_scaler: ImageScaler,
     ) -> None:
         self._agent_os: AndroidAgentOs = agent_os
-        self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._image_scaler = image_scaler
+        self._target_resolution: Optional[Tuple[int, int]] = None
         self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: Optional[Tuple[int, int]] = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
@@ -41,10 +41,15 @@ def disconnect(self) -> None:
     def screenshot(self) -> Image.Image:
         screenshot = self._agent_os.screenshot()
         self._real_screen_resolution = screenshot.size
-        return scale_image_to_fit(
-            screenshot,
-            self._target_resolution,
-        )
+        scaled = self._image_scaler(screenshot)
+        self._target_resolution = scaled.size
+        return scaled
+
+    def _ensure_target_resolution(self) -> Tuple[int, int]:
+        if self._target_resolution is None:
+            self.screenshot()
+        assert self._target_resolution is not None  # noqa: S101
+        return self._target_resolution
 
     def _scale_coordinates(
         self,
@@ -55,15 +60,17 @@ def _scale_coordinates(
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot().size
 
+        target_resolution = self._ensure_target_resolution()
+
         if from_agent:
             if self._coordinate_space.maps_to_screenshot_pixels:
                 mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, self._target_resolution
+                    x, y, target_resolution
                 )
                 return scale_coordinates(
                     (mapped_x, mapped_y),
                     self._real_screen_resolution,
-                    self._target_resolution,
+                    target_resolution,
                     inverse=True,
                 )
             return self._coordinate_space.map_to_target(
@@ -73,7 +80,7 @@ def _scale_coordinates(
         return scale_coordinates(
             (int(x), int(y)),
             self._real_screen_resolution,
-            self._target_resolution,
+            target_resolution,
             inverse=False,
         )
 
diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py
index 6f7cc75b..63cc6dde 100644
--- a/src/askui/tools/computer_agent_os_facade.py
+++ b/src/askui/tools/computer_agent_os_facade.py
@@ -2,10 +2,8 @@
 
 from PIL import Image
 
-from askui.models.shared.coordinate_space import (
-    SCREENSHOT_RESOLUTION,
-    VlmCoordinateSpace,
-)
+from askui.models.shared.coordinate_space import VlmCoordinateSpace
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import (
     AgentOs,
@@ -19,7 +17,7 @@
     PcKey,
 )
 from askui.tools.askui.askui_controller import RenderObjectStyle  # noqa: TC001
-from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
+from askui.utils.image_utils import scale_coordinates
 
 if TYPE_CHECKING:
     from askui.tools.askui.askui_ui_controller_grpc.generated import (
@@ -44,9 +42,11 @@ def __init__(
         self,
         agent_os: AgentOs,
         coordinate_space: VlmCoordinateSpace,
+        image_scaler: ImageScaler,
     ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._image_scaler = image_scaler
+        self._target_resolution: tuple[int, int] | None = None
         self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: DisplaySize | None = None
         self.tags.append(ToolTags.SCALED_AGENT_OS.value)
@@ -64,7 +64,9 @@ def screenshot(self, report: bool = True) -> Image.Image:
         self._real_screen_resolution = DisplaySize(
             width=screenshot.width, height=screenshot.height
         )
-        return scale_image_to_fit(screenshot, self._target_resolution)
+        scaled = self._image_scaler(screenshot)
+        self._target_resolution = scaled.size
+        return scaled
 
     def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)
@@ -299,7 +301,7 @@ def get_file(self, path: str) -> Image.Image | str:
         """
         response = self._agent_os.get_file(path)
         if isinstance(response, Image.Image):
-            return scale_image_to_fit(response, self._target_resolution)
+            return self._image_scaler(response)
         return response
 
     def remove_virtual_displays(self) -> None:
@@ -309,6 +311,12 @@ def remove_virtual_displays(self) -> None:
         self._agent_os.remove_virtual_displays()
         self._real_screen_resolution = None
 
+    def _ensure_target_resolution(self) -> tuple[int, int]:
+        if self._target_resolution is None:
+            self.screenshot(report=False)
+        assert self._target_resolution is not None  # noqa: S101
+        return self._target_resolution
+
     def _scale_coordinates_back(
         self,
         x: float,
@@ -319,6 +327,7 @@ def _scale_coordinates_back(
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.retrieve_active_display().size
 
+        target_resolution = self._ensure_target_resolution()
         real_size = (
             self._real_screen_resolution.width,
             self._real_screen_resolution.height,
@@ -327,12 +336,12 @@ def _scale_coordinates_back(
         if from_agent:
             if self._coordinate_space.maps_to_screenshot_pixels:
                 mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, self._target_resolution
+                    x, y, target_resolution
                 )
                 return scale_coordinates(
                     (mapped_x, mapped_y),
                     real_size,
-                    self._target_resolution,
+                    target_resolution,
                     inverse=True,
                     check_coordinates_in_bounds=check_coordinates_in_bounds,
                 )
@@ -341,7 +350,7 @@ def _scale_coordinates_back(
         return scale_coordinates(
             (int(x), int(y)),
             real_size,
-            self._target_resolution,
+            target_resolution,
             inverse=False,
             check_coordinates_in_bounds=check_coordinates_in_bounds,
         )
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
index 3e4f8500..a286476c 100644
--- a/src/askui/tools/playwright/agent_os_facade.py
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -2,35 +2,37 @@
 
 from PIL import Image
 
-from askui.models.shared.coordinate_space import (
-    SCREENSHOT_RESOLUTION,
-    VlmCoordinateSpace,
-)
+from askui.models.shared.coordinate_space import VlmCoordinateSpace
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import Display, ModifierKey, PcKey
 from askui.tools.playwright.agent_os import PlaywrightAgentOs
-from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
+from askui.utils.image_utils import scale_coordinates
 
 
 class PlaywrightAgentOsFacade(PlaywrightAgentOs):
     """Facade for `PlaywrightAgentOs` that adds coordinate scaling.
 
-    Screenshots are scaled down to a fixed target resolution so that the
-    AI model always sees a consistent image size.  Coordinate-based inputs
+    Screenshots are scaled using the provider's image scaler so that the
+    AI model sees an optimally sized image.  Coordinate-based inputs
     (``mouse_move``) are scaled back up to the real page resolution before
     being forwarded to the underlying agent OS.
 
     Args:
         agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap.
+        coordinate_space (VlmCoordinateSpace): Coordinate grid the model uses.
+        image_scaler (ImageScaler): Callable to preprocess screenshots.
     """
 
     def __init__(
         self,
         agent_os: PlaywrightAgentOs,
         coordinate_space: VlmCoordinateSpace,
+        image_scaler: ImageScaler,
     ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._image_scaler = image_scaler
+        self._target_resolution: tuple[int, int] | None = None
         self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: tuple[int, int] | None = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
@@ -48,7 +50,15 @@ def disconnect(self) -> None:
     def screenshot(self, report: bool = True) -> Image.Image:
         screenshot = self._agent_os.screenshot(report=report)
         self._real_screen_resolution = screenshot.size
-        return scale_image_to_fit(screenshot, self._target_resolution)
+        scaled = self._image_scaler(screenshot)
+        self._target_resolution = scaled.size
+        return scaled
+
+    def _ensure_target_resolution(self) -> tuple[int, int]:
+        if self._target_resolution is None:
+            self.screenshot(report=False)
+        assert self._target_resolution is not None  # noqa: S101
+        return self._target_resolution
 
     def _scale_coordinates(
         self,
@@ -61,15 +71,17 @@ def _scale_coordinates(
                 report=False,
             ).size
 
+        target_resolution = self._ensure_target_resolution()
+
         if from_agent:
             if self._coordinate_space.maps_to_screenshot_pixels:
                 mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, self._target_resolution
+                    x, y, target_resolution
                 )
                 return scale_coordinates(
                     (mapped_x, mapped_y),
                     self._real_screen_resolution,
-                    self._target_resolution,
+                    target_resolution,
                     inverse=True,
                 )
             return self._coordinate_space.map_to_target(
@@ -79,7 +91,7 @@ def _scale_coordinates(
         return scale_coordinates(
             (int(x), int(y)),
             self._real_screen_resolution,
-            self._target_resolution,
+            target_resolution,
             inverse=False,
         )
 
diff --git a/src/askui/tools/store/universal/load_image_tool.py b/src/askui/tools/store/universal/load_image_tool.py
index 5a0512e9..b763f2ee 100644
--- a/src/askui/tools/store/universal/load_image_tool.py
+++ b/src/askui/tools/store/universal/load_image_tool.py
@@ -4,7 +4,7 @@
 from PIL import Image
 
 from askui.models.shared.tools import Tool
-from askui.utils.image_utils import scale_image_to_fit
+from askui.utils.llm_image_utils import compute_contained_size, resize_image
 
 
 class LoadImageTool(Tool):
@@ -116,7 +116,13 @@ def __call__(self, image_path: str = "") -> Tuple[str, Image.Image]:
             raise FileExistsError(error_msg)
 
         image = Image.open(absolute_image_path)
-        image = scale_image_to_fit(image, target_size=self._target_size)
+        target_size = compute_contained_size(
+            image.width,
+            image.height,
+            self._target_size[0],
+            self._target_size[1],
+        )
+        image = resize_image(image, target_size)
 
         return (
             f"Image was successfully loaded from {absolute_image_path}",
diff --git a/src/askui/utils/llm_image_utils.py b/src/askui/utils/llm_image_utils.py
new file mode 100644
index 00000000..688854bc
--- /dev/null
+++ b/src/askui/utils/llm_image_utils.py
@@ -0,0 +1,161 @@
+"""Image utilities for LLM vision model preprocessing.
+
+Functions for computing optimal image sizes based on patch-based token budgets
+and resizing images for VLM consumption.
+"""
+
+import logging
+import math
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+def count_image_tokens(width: int, height: int, patch_size: int = 28) -> int:
+    """Count the number of tokens an image will consume in a patch-based VLM.
+
+    Each non-overlapping ``patch_size x patch_size`` square maps to one token.
+
+    Args:
+        width (int): Image width in pixels.
+        height (int): Image height in pixels.
+        patch_size (int): Side length of a single patch in pixels.
+
+    Returns:
+        int: Number of image tokens.
+    """
+    patches_w = math.ceil(width / patch_size)
+    patches_h = math.ceil(height / patch_size)
+    return patches_w * patches_h
+
+
+def compute_patch_optimized_size(
+    width: int,
+    height: int,
+    max_edge: int = 1568,
+    max_tokens: int = 1568,
+    patch_size: int = 28,
+) -> tuple[int, int]:
+    """Compute the largest aspect-preserving size within a patch-based token budget.
+
+    Uses binary search to find the biggest scale factor such that:
+    - Neither dimension exceeds ``max_edge``.
+    - ``count_image_tokens(w, h, patch_size) <= max_tokens``.
+
+    Args:
+        width (int): Original image width.
+        height (int): Original image height.
+        max_edge (int): Maximum allowed dimension (width or height).
+        max_tokens (int): Maximum allowed number of image tokens.
+        patch_size (int): Patch size used by the model.
+
+    Returns:
+        tuple[int, int]: Target ``(width, height)``.
+    """
+    if width <= 0 or height <= 0:
+        error_msg = f"Image dimensions must be positive, got {width}x{height}"
+        raise ValueError(error_msg)
+
+    # If already within all constraints, return as-is
+    if (
+        width <= max_edge
+        and height <= max_edge
+        and count_image_tokens(width, height, patch_size) <= max_tokens
+    ):
+        return width, height
+
+    # Clamp to max_edge first
+    scale = min(max_edge / width, max_edge / height, 1.0)
+
+    # Binary search for largest scale that fits within token budget
+    lo, hi = 0.0, scale
+    for _ in range(50):
+        mid = (lo + hi) / 2
+        w = max(1, int(width * mid))
+        h = max(1, int(height * mid))
+        if count_image_tokens(w, h, patch_size) <= max_tokens:
+            lo = mid
+        else:
+            hi = mid
+
+    result_w = max(1, int(width * lo))
+    result_h = max(1, int(height * lo))
+    return result_w, result_h
+
+
+def compute_contained_size(
+    width: int,
+    height: int,
+    max_width: int = 1024,
+    max_height: int = 768,
+) -> tuple[int, int]:
+    """Compute the largest aspect-preserving size contained within max bounds.
+
+    If the image already fits, returns its original dimensions.
+
+    Args:
+        width (int): Original image width.
+        height (int): Original image height.
+        max_width (int): Maximum allowed width.
+        max_height (int): Maximum allowed height.
+
+    Returns:
+        tuple[int, int]: Target ``(width, height)``.
+    """
+    if width <= 0 or height <= 0:
+        error_msg = f"Image dimensions must be positive, got {width}x{height}"
+        raise ValueError(error_msg)
+
+    if width <= max_width and height <= max_height:
+        return width, height
+
+    scale = min(max_width / width, max_height / height)
+    return max(1, int(width * scale)), max(1, int(height * scale))
+
+
+def resize_image(image: Image.Image, target_size: tuple[int, int]) -> Image.Image:
+    """Resize an image to exact ``target_size`` using LANCZOS resampling.
+
+    Logs a warning if the aspect ratio changes by more than 1%.
+
+    Args:
+        image (Image.Image): Source image.
+        target_size (tuple[int, int]): Target ``(width, height)``.
+
+    Returns:
+        Image.Image: Resized image.
+    """
+    if image.size == target_size:
+        return image
+
+    src_ratio = image.width / image.height
+    dst_ratio = target_size[0] / target_size[1]
+    if abs(src_ratio - dst_ratio) / max(src_ratio, dst_ratio) > 0.01:
+        logger.warning(
+            "Aspect ratio change during resize: %.3f -> %.3f",
+            src_ratio,
+            dst_ratio,
+        )
+
+    return image.resize(target_size, Image.Resampling.LANCZOS)
+
+
+def resize_and_pad_image(
+    image: Image.Image,
+    target_size: tuple[int, int],
+) -> Image.Image:
+    """Resize preserving aspect ratio, then center on a padded canvas.
+
+    Equivalent to the legacy ``scale_image_to_fit`` behaviour.
+
+    Args:
+        image (Image.Image): Source image.
+        target_size (tuple[int, int]): Canvas ``(width, height)``.
+
+    Returns:
+        Image.Image: Image centered on a ``target_size`` canvas.
+    """
+    from askui.utils.image_utils import scale_image_to_fit
+
+    return scale_image_to_fit(image, target_size)
diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py
index bc211ec9..d1c94232 100644
--- a/src/askui/web_agent.py
+++ b/src/askui/web_agent.py
@@ -72,6 +72,7 @@ def __init__(
         self.act_agent_os_facade = PlaywrightAgentOsFacade(
             self.os,
             coordinate_space=self._vlm_provider.coordinate_space,
+            image_scaler=self._vlm_provider.image_scaler,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         self.act_settings = ActSettings(
diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py
index 1d33f1d5..8fb787ab 100644
--- a/tests/unit/model_providers/test_openai_vlm_provider.py
+++ b/tests/unit/model_providers/test_openai_vlm_provider.py
@@ -3,6 +3,7 @@
 from unittest.mock import MagicMock
 
 from openai import OpenAI
+from PIL import Image
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
@@ -74,67 +75,77 @@ def test_augment_system_prompt_scaled_coordinate_space(self) -> None:
         rendered = str(augmented)
         assert "You are a helpful assistant." in rendered
         assert "1000x1000 normalised grid" in rendered
-        assert "1024x768" in rendered
 
-    def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None:
+    def test_augment_system_prompt_pixel_coordinate_space(self) -> None:
         provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
         system = SystemPrompt(prompt="Base prompt.")
         augmented = provider.augment_system_prompt(system)
 
         rendered = str(augmented)
         assert "normalised grid" not in rendered
-        assert "0 <= x < 1024" in rendered
+        assert "pixel space matching the screenshot dimensions" in rendered
+
+
+class TestImageScaler:
+    def test_default_scaler_returns_valid_image(self) -> None:
+        provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
+        img = Image.new("RGB", (1920, 1080))
+        scaled = provider.image_scaler(img)
+        assert scaled.width <= 2048
+        assert scaled.height <= 2048
+
+    def test_custom_scaler_override(self) -> None:
+        def custom_scaler(image: Image.Image) -> Image.Image:
+            return image.resize((100, 100))
+
+        provider = OpenAIVlmProvider(
+            model_id="gpt-4o",
+            api_key="sk-test",
+            image_scaler=custom_scaler,
+        )
+        img = Image.new("RGB", (1920, 1080))
+        scaled = provider.image_scaler(img)
+        assert scaled.size == (100, 100)
 
 
 class TestPixelCoordinateSpacePrompt:
-    def test_shows_pixel_bounds(self) -> None:
+    def test_shows_pixel_space_description(self) -> None:
         cs = PixelCoordinateSpace()
-        result = cs.build_prompt_section((1024, 768))
-        assert "0 <= x < 1024" in result
-        assert "0 <= y < 768" in result
+        result = cs.build_prompt_section()
+        assert "pixel space matching the screenshot dimensions" in result
         assert "normalised grid" not in result
 
-    def test_includes_padding_and_origin_info(self) -> None:
+    def test_includes_origin_info(self) -> None:
         cs = PixelCoordinateSpace()
-        result = cs.build_prompt_section((1024, 768))
-        assert "black padding" in result
+        result = cs.build_prompt_section()
         assert "top-left" in result
 
 
 class TestScaledCoordinateSpacePrompt:
     def test_shows_normalised_grid(self) -> None:
         cs = ScaledCoordinateSpace(width=1000, height=1000)
-        result = cs.build_prompt_section((1024, 768))
-        assert "1024x768" in result
+        result = cs.build_prompt_section()
         assert "1000x1000 normalised grid" in result
         assert "0 <= x < 1000" in result
         assert "0 <= y < 1000" in result
 
-    def test_matching_resolution_shows_pixel_bounds(self) -> None:
-        cs = ScaledCoordinateSpace(width=1024, height=768)
-        result = cs.build_prompt_section((1024, 768))
-        assert "0 <= x < 1024" in result
-        assert "normalised grid" not in result
-
-    def test_includes_padding_and_origin_info(self) -> None:
+    def test_includes_origin_info(self) -> None:
         cs = ScaledCoordinateSpace(width=1000, height=1000)
-        result = cs.build_prompt_section((1024, 768))
-        assert "black padding" in result
+        result = cs.build_prompt_section()
         assert "top-left" in result
 
 
 class TestNormalizedCoordinateSpacePrompt:
     def test_shows_normalised_floats(self) -> None:
         cs = NormalizedCoordinateSpace()
-        result = cs.build_prompt_section((1024, 768))
+        result = cs.build_prompt_section()
         assert "0.0 <= x <= 1.0" in result
         assert "0.0 <= y <= 1.0" in result
         assert "normalised floats" in result
 
-    def test_includes_padding_and_origin_info(self) -> None:
+    def test_includes_origin_info(self) -> None:
         cs = NormalizedCoordinateSpace()
-        result = cs.build_prompt_section((1024, 768))
-        assert "black padding" in result
+        result = cs.build_prompt_section()
         assert "top-left" in result
 
 
diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py
index e81e8214..3b9e1b89 100644
--- a/tests/unit/tools/test_agent_os_facade_coordinates.py
+++ b/tests/unit/tools/test_agent_os_facade_coordinates.py
@@ -15,6 +15,13 @@
     ScaledCoordinateSpace,
 )
 from askui.tools.android.agent_os_facade import AndroidAgentOsFacade
+from askui.utils.llm_image_utils import compute_contained_size, resize_image
+
+
+def _default_scaler(image: Image.Image) -> Image.Image:
+    """Scaler that mimics the default contained-size logic."""
+    target = compute_contained_size(image.width, image.height, 1024, 768)
+    return resize_image(image, target)
 
 
 def _make_android_facade(
@@ -27,16 +34,23 @@ def _make_android_facade(
     mock_os = MagicMock()
     mock_os.tags = []
     mock_os.screenshot.return_value = Image.new("RGB", device_size)
-    facade = AndroidAgentOsFacade(mock_os, coordinate_space=coordinate_space)
+    facade = AndroidAgentOsFacade(
+        mock_os,
+        coordinate_space=coordinate_space,
+        image_scaler=_default_scaler,
+    )
     facade._real_screen_resolution = device_size
+    # Set target resolution as the scaler would produce it
+    scaled = _default_scaler(Image.new("RGB", device_size))
+    facade._target_resolution = scaled.size
     return facade
 
 
 class TestScaledCoordinateSpaceTallDevice:
     """Qwen 0-1000 grid on a tall Android device (1080x2400).
 
-    The screenshot is scaled to 345x768 with 339px horizontal padding,
-    so the old code would produce negative x when x_model < ~331.
+    Non-pixel coordinate spaces map directly to device resolution,
+    so no padding offset is involved.
     """
 
     device = (1080, 2400)
@@ -90,30 +104,33 @@ def test_left_side_tap(self) -> None:
 class TestPixelCoordinateSpaceTallDevice:
     """Claude pixel coordinates on a tall Android device (1080x2400).
 
-    Pixel coordinates are in the padded 1024x768 screenshot space
-    and must go through the padding-aware inverse scaling pipeline.
+    With the no-padding scaler, a 1080x2400 device is scaled to
+    compute_contained_size(1080, 2400, 1024, 768) = (345, 768).
+    Pixel coordinates are in the (345, 768) screenshot space and go
+    through the padding-aware inverse scaling pipeline.  Because the
+    image nearly fills the target (only ~2 px rounding slack), offsets
+    are close to zero but not exactly zero.
     """
 
     device = (1080, 2400)
     cs = PixelCoordinateSpace()
 
     def test_center_of_content(self) -> None:
-        """The center of the content area in the padded screenshot."""
+        """The center of the content area in the scaled screenshot."""
         facade = _make_android_facade(self.device, self.cs)
-        # Content area: x=[339..684], y=[0..768] in 1024x768 screenshot
-        # Center of content: x=511, y=384
-        x, y = facade._scale_coordinates(511, 384)
-        # (511 - 339) / 0.32 = 537.5 → 537, (384 - 0) / 0.32 = 1200
-        assert x == pytest.approx(537, abs=2)
-        assert y == 1200
-
-    def test_top_left_of_content(self) -> None:
-        """Top-left corner of the content area."""
+        # Target resolution is (345, 768) — nearly no padding
+        x, y = facade._scale_coordinates(172, 384)
+        assert x == pytest.approx(538, abs=5)
+        assert y == pytest.approx(1200, abs=5)
+
+    def test_near_top_left_of_content(self) -> None:
+        """Coordinate near top-left corner maps back close to origin."""
         facade = _make_android_facade(self.device, self.cs)
-        # Content starts at x=339 in the padded screenshot
-        x, y = facade._scale_coordinates(339, 0)
-        assert x == pytest.approx(0, abs=2)
-        assert y == 0
+        # Use (1, 2) instead of exact origin to avoid rounding-offset
+        # edge case that can produce small negative values.
+        x, y = facade._scale_coordinates(1, 2)
+        assert x == pytest.approx(3, abs=5)
+        assert y == pytest.approx(3, abs=5)
 
 
 class TestSquareDevice:
@@ -136,6 +153,8 @@ def test_device_to_screenshot_scaled_space(self) -> None:
             (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000)
         )
         x, y = facade._scale_coordinates(540, 1200, from_agent=False)
-        # Forward scaling: (540 * 0.32 + 339, 1200 * 0.32 + 0) ≈ (512, 384)
-        assert x == pytest.approx(512, abs=2)
+        # Target resolution is (345, 768), no padding
+        # Forward scaling: factor = 768/2400 = 0.32
+        # x = 540 * 0.32 = 172.8 → 172, y = 1200 * 0.32 = 384
+        assert x == pytest.approx(172, abs=2)
         assert y == pytest.approx(384, abs=2)
diff --git a/tests/unit/utils/test_llm_image_utils.py b/tests/unit/utils/test_llm_image_utils.py
new file mode 100644
index 00000000..714d84c0
--- /dev/null
+++ b/tests/unit/utils/test_llm_image_utils.py
@@ -0,0 +1,145 @@
+"""Tests for LLM image utility functions."""
+
+import logging
+
+import pytest
+from PIL import Image
+
+from askui.utils.llm_image_utils import (
+    compute_contained_size,
+    compute_patch_optimized_size,
+    count_image_tokens,
+    resize_and_pad_image,
+    resize_image,
+)
+
+
+class TestCountImageTokens:
+    def test_exact_patches(self) -> None:
+        # 56x56 with patch_size=28 → 2x2 = 4 tokens
+        assert count_image_tokens(56, 56, patch_size=28) == 4
+
+    def test_single_patch(self) -> None:
+        assert count_image_tokens(28, 28, patch_size=28) == 1
+
+    def test_partial_patches_round_up(self) -> None:
+        # 30x30 with patch_size=28 → ceil(30/28) * ceil(30/28) = 2*2 = 4
+        assert count_image_tokens(30, 30, patch_size=28) == 4
+
+    def test_known_anthropic_value(self) -> None:
+        # 1568x1568 with patch_size=28 → 56*56 = 3136
+        assert count_image_tokens(1568, 1568, patch_size=28) == 3136
+
+    def test_rectangular(self) -> None:
+        # 1024x768 with patch_size=28 → ceil(1024/28)*ceil(768/28) = 37*28 = 1036
+        assert count_image_tokens(1024, 768, patch_size=28) == 37 * 28
+
+
+class TestComputePatchOptimizedSize:
+    def test_small_image_unchanged(self) -> None:
+        # A small image that fits within all constraints is returned as-is
+        w, h = compute_patch_optimized_size(200, 100)
+        assert w == 200
+        assert h == 100
+
+    def test_respects_max_edge(self) -> None:
+        w, h = compute_patch_optimized_size(3000, 2000, max_edge=1568)
+        assert w <= 1568
+        assert h <= 1568
+
+    def test_respects_max_tokens(self) -> None:
+        w, h = compute_patch_optimized_size(
+            1920, 1080, max_edge=1568, max_tokens=1568, patch_size=28
+        )
+        tokens = count_image_tokens(w, h, patch_size=28)
+        assert tokens <= 1568
+
+    def test_preserves_aspect_ratio(self) -> None:
+        w, h = compute_patch_optimized_size(1920, 1080)
+        original_ratio = 1920 / 1080
+        result_ratio = w / h
+        assert abs(original_ratio - result_ratio) / original_ratio < 0.02
+
+    def test_invalid_dimensions_raises(self) -> None:
+        with pytest.raises(ValueError, match="positive"):
+            compute_patch_optimized_size(0, 100)
+
+    def test_openai_params(self) -> None:
+        w, h = compute_patch_optimized_size(
+            1920, 1080, max_edge=2048, max_tokens=1536, patch_size=32
+        )
+        tokens = count_image_tokens(w, h, patch_size=32)
+        assert tokens <= 1536
+        assert w <= 2048
+        assert h <= 2048
+
+
+class TestComputeContainedSize:
+    def test_already_fits(self) -> None:
+        assert compute_contained_size(800, 600, 1024, 768) == (800, 600)
+
+    def test_exact_match(self) -> None:
+        assert compute_contained_size(1024, 768, 1024, 768) == (1024, 768)
+
+    def test_landscape_too_wide(self) -> None:
+        w, h = compute_contained_size(2048, 768, 1024, 768)
+        assert w <= 1024
+        assert h <= 768
+
+    def test_portrait_too_tall(self) -> None:
+        w, h = compute_contained_size(768, 2048, 1024, 768)
+        assert w <= 1024
+        assert h <= 768
+
+    def test_preserves_aspect_ratio(self) -> None:
+        w, h = compute_contained_size(1920, 1080, 1024, 768)
+        original_ratio = 1920 / 1080
+        result_ratio = w / h
+        assert abs(original_ratio - result_ratio) / original_ratio < 0.02
+
+    def test_invalid_dimensions_raises(self) -> None:
+        with pytest.raises(ValueError, match="positive"):
+            compute_contained_size(0, 100)
+
+
+class TestResizeImage:
+    def test_correct_dimensions(self) -> None:
+        img = Image.new("RGB", (1920, 1080))
+        result = resize_image(img, (1024, 576))
+        assert result.size == (1024, 576)
+
+    def test_no_op_when_same_size(self) -> None:
+        img = Image.new("RGB", (1024, 768))
+        result = resize_image(img, (1024, 768))
+        assert result is img  # Same object, no copy
+
+    def test_aspect_ratio_warning_logged(
+        self, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        img = Image.new("RGB", (1920, 1080))
+        with caplog.at_level(logging.WARNING):
+            resize_image(img, (1024, 768))
+        assert "Aspect ratio change" in caplog.text
+
+    def test_no_warning_when_ratio_preserved(
+        self, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        img = Image.new("RGB", (1920, 1080))
+        with caplog.at_level(logging.WARNING):
+            resize_image(img, (960, 540))
+        assert "Aspect ratio change" not in caplog.text
+
+
+class TestResizeAndPadImage:
+    def test_correct_dimensions(self) -> None:
+        img = Image.new("RGB", (1920, 1080))
+        result = resize_and_pad_image(img, (1024, 768))
+        assert result.size == (1024, 768)
+
+    def test_preserves_aspect_ratio_with_padding(self) -> None:
+        img = Image.new("RGB", (1080, 2400), color=(255, 0, 0))
+        result = resize_and_pad_image(img, (1024, 768))
+        assert result.size == (1024, 768)
+        # Check that some padding exists (black pixels at edges)
+        left_pixel = result.getpixel((0, 0))
+        assert left_pixel == (0, 0, 0)  # Black padding

From 3665cc42646e8c5749247c9a23c2b3fdd9b5964f Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Thu, 11 Jun 2026 14:53:00 +0200
Subject: [PATCH 4/6] refactor: clean up PR (composition, deduplication,
 exports)

---
 src/askui/model_providers/__init__.py         |  14 ++-
 .../model_providers/anthropic_vlm_provider.py |  16 +--
 .../model_providers/askui_vlm_provider.py     |  33 ++---
 .../openai_compatible_vlm_provider.py         |  11 ++
 .../model_providers/openai_vlm_provider.py    |  18 +--
 src/askui/models/shared/__init__.py           |   2 +-
 src/askui/tools/android/agent_os_facade.py    | 115 +++++++-----------
 src/askui/tools/computer_agent_os_facade.py   |  98 +++++----------
 src/askui/tools/coordinate_scaling_mixin.py   |  99 +++++++++++++++
 src/askui/tools/playwright/agent_os_facade.py |  73 +++--------
 src/askui/utils/llm_image_utils.py            |  30 +++++
 .../tools/test_agent_os_facade_coordinates.py |  28 ++---
 12 files changed, 271 insertions(+), 266 deletions(-)
 create mode 100644 src/askui/tools/coordinate_scaling_mixin.py

diff --git a/src/askui/model_providers/__init__.py b/src/askui/model_providers/__init__.py
index ae1f0d0d..9424577d 100644
--- a/src/askui/model_providers/__init__.py
+++ b/src/askui/model_providers/__init__.py
@@ -35,6 +35,13 @@
 from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.model_providers.vlm_provider import VlmProvider
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
+from askui.models.shared.image_scaler import ImageScaler
 from askui.utils.model_pricing import ModelPricing
 
 __all__ = [
@@ -46,11 +53,16 @@
     "DetectionProvider",
     "GoogleImageQAProvider",
     "ImageQAProvider",
+    "ImageScaler",
     "ModelPricing",
+    "NormalizedCoordinateSpace",
     "OllamaImageQAProvider",
     "OllamaVlmProvider",
+    "OpenAICompatibleVlmProvider",
     "OpenAIImageQAProvider",
     "OpenAIVlmProvider",
-    "OpenAICompatibleVlmProvider",
+    "PixelCoordinateSpace",
+    "ScaledCoordinateSpace",
+    "VlmCoordinateSpace",
     "VlmProvider",
 ]
diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py
index 37ca9a5d..f094f22c 100644
--- a/src/askui/model_providers/anthropic_vlm_provider.py
+++ b/src/askui/model_providers/anthropic_vlm_provider.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 from anthropic import Anthropic
-from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -18,24 +17,13 @@
 from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
-from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
+from askui.utils.llm_image_utils import compute_patch_optimized_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
 _DEFAULT_MAX_IMAGE_EDGE = 1568
 
 
-def _anthropic_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
-    target = compute_patch_optimized_size(
-        image.width,
-        image.height,
-        max_edge=max_edge,
-        max_tokens=1568,
-        patch_size=28,
-    )
-    return resize_image(image, target)
-
-
 class AnthropicVlmProvider(VlmProvider):
     """VLM provider that routes requests directly to the Anthropic API.
 
@@ -138,7 +126,7 @@ def image_scaler(self) -> ImageScaler:
         if self._image_scaler_override is not None:
             return self._image_scaler_override
         max_edge = self._max_edge
-        return lambda image: _anthropic_image_scaler(image, max_edge)
+        return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)
 
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py
index c8990a6c..f402ee3c 100644
--- a/src/askui/model_providers/askui_vlm_provider.py
+++ b/src/askui/model_providers/askui_vlm_provider.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 from anthropic import Anthropic
-from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -19,23 +18,12 @@
 from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
-from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
+from askui.utils.llm_image_utils import compute_patch_optimized_image
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
 _DEFAULT_MAX_IMAGE_EDGE = 1568
 
 
-def _askui_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
-    target = compute_patch_optimized_size(
-        image.width,
-        image.height,
-        max_edge=max_edge,
-        max_tokens=1568,
-        patch_size=28,
-    )
-    return resize_image(image, target)
-
-
 class AskUIVlmProvider(VlmProvider):
     """VLM provider that routes requests through AskUI's hosted Anthropic proxy.
 
@@ -44,14 +32,13 @@ class AskUIVlmProvider(VlmProvider):
     on the first API call, not at construction time.
 
     Args:
-        workspace_id (str | None, optional): AskUI workspace ID. Reads
-            `ASKUI_WORKSPACE_ID` from the environment if not provided.
-        token (str | None, optional): AskUI API token. Reads `ASKUI_TOKEN`
-            from the environment if not provided.
-        model_id (str, optional): Claude model to use. Defaults to
-            `"claude-sonnet-4-6"`.
-        client (Anthropic | None, optional): Pre-configured Anthropic client.
-            If provided, `workspace_id` and `token` are ignored.
+        askui_settings (`AskUiInferenceApiSettings` | None, optional):
+            Connection settings (workspace ID, token, base URL).  Reads
+            from environment variables if not provided.
+        model_id (str | None, optional): Claude model to use. Defaults to
+            ``"claude-sonnet-4-6"``.
+        client (`Anthropic` | None, optional): Pre-configured Anthropic client.
+            If provided, ``askui_settings`` is only used for the base URL.
         image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
             callable. If ``None``, uses Anthropic-optimized patch-based scaling.
         max_image_edge (int | None, optional): Maximum edge length (in pixels)
@@ -65,8 +52,6 @@ class AskUIVlmProvider(VlmProvider):
 
         agent = ComputerAgent(settings=AgentSettings(
             vlm_provider=AskUIVlmProvider(
-                workspace_id="my-workspace",
-                token="my-token",
                 model_id="claude-opus-4-6-20260401",
             )
         ))
@@ -104,7 +89,7 @@ def image_scaler(self) -> ImageScaler:
         if self._image_scaler_override is not None:
             return self._image_scaler_override
         max_edge = self._max_edge
-        return lambda image: _askui_image_scaler(image, max_edge)
+        return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)
 
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py
index a574913b..98b50627 100644
--- a/src/askui/model_providers/openai_compatible_vlm_provider.py
+++ b/src/askui/model_providers/openai_compatible_vlm_provider.py
@@ -4,8 +4,14 @@
 from openai import OpenAI
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.image_scaler import ImageScaler
 
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
+
 
 class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
     """VLM provider for OpenAI-compatible APIs that require an exact endpoint URL.
@@ -21,6 +27,9 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
             (e.g. ``"https://my-host/v1/chat/completions"``).
         model_id (str): Model name expected by the deployment.
         api_key (str | None, optional): API key for the endpoint.
+        coordinate_space (`VlmCoordinateSpace` | None, optional): The coordinate
+            grid the model emits coordinates in.  If ``None``, inherits the
+            default from `OpenAIVlmProvider` (pixel coordinates).
         image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
             callable. If ``None``, inherits from `OpenAIVlmProvider`.
         max_image_edge (int | None, optional): Maximum edge length (in pixels)
@@ -48,6 +57,7 @@ def __init__(
         endpoint_url: str,
         model_id: str | None = None,
         api_key: str | None = None,
+        coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
         image_scaler: ImageScaler | None = None,
         max_image_edge: int | None = None,
     ) -> None:
@@ -65,6 +75,7 @@ def _rewrite_url(request: httpx.Request) -> None:
         super().__init__(
             model_id=model_id,
             client=client,
+            coordinate_space=coordinate_space,
             image_scaler=image_scaler,
             max_image_edge=max_image_edge,
         )
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
index 693451b2..bf25eb00 100644
--- a/src/askui/model_providers/openai_vlm_provider.py
+++ b/src/askui/model_providers/openai_vlm_provider.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 from openai import OpenAI
-from PIL import Image
 from typing_extensions import override
 
 from askui.model_providers.vlm_provider import VlmProvider
@@ -22,7 +21,7 @@
 from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
-from askui.utils.llm_image_utils import compute_patch_optimized_size, resize_image
+from askui.utils.llm_image_utils import compute_patch_optimized_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
@@ -30,17 +29,6 @@
 _DEFAULT_MAX_IMAGE_EDGE = 2048
 
 
-def _openai_image_scaler(image: Image.Image, max_edge: int) -> Image.Image:
-    target = compute_patch_optimized_size(
-        image.width,
-        image.height,
-        max_edge=max_edge,
-        max_tokens=1536,
-        patch_size=32,
-    )
-    return resize_image(image, target)
-
-
 class OpenAIVlmProvider(VlmProvider):
     """VLM provider for any OpenAI-compatible API.
 
@@ -138,7 +126,9 @@ def image_scaler(self) -> ImageScaler:
         if self._image_scaler_override is not None:
             return self._image_scaler_override
         max_edge = self._max_edge
-        return lambda image: _openai_image_scaler(image, max_edge)
+        return lambda image: compute_patch_optimized_image(
+            image, max_edge=max_edge, max_tokens=1536, patch_size=32
+        )
 
     @cached_property
     def _messages_api(self) -> OpenAIMessagesApi:
diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
index cc225d6e..84bd3d22 100644
--- a/src/askui/models/shared/__init__.py
+++ b/src/askui/models/shared/__init__.py
@@ -24,8 +24,8 @@
     "NormalizedCoordinateSpace",
     "PixelCoordinateSpace",
     "ScaledCoordinateSpace",
-    "VlmCoordinateSpace",
     "ToolTags",
+    "VlmCoordinateSpace",
 ]
 
 if _PLAYWRIGHT_AVAILABLE:
diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
index 55132efa..9fcfa25f 100644
--- a/src/askui/tools/android/agent_os_facade.py
+++ b/src/askui/tools/android/agent_os_facade.py
@@ -1,20 +1,31 @@
-from typing import List, Optional, Tuple
+from __future__ import annotations
 
-from PIL import Image
+from typing import TYPE_CHECKING
 
-from askui.models.shared.coordinate_space import VlmCoordinateSpace
-from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
-from askui.tools.android.uiautomator_hierarchy import UIElementCollection
-from askui.utils.image_utils import scale_coordinates
+from askui.tools.coordinate_scaling_mixin import CoordinateScaler
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+    from askui.models.shared.coordinate_space import VlmCoordinateSpace
+    from askui.models.shared.image_scaler import ImageScaler
+    from askui.tools.android.uiautomator_hierarchy import UIElementCollection
 
 
 class AndroidAgentOsFacade(AndroidAgentOs):
-    """
-    Facade for AndroidAgentOs that adds coordinate scaling functionality.
-    It is used to scale the coordinates to the target resolution
-    and back to the real screen resolution.
+    """Facade for `AndroidAgentOs` that adds coordinate scaling.
+
+    Screenshots are scaled using the provider's image scaler so that the
+    AI model sees an optimally sized image.  Coordinate-based inputs
+    (``tap``, ``swipe``, ``drag_and_drop``) are scaled back up to the
+    real device resolution before being forwarded to the underlying agent OS.
+
+    Args:
+        agent_os (`AndroidAgentOs`): The real Android agent OS to wrap.
+        coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses.
+        image_scaler (`ImageScaler`): Callable to preprocess screenshots.
     """
 
     def __init__(
@@ -24,82 +35,42 @@ def __init__(
         image_scaler: ImageScaler,
     ) -> None:
         self._agent_os: AndroidAgentOs = agent_os
-        self._image_scaler = image_scaler
-        self._target_resolution: Optional[Tuple[int, int]] = None
-        self._coordinate_space: VlmCoordinateSpace = coordinate_space
-        self._real_screen_resolution: Optional[Tuple[int, int]] = None
+        self._scaler = CoordinateScaler(
+            coordinate_space=coordinate_space,
+            image_scaler=image_scaler,
+            fetch_real_resolution=lambda: self._agent_os.screenshot().size,
+            take_screenshot=lambda: self.screenshot(),
+        )
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
     def connect(self) -> None:
         self._agent_os.connect()
-        self._real_screen_resolution = self._agent_os.screenshot().size
+        self._scaler.real_screen_resolution = self._agent_os.screenshot().size
 
     def disconnect(self) -> None:
         self._agent_os.disconnect()
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def screenshot(self) -> Image.Image:
         screenshot = self._agent_os.screenshot()
-        self._real_screen_resolution = screenshot.size
-        scaled = self._image_scaler(screenshot)
-        self._target_resolution = scaled.size
-        return scaled
-
-    def _ensure_target_resolution(self) -> Tuple[int, int]:
-        if self._target_resolution is None:
-            self.screenshot()
-        assert self._target_resolution is not None  # noqa: S101
-        return self._target_resolution
-
-    def _scale_coordinates(
-        self,
-        x: float,
-        y: float,
-        from_agent: bool = True,
-    ) -> Tuple[int, int]:
-        if self._real_screen_resolution is None:
-            self._real_screen_resolution = self._agent_os.screenshot().size
-
-        target_resolution = self._ensure_target_resolution()
-
-        if from_agent:
-            if self._coordinate_space.maps_to_screenshot_pixels:
-                mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, target_resolution
-                )
-                return scale_coordinates(
-                    (mapped_x, mapped_y),
-                    self._real_screen_resolution,
-                    target_resolution,
-                    inverse=True,
-                )
-            return self._coordinate_space.map_to_target(
-                x, y, self._real_screen_resolution
-            )
-
-        return scale_coordinates(
-            (int(x), int(y)),
-            self._real_screen_resolution,
-            target_resolution,
-            inverse=False,
-        )
+        return self._scaler.scale_screenshot(screenshot)
 
     def tap(self, x: float, y: float) -> None:
-        x, y = self._scale_coordinates(x, y)
+        x, y = self._scaler.scale_coordinates(x, y)
         self._agent_os.tap(x, y)
 
     def swipe(
         self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
-        x1, y1 = self._scale_coordinates(x1, y1)
-        x2, y2 = self._scale_coordinates(x2, y2)
+        x1, y1 = self._scaler.scale_coordinates(x1, y1)
+        x2, y2 = self._scaler.scale_coordinates(x2, y2)
         self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms)
 
     def drag_and_drop(
         self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
-        x1, y1 = self._scale_coordinates(x1, y1)
-        x2, y2 = self._scale_coordinates(x2, y2)
+        x1, y1 = self._scaler.scale_coordinates(x1, y1)
+        x2, y2 = self._scaler.scale_coordinates(x2, y2)
         self._agent_os.drag_and_drop(x1, y1, x2, y2, duration_in_ms)
 
     def type(self, text: str) -> None:
@@ -109,7 +80,7 @@ def key_tap(self, key: ANDROID_KEY) -> None:
         self._agent_os.key_tap(key)
 
     def key_combination(
-        self, keys: List[ANDROID_KEY], duration_in_ms: int = 100
+        self, keys: list[ANDROID_KEY], duration_in_ms: int = 100
     ) -> None:
         self._agent_os.key_combination(keys, duration_in_ms)
 
@@ -121,27 +92,27 @@ def get_connected_displays(self) -> list[AndroidDisplay]:
 
     def set_display_by_index(self, display_index: int = 0) -> None:
         self._agent_os.set_display_by_index(display_index)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def set_display_by_unique_id(self, display_unique_id: int) -> None:
         self._agent_os.set_display_by_unique_id(display_unique_id)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def set_display_by_id(self, display_id: int) -> None:
         self._agent_os.set_display_by_id(display_id)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def set_display_by_name(self, display_name: str) -> None:
         self._agent_os.set_display_by_name(display_name)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def set_device_by_index(self, device_index: int = 0) -> None:
         self._agent_os.set_device_by_index(device_index)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def set_device_by_serial_number(self, device_sn: str) -> None:
         self._agent_os.set_device_by_serial_number(device_sn)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def get_connected_devices_serial_numbers(self) -> list[str]:
         return self._agent_os.get_connected_devices_serial_numbers()
@@ -165,7 +136,7 @@ def get_ui_elements(self) -> UIElementCollection:
             if element.center is None:
                 continue
             element.set_center(
-                self._scale_coordinates(
+                self._scaler.scale_coordinates(
                     x=element.center[0],
                     y=element.center[1],
                     from_agent=False,
diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py
index 63cc6dde..848f6a10 100644
--- a/src/askui/tools/computer_agent_os_facade.py
+++ b/src/askui/tools/computer_agent_os_facade.py
@@ -9,7 +9,6 @@
     AgentOs,
     Coordinate,
     Display,
-    DisplaySize,
     DisplaysListResponse,
     InputEvent,
     ModifierKey,
@@ -17,7 +16,7 @@
     PcKey,
 )
 from askui.tools.askui.askui_controller import RenderObjectStyle  # noqa: TC001
-from askui.utils.image_utils import scale_coordinates
+from askui.tools.coordinate_scaling_mixin import CoordinateScaler
 
 if TYPE_CHECKING:
     from askui.tools.askui.askui_ui_controller_grpc.generated import (
@@ -31,11 +30,17 @@
 
 
 class ComputerAgentOsFacade(AgentOs):
-    """
-    Facade for AgentOs that adds coordinate scaling functionality.
+    """Facade for `AgentOs` that adds coordinate scaling.
+
+    Screenshots are scaled using the provider's image scaler so that the
+    AI model sees an optimally sized image.  Coordinate-based inputs
+    are scaled back up to the real screen resolution before being forwarded
+    to the underlying agent OS.
 
-    This class is used to scale the coordinates to the target resolution
-    and back to the real screen resolution.
+    Args:
+        agent_os (`AgentOs`): The real agent OS to wrap.
+        coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses.
+        image_scaler (`ImageScaler`): Callable to preprocess screenshots.
     """
 
     def __init__(
@@ -45,42 +50,43 @@ def __init__(
         image_scaler: ImageScaler,
     ) -> None:
         self._agent_os = agent_os
-        self._image_scaler = image_scaler
-        self._target_resolution: tuple[int, int] | None = None
-        self._coordinate_space: VlmCoordinateSpace = coordinate_space
-        self._real_screen_resolution: DisplaySize | None = None
+        self._scaler = CoordinateScaler(
+            coordinate_space=coordinate_space,
+            image_scaler=image_scaler,
+            fetch_real_resolution=self._fetch_real_screen_resolution,
+            take_screenshot=lambda: self.screenshot(report=False),
+        )
         self.tags.append(ToolTags.SCALED_AGENT_OS.value)
 
     def connect(self) -> None:
         self._agent_os.connect()
-        self._real_screen_resolution = self._agent_os.retrieve_active_display().size
+        self._scaler.real_screen_resolution = self._fetch_real_screen_resolution()
 
     def disconnect(self) -> None:
         self._agent_os.disconnect()
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def screenshot(self, report: bool = True) -> Image.Image:
         screenshot = self._agent_os.screenshot(report=report)
-        self._real_screen_resolution = DisplaySize(
-            width=screenshot.width, height=screenshot.height
-        )
-        scaled = self._image_scaler(screenshot)
-        self._target_resolution = scaled.size
-        return scaled
+        return self._scaler.scale_screenshot(screenshot)
+
+    def _fetch_real_screen_resolution(self) -> tuple[int, int]:
+        display = self._agent_os.retrieve_active_display()
+        return display.size.width, display.size.height
 
     def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
-        scaled_x, scaled_y = self._scale_coordinates_back(x, y)
+        scaled_x, scaled_y = self._scaler.scale_coordinates(x, y)
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
 
     def get_mouse_position(self) -> Coordinate:
         mouse_position = self._agent_os.get_mouse_position()
-        scaled_x, scaled_y = self._scale_coordinates_back(
+        scaled_x, scaled_y = self._scaler.scale_coordinates(
             mouse_position.x, mouse_position.y, from_agent=False
         )
         return Coordinate(x=scaled_x, y=scaled_y)
 
     def set_mouse_position(self, x: float, y: float) -> None:
-        scaled_x, scaled_y = self._scale_coordinates_back(x, y)
+        scaled_x, scaled_y = self._scaler.scale_coordinates(x, y)
         self._agent_os.set_mouse_position(scaled_x, scaled_y)
 
     def type(self, text: str, typing_speed: int = 50) -> None:
@@ -124,7 +130,7 @@ def retrieve_active_display(self) -> Display:
 
     def set_display(self, display: int = 1) -> None:
         self._agent_os.set_display(display)
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def run_command(self, command: str, timeout_ms: int = 30000) -> None:
         self._agent_os.run_command(command, timeout_ms)
@@ -301,7 +307,7 @@ def get_file(self, path: str) -> Image.Image | str:
         """
         response = self._agent_os.get_file(path)
         if isinstance(response, Image.Image):
-            return self._image_scaler(response)
+            return self._scaler.scale_screenshot(response)
         return response
 
     def remove_virtual_displays(self) -> None:
@@ -309,48 +315,4 @@ def remove_virtual_displays(self) -> None:
         Remove virtual displays from the controller, leaving real displays only.
         """
         self._agent_os.remove_virtual_displays()
-        self._real_screen_resolution = None
-
-    def _ensure_target_resolution(self) -> tuple[int, int]:
-        if self._target_resolution is None:
-            self.screenshot(report=False)
-        assert self._target_resolution is not None  # noqa: S101
-        return self._target_resolution
-
-    def _scale_coordinates_back(
-        self,
-        x: float,
-        y: float,
-        from_agent: bool = True,
-        check_coordinates_in_bounds: bool = True,
-    ) -> tuple[int, int]:
-        if self._real_screen_resolution is None:
-            self._real_screen_resolution = self._agent_os.retrieve_active_display().size
-
-        target_resolution = self._ensure_target_resolution()
-        real_size = (
-            self._real_screen_resolution.width,
-            self._real_screen_resolution.height,
-        )
-
-        if from_agent:
-            if self._coordinate_space.maps_to_screenshot_pixels:
-                mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, target_resolution
-                )
-                return scale_coordinates(
-                    (mapped_x, mapped_y),
-                    real_size,
-                    target_resolution,
-                    inverse=True,
-                    check_coordinates_in_bounds=check_coordinates_in_bounds,
-                )
-            return self._coordinate_space.map_to_target(x, y, real_size)
-
-        return scale_coordinates(
-            (int(x), int(y)),
-            real_size,
-            target_resolution,
-            inverse=False,
-            check_coordinates_in_bounds=check_coordinates_in_bounds,
-        )
+        self._scaler.real_screen_resolution = None
diff --git a/src/askui/tools/coordinate_scaling_mixin.py b/src/askui/tools/coordinate_scaling_mixin.py
new file mode 100644
index 00000000..39d12194
--- /dev/null
+++ b/src/askui/tools/coordinate_scaling_mixin.py
@@ -0,0 +1,99 @@
+"""Coordinate scaling helper used by all agent OS facades."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from askui.utils.image_utils import scale_coordinates
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from PIL import Image
+
+    from askui.models.shared.coordinate_space import VlmCoordinateSpace
+    from askui.models.shared.image_scaler import ImageScaler
+
+
+class CoordinateScaler:
+    """Maps coordinates between model space and device space.
+
+    Each agent OS facade owns an instance and delegates scaling to it.
+
+    Args:
+        coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses.
+        image_scaler (`ImageScaler`): Callable to preprocess screenshots.
+        fetch_real_resolution (`Callable`): Callback that returns the real
+            ``(width, height)`` of the screen/device when it is not yet known.
+        take_screenshot (`Callable`): Callback that triggers a screenshot
+            so that ``target_resolution`` can be populated.
+    """
+
+    def __init__(
+        self,
+        coordinate_space: VlmCoordinateSpace,
+        image_scaler: ImageScaler,
+        fetch_real_resolution: Callable[[], tuple[int, int]],
+        take_screenshot: Callable[[], Image.Image],
+    ) -> None:
+        self._coordinate_space = coordinate_space
+        self._image_scaler = image_scaler
+        self._fetch_real_resolution = fetch_real_resolution
+        self._take_screenshot = take_screenshot
+        self.target_resolution: tuple[int, int] | None = None
+        self.real_screen_resolution: tuple[int, int] | None = None
+
+    def scale_screenshot(self, screenshot: Image.Image) -> Image.Image:
+        """Record real resolution, apply scaler, record target resolution."""
+        self.real_screen_resolution = screenshot.size
+        scaled = self._image_scaler(screenshot)
+        self.target_resolution = scaled.size
+        return scaled
+
+    def scale_coordinates(
+        self,
+        x: float,
+        y: float,
+        from_agent: bool = True,
+        check_coordinates_in_bounds: bool = True,
+    ) -> tuple[int, int]:
+        """Map coordinates between model space and device space.
+
+        When ``from_agent=True``, maps model-emitted coordinates to real
+        device pixels.  When ``from_agent=False``, maps device coordinates
+        to model space (e.g. for reporting element positions back to the model).
+        """
+        if self.real_screen_resolution is None:
+            self.real_screen_resolution = self._fetch_real_resolution()
+
+        target_resolution = self._ensure_target_resolution()
+
+        if from_agent:
+            if self._coordinate_space.maps_to_screenshot_pixels:
+                mapped_x, mapped_y = self._coordinate_space.map_to_target(
+                    x, y, target_resolution
+                )
+                return scale_coordinates(
+                    (mapped_x, mapped_y),
+                    self.real_screen_resolution,
+                    target_resolution,
+                    inverse=True,
+                    check_coordinates_in_bounds=check_coordinates_in_bounds,
+                )
+            return self._coordinate_space.map_to_target(
+                x, y, self.real_screen_resolution
+            )
+
+        return scale_coordinates(
+            (int(x), int(y)),
+            self.real_screen_resolution,
+            target_resolution,
+            inverse=False,
+            check_coordinates_in_bounds=check_coordinates_in_bounds,
+        )
+
+    def _ensure_target_resolution(self) -> tuple[int, int]:
+        if self.target_resolution is None:
+            self._take_screenshot()
+        assert self.target_resolution is not None  # noqa: S101
+        return self.target_resolution
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
index a286476c..62a77652 100644
--- a/src/askui/tools/playwright/agent_os_facade.py
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -6,8 +6,8 @@
 from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import Display, ModifierKey, PcKey
+from askui.tools.coordinate_scaling_mixin import CoordinateScaler
 from askui.tools.playwright.agent_os import PlaywrightAgentOs
-from askui.utils.image_utils import scale_coordinates
 
 
 class PlaywrightAgentOsFacade(PlaywrightAgentOs):
@@ -19,9 +19,9 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs):
     being forwarded to the underlying agent OS.
 
     Args:
-        agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap.
-        coordinate_space (VlmCoordinateSpace): Coordinate grid the model uses.
-        image_scaler (ImageScaler): Callable to preprocess screenshots.
+        agent_os (`PlaywrightAgentOs`): The real Playwright agent OS to wrap.
+        coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses.
+        image_scaler (`ImageScaler`): Callable to preprocess screenshots.
     """
 
     def __init__(
@@ -31,73 +31,30 @@ def __init__(
         image_scaler: ImageScaler,
     ) -> None:
         self._agent_os = agent_os
-        self._image_scaler = image_scaler
-        self._target_resolution: tuple[int, int] | None = None
-        self._coordinate_space: VlmCoordinateSpace = coordinate_space
-        self._real_screen_resolution: tuple[int, int] | None = None
+        self._scaler = CoordinateScaler(
+            coordinate_space=coordinate_space,
+            image_scaler=image_scaler,
+            fetch_real_resolution=lambda: self._agent_os.screenshot(report=False).size,
+            take_screenshot=lambda: self.screenshot(report=False),
+        )
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
     def connect(self) -> None:
         self._agent_os.connect()
-        self._real_screen_resolution = self._agent_os.screenshot(
-            report=False,
+        self._scaler.real_screen_resolution = self._agent_os.screenshot(
+            report=False
         ).size
 
     def disconnect(self) -> None:
         self._agent_os.disconnect()
-        self._real_screen_resolution = None
+        self._scaler.real_screen_resolution = None
 
     def screenshot(self, report: bool = True) -> Image.Image:
         screenshot = self._agent_os.screenshot(report=report)
-        self._real_screen_resolution = screenshot.size
-        scaled = self._image_scaler(screenshot)
-        self._target_resolution = scaled.size
-        return scaled
-
-    def _ensure_target_resolution(self) -> tuple[int, int]:
-        if self._target_resolution is None:
-            self.screenshot(report=False)
-        assert self._target_resolution is not None  # noqa: S101
-        return self._target_resolution
-
-    def _scale_coordinates(
-        self,
-        x: float,
-        y: float,
-        from_agent: bool = True,
-    ) -> tuple[int, int]:
-        if self._real_screen_resolution is None:
-            self._real_screen_resolution = self._agent_os.screenshot(
-                report=False,
-            ).size
-
-        target_resolution = self._ensure_target_resolution()
-
-        if from_agent:
-            if self._coordinate_space.maps_to_screenshot_pixels:
-                mapped_x, mapped_y = self._coordinate_space.map_to_target(
-                    x, y, target_resolution
-                )
-                return scale_coordinates(
-                    (mapped_x, mapped_y),
-                    self._real_screen_resolution,
-                    target_resolution,
-                    inverse=True,
-                )
-            return self._coordinate_space.map_to_target(
-                x, y, self._real_screen_resolution
-            )
-
-        return scale_coordinates(
-            (int(x), int(y)),
-            self._real_screen_resolution,
-            target_resolution,
-            inverse=False,
-        )
+        return self._scaler.scale_screenshot(screenshot)
 
     def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
-        scaled_x, scaled_y = self._scale_coordinates(x, y)
-        # scaled_x, scaled_y = x, y
+        scaled_x, scaled_y = self._scaler.scale_coordinates(x, y)
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
 
     def type(self, text: str, typing_speed: int = 50) -> None:
diff --git a/src/askui/utils/llm_image_utils.py b/src/askui/utils/llm_image_utils.py
index 688854bc..c74e4f1e 100644
--- a/src/askui/utils/llm_image_utils.py
+++ b/src/askui/utils/llm_image_utils.py
@@ -141,6 +141,36 @@ def resize_image(image: Image.Image, target_size: tuple[int, int]) -> Image.Imag
     return image.resize(target_size, Image.Resampling.LANCZOS)
 
 
+def compute_patch_optimized_image(
+    image: Image.Image,
+    max_edge: int = 1568,
+    max_tokens: int = 1568,
+    patch_size: int = 28,
+) -> Image.Image:
+    """Resize an image to its patch-optimized size.
+
+    Convenience wrapper that combines `compute_patch_optimized_size` and
+    `resize_image` into a single call.
+
+    Args:
+        image (Image.Image): Source image.
+        max_edge (int): Maximum allowed dimension (width or height).
+        max_tokens (int): Maximum allowed number of image tokens.
+        patch_size (int): Patch size used by the model.
+
+    Returns:
+        Image.Image: Resized image.
+    """
+    target = compute_patch_optimized_size(
+        image.width,
+        image.height,
+        max_edge=max_edge,
+        max_tokens=max_tokens,
+        patch_size=patch_size,
+    )
+    return resize_image(image, target)
+
+
 def resize_and_pad_image(
     image: Image.Image,
     target_size: tuple[int, int],
diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py
index 3b9e1b89..bc0b2868 100644
--- a/tests/unit/tools/test_agent_os_facade_coordinates.py
+++ b/tests/unit/tools/test_agent_os_facade_coordinates.py
@@ -39,10 +39,10 @@ def _make_android_facade(
         coordinate_space=coordinate_space,
         image_scaler=_default_scaler,
     )
-    facade._real_screen_resolution = device_size
+    facade._scaler.real_screen_resolution = device_size
     # Set target resolution as the scaler would produce it
     scaled = _default_scaler(Image.new("RGB", device_size))
-    facade._target_resolution = scaled.size
+    facade._scaler.target_resolution = scaled.size
     return facade
 
 
@@ -58,29 +58,29 @@ class TestScaledCoordinateSpaceTallDevice:
 
     def test_center_tap(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(500, 500)
+        x, y = facade._scaler.scale_coordinates(500, 500)
         assert (x, y) == (540, 1200)
 
     def test_left_side_tap(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(200, 500)
+        x, y = facade._scaler.scale_coordinates(200, 500)
         assert (x, y) == (216, 1200)
 
     def test_swipe_across(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x1, y1 = facade._scale_coordinates(500, 500)
-        x2, y2 = facade._scale_coordinates(200, 500)
+        x1, y1 = facade._scaler.scale_coordinates(500, 500)
+        x2, y2 = facade._scaler.scale_coordinates(200, 500)
         assert (x1, y1) == (540, 1200)
         assert (x2, y2) == (216, 1200)
 
     def test_origin(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(0, 0)
+        x, y = facade._scaler.scale_coordinates(0, 0)
         assert (x, y) == (0, 0)
 
     def test_max_corner(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(1000, 1000)
+        x, y = facade._scaler.scale_coordinates(1000, 1000)
         assert (x, y) == (1080, 2400)
 
 
@@ -92,12 +92,12 @@ class TestNormalizedCoordinateSpaceTallDevice:
 
     def test_center_tap(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(0.5, 0.5)
+        x, y = facade._scaler.scale_coordinates(0.5, 0.5)
         assert (x, y) == (540, 1200)
 
     def test_left_side_tap(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(0.2, 0.5)
+        x, y = facade._scaler.scale_coordinates(0.2, 0.5)
         assert (x, y) == (216, 1200)
 
 
@@ -119,7 +119,7 @@ def test_center_of_content(self) -> None:
         """The center of the content area in the scaled screenshot."""
         facade = _make_android_facade(self.device, self.cs)
         # Target resolution is (345, 768) — nearly no padding
-        x, y = facade._scale_coordinates(172, 384)
+        x, y = facade._scaler.scale_coordinates(172, 384)
         assert x == pytest.approx(538, abs=5)
         assert y == pytest.approx(1200, abs=5)
 
@@ -128,7 +128,7 @@ def test_near_top_left_of_content(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
         # Use (1, 2) instead of exact origin to avoid rounding-offset
         # edge case that can produce small negative values.
-        x, y = facade._scale_coordinates(1, 2)
+        x, y = facade._scaler.scale_coordinates(1, 2)
         assert x == pytest.approx(3, abs=5)
         assert y == pytest.approx(3, abs=5)
 
@@ -141,7 +141,7 @@ class TestSquareDevice:
 
     def test_center(self) -> None:
         facade = _make_android_facade(self.device, self.cs)
-        x, y = facade._scale_coordinates(500, 500)
+        x, y = facade._scaler.scale_coordinates(500, 500)
         assert (x, y) == (512, 384)
 
 
@@ -152,7 +152,7 @@ def test_device_to_screenshot_scaled_space(self) -> None:
         facade = _make_android_facade(
             (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000)
         )
-        x, y = facade._scale_coordinates(540, 1200, from_agent=False)
+        x, y = facade._scaler.scale_coordinates(540, 1200, from_agent=False)
         # Target resolution is (345, 768), no padding
         # Forward scaling: factor = 768/2400 = 0.32
         # x = 540 * 0.32 = 172.8 → 172, y = 1200 * 0.32 = 384

From cca155fac4ed501bfda9a5ac3649044de805c67c Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Thu, 11 Jun 2026 15:17:49 +0200
Subject: [PATCH 5/6] chore: fine-tune settings for MAX_IMAGE_EDGE

---
 src/askui/model_providers/anthropic_vlm_provider.py | 2 +-
 src/askui/model_providers/askui_vlm_provider.py     | 2 +-
 src/askui/model_providers/ollama_vlm_provider.py    | 3 +--
 src/askui/model_providers/openai_vlm_provider.py    | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py
index f094f22c..80168aa3 100644
--- a/src/askui/model_providers/anthropic_vlm_provider.py
+++ b/src/askui/model_providers/anthropic_vlm_provider.py
@@ -21,7 +21,7 @@
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
-_DEFAULT_MAX_IMAGE_EDGE = 1568
+_DEFAULT_MAX_IMAGE_EDGE = 1024
 
 
 class AnthropicVlmProvider(VlmProvider):
diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py
index f402ee3c..615a1af8 100644
--- a/src/askui/model_providers/askui_vlm_provider.py
+++ b/src/askui/model_providers/askui_vlm_provider.py
@@ -21,7 +21,7 @@
 from askui.utils.llm_image_utils import compute_patch_optimized_image
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
-_DEFAULT_MAX_IMAGE_EDGE = 1568
+_DEFAULT_MAX_IMAGE_EDGE = 1024
 
 
 class AskUIVlmProvider(VlmProvider):
diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
index c53103ed..c313983f 100644
--- a/src/askui/model_providers/ollama_vlm_provider.py
+++ b/src/askui/model_providers/ollama_vlm_provider.py
@@ -5,7 +5,6 @@
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.models.shared.coordinate_space import (
-    NormalizedCoordinateSpace,
     PixelCoordinateSpace,
     ScaledCoordinateSpace,
     VlmCoordinateSpace,
@@ -17,7 +16,7 @@
 
 _QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
 _HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
-_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace()
+_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
 
 
 class OllamaVlmProvider(OpenAIVlmProvider):
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
index bf25eb00..e8d6b8f1 100644
--- a/src/askui/model_providers/openai_vlm_provider.py
+++ b/src/askui/model_providers/openai_vlm_provider.py
@@ -26,7 +26,7 @@
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
 _DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
-_DEFAULT_MAX_IMAGE_EDGE = 2048
+_DEFAULT_MAX_IMAGE_EDGE = 1024
 
 
 class OpenAIVlmProvider(VlmProvider):

From d6415eb3df04207ac785e0bbf1e49a2df9e07408 Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Thu, 11 Jun 2026 15:21:54 +0200
Subject: [PATCH 6/6] fix: outdated cos test for kimi

---
 tests/unit/model_providers/test_ollama_vlm_provider.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py
index e3f78ef5..e4fe32d3 100644
--- a/tests/unit/model_providers/test_ollama_vlm_provider.py
+++ b/tests/unit/model_providers/test_ollama_vlm_provider.py
@@ -7,7 +7,6 @@
 from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
 from askui.models.shared.coordinate_space import (
-    NormalizedCoordinateSpace,
     PixelCoordinateSpace,
     ScaledCoordinateSpace,
 )
@@ -68,11 +67,15 @@ def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None:
 
     def test_coordinate_space_auto_detects_kimi(self) -> None:
         provider = OllamaVlmProvider(model_id="kimi-vl")
-        assert provider.coordinate_space == NormalizedCoordinateSpace()
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
 
     def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None:
         provider = OllamaVlmProvider(model_id="Kimi-VL-A3B")
-        assert provider.coordinate_space == NormalizedCoordinateSpace()
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
 
     def test_coordinate_space_default_for_non_qwen(self) -> None:
         provider = OllamaVlmProvider(model_id="llava")