askui · philipph-askui · Jun 9, 2026 · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py
@@ -87,7 +87,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
-        self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -97,6 +96,11 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = AndroidAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+            image_scaler=self._vlm_provider.image_scaler,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with Android-specific settings
         self.act_settings = ActSettings(

diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py
@@ -130,7 +130,9 @@ def __init__(
             truncation_strategy=truncation_strategy,
         )
         self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
-            self.tools.os
+            self.tools.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+            image_scaler=self._vlm_provider.image_scaler,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with computer-specific settings

diff --git a/src/askui/model_providers/__init__.py b/src/askui/model_providers/__init__.py
@@ -35,6 +35,13 @@
 from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.model_providers.vlm_provider import VlmProvider
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
+from askui.models.shared.image_scaler import ImageScaler
 from askui.utils.model_pricing import ModelPricing
 
 __all__ = [
@@ -46,11 +53,16 @@
     "DetectionProvider",
     "GoogleImageQAProvider",
     "ImageQAProvider",
+    "ImageScaler",
     "ModelPricing",
+    "NormalizedCoordinateSpace",
     "OllamaImageQAProvider",
     "OllamaVlmProvider",
+    "OpenAICompatibleVlmProvider",
     "OpenAIImageQAProvider",
     "OpenAIVlmProvider",
-    "OpenAICompatibleVlmProvider",
+    "PixelCoordinateSpace",
+    "ScaledCoordinateSpace",
+    "VlmCoordinateSpace",
     "VlmProvider",
 ]
diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py
@@ -14,11 +14,14 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_patch_optimized_image
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
+_DEFAULT_MAX_IMAGE_EDGE = 1024
 
 
 class AnthropicVlmProvider(VlmProvider):
@@ -46,6 +49,11 @@ class AnthropicVlmProvider(VlmProvider):
             cost in USD per 1M output tokens.
         cache_write_cost_per_million_tokens (float | None, optional): Override
             cost in USD per 1M cache write input tokens.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, uses Anthropic-optimized patch-based scaling.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Defaults to 1568.
         cache_read_cost_per_million_tokens (float | None, optional): Override
             cost in USD per 1M cache read input tokens.
 
@@ -70,6 +78,8 @@ def __init__(
         auth_token: str | None = None,
         model_id: str | None = None,
         client: Anthropic | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -78,6 +88,12 @@ def __init__(
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
+        self._image_scaler_override = image_scaler
+        self._max_edge = (
+            max_image_edge
+            or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
+            or _DEFAULT_MAX_IMAGE_EDGE
+        )
         if client is not None:
             self.client = client
         else:
@@ -104,6 +120,14 @@ def model_id(self) -> str:
     def pricing(self) -> ModelPricing | None:
         return self._pricing
 
+    @property
+    @override
+    def image_scaler(self) -> ImageScaler:
+        if self._image_scaler_override is not None:
+            return self._image_scaler_override
+        max_edge = self._max_edge
+        return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)
+
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
         """Lazily initialise the AnthropicMessagesApi on first use."""

diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py
@@ -15,10 +15,13 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.image_scaler import ImageScaler
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
+from askui.utils.llm_image_utils import compute_patch_optimized_image
 
 _DEFAULT_MODEL_ID = "claude-sonnet-4-6"
+_DEFAULT_MAX_IMAGE_EDGE = 1024
 
 
 class AskUIVlmProvider(VlmProvider):
@@ -29,23 +32,26 @@ class AskUIVlmProvider(VlmProvider):
     on the first API call, not at construction time.
 
     Args:
-        workspace_id (str | None, optional): AskUI workspace ID. Reads
-            `ASKUI_WORKSPACE_ID` from the environment if not provided.
-        token (str | None, optional): AskUI API token. Reads `ASKUI_TOKEN`
-            from the environment if not provided.
-        model_id (str, optional): Claude model to use. Defaults to
-            `"claude-sonnet-4-6"`.
-        client (Anthropic | None, optional): Pre-configured Anthropic client.
-            If provided, `workspace_id` and `token` are ignored.
+        askui_settings (`AskUiInferenceApiSettings` | None, optional):
+            Connection settings (workspace ID, token, base URL).  Reads
+            from environment variables if not provided.
+        model_id (str | None, optional): Claude model to use. Defaults to
+            ``"claude-sonnet-4-6"``.
+        client (`Anthropic` | None, optional): Pre-configured Anthropic client.
+            If provided, ``askui_settings`` is only used for the base URL.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, uses Anthropic-optimized patch-based scaling.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Defaults to 1568.
+
     Example:
         ```python
         from askui import AgentSettings, ComputerAgent
         from askui.model_providers import AskUIVlmProvider
 
         agent = ComputerAgent(settings=AgentSettings(
             vlm_provider=AskUIVlmProvider(
-                workspace_id="my-workspace",
-                token="my-token",
                 model_id="claude-opus-4-6-20260401",
             )
         ))
@@ -57,18 +63,34 @@ def __init__(
         askui_settings: AskUiInferenceApiSettings | None = None,
         model_id: str | None = None,
         client: Anthropic | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
         self._askui_settings = askui_settings or AskUiInferenceApiSettings()
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
         self._injected_client = client
+        self._image_scaler_override = image_scaler
+        self._max_edge = (
+            max_image_edge
+            or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
+            or _DEFAULT_MAX_IMAGE_EDGE
+        )
 
     @property
     @override
     def model_id(self) -> str:
         return self._model_id_value
 
+    @property
+    @override
+    def image_scaler(self) -> ImageScaler:
+        if self._image_scaler_override is not None:
+            return self._image_scaler_override
+        max_edge = self._max_edge
+        return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)
+
     @cached_property
     def _messages_api(self) -> AnthropicMessagesApi:
         """Lazily initialise the AnthropicMessagesApi on first use."""

diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
@@ -1,26 +1,51 @@
 """OllamaVlmProvider — VLM access via a local Ollama instance."""
 
 from openai import OpenAI
+from typing_extensions import override
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
+from askui.models.shared.image_scaler import ImageScaler
 
 _DEFAULT_BASE_URL = "http://localhost:11434/v1"
 _DEFAULT_MODEL_ID = "qwen3.5"
 
+_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+
 
 class OllamaVlmProvider(OpenAIVlmProvider):
     """VLM provider that routes requests to a local Ollama instance.
 
     Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
     defaults (``base_url``, ``api_key``, ``model_id``).
 
+    Qwen and Holo models are automatically detected and their coordinate
+    space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
+    Kimi models use ``NormalizedCoordinateSpace()``.
+    Pass ``coordinate_space`` explicitly to override auto-detection.
+
     Args:
         model_id (str, optional): Ollama model to use. Defaults to
             ``"qwen3.5"``.
         base_url (str, optional): Base URL for the Ollama OpenAI-compatible
             API. Defaults to ``"http://localhost:11434/v1"``.
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``base_url`` is ignored.
+        coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
+            grid the model emits coordinates in.  ``None`` (the default)
+            enables auto-detection based on ``model_id``.
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, inherits from `OpenAIVlmProvider`.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Inherits the default from
+            `OpenAIVlmProvider` (2048).
 
     Example:
         ```python
@@ -40,10 +65,31 @@ def __init__(
         model_id: str = _DEFAULT_MODEL_ID,
         base_url: str = _DEFAULT_BASE_URL,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace | None = None,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
+        self._coordinate_space_override = coordinate_space
         super().__init__(
             model_id=model_id,
             api_key="ollama",  # Ollama requires no auth; OpenAI SDK needs a value
             base_url=base_url,
             client=client,
+            coordinate_space=coordinate_space or PixelCoordinateSpace(),
+            image_scaler=image_scaler,
+            max_image_edge=max_image_edge,
         )
+
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        if self._coordinate_space_override is not None:
+            return self._coordinate_space_override
+        model_lower = self._model_id_value.lower()
+        if "qwen" in model_lower:
+            return _QWEN_COORDINATE_SPACE
+        if "holo" in model_lower:
+            return _HOLO_COORDINATE_SPACE
+        if "kimi" in model_lower:
+            return _KIMI_COORDINATE_SPACE
+        return self._coordinate_space
diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py
@@ -4,6 +4,13 @@
 from openai import OpenAI
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
+from askui.models.shared.image_scaler import ImageScaler
+
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
 
 
 class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
@@ -20,6 +27,15 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
             (e.g. ``"https://my-host/v1/chat/completions"``).
         model_id (str): Model name expected by the deployment.
         api_key (str | None, optional): API key for the endpoint.
+        coordinate_space (`VlmCoordinateSpace` | None, optional): The coordinate
+            grid the model emits coordinates in.  If ``None``, inherits the
+            default from `OpenAIVlmProvider` (pixel coordinates).
+        image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
+            callable. If ``None``, inherits from `OpenAIVlmProvider`.
+        max_image_edge (int | None, optional): Maximum edge length (in pixels)
+            for screenshots sent to the model.  Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
+            from the environment if not provided.  Inherits the default from
+            `OpenAIVlmProvider` (2048).
 
     Example:
         ```python
@@ -41,6 +57,9 @@ def __init__(
         endpoint_url: str,
         model_id: str | None = None,
         api_key: str | None = None,
+        coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
+        image_scaler: ImageScaler | None = None,
+        max_image_edge: int | None = None,
     ) -> None:
         def _rewrite_url(request: httpx.Request) -> None:
             request.url = httpx.URL(endpoint_url)
@@ -56,4 +75,7 @@ def _rewrite_url(request: httpx.Request) -> None:
         super().__init__(
             model_id=model_id,
             client=client,
+            coordinate_space=coordinate_space,
+            image_scaler=image_scaler,
+            max_image_edge=max_image_edge,
         )