diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 98b79143..29b96a15 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -87,7 +87,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PpadbAgentOs(device_identifier=device, reporter=reporter) - self.act_agent_os_facade = AndroidAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -97,6 +96,11 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = AndroidAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with Android-specific settings self.act_settings = ActSettings( diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py index ad0a6627..7f121dbf 100644 --- a/src/askui/computer_agent.py +++ b/src/askui/computer_agent.py @@ -130,7 +130,9 @@ def __init__( truncation_strategy=truncation_strategy, ) self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade( - self.tools.os + self.tools.os, + coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with computer-specific settings diff --git a/src/askui/model_providers/__init__.py b/src/askui/model_providers/__init__.py index ae1f0d0d..9424577d 100644 --- a/src/askui/model_providers/__init__.py +++ b/src/askui/model_providers/__init__.py @@ -35,6 +35,13 @@ from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.model_providers.vlm_provider import VlmProvider +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler from askui.utils.model_pricing import ModelPricing __all__ = [ @@ -46,11 +53,16 @@ "DetectionProvider", "GoogleImageQAProvider", "ImageQAProvider", + "ImageScaler", "ModelPricing", + "NormalizedCoordinateSpace", "OllamaImageQAProvider", "OllamaVlmProvider", + "OpenAICompatibleVlmProvider", "OpenAIImageQAProvider", "OpenAIVlmProvider", - "OpenAICompatibleVlmProvider", + "PixelCoordinateSpace", + "ScaledCoordinateSpace", + "VlmCoordinateSpace", "VlmProvider", ] diff --git a/src/askui/model_providers/anthropic_vlm_provider.py b/src/askui/model_providers/anthropic_vlm_provider.py index 9edd42b9..80168aa3 100644 --- a/src/askui/model_providers/anthropic_vlm_provider.py +++ b/src/askui/model_providers/anthropic_vlm_provider.py @@ -14,11 +14,14 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "claude-sonnet-4-6" +_DEFAULT_MAX_IMAGE_EDGE = 1024 class AnthropicVlmProvider(VlmProvider): @@ -46,6 +49,11 @@ class AnthropicVlmProvider(VlmProvider): cost in USD per 1M output tokens. cache_write_cost_per_million_tokens (float | None, optional): Override cost in USD per 1M cache write input tokens. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, uses Anthropic-optimized patch-based scaling. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 1568. cache_read_cost_per_million_tokens (float | None, optional): Override cost in USD per 1M cache read input tokens. @@ -70,6 +78,8 @@ def __init__( auth_token: str | None = None, model_id: str | None = None, client: Anthropic | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -78,6 +88,12 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) if client is not None: self.client = client else: @@ -104,6 +120,14 @@ def model_id(self) -> str: def pricing(self) -> ModelPricing | None: return self._pricing + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: compute_patch_optimized_image(image, max_edge=max_edge) + @cached_property def _messages_api(self) -> AnthropicMessagesApi: """Lazily initialise the AnthropicMessagesApi on first use.""" diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py index d149deff..615a1af8 100644 --- a/src/askui/model_providers/askui_vlm_provider.py +++ b/src/askui/model_providers/askui_vlm_provider.py @@ -15,10 +15,13 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_image _DEFAULT_MODEL_ID = "claude-sonnet-4-6" +_DEFAULT_MAX_IMAGE_EDGE = 1024 class AskUIVlmProvider(VlmProvider): @@ -29,14 +32,19 @@ class AskUIVlmProvider(VlmProvider): on the first API call, not at construction time. Args: - workspace_id (str | None, optional): AskUI workspace ID. Reads - `ASKUI_WORKSPACE_ID` from the environment if not provided. - token (str | None, optional): AskUI API token. Reads `ASKUI_TOKEN` - from the environment if not provided. - model_id (str, optional): Claude model to use. Defaults to - `"claude-sonnet-4-6"`. - client (Anthropic | None, optional): Pre-configured Anthropic client. - If provided, `workspace_id` and `token` are ignored. + askui_settings (`AskUiInferenceApiSettings` | None, optional): + Connection settings (workspace ID, token, base URL). Reads + from environment variables if not provided. + model_id (str | None, optional): Claude model to use. Defaults to + ``"claude-sonnet-4-6"``. + client (`Anthropic` | None, optional): Pre-configured Anthropic client. + If provided, ``askui_settings`` is only used for the base URL. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, uses Anthropic-optimized patch-based scaling. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 1568. + Example: ```python from askui import AgentSettings, ComputerAgent @@ -44,8 +52,6 @@ class AskUIVlmProvider(VlmProvider): agent = ComputerAgent(settings=AgentSettings( vlm_provider=AskUIVlmProvider( - workspace_id="my-workspace", - token="my-token", model_id="claude-opus-4-6-20260401", ) )) @@ -57,18 +63,34 @@ def __init__( askui_settings: AskUiInferenceApiSettings | None = None, model_id: str | None = None, client: Anthropic | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: self._askui_settings = askui_settings or AskUiInferenceApiSettings() self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) self._injected_client = client + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) @property @override def model_id(self) -> str: return self._model_id_value + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: compute_patch_optimized_image(image, max_edge=max_edge) + @cached_property def _messages_api(self) -> AnthropicMessagesApi: """Lazily initialise the AnthropicMessagesApi on first use.""" diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py index e06fa408..c313983f 100644 --- a/src/askui/model_providers/ollama_vlm_provider.py +++ b/src/askui/model_providers/ollama_vlm_provider.py @@ -1,12 +1,23 @@ """OllamaVlmProvider — VLM access via a local Ollama instance.""" from openai import OpenAI +from typing_extensions import override from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler _DEFAULT_BASE_URL = "http://localhost:11434/v1" _DEFAULT_MODEL_ID = "qwen3.5" +_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) + class OllamaVlmProvider(OpenAIVlmProvider): """VLM provider that routes requests to a local Ollama instance. @@ -14,6 +25,11 @@ class OllamaVlmProvider(OpenAIVlmProvider): Thin convenience wrapper around `OpenAIVlmProvider` with Ollama defaults (``base_url``, ``api_key``, ``model_id``). + Qwen and Holo models are automatically detected and their coordinate + space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``. + Kimi models use ``NormalizedCoordinateSpace()``. + Pass ``coordinate_space`` explicitly to override auto-detection. + Args: model_id (str, optional): Ollama model to use. Defaults to ``"qwen3.5"``. @@ -21,6 +37,15 @@ class OllamaVlmProvider(OpenAIVlmProvider): API. Defaults to ``"http://localhost:11434/v1"``. client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``base_url`` is ignored. + coordinate_space (VlmCoordinateSpace | None, optional): The coordinate + grid the model emits coordinates in. ``None`` (the default) + enables auto-detection based on ``model_id``. + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, inherits from `OpenAIVlmProvider`. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Inherits the default from + `OpenAIVlmProvider` (2048). Example: ```python @@ -40,10 +65,31 @@ def __init__( model_id: str = _DEFAULT_MODEL_ID, base_url: str = _DEFAULT_BASE_URL, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace | None = None, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: + self._coordinate_space_override = coordinate_space super().__init__( model_id=model_id, api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value base_url=base_url, client=client, + coordinate_space=coordinate_space or PixelCoordinateSpace(), + image_scaler=image_scaler, + max_image_edge=max_image_edge, ) + + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + if self._coordinate_space_override is not None: + return self._coordinate_space_override + model_lower = self._model_id_value.lower() + if "qwen" in model_lower: + return _QWEN_COORDINATE_SPACE + if "holo" in model_lower: + return _HOLO_COORDINATE_SPACE + if "kimi" in model_lower: + return _KIMI_COORDINATE_SPACE + return self._coordinate_space diff --git a/src/askui/model_providers/openai_compatible_vlm_provider.py b/src/askui/model_providers/openai_compatible_vlm_provider.py index aae55c11..98b50627 100644 --- a/src/askui/model_providers/openai_compatible_vlm_provider.py +++ b/src/askui/model_providers/openai_compatible_vlm_provider.py @@ -4,6 +4,13 @@ from openai import OpenAI from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler + +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() class OpenAICompatibleVlmProvider(OpenAIVlmProvider): @@ -20,6 +27,15 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider): (e.g. ``"https://my-host/v1/chat/completions"``). model_id (str): Model name expected by the deployment. api_key (str | None, optional): API key for the endpoint. + coordinate_space (`VlmCoordinateSpace` | None, optional): The coordinate + grid the model emits coordinates in. If ``None``, inherits the + default from `OpenAIVlmProvider` (pixel coordinates). + image_scaler (`ImageScaler` | None, optional): Custom image preprocessing + callable. If ``None``, inherits from `OpenAIVlmProvider`. + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Inherits the default from + `OpenAIVlmProvider` (2048). Example: ```python @@ -41,6 +57,9 @@ def __init__( endpoint_url: str, model_id: str | None = None, api_key: str | None = None, + coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, ) -> None: def _rewrite_url(request: httpx.Request) -> None: request.url = httpx.URL(endpoint_url) @@ -56,4 +75,7 @@ def _rewrite_url(request: httpx.Request) -> None: super().__init__( model_id=model_id, client=client, + coordinate_space=coordinate_space, + image_scaler=image_scaler, + max_image_edge=max_image_edge, ) diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index 47475cc7..e8d6b8f1 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -14,11 +14,19 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_patch_optimized_image from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "gpt-5.4" +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() +_DEFAULT_MAX_IMAGE_EDGE = 1024 class OpenAIVlmProvider(VlmProvider): @@ -36,6 +44,12 @@ class OpenAIVlmProvider(VlmProvider): to the OpenAI API (``https://api.openai.com/v1``). client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``api_key`` and ``base_url`` are ignored. + coordinate_space (VlmCoordinateSpace, optional): The coordinate grid + the model emits coordinates in. Defaults to the screenshot + resolution (native pixel coordinates). + max_image_edge (int | None, optional): Maximum edge length (in pixels) + for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` + from the environment if not provided. Defaults to 2048. Example: ```python @@ -57,6 +71,9 @@ def __init__( api_key: str | None = None, base_url: str | None = None, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, + image_scaler: ImageScaler | None = None, + max_image_edge: int | None = None, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -65,6 +82,13 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) + self._coordinate_space = coordinate_space + self._image_scaler_override = image_scaler + self._max_edge = ( + max_image_edge + or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) + or _DEFAULT_MAX_IMAGE_EDGE + ) if client is not None: self._client = client else: @@ -86,16 +110,37 @@ def __init__( def model_id(self) -> str: return self._model_id_value + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + return self._coordinate_space + @property @override def pricing(self) -> ModelPricing | None: return self._pricing + @property + @override + def image_scaler(self) -> ImageScaler: + if self._image_scaler_override is not None: + return self._image_scaler_override + max_edge = self._max_edge + return lambda image: compute_patch_optimized_image( + image, max_edge=max_edge, max_tokens=1536, patch_size=32 + ) + @cached_property def _messages_api(self) -> OpenAIMessagesApi: """Lazily initialise the `OpenAIMessagesApi` on first use.""" return OpenAIMessagesApi(client=self._client) + @override + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Append coordinate and resolution info to the system prompt.""" + coord_info = self.coordinate_space.build_prompt_section() + return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}") + @override def create_message( self, @@ -108,6 +153,8 @@ def create_message( temperature: float | None = None, provider_options: dict[str, Any] | None = None, ) -> MessageParam: + if system is not None: + system = self.augment_system_prompt(system) return self._messages_api.create_message( messages=messages, model_id=self._model_id_value, diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py index 1e98b972..5cea3284 100644 --- a/src/askui/model_providers/vlm_provider.py +++ b/src/askui/model_providers/vlm_provider.py @@ -3,15 +3,30 @@ from abc import ABC, abstractmethod from typing import Any +from PIL import Image + from askui.models.shared.agent_message_param import ( MessageParam, ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.llm_image_utils import compute_contained_size, resize_image from askui.utils.model_pricing import ModelPricing +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() + + +def _default_image_scaler(image: Image.Image) -> Image.Image: + target = compute_contained_size(image.width, image.height) + return resize_image(image, target) + class VlmProvider(ABC): """Interface for Vision Language Model providers. @@ -44,6 +59,17 @@ class VlmProvider(ABC): def model_id(self) -> str: """The model identifier used by this provider.""" + @property + def coordinate_space(self) -> VlmCoordinateSpace: + """The coordinate space this model emits coordinates in. + + Returns a `VlmCoordinateSpace` describing the grid the model uses. + The default is `PixelCoordinateSpace` (native pixel coordinates). + Override in subclasses when the model uses a different grid + (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen). + """ + return _DEFAULT_COORDINATE_SPACE + @property def pricing(self) -> ModelPricing | None: """Pricing information for this provider's model. @@ -53,6 +79,28 @@ def pricing(self) -> ModelPricing | None: """ return None + @property + def image_scaler(self) -> ImageScaler: + """Callable that preprocesses a screenshot before sending to the model. + + Override in subclasses for provider-specific sizing. + """ + return _default_image_scaler + + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Hook for providers to augment the system prompt before sending. + + Called by ``create_message()`` implementations. The base + implementation returns the prompt unchanged. Override in + subclasses that need to inject provider-specific information + (e.g. coordinate bounds for non-Anthropic models). + + The original ``SystemPrompt`` object is **not** mutated — + implementations should create a new ``SystemPrompt`` wrapping + the augmented text. + """ + return system + @abstractmethod def create_message( self, diff --git a/src/askui/models/anthropic/get_model.py b/src/askui/models/anthropic/get_model.py index 7bed5627..421126e6 100644 --- a/src/askui/models/anthropic/get_model.py +++ b/src/askui/models/anthropic/get_model.py @@ -20,7 +20,7 @@ from askui.models.types.response_schemas import ResponseSchema from askui.prompts.get_prompts import SYSTEM_PROMPT_GET from askui.utils.excel_utils import OfficeDocumentSource -from askui.utils.image_utils import scale_image_to_fit +from askui.utils.llm_image_utils import compute_contained_size, resize_image from askui.utils.pdf_utils import PdfSource from askui.utils.source_utils import Source @@ -78,10 +78,13 @@ def get( if response_schema is not None: error_msg = "Response schema is not yet supported for Anthropic" raise NotImplementedError(error_msg) - scaled_image = scale_image_to_fit( - source.root, - get_settings.resolution, + target_size = compute_contained_size( + source.root.width, + source.root.height, + get_settings.resolution.width, + get_settings.resolution.height, ) + scaled_image = resize_image(source.root, target_size) messages = built_messages_for_get_and_locate(scaled_image, query) message = self._messages_api.create_message( messages=messages, diff --git a/src/askui/models/askui/locate_models/anthropic_locate_model.py b/src/askui/models/askui/locate_models/anthropic_locate_model.py index d2b78c27..3856b8f7 100644 --- a/src/askui/models/askui/locate_models/anthropic_locate_model.py +++ b/src/askui/models/askui/locate_models/anthropic_locate_model.py @@ -20,8 +20,8 @@ from askui.utils.image_utils import ( ImageSource, scale_coordinates, - scale_image_to_fit, ) +from askui.utils.llm_image_utils import compute_contained_size, resize_image class AnthropicLocateModel(LocateModel): @@ -77,14 +77,17 @@ def locate( try: prompt = f"Click on {locator_serialized}" resolution = locate_settings.resolution - screen_width = resolution.width - screen_height = resolution.height - scaled_image = scale_image_to_fit( - image.root, - resolution, + target_size = compute_contained_size( + image.root.width, + image.root.height, + resolution.width, + resolution.height, ) + scaled_image = resize_image(image.root, target_size) messages = built_messages_for_get_and_locate(scaled_image, prompt) - system = build_system_prompt_locate(str(screen_width), str(screen_height)) + system = build_system_prompt_locate( + str(scaled_image.width), str(scaled_image.height) + ) message = self._messages_api.create_message( messages=messages, model_id=self._model_id, @@ -100,7 +103,7 @@ def locate( scale_coordinates( extract_click_coordinates(content_text.text), image.root.size, - resolution, + scaled_image.size, inverse=True, ) ] diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index 4df27a7b..84bd3d22 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -1,5 +1,12 @@ from .android_base_tool import AndroidBaseTool from .computer_base_tool import ComputerBaseTool +from .coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) +from .image_scaler import ImageScaler from .tool_tags import ToolTags try: @@ -13,7 +20,12 @@ __all__ = [ "AndroidBaseTool", "ComputerBaseTool", + "ImageScaler", + "NormalizedCoordinateSpace", + "PixelCoordinateSpace", + "ScaledCoordinateSpace", "ToolTags", + "VlmCoordinateSpace", ] if _PLAYWRIGHT_AVAILABLE: diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py new file mode 100644 index 00000000..c2cd7c71 --- /dev/null +++ b/src/askui/models/shared/coordinate_space.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from pydantic import BaseModel, Field + + +def _common_prompt_lines() -> list[str]: + return ["* Coordinate origin is the top-left corner (0, 0)"] + + +class VlmCoordinateSpace(BaseModel, ABC): + """Abstract base for VLM coordinate conventions. + + Each subclass describes one coordinate grid a VLM may emit and knows + how to map those coordinates back to pixel space and how to render + the matching prompt section. + """ + + @property + def maps_to_screenshot_pixels(self) -> bool: + """Whether model coordinates are absolute pixels in the screenshot image. + + When ``True``, coordinates need padding-aware inverse scaling + (screenshot space to device space). When ``False``, coordinates + are in a normalised grid and map directly to device resolution. + """ + return False + + @abstractmethod + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + """Map model coordinates to pixel coordinates in *target_resolution*.""" + + @abstractmethod + def build_prompt_section(self) -> str: + """Build prompt text describing coordinate bounds for the model.""" + + +class PixelCoordinateSpace(VlmCoordinateSpace): + """Identity mapping -- coordinates already in pixel space. + + Used by Anthropic/Claude which emit coordinates matching the + screenshot resolution. + """ + + @property + def maps_to_screenshot_pixels(self) -> bool: + return True + + def map_to_target( + self, + x: float, + y: float, + target_resolution: tuple[int, int], # noqa: ARG002 + ) -> tuple[int, int]: + return int(x), int(y) + + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() + lines.append( + "* Coordinates are in pixel space matching the screenshot dimensions" + ) + return "\n".join(lines) + + +class ScaledCoordinateSpace(VlmCoordinateSpace): + """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling.""" + + width: int = Field(gt=0, description="Width of the coordinate grid") + height: int = Field(gt=0, description="Height of the coordinate grid") + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw / self.width), int(y * th / self.height) + + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() + lines.append( + f"* Emit coordinates in a {self.width}x{self.height} " + f"normalised grid: 0 <= x < {self.width}, " + f"0 <= y < {self.height}" + ) + return "\n".join(lines) + + +class NormalizedCoordinateSpace(VlmCoordinateSpace): + """0.0-1.0 float grid (Kimi). No fields.""" + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw), int(y * th) + + def build_prompt_section(self) -> str: + lines = _common_prompt_lines() + lines.append( + "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0" + ) + return "\n".join(lines) diff --git a/src/askui/models/shared/image_scaler.py b/src/askui/models/shared/image_scaler.py new file mode 100644 index 00000000..3c579e81 --- /dev/null +++ b/src/askui/models/shared/image_scaler.py @@ -0,0 +1,8 @@ +"""Type alias for image scaling callables used by VLM providers.""" + +from collections.abc import Callable + +from PIL import Image + +ImageScaler = Callable[[Image.Image], Image.Image] +"""Callable that preprocesses a screenshot before sending to a model.""" diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index f27d0eee..9fcfa25f 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -1,74 +1,76 @@ -from typing import List, Optional, Tuple +from __future__ import annotations -from PIL import Image +from typing import TYPE_CHECKING from askui.models.shared.tool_tags import ToolTags from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay -from askui.tools.android.uiautomator_hierarchy import UIElementCollection -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit +from askui.tools.coordinate_scaling_mixin import CoordinateScaler + +if TYPE_CHECKING: + from PIL import Image + + from askui.models.shared.coordinate_space import VlmCoordinateSpace + from askui.models.shared.image_scaler import ImageScaler + from askui.tools.android.uiautomator_hierarchy import UIElementCollection class AndroidAgentOsFacade(AndroidAgentOs): - """ - Facade for AndroidAgentOs that adds coordinate scaling functionality. - It is used to scale the coordinates to the target resolution - and back to the real screen resolution. + """Facade for `AndroidAgentOs` that adds coordinate scaling. + + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs + (``tap``, ``swipe``, ``drag_and_drop``) are scaled back up to the + real device resolution before being forwarded to the underlying agent OS. + + Args: + agent_os (`AndroidAgentOs`): The real Android agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ - def __init__(self, agent_os: AndroidAgentOs) -> None: + def __init__( + self, + agent_os: AndroidAgentOs, + coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, + ) -> None: self._agent_os: AndroidAgentOs = agent_os - self._target_resolution: Tuple[int, int] = (1024, 768) - self._real_screen_resolution: Optional[Tuple[int, int]] = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=lambda: self._agent_os.screenshot().size, + take_screenshot=lambda: self.screenshot(), + ) self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.screenshot().size + self._scaler.real_screen_resolution = self._agent_os.screenshot().size def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self) -> Image.Image: screenshot = self._agent_os.screenshot() - self._real_screen_resolution = screenshot.size - return scale_image_to_fit( - screenshot, - self._target_resolution, - ) - - def _scale_coordinates( - self, - x: int, - y: int, - from_agent: bool = True, - ) -> Tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.screenshot().size - - return scale_coordinates( - (x, y), - self._real_screen_resolution, - self._target_resolution, - inverse=from_agent, - ) + return self._scaler.scale_screenshot(screenshot) - def tap(self, x: int, y: int) -> None: - x, y = self._scale_coordinates(x, y) + def tap(self, x: float, y: float) -> None: + x, y = self._scaler.scale_coordinates(x, y) self._agent_os.tap(x, y) def swipe( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: - x1, y1 = self._scale_coordinates(x1, y1) - x2, y2 = self._scale_coordinates(x2, y2) + x1, y1 = self._scaler.scale_coordinates(x1, y1) + x2, y2 = self._scaler.scale_coordinates(x2, y2) self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms) def drag_and_drop( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: - x1, y1 = self._scale_coordinates(x1, y1) - x2, y2 = self._scale_coordinates(x2, y2) + x1, y1 = self._scaler.scale_coordinates(x1, y1) + x2, y2 = self._scaler.scale_coordinates(x2, y2) self._agent_os.drag_and_drop(x1, y1, x2, y2, duration_in_ms) def type(self, text: str) -> None: @@ -78,7 +80,7 @@ def key_tap(self, key: ANDROID_KEY) -> None: self._agent_os.key_tap(key) def key_combination( - self, keys: List[ANDROID_KEY], duration_in_ms: int = 100 + self, keys: list[ANDROID_KEY], duration_in_ms: int = 100 ) -> None: self._agent_os.key_combination(keys, duration_in_ms) @@ -90,27 +92,27 @@ def get_connected_displays(self) -> list[AndroidDisplay]: def set_display_by_index(self, display_index: int = 0) -> None: self._agent_os.set_display_by_index(display_index) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_unique_id(self, display_unique_id: int) -> None: self._agent_os.set_display_by_unique_id(display_unique_id) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_id(self, display_id: int) -> None: self._agent_os.set_display_by_id(display_id) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_display_by_name(self, display_name: str) -> None: self._agent_os.set_display_by_name(display_name) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_device_by_index(self, device_index: int = 0) -> None: self._agent_os.set_device_by_index(device_index) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def set_device_by_serial_number(self, device_sn: str) -> None: self._agent_os.set_device_by_serial_number(device_sn) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def get_connected_devices_serial_numbers(self) -> list[str]: return self._agent_os.get_connected_devices_serial_numbers() @@ -134,7 +136,7 @@ def get_ui_elements(self) -> UIElementCollection: if element.center is None: continue element.set_center( - self._scale_coordinates( + self._scaler.scale_coordinates( x=element.center[0], y=element.center[1], from_agent=False, diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 28a1a8c5..848f6a10 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -2,12 +2,13 @@ from PIL import Image +from askui.models.shared.coordinate_space import VlmCoordinateSpace +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import ( AgentOs, Coordinate, Display, - DisplaySize, DisplaysListResponse, InputEvent, ModifierKey, @@ -15,7 +16,7 @@ PcKey, ) from askui.tools.askui.askui_controller import RenderObjectStyle # noqa: TC001 -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit +from askui.tools.coordinate_scaling_mixin import CoordinateScaler if TYPE_CHECKING: from askui.tools.askui.askui_ui_controller_grpc.generated import ( @@ -29,47 +30,63 @@ class ComputerAgentOsFacade(AgentOs): - """ - Facade for AgentOs that adds coordinate scaling functionality. + """Facade for `AgentOs` that adds coordinate scaling. + + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs + are scaled back up to the real screen resolution before being forwarded + to the underlying agent OS. - This class is used to scale the coordinates to the target resolution - and back to the real screen resolution. + Args: + agent_os (`AgentOs`): The real agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ - def __init__(self, agent_os: AgentOs) -> None: + def __init__( + self, + agent_os: AgentOs, + coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) - self._real_screen_resolution: DisplaySize | None = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=self._fetch_real_screen_resolution, + take_screenshot=lambda: self.screenshot(report=False), + ) self.tags.append(ToolTags.SCALED_AGENT_OS.value) def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.retrieve_active_display().size + self._scaler.real_screen_resolution = self._fetch_real_screen_resolution() def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self, report: bool = True) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) - self._real_screen_resolution = DisplaySize( - width=screenshot.width, height=screenshot.height - ) - return scale_image_to_fit(screenshot, self._target_resolution) + return self._scaler.scale_screenshot(screenshot) + + def _fetch_real_screen_resolution(self) -> tuple[int, int]: + display = self._agent_os.retrieve_active_display() + return display.size.width, display.size.height - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: - scaled_x, scaled_y = self._scale_coordinates_back(x, y) + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) def get_mouse_position(self) -> Coordinate: mouse_position = self._agent_os.get_mouse_position() - scaled_x, scaled_y = self._scale_coordinates_back( + scaled_x, scaled_y = self._scaler.scale_coordinates( mouse_position.x, mouse_position.y, from_agent=False ) return Coordinate(x=scaled_x, y=scaled_y) - def set_mouse_position(self, x: int, y: int) -> None: - scaled_x, scaled_y = self._scale_coordinates_back(x, y) + def set_mouse_position(self, x: float, y: float) -> None: + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.set_mouse_position(scaled_x, scaled_y) def type(self, text: str, typing_speed: int = 50) -> None: @@ -113,7 +130,7 @@ def retrieve_active_display(self) -> Display: def set_display(self, display: int = 1) -> None: self._agent_os.set_display(display) - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def run_command(self, command: str, timeout_ms: int = 30000) -> None: self._agent_os.run_command(command, timeout_ms) @@ -290,7 +307,7 @@ def get_file(self, path: str) -> Image.Image | str: """ response = self._agent_os.get_file(path) if isinstance(response, Image.Image): - return scale_image_to_fit(response, self._target_resolution) + return self._scaler.scale_screenshot(response) return response def remove_virtual_displays(self) -> None: @@ -298,21 +315,4 @@ def remove_virtual_displays(self) -> None: Remove virtual displays from the controller, leaving real displays only. """ self._agent_os.remove_virtual_displays() - self._real_screen_resolution = None - - def _scale_coordinates_back( - self, - x: int, - y: int, - from_agent: bool = True, - check_coordinates_in_bounds: bool = True, - ) -> tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.retrieve_active_display().size - return scale_coordinates( - (x, y), - (self._real_screen_resolution.width, self._real_screen_resolution.height), - self._target_resolution, - inverse=from_agent, - check_coordinates_in_bounds=check_coordinates_in_bounds, - ) + self._scaler.real_screen_resolution = None diff --git a/src/askui/tools/coordinate_scaling_mixin.py b/src/askui/tools/coordinate_scaling_mixin.py new file mode 100644 index 00000000..39d12194 --- /dev/null +++ b/src/askui/tools/coordinate_scaling_mixin.py @@ -0,0 +1,99 @@ +"""Coordinate scaling helper used by all agent OS facades.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from askui.utils.image_utils import scale_coordinates + +if TYPE_CHECKING: + from collections.abc import Callable + + from PIL import Image + + from askui.models.shared.coordinate_space import VlmCoordinateSpace + from askui.models.shared.image_scaler import ImageScaler + + +class CoordinateScaler: + """Maps coordinates between model space and device space. + + Each agent OS facade owns an instance and delegates scaling to it. + + Args: + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. + fetch_real_resolution (`Callable`): Callback that returns the real + ``(width, height)`` of the screen/device when it is not yet known. + take_screenshot (`Callable`): Callback that triggers a screenshot + so that ``target_resolution`` can be populated. + """ + + def __init__( + self, + coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, + fetch_real_resolution: Callable[[], tuple[int, int]], + take_screenshot: Callable[[], Image.Image], + ) -> None: + self._coordinate_space = coordinate_space + self._image_scaler = image_scaler + self._fetch_real_resolution = fetch_real_resolution + self._take_screenshot = take_screenshot + self.target_resolution: tuple[int, int] | None = None + self.real_screen_resolution: tuple[int, int] | None = None + + def scale_screenshot(self, screenshot: Image.Image) -> Image.Image: + """Record real resolution, apply scaler, record target resolution.""" + self.real_screen_resolution = screenshot.size + scaled = self._image_scaler(screenshot) + self.target_resolution = scaled.size + return scaled + + def scale_coordinates( + self, + x: float, + y: float, + from_agent: bool = True, + check_coordinates_in_bounds: bool = True, + ) -> tuple[int, int]: + """Map coordinates between model space and device space. + + When ``from_agent=True``, maps model-emitted coordinates to real + device pixels. When ``from_agent=False``, maps device coordinates + to model space (e.g. for reporting element positions back to the model). + """ + if self.real_screen_resolution is None: + self.real_screen_resolution = self._fetch_real_resolution() + + target_resolution = self._ensure_target_resolution() + + if from_agent: + if self._coordinate_space.maps_to_screenshot_pixels: + mapped_x, mapped_y = self._coordinate_space.map_to_target( + x, y, target_resolution + ) + return scale_coordinates( + (mapped_x, mapped_y), + self.real_screen_resolution, + target_resolution, + inverse=True, + check_coordinates_in_bounds=check_coordinates_in_bounds, + ) + return self._coordinate_space.map_to_target( + x, y, self.real_screen_resolution + ) + + return scale_coordinates( + (int(x), int(y)), + self.real_screen_resolution, + target_resolution, + inverse=False, + check_coordinates_in_bounds=check_coordinates_in_bounds, + ) + + def _ensure_target_resolution(self) -> tuple[int, int]: + if self.target_resolution is None: + self._take_screenshot() + assert self.target_resolution is not None # noqa: S101 + return self.target_resolution diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index 091ff804..62a77652 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -2,65 +2,59 @@ from PIL import Image +from askui.models.shared.coordinate_space import VlmCoordinateSpace +from askui.models.shared.image_scaler import ImageScaler from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import Display, ModifierKey, PcKey +from askui.tools.coordinate_scaling_mixin import CoordinateScaler from askui.tools.playwright.agent_os import PlaywrightAgentOs -from askui.utils.image_utils import scale_coordinates, scale_image_to_fit class PlaywrightAgentOsFacade(PlaywrightAgentOs): """Facade for `PlaywrightAgentOs` that adds coordinate scaling. - Screenshots are scaled down to a fixed target resolution so that the - AI model always sees a consistent image size. Coordinate-based inputs + Screenshots are scaled using the provider's image scaler so that the + AI model sees an optimally sized image. Coordinate-based inputs (``mouse_move``) are scaled back up to the real page resolution before being forwarded to the underlying agent OS. Args: - agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. + agent_os (`PlaywrightAgentOs`): The real Playwright agent OS to wrap. + coordinate_space (`VlmCoordinateSpace`): Coordinate grid the model uses. + image_scaler (`ImageScaler`): Callable to preprocess screenshots. """ - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__( + self, + agent_os: PlaywrightAgentOs, + coordinate_space: VlmCoordinateSpace, + image_scaler: ImageScaler, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) - self._real_screen_resolution: tuple[int, int] | None = None + self._scaler = CoordinateScaler( + coordinate_space=coordinate_space, + image_scaler=image_scaler, + fetch_real_resolution=lambda: self._agent_os.screenshot(report=False).size, + take_screenshot=lambda: self.screenshot(report=False), + ) self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] def connect(self) -> None: self._agent_os.connect() - self._real_screen_resolution = self._agent_os.screenshot( - report=False, + self._scaler.real_screen_resolution = self._agent_os.screenshot( + report=False ).size def disconnect(self) -> None: self._agent_os.disconnect() - self._real_screen_resolution = None + self._scaler.real_screen_resolution = None def screenshot(self, report: bool = True) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) - self._real_screen_resolution = screenshot.size - return scale_image_to_fit(screenshot, self._target_resolution) - - def _scale_coordinates( - self, - x: int, - y: int, - from_agent: bool = True, - ) -> tuple[int, int]: - if self._real_screen_resolution is None: - self._real_screen_resolution = self._agent_os.screenshot( - report=False, - ).size - return scale_coordinates( - (x, y), - self._real_screen_resolution, - self._target_resolution, - inverse=from_agent, - ) + return self._scaler.scale_screenshot(screenshot) - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: - scaled_x, scaled_y = self._scale_coordinates(x, y) - # scaled_x, scaled_y = x, y + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: + scaled_x, scaled_y = self._scaler.scale_coordinates(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) def type(self, text: str, typing_speed: int = 50) -> None: diff --git a/src/askui/tools/store/universal/load_image_tool.py b/src/askui/tools/store/universal/load_image_tool.py index 5a0512e9..b763f2ee 100644 --- a/src/askui/tools/store/universal/load_image_tool.py +++ b/src/askui/tools/store/universal/load_image_tool.py @@ -4,7 +4,7 @@ from PIL import Image from askui.models.shared.tools import Tool -from askui.utils.image_utils import scale_image_to_fit +from askui.utils.llm_image_utils import compute_contained_size, resize_image class LoadImageTool(Tool): @@ -116,7 +116,13 @@ def __call__(self, image_path: str = "") -> Tuple[str, Image.Image]: raise FileExistsError(error_msg) image = Image.open(absolute_image_path) - image = scale_image_to_fit(image, target_size=self._target_size) + target_size = compute_contained_size( + image.width, + image.height, + self._target_size[0], + self._target_size[1], + ) + image = resize_image(image, target_size) return ( f"Image was successfully loaded from {absolute_image_path}", diff --git a/src/askui/utils/llm_image_utils.py b/src/askui/utils/llm_image_utils.py new file mode 100644 index 00000000..c74e4f1e --- /dev/null +++ b/src/askui/utils/llm_image_utils.py @@ -0,0 +1,191 @@ +"""Image utilities for LLM vision model preprocessing. + +Functions for computing optimal image sizes based on patch-based token budgets +and resizing images for VLM consumption. +""" + +import logging +import math + +from PIL import Image + +logger = logging.getLogger(__name__) + + +def count_image_tokens(width: int, height: int, patch_size: int = 28) -> int: + """Count the number of tokens an image will consume in a patch-based VLM. + + Each non-overlapping ``patch_size x patch_size`` square maps to one token. + + Args: + width (int): Image width in pixels. + height (int): Image height in pixels. + patch_size (int): Side length of a single patch in pixels. + + Returns: + int: Number of image tokens. + """ + patches_w = math.ceil(width / patch_size) + patches_h = math.ceil(height / patch_size) + return patches_w * patches_h + + +def compute_patch_optimized_size( + width: int, + height: int, + max_edge: int = 1568, + max_tokens: int = 1568, + patch_size: int = 28, +) -> tuple[int, int]: + """Compute the largest aspect-preserving size within a patch-based token budget. + + Uses binary search to find the biggest scale factor such that: + - Neither dimension exceeds ``max_edge``. + - ``count_image_tokens(w, h, patch_size) <= max_tokens``. + + Args: + width (int): Original image width. + height (int): Original image height. + max_edge (int): Maximum allowed dimension (width or height). + max_tokens (int): Maximum allowed number of image tokens. + patch_size (int): Patch size used by the model. + + Returns: + tuple[int, int]: Target ``(width, height)``. + """ + if width <= 0 or height <= 0: + error_msg = f"Image dimensions must be positive, got {width}x{height}" + raise ValueError(error_msg) + + # If already within all constraints, return as-is + if ( + width <= max_edge + and height <= max_edge + and count_image_tokens(width, height, patch_size) <= max_tokens + ): + return width, height + + # Clamp to max_edge first + scale = min(max_edge / width, max_edge / height, 1.0) + + # Binary search for largest scale that fits within token budget + lo, hi = 0.0, scale + for _ in range(50): + mid = (lo + hi) / 2 + w = max(1, int(width * mid)) + h = max(1, int(height * mid)) + if count_image_tokens(w, h, patch_size) <= max_tokens: + lo = mid + else: + hi = mid + + result_w = max(1, int(width * lo)) + result_h = max(1, int(height * lo)) + return result_w, result_h + + +def compute_contained_size( + width: int, + height: int, + max_width: int = 1024, + max_height: int = 768, +) -> tuple[int, int]: + """Compute the largest aspect-preserving size contained within max bounds. + + If the image already fits, returns its original dimensions. + + Args: + width (int): Original image width. + height (int): Original image height. + max_width (int): Maximum allowed width. + max_height (int): Maximum allowed height. + + Returns: + tuple[int, int]: Target ``(width, height)``. + """ + if width <= 0 or height <= 0: + error_msg = f"Image dimensions must be positive, got {width}x{height}" + raise ValueError(error_msg) + + if width <= max_width and height <= max_height: + return width, height + + scale = min(max_width / width, max_height / height) + return max(1, int(width * scale)), max(1, int(height * scale)) + + +def resize_image(image: Image.Image, target_size: tuple[int, int]) -> Image.Image: + """Resize an image to exact ``target_size`` using LANCZOS resampling. + + Logs a warning if the aspect ratio changes by more than 1%. + + Args: + image (Image.Image): Source image. + target_size (tuple[int, int]): Target ``(width, height)``. + + Returns: + Image.Image: Resized image. + """ + if image.size == target_size: + return image + + src_ratio = image.width / image.height + dst_ratio = target_size[0] / target_size[1] + if abs(src_ratio - dst_ratio) / max(src_ratio, dst_ratio) > 0.01: + logger.warning( + "Aspect ratio change during resize: %.3f -> %.3f", + src_ratio, + dst_ratio, + ) + + return image.resize(target_size, Image.Resampling.LANCZOS) + + +def compute_patch_optimized_image( + image: Image.Image, + max_edge: int = 1568, + max_tokens: int = 1568, + patch_size: int = 28, +) -> Image.Image: + """Resize an image to its patch-optimized size. + + Convenience wrapper that combines `compute_patch_optimized_size` and + `resize_image` into a single call. + + Args: + image (Image.Image): Source image. + max_edge (int): Maximum allowed dimension (width or height). + max_tokens (int): Maximum allowed number of image tokens. + patch_size (int): Patch size used by the model. + + Returns: + Image.Image: Resized image. + """ + target = compute_patch_optimized_size( + image.width, + image.height, + max_edge=max_edge, + max_tokens=max_tokens, + patch_size=patch_size, + ) + return resize_image(image, target) + + +def resize_and_pad_image( + image: Image.Image, + target_size: tuple[int, int], +) -> Image.Image: + """Resize preserving aspect ratio, then center on a padded canvas. + + Equivalent to the legacy ``scale_image_to_fit`` behaviour. + + Args: + image (Image.Image): Source image. + target_size (tuple[int, int]): Canvas ``(width, height)``. + + Returns: + Image.Image: Image centered on a ``target_size`` canvas. + """ + from askui.utils.image_utils import scale_image_to_fit + + return scale_image_to_fit(image, target_size) diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py index fe47c5f9..d1c94232 100644 --- a/src/askui/web_agent.py +++ b/src/askui/web_agent.py @@ -60,7 +60,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PlaywrightAgentOs(reporter) - self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -70,6 +69,11 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = PlaywrightAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + image_scaler=self._vlm_provider.image_scaler, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) self.act_settings = ActSettings( messages=MessageSettings( diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py index 143e7c35..e4fe32d3 100644 --- a/tests/unit/model_providers/test_ollama_vlm_provider.py +++ b/tests/unit/model_providers/test_ollama_vlm_provider.py @@ -6,6 +6,10 @@ from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + ScaledCoordinateSpace, +) class TestOllamaVlmProvider: @@ -48,3 +52,66 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_auto_detects_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="qwen3.5") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Qwen2-VL") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_kimi(self) -> None: + provider = OllamaVlmProvider(model_id="kimi-vl") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Kimi-VL-A3B") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_default_for_non_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="llava") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_explicit_override(self) -> None: + provider = OllamaVlmProvider( + model_id="llava", + coordinate_space=ScaledCoordinateSpace(width=500, height=500), + ) + assert provider.coordinate_space == ScaledCoordinateSpace(width=500, height=500) + + def test_coordinate_space_explicit_override_takes_precedence(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=ScaledCoordinateSpace(width=2000, height=2000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=2000, height=2000 + ) + + def test_coordinate_space_explicit_pixel_overrides_qwen_auto_detect(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=PixelCoordinateSpace(), + ) + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_auto_detects_holo(self) -> None: + provider = OllamaVlmProvider(model_id="holo3.1-35b-a3b") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_holo_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Holo-3.1-4B") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py index d51ff74b..8fb787ab 100644 --- a/tests/unit/model_providers/test_openai_vlm_provider.py +++ b/tests/unit/model_providers/test_openai_vlm_provider.py @@ -3,9 +3,16 @@ from unittest.mock import MagicMock from openai import OpenAI +from PIL import Image from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.models.shared.prompts import SystemPrompt class TestOpenAIVlmProvider: @@ -41,3 +48,146 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_defaults_to_pixel(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_passthrough(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_augment_system_prompt_scaled_coordinate_space(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + system = SystemPrompt(prompt="You are a helpful assistant.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "You are a helpful assistant." in rendered + assert "1000x1000 normalised grid" in rendered + + def test_augment_system_prompt_pixel_coordinate_space(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + system = SystemPrompt(prompt="Base prompt.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "normalised grid" not in rendered + assert "pixel space matching the screenshot dimensions" in rendered + + +class TestImageScaler: + def test_default_scaler_returns_valid_image(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + img = Image.new("RGB", (1920, 1080)) + scaled = provider.image_scaler(img) + assert scaled.width <= 2048 + assert scaled.height <= 2048 + + def test_custom_scaler_override(self) -> None: + def custom_scaler(image: Image.Image) -> Image.Image: + return image.resize((100, 100)) + + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + image_scaler=custom_scaler, + ) + img = Image.new("RGB", (1920, 1080)) + scaled = provider.image_scaler(img) + assert scaled.size == (100, 100) + + +class TestPixelCoordinateSpacePrompt: + def test_shows_pixel_space_description(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section() + assert "pixel space matching the screenshot dimensions" in result + assert "normalised grid" not in result + + def test_includes_origin_info(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section() + assert "top-left" in result + + +class TestScaledCoordinateSpacePrompt: + def test_shows_normalised_grid(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section() + assert "1000x1000 normalised grid" in result + assert "0 <= x < 1000" in result + assert "0 <= y < 1000" in result + + def test_includes_origin_info(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section() + assert "top-left" in result + + +class TestNormalizedCoordinateSpacePrompt: + def test_shows_normalised_floats(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section() + assert "0.0 <= x <= 1.0" in result + assert "0.0 <= y <= 1.0" in result + assert "normalised floats" in result + + def test_includes_origin_info(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section() + assert "top-left" in result + + +class TestMapsToScreenshotPixels: + def test_pixel_returns_true(self) -> None: + assert PixelCoordinateSpace().maps_to_screenshot_pixels is True + + def test_scaled_returns_false(self) -> None: + assert ( + ScaledCoordinateSpace(width=1000, height=1000).maps_to_screenshot_pixels + is False + ) + + def test_normalized_returns_false(self) -> None: + assert NormalizedCoordinateSpace().maps_to_screenshot_pixels is False + + +class TestMapToTarget: + def test_pixel_identity(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512, 384, (1024, 768)) == (512, 384) + + def test_pixel_truncates_floats(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512.7, 384.3, (1024, 768)) == (512, 384) + + def test_scaled_maps_correctly(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(500, 500, (1024, 768)) == (512, 384) + + def test_scaled_zero(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(0, 0, (1024, 768)) == (0, 0) + + def test_normalized_maps_correctly(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.5, 0.5, (1024, 768)) == (512, 384) + + def test_normalized_zero(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.0, 0.0, (1024, 768)) == (0, 0) + + def test_normalized_one(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(1.0, 1.0, (1024, 768)) == (1024, 768) diff --git a/tests/unit/tools/test_agent_os_facade_coordinates.py b/tests/unit/tools/test_agent_os_facade_coordinates.py new file mode 100644 index 00000000..bc0b2868 --- /dev/null +++ b/tests/unit/tools/test_agent_os_facade_coordinates.py @@ -0,0 +1,160 @@ +"""Tests for coordinate mapping in agent OS facades. + +Verifies that non-pixel coordinate spaces (Qwen 0-1000, Kimi 0.0-1.0) +map directly to device resolution, bypassing the padded screenshot space. +""" + +from unittest.mock import MagicMock + +import pytest +from PIL import Image + +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.tools.android.agent_os_facade import AndroidAgentOsFacade +from askui.utils.llm_image_utils import compute_contained_size, resize_image + + +def _default_scaler(image: Image.Image) -> Image.Image: + """Scaler that mimics the default contained-size logic.""" + target = compute_contained_size(image.width, image.height, 1024, 768) + return resize_image(image, target) + + +def _make_android_facade( + device_size: tuple[int, int], + coordinate_space: PixelCoordinateSpace + | ScaledCoordinateSpace + | NormalizedCoordinateSpace, +) -> AndroidAgentOsFacade: + """Create an AndroidAgentOsFacade with a mocked agent OS.""" + mock_os = MagicMock() + mock_os.tags = [] + mock_os.screenshot.return_value = Image.new("RGB", device_size) + facade = AndroidAgentOsFacade( + mock_os, + coordinate_space=coordinate_space, + image_scaler=_default_scaler, + ) + facade._scaler.real_screen_resolution = device_size + # Set target resolution as the scaler would produce it + scaled = _default_scaler(Image.new("RGB", device_size)) + facade._scaler.target_resolution = scaled.size + return facade + + +class TestScaledCoordinateSpaceTallDevice: + """Qwen 0-1000 grid on a tall Android device (1080x2400). + + Non-pixel coordinate spaces map directly to device resolution, + so no padding offset is involved. + """ + + device = (1080, 2400) + cs = ScaledCoordinateSpace(width=1000, height=1000) + + def test_center_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(500, 500) + assert (x, y) == (540, 1200) + + def test_left_side_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(200, 500) + assert (x, y) == (216, 1200) + + def test_swipe_across(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x1, y1 = facade._scaler.scale_coordinates(500, 500) + x2, y2 = facade._scaler.scale_coordinates(200, 500) + assert (x1, y1) == (540, 1200) + assert (x2, y2) == (216, 1200) + + def test_origin(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(0, 0) + assert (x, y) == (0, 0) + + def test_max_corner(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(1000, 1000) + assert (x, y) == (1080, 2400) + + +class TestNormalizedCoordinateSpaceTallDevice: + """Kimi 0.0-1.0 grid on a tall Android device (1080x2400).""" + + device = (1080, 2400) + cs = NormalizedCoordinateSpace() + + def test_center_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(0.5, 0.5) + assert (x, y) == (540, 1200) + + def test_left_side_tap(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(0.2, 0.5) + assert (x, y) == (216, 1200) + + +class TestPixelCoordinateSpaceTallDevice: + """Claude pixel coordinates on a tall Android device (1080x2400). + + With the no-padding scaler, a 1080x2400 device is scaled to + compute_contained_size(1080, 2400, 1024, 768) = (345, 768). + Pixel coordinates are in the (345, 768) screenshot space and go + through the padding-aware inverse scaling pipeline. Because the + image nearly fills the target (only ~2 px rounding slack), offsets + are close to zero but not exactly zero. + """ + + device = (1080, 2400) + cs = PixelCoordinateSpace() + + def test_center_of_content(self) -> None: + """The center of the content area in the scaled screenshot.""" + facade = _make_android_facade(self.device, self.cs) + # Target resolution is (345, 768) — nearly no padding + x, y = facade._scaler.scale_coordinates(172, 384) + assert x == pytest.approx(538, abs=5) + assert y == pytest.approx(1200, abs=5) + + def test_near_top_left_of_content(self) -> None: + """Coordinate near top-left corner maps back close to origin.""" + facade = _make_android_facade(self.device, self.cs) + # Use (1, 2) instead of exact origin to avoid rounding-offset + # edge case that can produce small negative values. + x, y = facade._scaler.scale_coordinates(1, 2) + assert x == pytest.approx(3, abs=5) + assert y == pytest.approx(3, abs=5) + + +class TestSquareDevice: + """Verify no regression on a device with matching aspect ratio.""" + + device = (1024, 768) + cs = ScaledCoordinateSpace(width=1000, height=1000) + + def test_center(self) -> None: + facade = _make_android_facade(self.device, self.cs) + x, y = facade._scaler.scale_coordinates(500, 500) + assert (x, y) == (512, 384) + + +class TestFromAgentFalse: + """from_agent=False always maps device → screenshot pixel space.""" + + def test_device_to_screenshot_scaled_space(self) -> None: + facade = _make_android_facade( + (1080, 2400), ScaledCoordinateSpace(width=1000, height=1000) + ) + x, y = facade._scaler.scale_coordinates(540, 1200, from_agent=False) + # Target resolution is (345, 768), no padding + # Forward scaling: factor = 768/2400 = 0.32 + # x = 540 * 0.32 = 172.8 → 172, y = 1200 * 0.32 = 384 + assert x == pytest.approx(172, abs=2) + assert y == pytest.approx(384, abs=2) diff --git a/tests/unit/utils/test_llm_image_utils.py b/tests/unit/utils/test_llm_image_utils.py new file mode 100644 index 00000000..714d84c0 --- /dev/null +++ b/tests/unit/utils/test_llm_image_utils.py @@ -0,0 +1,145 @@ +"""Tests for LLM image utility functions.""" + +import logging + +import pytest +from PIL import Image + +from askui.utils.llm_image_utils import ( + compute_contained_size, + compute_patch_optimized_size, + count_image_tokens, + resize_and_pad_image, + resize_image, +) + + +class TestCountImageTokens: + def test_exact_patches(self) -> None: + # 56x56 with patch_size=28 → 2x2 = 4 tokens + assert count_image_tokens(56, 56, patch_size=28) == 4 + + def test_single_patch(self) -> None: + assert count_image_tokens(28, 28, patch_size=28) == 1 + + def test_partial_patches_round_up(self) -> None: + # 30x30 with patch_size=28 → ceil(30/28) * ceil(30/28) = 2*2 = 4 + assert count_image_tokens(30, 30, patch_size=28) == 4 + + def test_known_anthropic_value(self) -> None: + # 1568x1568 with patch_size=28 → 56*56 = 3136 + assert count_image_tokens(1568, 1568, patch_size=28) == 3136 + + def test_rectangular(self) -> None: + # 1024x768 with patch_size=28 → ceil(1024/28)*ceil(768/28) = 37*28 = 1036 + assert count_image_tokens(1024, 768, patch_size=28) == 37 * 28 + + +class TestComputePatchOptimizedSize: + def test_small_image_unchanged(self) -> None: + # A small image that fits within all constraints is returned as-is + w, h = compute_patch_optimized_size(200, 100) + assert w == 200 + assert h == 100 + + def test_respects_max_edge(self) -> None: + w, h = compute_patch_optimized_size(3000, 2000, max_edge=1568) + assert w <= 1568 + assert h <= 1568 + + def test_respects_max_tokens(self) -> None: + w, h = compute_patch_optimized_size( + 1920, 1080, max_edge=1568, max_tokens=1568, patch_size=28 + ) + tokens = count_image_tokens(w, h, patch_size=28) + assert tokens <= 1568 + + def test_preserves_aspect_ratio(self) -> None: + w, h = compute_patch_optimized_size(1920, 1080) + original_ratio = 1920 / 1080 + result_ratio = w / h + assert abs(original_ratio - result_ratio) / original_ratio < 0.02 + + def test_invalid_dimensions_raises(self) -> None: + with pytest.raises(ValueError, match="positive"): + compute_patch_optimized_size(0, 100) + + def test_openai_params(self) -> None: + w, h = compute_patch_optimized_size( + 1920, 1080, max_edge=2048, max_tokens=1536, patch_size=32 + ) + tokens = count_image_tokens(w, h, patch_size=32) + assert tokens <= 1536 + assert w <= 2048 + assert h <= 2048 + + +class TestComputeContainedSize: + def test_already_fits(self) -> None: + assert compute_contained_size(800, 600, 1024, 768) == (800, 600) + + def test_exact_match(self) -> None: + assert compute_contained_size(1024, 768, 1024, 768) == (1024, 768) + + def test_landscape_too_wide(self) -> None: + w, h = compute_contained_size(2048, 768, 1024, 768) + assert w <= 1024 + assert h <= 768 + + def test_portrait_too_tall(self) -> None: + w, h = compute_contained_size(768, 2048, 1024, 768) + assert w <= 1024 + assert h <= 768 + + def test_preserves_aspect_ratio(self) -> None: + w, h = compute_contained_size(1920, 1080, 1024, 768) + original_ratio = 1920 / 1080 + result_ratio = w / h + assert abs(original_ratio - result_ratio) / original_ratio < 0.02 + + def test_invalid_dimensions_raises(self) -> None: + with pytest.raises(ValueError, match="positive"): + compute_contained_size(0, 100) + + +class TestResizeImage: + def test_correct_dimensions(self) -> None: + img = Image.new("RGB", (1920, 1080)) + result = resize_image(img, (1024, 576)) + assert result.size == (1024, 576) + + def test_no_op_when_same_size(self) -> None: + img = Image.new("RGB", (1024, 768)) + result = resize_image(img, (1024, 768)) + assert result is img # Same object, no copy + + def test_aspect_ratio_warning_logged( + self, caplog: pytest.LogCaptureFixture + ) -> None: + img = Image.new("RGB", (1920, 1080)) + with caplog.at_level(logging.WARNING): + resize_image(img, (1024, 768)) + assert "Aspect ratio change" in caplog.text + + def test_no_warning_when_ratio_preserved( + self, caplog: pytest.LogCaptureFixture + ) -> None: + img = Image.new("RGB", (1920, 1080)) + with caplog.at_level(logging.WARNING): + resize_image(img, (960, 540)) + assert "Aspect ratio change" not in caplog.text + + +class TestResizeAndPadImage: + def test_correct_dimensions(self) -> None: + img = Image.new("RGB", (1920, 1080)) + result = resize_and_pad_image(img, (1024, 768)) + assert result.size == (1024, 768) + + def test_preserves_aspect_ratio_with_padding(self) -> None: + img = Image.new("RGB", (1080, 2400), color=(255, 0, 0)) + result = resize_and_pad_image(img, (1024, 768)) + assert result.size == (1024, 768) + # Check that some padding exists (black pixels at edges) + left_pixel = result.getpixel((0, 0)) + assert left_pixel == (0, 0, 0) # Black padding