Skip to content
6 changes: 5 additions & 1 deletion src/askui/android_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def __init__(
) -> None:
reporter = CompositeReporter(reporters=reporters)
self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
super().__init__(
reporter=reporter,
retry=retry,
Expand All @@ -97,6 +96,11 @@ def __init__(
callbacks=callbacks,
truncation_strategy=truncation_strategy,
)
self.act_agent_os_facade = AndroidAgentOsFacade(
self.os,
coordinate_space=self._vlm_provider.coordinate_space,
image_scaler=self._vlm_provider.image_scaler,
)
self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
# Override default act settings with Android-specific settings
self.act_settings = ActSettings(
Expand Down
4 changes: 3 additions & 1 deletion src/askui/computer_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ def __init__(
truncation_strategy=truncation_strategy,
)
self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
self.tools.os
self.tools.os,
coordinate_space=self._vlm_provider.coordinate_space,
image_scaler=self._vlm_provider.image_scaler,
)
self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
# Override default act settings with computer-specific settings
Expand Down
14 changes: 13 additions & 1 deletion src/askui/model_providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider
from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
from askui.model_providers.vlm_provider import VlmProvider
from askui.models.shared.coordinate_space import (
NormalizedCoordinateSpace,
PixelCoordinateSpace,
ScaledCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.image_scaler import ImageScaler
from askui.utils.model_pricing import ModelPricing

__all__ = [
Expand All @@ -46,11 +53,16 @@
"DetectionProvider",
"GoogleImageQAProvider",
"ImageQAProvider",
"ImageScaler",
"ModelPricing",
"NormalizedCoordinateSpace",
"OllamaImageQAProvider",
"OllamaVlmProvider",
"OpenAICompatibleVlmProvider",
"OpenAIImageQAProvider",
"OpenAIVlmProvider",
"OpenAICompatibleVlmProvider",
"PixelCoordinateSpace",
"ScaledCoordinateSpace",
"VlmCoordinateSpace",
"VlmProvider",
]
24 changes: 24 additions & 0 deletions src/askui/model_providers/anthropic_vlm_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
ThinkingConfigParam,
ToolChoiceParam,
)
from askui.models.shared.image_scaler import ImageScaler
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection
from askui.utils.llm_image_utils import compute_patch_optimized_image
from askui.utils.model_pricing import ModelPricing

_DEFAULT_MODEL_ID = "claude-sonnet-4-6"
_DEFAULT_MAX_IMAGE_EDGE = 1024


class AnthropicVlmProvider(VlmProvider):
Expand Down Expand Up @@ -46,6 +49,11 @@ class AnthropicVlmProvider(VlmProvider):
cost in USD per 1M output tokens.
cache_write_cost_per_million_tokens (float | None, optional): Override
cost in USD per 1M cache write input tokens.
image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
callable. If ``None``, uses Anthropic-optimized patch-based scaling.
max_image_edge (int | None, optional): Maximum edge length (in pixels)
for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
from the environment if not provided. Defaults to 1568.
cache_read_cost_per_million_tokens (float | None, optional): Override
cost in USD per 1M cache read input tokens.

Expand All @@ -70,6 +78,8 @@ def __init__(
auth_token: str | None = None,
model_id: str | None = None,
client: Anthropic | None = None,
image_scaler: ImageScaler | None = None,
max_image_edge: int | None = None,
input_cost_per_million_tokens: float | None = None,
output_cost_per_million_tokens: float | None = None,
cache_write_cost_per_million_tokens: float | None = None,
Expand All @@ -78,6 +88,12 @@ def __init__(
self._model_id_value = (
model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
)
self._image_scaler_override = image_scaler
self._max_edge = (
max_image_edge
or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
or _DEFAULT_MAX_IMAGE_EDGE
)
if client is not None:
self.client = client
else:
Expand All @@ -104,6 +120,14 @@ def model_id(self) -> str:
def pricing(self) -> ModelPricing | None:
return self._pricing

@property
@override
def image_scaler(self) -> ImageScaler:
if self._image_scaler_override is not None:
return self._image_scaler_override
max_edge = self._max_edge
return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)

@cached_property
def _messages_api(self) -> AnthropicMessagesApi:
"""Lazily initialise the AnthropicMessagesApi on first use."""
Expand Down
42 changes: 32 additions & 10 deletions src/askui/model_providers/askui_vlm_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
ThinkingConfigParam,
ToolChoiceParam,
)
from askui.models.shared.image_scaler import ImageScaler
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection
from askui.utils.llm_image_utils import compute_patch_optimized_image

_DEFAULT_MODEL_ID = "claude-sonnet-4-6"
_DEFAULT_MAX_IMAGE_EDGE = 1024


class AskUIVlmProvider(VlmProvider):
Expand All @@ -29,23 +32,26 @@ class AskUIVlmProvider(VlmProvider):
on the first API call, not at construction time.

Args:
workspace_id (str | None, optional): AskUI workspace ID. Reads
`ASKUI_WORKSPACE_ID` from the environment if not provided.
token (str | None, optional): AskUI API token. Reads `ASKUI_TOKEN`
from the environment if not provided.
model_id (str, optional): Claude model to use. Defaults to
`"claude-sonnet-4-6"`.
client (Anthropic | None, optional): Pre-configured Anthropic client.
If provided, `workspace_id` and `token` are ignored.
askui_settings (`AskUiInferenceApiSettings` | None, optional):
Connection settings (workspace ID, token, base URL). Reads
from environment variables if not provided.
model_id (str | None, optional): Claude model to use. Defaults to
``"claude-sonnet-4-6"``.
client (`Anthropic` | None, optional): Pre-configured Anthropic client.
If provided, ``askui_settings`` is only used for the base URL.
image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
callable. If ``None``, uses Anthropic-optimized patch-based scaling.
max_image_edge (int | None, optional): Maximum edge length (in pixels)
for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
from the environment if not provided. Defaults to 1568.

Example:
```python
from askui import AgentSettings, ComputerAgent
from askui.model_providers import AskUIVlmProvider

agent = ComputerAgent(settings=AgentSettings(
vlm_provider=AskUIVlmProvider(
workspace_id="my-workspace",
token="my-token",
model_id="claude-opus-4-6-20260401",
)
))
Expand All @@ -57,18 +63,34 @@ def __init__(
askui_settings: AskUiInferenceApiSettings | None = None,
model_id: str | None = None,
client: Anthropic | None = None,
image_scaler: ImageScaler | None = None,
max_image_edge: int | None = None,
) -> None:
self._askui_settings = askui_settings or AskUiInferenceApiSettings()
self._model_id_value = (
model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
)
self._injected_client = client
self._image_scaler_override = image_scaler
self._max_edge = (
max_image_edge
or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
or _DEFAULT_MAX_IMAGE_EDGE
)

@property
@override
def model_id(self) -> str:
return self._model_id_value

@property
@override
def image_scaler(self) -> ImageScaler:
if self._image_scaler_override is not None:
return self._image_scaler_override
max_edge = self._max_edge
return lambda image: compute_patch_optimized_image(image, max_edge=max_edge)

@cached_property
def _messages_api(self) -> AnthropicMessagesApi:
"""Lazily initialise the AnthropicMessagesApi on first use."""
Expand Down
46 changes: 46 additions & 0 deletions src/askui/model_providers/ollama_vlm_provider.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,51 @@
"""OllamaVlmProvider — VLM access via a local Ollama instance."""

from openai import OpenAI
from typing_extensions import override

from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
from askui.models.shared.coordinate_space import (
PixelCoordinateSpace,
ScaledCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.image_scaler import ImageScaler

_DEFAULT_BASE_URL = "http://localhost:11434/v1"
_DEFAULT_MODEL_ID = "qwen3.5"

_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)


class OllamaVlmProvider(OpenAIVlmProvider):
"""VLM provider that routes requests to a local Ollama instance.

Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
defaults (``base_url``, ``api_key``, ``model_id``).

Qwen and Holo models are automatically detected and their coordinate
space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
Kimi models use ``NormalizedCoordinateSpace()``.
Pass ``coordinate_space`` explicitly to override auto-detection.

Args:
model_id (str, optional): Ollama model to use. Defaults to
``"qwen3.5"``.
base_url (str, optional): Base URL for the Ollama OpenAI-compatible
API. Defaults to ``"http://localhost:11434/v1"``.
client (`OpenAI` | None, optional): Pre-configured OpenAI client.
If provided, ``base_url`` is ignored.
coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
grid the model emits coordinates in. ``None`` (the default)
enables auto-detection based on ``model_id``.
image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
callable. If ``None``, inherits from `OpenAIVlmProvider`.
max_image_edge (int | None, optional): Maximum edge length (in pixels)
for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
from the environment if not provided. Inherits the default from
`OpenAIVlmProvider` (2048).

Example:
```python
Expand All @@ -40,10 +65,31 @@ def __init__(
model_id: str = _DEFAULT_MODEL_ID,
base_url: str = _DEFAULT_BASE_URL,
client: OpenAI | None = None,
coordinate_space: VlmCoordinateSpace | None = None,
image_scaler: ImageScaler | None = None,
max_image_edge: int | None = None,
) -> None:
self._coordinate_space_override = coordinate_space
super().__init__(
model_id=model_id,
api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value
base_url=base_url,
client=client,
coordinate_space=coordinate_space or PixelCoordinateSpace(),
image_scaler=image_scaler,
max_image_edge=max_image_edge,
)

@property
@override
def coordinate_space(self) -> VlmCoordinateSpace:
if self._coordinate_space_override is not None:
return self._coordinate_space_override
model_lower = self._model_id_value.lower()
if "qwen" in model_lower:
return _QWEN_COORDINATE_SPACE
if "holo" in model_lower:
return _HOLO_COORDINATE_SPACE
if "kimi" in model_lower:
return _KIMI_COORDINATE_SPACE
return self._coordinate_space
22 changes: 22 additions & 0 deletions src/askui/model_providers/openai_compatible_vlm_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
from openai import OpenAI

from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
from askui.models.shared.coordinate_space import (
PixelCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.image_scaler import ImageScaler

_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()


class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
Expand All @@ -20,6 +27,15 @@ class OpenAICompatibleVlmProvider(OpenAIVlmProvider):
(e.g. ``"https://my-host/v1/chat/completions"``).
model_id (str): Model name expected by the deployment.
api_key (str | None, optional): API key for the endpoint.
coordinate_space (`VlmCoordinateSpace` | None, optional): The coordinate
grid the model emits coordinates in. If ``None``, inherits the
default from `OpenAIVlmProvider` (pixel coordinates).
image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
callable. If ``None``, inherits from `OpenAIVlmProvider`.
max_image_edge (int | None, optional): Maximum edge length (in pixels)
for screenshots sent to the model. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE``
from the environment if not provided. Inherits the default from
`OpenAIVlmProvider` (2048).

Example:
```python
Expand All @@ -41,6 +57,9 @@ def __init__(
endpoint_url: str,
model_id: str | None = None,
api_key: str | None = None,
coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
image_scaler: ImageScaler | None = None,
max_image_edge: int | None = None,
) -> None:
def _rewrite_url(request: httpx.Request) -> None:
request.url = httpx.URL(endpoint_url)
Expand All @@ -56,4 +75,7 @@ def _rewrite_url(request: httpx.Request) -> None:
super().__init__(
model_id=model_id,
client=client,
coordinate_space=coordinate_space,
image_scaler=image_scaler,
max_image_edge=max_image_edge,
)
Loading
Loading