diff --git a/pkg/create/templates.go b/pkg/create/templates.go index 699b97f..0627ad3 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -219,7 +219,7 @@ var Commands = map[string]map[string]DeployConfig{ TemplateYutoriComputerUse: { EntryPoint: "index.ts", NeedsEnvFile: true, - InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://www.yutori.com and list the team member names."}'`, }, TemplateTzafonComputerUse: { EntryPoint: "index.ts", @@ -271,7 +271,7 @@ var Commands = map[string]map[string]DeployConfig{ TemplateYutoriComputerUse: { EntryPoint: "main.py", NeedsEnvFile: true, - InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://www.yutori.com and list the team member names."}'`, }, TemplateTzafonComputerUse: { EntryPoint: "main.py", diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md index 8ace295..afca270 100644 --- a/pkg/templates/python/yutori/README.md +++ b/pkg/templates/python/yutori/README.md @@ -21,6 +21,18 @@ kernel deploy main.py --env-file .env ## Usage +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://www.yutori.com and list the team member names."}' +``` + +Optional payload fields: + +- `record_replay` (bool) — capture a video of the session (paid plans only). +- `kiosk` (bool) — launch the browser without address bar / tabs ([see below](#kiosk-mode)). +- `user_timezone` (IANA, e.g. `"America/New_York"`) and `user_location` (free text, e.g. `"New York, NY, US"`) — appended to the task message so the model has accurate temporal/locational grounding. + +More involved example (Kanban drag-and-drop): + ```bash kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items."}' ``` diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py index 447ebff..1a81c49 100644 --- a/pkg/templates/python/yutori/loop.py +++ b/pkg/templates/python/yutori/loop.py @@ -11,9 +11,14 @@ @see https://docs.yutori.com/reference/n1-5 """ +from __future__ import annotations + import copy import json +import platform +from datetime import datetime from typing import Any, Optional +from zoneinfo import ZoneInfo, ZoneInfoNotFoundError from kernel import Kernel from openai import OpenAI @@ -26,6 +31,8 @@ DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"] TOOL_SET = "browser_tools_core-20260403" +NAVIGATOR_COORDINATE_SCALE = 1000 + # Screenshot-trimming defaults mirror Yutori's reference loop: # https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py # Trimming is size-triggered — we only drop old screenshots when the payload @@ -42,12 +49,14 @@ async def sampling_loop( kernel: Kernel, session_id: str, max_completion_tokens: int = 4096, - max_iterations: int = 50, + max_iterations: int = 100, viewport_width: int = 1280, viewport_height: int = 800, kiosk_mode: bool = False, + user_timezone: str = "America/Los_Angeles", + user_location: str = "San Francisco, CA, US", ) -> dict[str, Any]: - """Run the n1 sampling loop until the model stops calling tools or max iterations.""" + """Run the n1.5 sampling loop until the model stops calling tools or max iterations.""" client = OpenAI( api_key=api_key, base_url="https://api.yutori.com/v1", @@ -57,7 +66,12 @@ async def sampling_loop( initial_screenshot = await computer_tool.screenshot() - user_content: list[dict[str, Any]] = [{"type": "text", "text": task}] + # Append location/timezone/current-date context to the task — mirrors Yutori's + # format_task_with_context helper and helps the model with date-sensitive + # judgments. https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/context.py + task_with_context = _format_task_with_context(task, user_timezone, user_location) + + user_content: list[dict[str, Any]] = [{"type": "text", "text": task_with_context}] if initial_screenshot.get("base64_image"): user_content.append({ "type": "image_url", @@ -171,8 +185,39 @@ async def sampling_loop( "content": result.get("output", "OK"), }) - if iteration >= max_iterations: - print("Max iterations reached") + # If the loop exhausted iterations, prompt the model for a final summary so + # the caller gets a usable answer instead of empty content. Mirrors Yutori's + # format_stop_and_summarize helper. + if iteration >= max_iterations and not final_answer: + print("Max iterations reached — requesting summary") + try: + final_screenshot = await computer_tool.screenshot() + stop_content: list[dict[str, Any]] = [ + {"type": "text", "text": _format_stop_and_summarize(task)} + ] + if final_screenshot.get("base64_image"): + stop_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/webp;base64,{final_screenshot['base64_image']}" + }, + }) + conversation_messages.append({"role": "user", "content": stop_content}) + + summary_messages, _ = _trimmed_for_request(conversation_messages) + summary_response = client.chat.completions.create( + model=model, + messages=summary_messages, + max_completion_tokens=max_completion_tokens, + temperature=0.3, + extra_body={"tool_set": TOOL_SET, "disable_tools": DISABLED_TOOLS}, + ) + summary = summary_response.choices[0].message if summary_response.choices else None + if summary: + conversation_messages.append(summary.model_dump(exclude_none=True)) + final_answer = summary.content or None + except Exception as summary_error: + print(f"Stop-and-summarize call failed: {summary_error}") return { "messages": conversation_messages, @@ -180,6 +225,41 @@ async def sampling_loop( } +def _format_task_with_context(task: str, user_timezone: str, user_location: str) -> str: + """Append location, timezone, and current date/time to the task message.""" + for timezone_name in [user_timezone, "America/Los_Angeles", "UTC"]: + try: + tz = ZoneInfo(timezone_name) + tz_label = timezone_name + break + except (ZoneInfoNotFoundError, ValueError, OSError): + continue + else: + return task + + now = datetime.now(tz) + day_fmt = "%#d" if platform.system() == "Windows" else "%-d" + context = "\n".join([ + f"User's location: {user_location}", + f"User's timezone: {tz_label}", + f"Current Date: {now.strftime(f'%B {day_fmt}, %Y')}", + f"Current Time: {now.strftime('%H:%M:%S %Z')}", + f"Today is: {now.strftime('%A')}", + ]) + return f"{task}\n\n{context}" + + +def _format_stop_and_summarize(task: str) -> str: + return ( + f"Stop here. " + f"Summarize your current progress and list in detail all the findings " + f"relevant to the given task:\n{task}\n" + f"Provide URLs for all relevant results you find and return them in your response. " + f"If there is no specific URL for a result, " + f"cite the page URL that the information was found on." + ) + + def _trimmed_for_request( messages: list[dict[str, Any]], ) -> tuple[list[dict[str, Any]], int]: @@ -263,17 +343,22 @@ def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: scaled = dict(action) if "coordinates" in scaled and scaled["coordinates"]: - coords = scaled["coordinates"] - scaled["coordinates"] = [ - round((coords[0] / 1000) * viewport_width), - round((coords[1] / 1000) * viewport_height), - ] + scaled["coordinates"] = _denormalize(scaled["coordinates"], viewport_width, viewport_height) if "start_coordinates" in scaled and scaled["start_coordinates"]: - coords = scaled["start_coordinates"] - scaled["start_coordinates"] = [ - round((coords[0] / 1000) * viewport_width), - round((coords[1] / 1000) * viewport_height), - ] + scaled["start_coordinates"] = _denormalize(scaled["start_coordinates"], viewport_width, viewport_height) return scaled + + +def _denormalize(coords: list[int] | tuple[int, int], width: int, height: int) -> list[int]: + """Map [0, 1000] coordinates to viewport pixels and clamp to [0, dim-1]. + + Clamping prevents a boundary value like 1000 from landing one pixel outside + the viewport on a 1280x800 display. + """ + raw_x = round((coords[0] / NAVIGATOR_COORDINATE_SCALE) * width) + raw_y = round((coords[1] / NAVIGATOR_COORDINATE_SCALE) * height) + x = max(0, min(width - 1, raw_x)) + y = max(0, min(height - 1, raw_y)) + return [x, y] diff --git a/pkg/templates/python/yutori/main.py b/pkg/templates/python/yutori/main.py index 21543d9..94d13e1 100644 --- a/pkg/templates/python/yutori/main.py +++ b/pkg/templates/python/yutori/main.py @@ -6,10 +6,15 @@ from session import KernelBrowserSession -class QueryInput(TypedDict): - query: str +class _QueryInputOptional(TypedDict, total=False): record_replay: Optional[bool] kiosk: Optional[bool] + user_timezone: Optional[str] + user_location: Optional[str] + + +class QueryInput(_QueryInputOptional): + query: str class QueryOutput(TypedDict): @@ -37,6 +42,9 @@ async def cua_task( payload: An object containing: - query: The task/query string to process - record_replay: Optional boolean to enable video replay recording + - kiosk: Optional boolean to launch in kiosk mode + - user_timezone: Optional IANA tz (e.g. "America/New_York") + - user_location: Optional free-text location for model context Returns: A dictionary containing: @@ -57,16 +65,22 @@ async def cua_task( ) as session: print("Kernel browser live view url:", session.live_view_url) - loop_result = await sampling_loop( - model="n1.5-latest", - task=payload["query"], - api_key=str(api_key), - kernel=session.kernel, - session_id=str(session.session_id), - viewport_width=session.viewport_width, - viewport_height=session.viewport_height, - kiosk_mode=kiosk_mode, - ) + loop_kwargs: dict = { + "model": "n1.5-latest", + "task": payload["query"], + "api_key": str(api_key), + "kernel": session.kernel, + "session_id": str(session.session_id), + "viewport_width": session.viewport_width, + "viewport_height": session.viewport_height, + "kiosk_mode": kiosk_mode, + } + if payload.get("user_timezone"): + loop_kwargs["user_timezone"] = payload["user_timezone"] + if payload.get("user_location"): + loop_kwargs["user_location"] = payload["user_location"] + + loop_result = await sampling_loop(**loop_kwargs) final_answer = loop_result.get("final_answer") messages = loop_result.get("messages", []) @@ -74,7 +88,6 @@ async def cua_task( if final_answer: result = final_answer else: - # Extract last assistant message result = _extract_last_assistant_message(messages) return { diff --git a/pkg/templates/python/yutori/pyproject.toml b/pkg/templates/python/yutori/pyproject.toml index aafe423..87c17c7 100644 --- a/pkg/templates/python/yutori/pyproject.toml +++ b/pkg/templates/python/yutori/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "python-yutori-cua" version = "0.1.0" -description = "Kernel reference app for Yutori n1 Computer Use" +description = "Kernel reference app for Yutori n1.5 Computer Use" requires-python = ">=3.9" dependencies = [ "openai>=1.58.0", diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index 8cdf098..d4debde 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -7,6 +7,8 @@ @see https://docs.yutori.com/reference/n1-5 """ +from __future__ import annotations + import asyncio import base64 import json @@ -22,6 +24,18 @@ SCREENSHOT_DELAY_S = 0.15 ACTION_DELAY_S = 0.3 +# n1.5 scroll `amount` is in "wheel units" where 1 unit ≈ 10% of the viewport +# height (~80px at 800px tall). Kernel's delta_y is a wheel-event repeat count +# where each tick is much smaller in practice, so we multiply. +SCROLL_NOTCHES_PER_AMOUNT = 4 + +# WebP quality for screenshots. Kernel returns PNGs, which are crisp and +# tolerate aggressive WebP compression with no visible degradation — matches +# Yutori SDK's DEFAULT_WEBP_QUALITY_FOR_PNG=30 (yutori-sdk-python/yutori/ +# navigator/images.py). Lower values cut payload size substantially on long +# multi-step trajectories. +WEBP_QUALITY = 30 + N15ActionType = Literal[ "left_click", "double_click", @@ -57,43 +71,87 @@ class N15Action(TypedDict, total=False): url: str -KEY_MAP = { - "Enter": "Return", - "Escape": "Escape", - "Backspace": "BackSpace", - "Tab": "Tab", - "Delete": "Delete", - "ArrowUp": "Up", - "ArrowDown": "Down", - "ArrowLeft": "Left", - "ArrowRight": "Right", - "Home": "Home", - "End": "End", - "PageUp": "Page_Up", - "PageDown": "Page_Down", - "F1": "F1", - "F2": "F2", - "F3": "F3", - "F4": "F4", - "F5": "F5", - "F6": "F6", - "F7": "F7", - "F8": "F8", - "F9": "F9", - "F10": "F10", - "F11": "F11", - "F12": "F12", +# n1.5 emits lowercase key names (e.g. `enter`, `ctrl+c`, `down down down enter`). +# Kernel's press_key expects XKeysym names (e.g. `Return`, `Ctrl`, `Page_Up`). +# Keys not in the map pass through unchanged (printable characters like `a`, +# `1`, `,` are already XKeysym). +# +# Sister implementation (Playwright target instead of XKeysym): +# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/keys.py +KEY_MAP: dict[str, str] = { + # Modifiers + "ctrl": "Ctrl", + "control": "Ctrl", + "shift": "Shift", + "alt": "Alt", + "meta": "Super_L", + "command": "Super_L", + "cmd": "Super_L", + "super": "Super_L", + "option": "Alt", + # Enter + "enter": "Return", + "return": "Return", + # Navigation + "tab": "Tab", + "backspace": "BackSpace", + "delete": "Delete", + "escape": "Escape", + "esc": "Escape", + "space": "space", + # Arrows + "up": "Up", + "down": "Down", + "left": "Left", + "right": "Right", + "arrowup": "Up", + "arrowdown": "Down", + "arrowleft": "Left", + "arrowright": "Right", + # Page nav + "home": "Home", + "end": "End", + "pageup": "Page_Up", + "pagedown": "Page_Down", + # Function keys + **{f"f{i}": f"F{i}" for i in range(1, 13)}, + # Locks / special + "capslock": "Caps_Lock", + "numlock": "Num_Lock", + "scrolllock": "Scroll_Lock", + "insert": "Insert", + "pause": "Pause", + "printscreen": "Print", } -MODIFIER_MAP = { - "control": "ctrl", - "ctrl": "ctrl", - "alt": "alt", - "shift": "shift", - "meta": "super", - "command": "super", - "cmd": "super", -} + +def _map_token(token: str) -> str: + lower = token.strip().lower() + return KEY_MAP.get(lower, token.strip()) + + +def _normalize_url(url: str) -> str: + trimmed = url.strip() + if "://" in trimmed: + return trimmed + return f"https://{trimmed}" + + +def _parse_key_expression(expr: str) -> list[str]: + """Parse an n1.5 key expression into one Kernel combo per sequential press. + + Spaces separate sequential presses; '+' separates simultaneous tokens + within a press. Examples: + "enter" -> ["Return"] + "ctrl+c" -> ["Ctrl+c"] + "down down enter" -> ["Down", "Down", "Return"] + "ctrl+shift+t" -> ["Ctrl+Shift+t"] + """ + return [ + "+".join(_map_token(token) for token in combo.split("+")) + for combo in expr.strip().split() + if combo + ] class ComputerTool: @@ -145,7 +203,7 @@ async def _handle_click(self, action: N15Action, button: str, num_clicks: int) - "num_clicks": num_clicks, } if modifier: - kwargs["hold_keys"] = [self._map_key(modifier)] + kwargs["hold_keys"] = [_map_token(modifier)] self.kernel.browsers.computer.click_mouse(self.session_id, **kwargs) @@ -181,23 +239,25 @@ async def _handle_mouse_button(self, action: N15Action, click_type: str) -> Tool async def _handle_scroll(self, action: N15Action) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) direction = action.get("direction") - notches = max(action.get("amount", 3), 1) + amount = max(action.get("amount", 3), 1) if direction not in ("up", "down", "left", "right"): raise ToolError(f"Invalid scroll direction: {direction}") - # Backend (kernel-images) uses delta_x/delta_y as wheel-event repeat count (notches), not pixels. + # Yutori 1 unit ≈ 10% of viewport height; scale into Kernel wheel-event ticks. + ticks = amount * SCROLL_NOTCHES_PER_AMOUNT + delta_x = 0 delta_y = 0 if direction == "up": - delta_y = -notches + delta_y = -ticks elif direction == "down": - delta_y = notches + delta_y = ticks elif direction == "left": - delta_x = -notches + delta_x = -ticks elif direction == "right": - delta_x = notches + delta_x = ticks modifier = action.get("modifier") scroll_kwargs: dict[str, Any] = { @@ -207,13 +267,13 @@ async def _handle_scroll(self, action: N15Action) -> ToolResult: "delta_y": delta_y, } if modifier: - scroll_kwargs["hold_keys"] = [self._map_key(modifier)] + scroll_kwargs["hold_keys"] = [_map_token(modifier)] self.kernel.browsers.computer.scroll(self.session_id, **scroll_kwargs) await asyncio.sleep(SCREENSHOT_DELAY_S) screenshot_result = await self.screenshot() - screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}." + screenshot_result["output"] = f"Scrolled {amount} unit(s) {direction}." return screenshot_result async def _handle_type(self, action: N15Action) -> ToolResult: @@ -235,12 +295,11 @@ async def _handle_key_press(self, action: N15Action) -> ToolResult: if not key: raise ToolError("key is required for key_press action") - mapped_key = self._map_key(key) - - self.kernel.browsers.computer.press_key( - self.session_id, - keys=[mapped_key], - ) + # n1.5 supports sequential presses ("down down down enter") — issue each + # combo as its own press_key so they're seen as separate keystrokes. + combos = _parse_key_expression(key) + for combo in combos: + self.kernel.browsers.computer.press_key(self.session_id, keys=[combo]) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() @@ -250,16 +309,17 @@ async def _handle_hold_key(self, action: N15Action) -> ToolResult: if not key: raise ToolError("key is required for hold_key action") - mapped_key = self._map_key(key) # Yutori emits `duration` in seconds; Kernel SDK's press_key takes ms. duration_s = action.get("duration") duration_ms = int(duration_s * 1000) if duration_s and duration_s > 0 else 1000 - self.kernel.browsers.computer.press_key( - self.session_id, - keys=[mapped_key], - duration=duration_ms, - ) + combos = _parse_key_expression(key) + for combo in combos: + self.kernel.browsers.computer.press_key( + self.session_id, + keys=[combo], + duration=duration_ms, + ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() @@ -295,7 +355,7 @@ async def _handle_refresh(self, action: N15Action) -> ToolResult: async def _handle_go_back(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, - keys=["alt+Left"], + keys=["Alt+Left"], ) await asyncio.sleep(1.5) return await self.screenshot() @@ -303,7 +363,7 @@ async def _handle_go_back(self, action: N15Action) -> ToolResult: async def _handle_go_forward(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, - keys=["alt+Right"], + keys=["Alt+Right"], ) await asyncio.sleep(1.5) return await self.screenshot() @@ -312,11 +372,12 @@ async def _handle_goto_url(self, action: N15Action) -> ToolResult: url = action.get("url") if not url: raise ToolError("url is required for goto_url action") + target_url = _normalize_url(url) if self.kiosk_mode: response = self.kernel.browsers.playwright.execute( self.session_id, - code=f"await page.goto({json.dumps(url)});", + code=f"await page.goto({json.dumps(target_url)});", timeout_sec=60, ) if not response.success: @@ -326,19 +387,19 @@ async def _handle_goto_url(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, - keys=["ctrl+l"], + keys=["Ctrl+l"], ) await asyncio.sleep(ACTION_DELAY_S) self.kernel.browsers.computer.press_key( self.session_id, - keys=["ctrl+a"], + keys=["Ctrl+a"], ) await asyncio.sleep(0.1) self.kernel.browsers.computer.type_text( self.session_id, - text=url, + text=target_url, delay=TYPING_DELAY_MS, ) await asyncio.sleep(ACTION_DELAY_S) @@ -358,7 +419,7 @@ async def screenshot(self) -> ToolResult: png_bytes = response.read() img = Image.open(BytesIO(png_bytes)) webp_buf = BytesIO() - img.save(webp_buf, "WEBP", quality=80) + img.save(webp_buf, "WEBP", quality=WEBP_QUALITY) base64_image = base64.b64encode(webp_buf.getvalue()).decode("utf-8") return {"base64_image": base64_image} except Exception as e: @@ -375,15 +436,3 @@ def _get_coordinates( raise ToolError(f"Invalid coordinates: {coords}") return {"x": int(x), "y": int(y)} - - def _map_key(self, key: str) -> str: - def map_part(part: str) -> str: - trimmed = part.strip() - lower = trimmed.lower() - if lower in MODIFIER_MAP: - return MODIFIER_MAP[lower] - return KEY_MAP.get(trimmed, trimmed) - - if "+" in key: - return "+".join(map_part(p) for p in key.split("+")) - return map_part(key) diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md index de9113b..04be089 100644 --- a/pkg/templates/typescript/yutori/README.md +++ b/pkg/templates/typescript/yutori/README.md @@ -21,6 +21,18 @@ kernel deploy index.ts --env-file .env ## Usage +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://www.yutori.com and list the team member names."}' +``` + +Optional payload fields: + +- `record_replay` (bool) — capture a video of the session (paid plans only). +- `kiosk` (bool) — launch the browser without address bar / tabs ([see below](#kiosk-mode)). +- `user_timezone` (IANA, e.g. `"America/New_York"`) and `user_location` (free text, e.g. `"New York, NY, US"`) — appended to the task message so the model has accurate temporal/locational grounding. + +More involved example (Kanban drag-and-drop): + ```bash kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items."}' ``` diff --git a/pkg/templates/typescript/yutori/index.ts b/pkg/templates/typescript/yutori/index.ts index c38a1b5..215a930 100644 --- a/pkg/templates/typescript/yutori/index.ts +++ b/pkg/templates/typescript/yutori/index.ts @@ -1,4 +1,5 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; +import type OpenAI from 'openai'; import { samplingLoop } from './loop'; import { KernelBrowserSession } from './session'; @@ -10,6 +11,8 @@ interface QueryInput { query: string; record_replay?: boolean; kiosk?: boolean; + user_timezone?: string; + user_location?: string; } interface QueryOutput { @@ -55,6 +58,8 @@ app.action( viewportWidth: session.viewportWidth, viewportHeight: session.viewportHeight, kioskMode, + userTimezone: payload.user_timezone, + userLocation: payload.user_location, }); // Extract the result @@ -75,10 +80,10 @@ app.action( }, ); -function extractLastAssistantMessage(messages: { role: string; content: string | unknown[] }[]): string { +function extractLastAssistantMessage(messages: OpenAI.ChatCompletionMessageParam[]): string { for (let i = messages.length - 1; i >= 0; i--) { const msg = messages[i]; - if (msg.role === 'assistant' && typeof msg.content === 'string' && msg.content) { + if (msg && msg.role === 'assistant' && typeof msg.content === 'string' && msg.content) { return msg.content; } } diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts index 7db2e4c..2b2c8c7 100644 --- a/pkg/templates/typescript/yutori/loop.ts +++ b/pkg/templates/typescript/yutori/loop.ts @@ -21,6 +21,8 @@ import { ComputerTool, type N15Action, type ToolResult } from './tools/computer' const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js']; const TOOL_SET = 'browser_tools_core-20260403'; +const NAVIGATOR_COORDINATE_SCALE = 1000; + // Screenshot-trimming defaults mirror Yutori's reference loop: // https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py // Trimming is size-triggered — we only drop old screenshots when the payload @@ -44,9 +46,11 @@ interface SamplingLoopOptions { viewportWidth?: number; viewportHeight?: number; kioskMode?: boolean; + userTimezone?: string; + userLocation?: string; } -interface SamplingLoopResult { +export interface SamplingLoopResult { messages: OpenAI.ChatCompletionMessageParam[]; finalAnswer?: string; } @@ -58,10 +62,12 @@ export async function samplingLoop({ kernel, sessionId, maxCompletionTokens = 4096, - maxIterations = 50, + maxIterations = 100, viewportWidth = 1280, viewportHeight = 800, kioskMode = false, + userTimezone = 'America/Los_Angeles', + userLocation = 'San Francisco, CA, US', }: SamplingLoopOptions): Promise { const client = new OpenAI({ apiKey, @@ -72,11 +78,16 @@ export async function samplingLoop({ const initialScreenshot = await computerTool.screenshot(); + // Append location/timezone/current-date context to the task — mirrors Yutori's + // format_task_with_context helper and helps the model with date-sensitive + // judgments. https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/context.py + const taskWithContext = formatTaskWithContext(task, userTimezone, userLocation); + const conversationMessages: OpenAI.ChatCompletionMessageParam[] = [ { role: 'user', content: [ - { type: 'text', text: task }, + { type: 'text', text: taskWithContext }, ...(initialScreenshot.base64Image ? [{ type: 'image_url' as const, @@ -129,8 +140,7 @@ export async function samplingLoop({ throw new Error('No choices in API response'); } - const choice = response.choices[0]; - const assistantMessage = choice.message; + const assistantMessage = response.choices[0]?.message; if (!assistantMessage) { throw new Error('No response from model'); } @@ -213,8 +223,41 @@ export async function samplingLoop({ } } - if (iteration >= maxIterations) { - console.log('Max iterations reached'); + // If the loop exhausted iterations, prompt the model for a final summary so + // the caller gets a usable answer instead of empty content. Mirrors Yutori's + // format_stop_and_summarize helper. + if (iteration >= maxIterations && !finalAnswer) { + console.log('Max iterations reached — requesting summary'); + try { + const finalScreenshot = await computerTool.screenshot(); + conversationMessages.push({ + role: 'user', + content: [ + { type: 'text', text: formatStopAndSummarize(task) }, + ...(finalScreenshot.base64Image + ? [{ + type: 'image_url' as const, + image_url: { url: `data:image/webp;base64,${finalScreenshot.base64Image}` }, + }] + : []), + ], + }); + const { messages: summaryMessages } = trimmedForRequest(conversationMessages); + const summaryResponse = await client.chat.completions.create({ + model, + messages: summaryMessages, + max_completion_tokens: maxCompletionTokens, + temperature: 0.3, + ...({ tool_set: TOOL_SET, disable_tools: DISABLED_TOOLS } satisfies YutoriExtras), + }); + const summary = summaryResponse.choices[0]?.message; + if (summary) { + conversationMessages.push(summary); + finalAnswer = summary.content || undefined; + } + } catch (error) { + console.error('Stop-and-summarize call failed:', error); + } } return { @@ -223,26 +266,83 @@ export async function samplingLoop({ }; } +function formatTaskWithContext(task: string, userTimezone: string, userLocation: string): string { + const now = new Date(); + const tzLabel = resolveTimezone(userTimezone); + const timeFormatter = new Intl.DateTimeFormat('en-US', { + timeZone: tzLabel, + hour12: false, + hour: '2-digit', + minute: '2-digit', + second: '2-digit', + timeZoneName: 'short', + }); + const dateFormatter = new Intl.DateTimeFormat('en-US', { + timeZone: tzLabel, + year: 'numeric', + month: 'long', + day: 'numeric', + }); + const weekdayFormatter = new Intl.DateTimeFormat('en-US', { timeZone: tzLabel, weekday: 'long' }); + + const context = [ + `User's location: ${userLocation}`, + `User's timezone: ${tzLabel}`, + `Current Date: ${dateFormatter.format(now)}`, + `Current Time: ${timeFormatter.format(now)}`, + `Today is: ${weekdayFormatter.format(now)}`, + ].join('\n'); + + return `${task}\n\n${context}`; +} + +function resolveTimezone(userTimezone: string): string { + for (const timeZone of [userTimezone, 'America/Los_Angeles', 'UTC']) { + try { + new Intl.DateTimeFormat('en-US', { timeZone }).format(new Date()); + return timeZone; + } catch { + // Try the next fallback. + } + } + return 'UTC'; +} + +function formatStopAndSummarize(task: string): string { + return ( + `Stop here. ` + + `Summarize your current progress and list in detail all the findings ` + + `relevant to the given task:\n${task}\n` + + `Provide URLs for all relevant results you find and return them in your response. ` + + `If there is no specific URL for a result, ` + + `cite the page URL that the information was found on.` + ); +} + function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeight: number): N15Action { const scaled = { ...action }; if (scaled.coordinates) { - scaled.coordinates = [ - Math.round((scaled.coordinates[0] / 1000) * viewportWidth), - Math.round((scaled.coordinates[1] / 1000) * viewportHeight), - ]; + scaled.coordinates = denormalize(scaled.coordinates, viewportWidth, viewportHeight); } if (scaled.start_coordinates) { - scaled.start_coordinates = [ - Math.round((scaled.start_coordinates[0] / 1000) * viewportWidth), - Math.round((scaled.start_coordinates[1] / 1000) * viewportHeight), - ]; + scaled.start_coordinates = denormalize(scaled.start_coordinates, viewportWidth, viewportHeight); } return scaled; } +// Map [0, 1000] coordinates into viewport pixels and clamp to [0, dim-1] so a +// boundary value like 1000 doesn't land one pixel outside the viewport. +function denormalize(coords: [number, number], width: number, height: number): [number, number] { + const rawX = Math.round((coords[0] / NAVIGATOR_COORDINATE_SCALE) * width); + const rawY = Math.round((coords[1] / NAVIGATOR_COORDINATE_SCALE) * height); + const x = Math.max(0, Math.min(width - 1, rawX)); + const y = Math.max(0, Math.min(height - 1, rawY)); + return [x, y]; +} + interface ImagePart { type: 'image_url'; image_url: { url: string }; @@ -300,7 +400,7 @@ function trimmedForRequest( const imageIndices: number[] = []; for (let i = 0; i < trimmed.length; i++) { - if (messageHasImage(trimmed[i])) imageIndices.push(i); + if (messageHasImage(trimmed[i]!)) imageIndices.push(i); } if (imageIndices.length === 0) return { messages: trimmed, removed: 0 }; @@ -311,7 +411,7 @@ function trimmedForRequest( for (const idx of imageIndices) { if (size <= MAX_REQUEST_BYTES) break; if (protectedIdx.has(idx)) continue; - if (stripOneImage(trimmed[idx])) { + if (stripOneImage(trimmed[idx]!)) { removed++; size = estimateSize(trimmed); } @@ -319,11 +419,11 @@ function trimmedForRequest( // If still over, strip from the protected window too — but always keep the latest. if (size > MAX_REQUEST_BYTES) { - const lastIdx = imageIndices[imageIndices.length - 1]; + const lastIdx = imageIndices[imageIndices.length - 1]!; for (const idx of imageIndices) { if (size <= MAX_REQUEST_BYTES) break; if (idx === lastIdx) continue; - if (stripOneImage(trimmed[idx])) { + if (stripOneImage(trimmed[idx]!)) { removed++; size = estimateSize(trimmed); } diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index c1dde02..a201e00 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -15,6 +15,18 @@ const TYPING_DELAY_MS = 12; const SCREENSHOT_DELAY_MS = 150; const ACTION_DELAY_MS = 300; +// n1.5 scroll `amount` is in "wheel units" where 1 unit ≈ 10% of the viewport +// height (~80px at 800px tall). Kernel's `delta_y` is a wheel-event repeat +// count where each tick is much smaller in practice, so we multiply. +const SCROLL_NOTCHES_PER_AMOUNT = 4; + +// WebP quality for screenshots. Kernel returns PNGs, which are crisp and +// tolerate aggressive WebP compression with no visible degradation — matches +// Yutori SDK's DEFAULT_WEBP_QUALITY_FOR_PNG=30 (yutori-sdk-python/yutori/ +// navigator/images.py). Lower values cut payload size substantially on long +// multi-step trajectories. +const WEBP_QUALITY = 30; + export interface ToolResult { base64Image?: string; output?: string; @@ -61,43 +73,85 @@ export interface N15Action { url?: string; } +// n1.5 emits lowercase key names (e.g. `enter`, `ctrl+c`, `down down down enter`). +// Kernel's press_key expects XKeysym names (e.g. `Return`, `Ctrl`, `Page_Up`). +// This map covers every key Yutori documents at +// https://docs.yutori.com/reference/n1-5#key-space — keys not in the map pass +// through unchanged (printable characters like `a`, `1`, `,` are already XKeysym). +// +// Sister implementation (Playwright target instead of XKeysym): +// https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/keys.py const KEY_MAP: Record = { - 'Enter': 'Return', - 'Escape': 'Escape', - 'Backspace': 'BackSpace', - 'Tab': 'Tab', - 'Delete': 'Delete', - 'ArrowUp': 'Up', - 'ArrowDown': 'Down', - 'ArrowLeft': 'Left', - 'ArrowRight': 'Right', - 'Home': 'Home', - 'End': 'End', - 'PageUp': 'Page_Up', - 'PageDown': 'Page_Down', - 'F1': 'F1', - 'F2': 'F2', - 'F3': 'F3', - 'F4': 'F4', - 'F5': 'F5', - 'F6': 'F6', - 'F7': 'F7', - 'F8': 'F8', - 'F9': 'F9', - 'F10': 'F10', - 'F11': 'F11', - 'F12': 'F12', + // Modifiers + ctrl: 'Ctrl', + control: 'Ctrl', + shift: 'Shift', + alt: 'Alt', + meta: 'Super_L', + command: 'Super_L', + cmd: 'Super_L', + super: 'Super_L', + option: 'Alt', + // Enter + enter: 'Return', + return: 'Return', + // Navigation + tab: 'Tab', + backspace: 'BackSpace', + delete: 'Delete', + escape: 'Escape', + esc: 'Escape', + space: 'space', + // Arrows + up: 'Up', + down: 'Down', + left: 'Left', + right: 'Right', + arrowup: 'Up', + arrowdown: 'Down', + arrowleft: 'Left', + arrowright: 'Right', + // Page nav + home: 'Home', + end: 'End', + pageup: 'Page_Up', + pagedown: 'Page_Down', + // Function keys + f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6', + f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12', + // Locks / special + capslock: 'Caps_Lock', + numlock: 'Num_Lock', + scrolllock: 'Scroll_Lock', + insert: 'Insert', + pause: 'Pause', + printscreen: 'Print', }; -const MODIFIER_MAP: Record = { - 'control': 'ctrl', - 'ctrl': 'ctrl', - 'alt': 'alt', - 'shift': 'shift', - 'meta': 'super', - 'command': 'super', - 'cmd': 'super', -}; +function mapToken(token: string): string { + const lower = token.trim().toLowerCase(); + return KEY_MAP[lower] ?? token.trim(); +} + +function normalizeUrl(url: string): string { + const trimmed = url.trim(); + return trimmed.includes('://') ? trimmed : `https://${trimmed}`; +} + +// Parse an n1.5 key expression into one Kernel combo string per sequential +// press. Spaces separate sequential presses; `+` separates simultaneous tokens +// within a press. Examples: +// "enter" -> ["Return"] +// "ctrl+c" -> ["Ctrl+c"] +// "down down enter" -> ["Down", "Down", "Return"] +// "ctrl+shift+t" -> ["Ctrl+Shift+t"] +function parseKeyExpression(expr: string): string[] { + return expr + .trim() + .split(/\s+/) + .filter(Boolean) + .map((combo) => combo.split('+').map(mapToken).join('+')); +} export class ComputerTool { private kernel: Kernel; @@ -161,7 +215,7 @@ export class ComputerTool { private async handleClick(action: N15Action, button: 'left' | 'right' | 'middle', numClicks: number): Promise { const coords = this.getCoordinates(action.coordinates); - const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined; + const holdKeys = action.modifier ? [mapToken(action.modifier)] : undefined; await this.kernel.browsers.computer.clickMouse(this.sessionId, { x: coords.x, @@ -205,31 +259,34 @@ export class ComputerTool { private async handleScroll(action: N15Action): Promise { const coords = this.getCoordinates(action.coordinates); const direction = action.direction; - const notches = Math.max(action.amount ?? 3, 1); + const amount = Math.max(action.amount ?? 3, 1); if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { throw new ToolError(`Invalid scroll direction: ${direction}`); } + // Yutori 1 unit ≈ 10% of viewport height; scale into Kernel wheel-event ticks. + const ticks = amount * SCROLL_NOTCHES_PER_AMOUNT; + let delta_x = 0; let delta_y = 0; switch (direction) { case 'up': - delta_y = -notches; + delta_y = -ticks; break; case 'down': - delta_y = notches; + delta_y = ticks; break; case 'left': - delta_x = -notches; + delta_x = -ticks; break; case 'right': - delta_x = notches; + delta_x = ticks; break; } - const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined; + const holdKeys = action.modifier ? [mapToken(action.modifier)] : undefined; await this.kernel.browsers.computer.scroll(this.sessionId, { x: coords.x, @@ -243,7 +300,7 @@ export class ComputerTool { const screenshotResult = await this.screenshot(); return { ...screenshotResult, - output: `Scrolled ${notches} wheel unit(s) ${direction}.`, + output: `Scrolled ${amount} unit(s) ${direction}.`, }; } @@ -268,11 +325,12 @@ export class ComputerTool { throw new ToolError('key is required for key_press action'); } - const mappedKey = this.mapKey(key); - - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: [mappedKey], - }); + // n1.5 supports sequential presses ("down down down enter") — issue each + // combo as its own pressKey so they're seen as separate keystrokes. + const combos = parseKeyExpression(key); + for (const combo of combos) { + await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [combo] }); + } await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); @@ -284,14 +342,16 @@ export class ComputerTool { throw new ToolError('key is required for hold_key action'); } - const mappedKey = this.mapKey(key); // Yutori emits `duration` in seconds; Kernel SDK's pressKey takes ms. const durationMs = action.duration && action.duration > 0 ? Math.round(action.duration * 1000) : 1000; - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: [mappedKey], - duration: durationMs, - }); + const combos = parseKeyExpression(key); + for (const combo of combos) { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [combo], + duration: durationMs, + }); + } await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); @@ -328,7 +388,7 @@ export class ComputerTool { private async handleGoBack(): Promise { await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['alt+Left'], + keys: ['Alt+Left'], }); await this.sleep(1500); @@ -337,7 +397,7 @@ export class ComputerTool { private async handleGoForward(): Promise { await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['alt+Right'], + keys: ['Alt+Right'], }); await this.sleep(1500); @@ -349,10 +409,11 @@ export class ComputerTool { if (!url) { throw new ToolError('url is required for goto_url action'); } + const targetUrl = normalizeUrl(url); if (this.kioskMode) { const response = await this.kernel.browsers.playwright.execute(this.sessionId, { - code: `await page.goto(${JSON.stringify(url)});`, + code: `await page.goto(${JSON.stringify(targetUrl)});`, timeout_sec: 60, }); if (!response.success) { @@ -363,17 +424,17 @@ export class ComputerTool { } await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['ctrl+l'], + keys: ['Ctrl+l'], }); await this.sleep(ACTION_DELAY_MS); await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['ctrl+a'], + keys: ['Ctrl+a'], }); await this.sleep(100); await this.kernel.browsers.computer.typeText(this.sessionId, { - text: url, + text: targetUrl, delay: TYPING_DELAY_MS, }); await this.sleep(ACTION_DELAY_MS); @@ -392,7 +453,7 @@ export class ComputerTool { const blob = await response.blob(); const arrayBuffer = await blob.arrayBuffer(); const pngBuffer = Buffer.from(arrayBuffer); - const webpBuffer = await sharp(pngBuffer).webp({ quality: 80 }).toBuffer(); + const webpBuffer = await sharp(pngBuffer).webp({ quality: WEBP_QUALITY }).toBuffer(); return { base64Image: webpBuffer.toString('base64'), @@ -404,7 +465,7 @@ export class ComputerTool { private getCoordinates(coords?: [number, number]): { x: number; y: number } { if (!coords || coords.length !== 2) { - return { x: this.width / 2, y: this.height / 2 }; + return { x: Math.floor(this.width / 2), y: Math.floor(this.height / 2) }; } const [x, y] = coords; @@ -415,23 +476,7 @@ export class ComputerTool { return { x, y }; } - private mapKey(key: string): string { - const mapPart = (part: string): string => { - const trimmed = part.trim(); - const lower = trimmed.toLowerCase(); - if (MODIFIER_MAP[lower]) { - return MODIFIER_MAP[lower]; - } - return KEY_MAP[trimmed] || trimmed; - }; - - if (key.includes('+')) { - return key.split('+').map(mapPart).join('+'); - } - return mapPart(key); - } - private sleep(ms: number): Promise { - return new Promise(resolve => setTimeout(resolve, ms)); + return new Promise((resolve) => setTimeout(resolve, ms)); } } diff --git a/pkg/templates/typescript/yutori/tsconfig.json b/pkg/templates/typescript/yutori/tsconfig.json index 13616f5..b0a441d 100644 --- a/pkg/templates/typescript/yutori/tsconfig.json +++ b/pkg/templates/typescript/yutori/tsconfig.json @@ -1,9 +1,24 @@ { - "extends": "../tsconfig.base.json", "compilerOptions": { - "outDir": "./dist", - "rootDir": "." + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + "esModuleInterop": true, + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false }, - "include": ["./**/*.ts"], + "include": ["./**/*.ts", "./**/*.tsx"], "exclude": ["node_modules", "dist"] }