diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index 281aa765f..f9e2d01cb 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -576,10 +576,8 @@ export class GoogleCUAClient extends AgentClient { }); const screenshot = await this.captureScreenshot(); - const base64Data = screenshot.replace( - /^data:image\/png;base64,/, - "", - ); + const { mimeType, base64Data } = + this.parseScreenshotDataUrl(screenshot); // Create one function response for each computer use function call // Following Python SDK pattern: FunctionResponse with parts containing inline_data @@ -606,7 +604,7 @@ export class GoogleCUAClient extends AgentClient { parts: [ { inlineData: { - mimeType: "image/png", + mimeType, data: base64Data, }, }, @@ -979,6 +977,46 @@ export class GoogleCUAClient extends AgentClient { }; } + private normalizeScreenshotDataUrl(imageData: string): string { + const trimmedImageData = imageData.trim(); + if (/^data:[^;]+;base64,/i.test(trimmedImageData)) { + return trimmedImageData; + } + return `data:image/png;base64,${trimmedImageData}`; + } + + private parseScreenshotDataUrl(screenshot: string): { + mimeType: string; + base64Data: string; + } { + const trimmedScreenshot = screenshot.trim(); + const imageDataUrlMatch = trimmedScreenshot.match( + /^data:(image\/[a-zA-Z0-9.+-]+);base64,([\s\S]+)$/i, + ); + + if (imageDataUrlMatch) { + const mimeType = + imageDataUrlMatch[1].toLowerCase() === "image/jpg" + ? "image/jpeg" + : imageDataUrlMatch[1]; + return { + mimeType, + base64Data: imageDataUrlMatch[2], + }; + } + + const genericDataUrlMatch = trimmedScreenshot.match( + /^data:[^;]+;base64,([\s\S]+)$/i, + ); + + return { + mimeType: "image/png", + base64Data: genericDataUrlMatch + ? genericDataUrlMatch[1] + : trimmedScreenshot, + }; + } + async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; @@ -990,14 +1028,14 @@ export class GoogleCUAClient extends AgentClient { // Use provided options if available if (options?.base64Image) { - return `data:image/png;base64,${options.base64Image}`; + return this.normalizeScreenshotDataUrl(options.base64Image); } // Use the screenshot provider if available if (this.screenshotProvider) { try { const base64Image = await this.screenshotProvider(); - return `data:image/png;base64,${base64Image}`; + return this.normalizeScreenshotDataUrl(base64Image); } catch (error) { console.error("Error capturing screenshot:", error); throw error; diff --git a/packages/core/tests/unit/google-cua-client.test.ts b/packages/core/tests/unit/google-cua-client.test.ts new file mode 100644 index 000000000..6e58368bf --- /dev/null +++ b/packages/core/tests/unit/google-cua-client.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from "vitest"; +import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js"; + +type ParseScreenshotDataUrlFn = (screenshot: string) => { + mimeType: string; + base64Data: string; +}; + +const parseScreenshotDataUrl = ( + GoogleCUAClient.prototype as unknown as { + parseScreenshotDataUrl: ParseScreenshotDataUrlFn; + } +).parseScreenshotDataUrl; + +function createClient(): GoogleCUAClient { + return new GoogleCUAClient( + "google", + "google/gemini-2.5-computer-use-preview-10-2025", + "test instructions", + { apiKey: "test" }, + ); +} + +describe("GoogleCUAClient screenshot MIME handling", () => { + it("preserves image data URLs passed via captureScreenshot options", async () => { + const client = createClient(); + const jpegDataUrl = "data:image/jpeg;base64,abc123"; + + const screenshot = await client.captureScreenshot({ + base64Image: jpegDataUrl, + }); + + expect(screenshot).toBe(jpegDataUrl); + }); + + it("defaults raw base64 captureScreenshot options to PNG data URL", async () => { + const client = createClient(); + + const screenshot = await client.captureScreenshot({ + base64Image: "abc123", + }); + + expect(screenshot).toBe("data:image/png;base64,abc123"); + }); + + it("extracts JPEG mime type and base64 payload from data URLs", () => { + const client = createClient(); + + const parsed = parseScreenshotDataUrl.call( + client, + "data:image/jpg;base64,abc123", + ); + + expect(parsed).toEqual({ + mimeType: "image/jpeg", + base64Data: "abc123", + }); + }); + + it("falls back to PNG mime type for non-image data URLs", () => { + const client = createClient(); + + const parsed = parseScreenshotDataUrl.call( + client, + "data:application/octet-stream;base64,abc123", + ); + + expect(parsed).toEqual({ + mimeType: "image/png", + base64Data: "abc123", + }); + }); +});