diff --git a/src/code.gs b/src/code.gs index 2720145..a5558d6 100644 --- a/src/code.gs +++ b/src/code.gs @@ -33,6 +33,9 @@ const GenAIApp = (function () { const globalMetadata = {}; const addedVectorStores = {}; + const modelForVision = "gemini-3-pro-preview"; + let promptForVision = "Describe the images, transcribe any visible text, and summarize the visual context."; + const MAX_FILE_SIZE = 20 * 1024 * 1024; // 20MB in bytes /** @@ -112,12 +115,32 @@ const GenAIApp = (function () { const response = UrlFetchApp.fetch(imageInput); const blob = response.getBlob(); const base64Image = Utilities.base64Encode(blob.getBytes()); + let mimeType = blob.getContentType(); + if (!mimeType || !mimeType.startsWith("image/")) { + let pathname; + try { + pathname = new URL(imageInput).pathname.toLowerCase(); + } catch { + pathname = imageInput.split("?")[0].split("#")[0].toLowerCase(); + } + if (pathname.endsWith(".png")) { + mimeType = "image/png"; + } else if (pathname.endsWith(".jpg") || pathname.endsWith(".jpeg")) { + mimeType = "image/jpeg"; + } else if (pathname.endsWith(".webp")) { + mimeType = "image/webp"; + } else if (pathname.endsWith(".gif")) { + mimeType = "image/gif"; + } else { + throw new Error("Failed to identify a valid image MIME type. Please check the file format for Gemini."); + } + } contents.push({ role: "user", parts: [ { - inline_data: { - mime_type: blob.getContentType(), + inlineData: { + mime_type: mimeType, data: base64Image } } @@ -195,7 +218,7 @@ const GenAIApp = (function () { contents.push({ role: 'user', parts: [{ - inline_data: { + inlineData: { mime_type: fileInfo.mimeType, data: blobToBase64 } @@ -422,6 +445,13 @@ const GenAIApp = (function () { knowledgeLink = []; } + // Gemini does not support using images together with vector stores (RAG) yet. + // Images must be analyzed first and replaced with text before RAG processing. + const ragCorpusIds = Object.keys(addedVectorStores); + if (ragCorpusIds.length > 0 && model.includes("gemini") && gcpProjectId) { + contents = this._convertImagesToText(contents); + } + let payload; if (model.includes("gemini")) { payload = this._buildGeminiPayload(advancedParametersObject); @@ -737,6 +767,98 @@ const GenAIApp = (function () { return payload; } + /** + * Replaces all image parts in a Gemini conversation with a text description + * generated by Gemini 3 Pro Preview (Vertex AI Vision). + * + * - Detects images (inlineData / fileData) across all messages + * - Sends them to Gemini Vision for analysis + * - Removes images from the conversation + * - Appends a new message containing the image analysis + * + * @param {Array} currentContents + * Gemini conversation contents. + * + * @returns {Array} + * Updated contents with images removed and a text analysis appended. + */ + this._convertImagesToText = function (currentContents) { + if (!currentContents || currentContents.length === 0) return currentContents; + + const hasImages = currentContents.some(c => { + const parts = Array.isArray(c.parts) ? c.parts : (c.parts ? [c.parts] : []); + return parts.some(p => p.inlineData || p.fileData); + }); + + if (!hasImages) return currentContents; + + if (verbose) { + console.log("[GenAIApp] - Images detected. Converting to text description..."); + } + + const imageParts = currentContents.flatMap(c => { + const parts = Array.isArray(c.parts) ? c.parts : (c.parts ? [c.parts] : []); + return parts.filter(p => p.inlineData || p.fileData); + }); + + const descriptionPayload = { + contents: [{ + role: "user", + parts: [ + ...imageParts, + { text: promptForVision} + ] + }], + generationConfig: { + temperature: 0.2, + maxOutputTokens: 2000 + } + }; + + const options = { + method: 'post', + contentType: 'application/json', + headers: { + 'Authorization': 'Bearer ' + ScriptApp.getOAuthToken() + }, + payload: JSON.stringify(descriptionPayload), + muteHttpExceptions: true + }; + + const endpoint = `https://aiplatform.googleapis.com/v1/projects/${gcpProjectId}/locations/global/publishers/google/models/${modelForVision}:generateContent`; + let description = "Image analysis returned no text."; + try { + const response = UrlFetchApp.fetch(endpoint, options); + const result = JSON.parse(response.getContentText()); + + if (result?.candidates?.[0]?.content?.parts?.[0]?.text) { + description = result.candidates[0].content.parts[0].text; + } else if (result?.parts?.[0]?.text) { + description = result.parts[0].text; + } + } catch (error) { + Logger.log(`[GenAIApp] - Image analysis failed during Gemini Vision preprocessing: ${error}`); + } + + let newContents = JSON.parse(JSON.stringify(currentContents)); + newContents.forEach(c => { + const parts = Array.isArray(c.parts) ? c.parts : (c.parts ? [c.parts] : []); + c.parts = parts.filter(p => !p.inlineData && !p.fileData); + }); + + newContents = newContents.filter(c => { + const parts = Array.isArray(c.parts) ? c.parts : (c.parts ? [c.parts] : []); + return parts.length > 0; + }); + + newContents.push({ + role: "user", + parts: [{ text: `IMAGE ANALYSIS:\n${description}` }] + }); + + return newContents; + } + /** * Get a blob from a Google Drive file ID * @@ -2254,6 +2376,20 @@ const GenAIApp = (function () { */ setPrivateInstanceBaseUrl: function (baseUrl) { privateInstanceBaseUrl = baseUrl; + }, + + /** + * Sets the prompt used to describe images when using Gemini with RAG. + * + * Gemini does not support combining images and vector stores directly. + * When RAG is enabled, images are first analyzed and replaced with text + * using this prompt before querying the Gemini vector store. + * + * @param {string} prompt The prompt to use for image description. + */ + setPromptForVision: function (prompt) { + promptForVision = prompt; } + } })(); \ No newline at end of file