diff --git a/apps/sim/app/api/tools/file/manage/route.ts b/apps/sim/app/api/tools/file/manage/route.ts index 367a5db8cfc..a29c47613fe 100644 --- a/apps/sim/app/api/tools/file/manage/route.ts +++ b/apps/sim/app/api/tools/file/manage/route.ts @@ -1,5 +1,4 @@ import { Buffer, isUtf8 } from 'buffer' -import type { Readable } from 'stream' import { AuditAction, AuditResourceType, recordAudit } from '@sim/audit' import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' @@ -21,6 +20,16 @@ import { ShareValidationError, upsertFileShare, } from '@/lib/public-shares/share-manager' +import { + inflateEntryWithinCaps, + isSymlinkEntry, + MAX_ARCHIVE_BYTES as MAX_DECOMPRESS_ARCHIVE_BYTES, + MAX_ARCHIVE_ENTRIES as MAX_DECOMPRESS_ENTRIES, + MAX_ARCHIVE_ENTRY_BYTES as MAX_DECOMPRESS_ENTRY_BYTES, + MAX_ARCHIVE_TOTAL_BYTES as MAX_DECOMPRESS_TOTAL_BYTES, + readEntryUncompressedSize, + sanitizeArchiveEntryPath, +} from '@/lib/uploads/archive' import { ensureWorkspaceFileFolderPath } from '@/lib/uploads/contexts/workspace/workspace-file-folder-manager' import { fetchWorkspaceFileBuffer, @@ -199,102 +208,6 @@ const uniqueZipEntryName = (name: string, usedNames: Set): string => { return candidate } -/** Input archive download cap for the decompress operation. */ -const MAX_DECOMPRESS_ARCHIVE_BYTES = 100 * 1024 * 1024 -/** Maximum number of entries extracted from a single archive. */ -const MAX_DECOMPRESS_ENTRIES = 1000 -/** Maximum uncompressed size for any single archive entry. */ -const MAX_DECOMPRESS_ENTRY_BYTES = 100 * 1024 * 1024 -/** Maximum total uncompressed size across all entries, to bound zip-bomb expansion. */ -const MAX_DECOMPRESS_TOTAL_BYTES = 200 * 1024 * 1024 - -const S_IFMT = 0o170000 -const S_IFLNK = 0o120000 - -/** - * Read a zip entry's declared uncompressed size without materializing it. This - * value comes straight from the (attacker-controlled) ZIP metadata, so it is only - * usable as a cheap fast-reject for honestly-declared archives — never as the - * authoritative cap. {@link inflateEntryWithinCaps} enforces the real limit on the - * inflated byte stream. - */ -const readEntryUncompressedSize = (entry: JSZip.JSZipObject): number | undefined => { - const data = (entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } })._data - const size = data?.uncompressedSize - return typeof size === 'number' && Number.isFinite(size) ? size : undefined -} - -type InflateResult = { ok: true; buffer: Buffer } | { ok: false; reason: 'entry' | 'total' } - -/** - * Inflate a single zip entry through a streaming counting sink, tearing the - * stream down the moment cumulative output would exceed the per-entry cap or the - * remaining total budget. The declared uncompressed size in the ZIP header is - * attacker-controlled and is NOT trusted here: a forged-small or absent size - * cannot cause the full (potentially gigabyte-scale) entry to be materialized in - * memory, because enforcement happens on the actual inflated bytes as they - * arrive. Peak memory is bounded by the cap plus one DEFLATE chunk. - */ -const inflateEntryWithinCaps = ( - entry: JSZip.JSZipObject, - remainingTotalBudget: number -): Promise => - new Promise((resolve, reject) => { - const chunks: Buffer[] = [] - let size = 0 - let settled = false - const stream = entry.nodeStream() as Readable - - const settle = (result: InflateResult) => { - if (settled) return - settled = true - stream.destroy() - resolve(result) - } - - stream.on('data', (chunk: Buffer) => { - size += chunk.length - if (size > MAX_DECOMPRESS_ENTRY_BYTES) { - settle({ ok: false, reason: 'entry' }) - return - } - if (size > remainingTotalBudget) { - settle({ ok: false, reason: 'total' }) - return - } - chunks.push(chunk) - }) - stream.on('end', () => settle({ ok: true, buffer: Buffer.concat(chunks, size) })) - stream.on('error', (error) => { - if (settled) return - settled = true - stream.destroy() - reject(error) - }) - }) - -/** True when a zip entry's unix mode marks it as a symlink (never extracted). */ -const isSymlinkEntry = (entry: JSZip.JSZipObject): boolean => { - const mode = (entry as JSZip.JSZipObject & { unixPermissions?: number | null }).unixPermissions - return typeof mode === 'number' && (mode & S_IFMT) === S_IFLNK -} - -/** - * Normalize a zip entry path into safe workspace folder segments, guarding against - * zip-slip. Returns null for traversal (`..`), so the entry is skipped rather than - * written outside its intended location. - */ -const sanitizeArchiveEntryPath = (rawPath: string): string[] | null => { - const segments = rawPath - .replace(/\\/g, '/') - .split('/') - .map((segment) => segment.trim()) - .filter((segment) => segment.length > 0 && segment !== '.') - - if (segments.length === 0 || segments.includes('..')) return null - return segments -} - const isLikelyTextBuffer = (buffer: Buffer): boolean => isUtf8(buffer) && !buffer.includes(0) /** diff --git a/apps/sim/lib/copilot/chat/payload.ts b/apps/sim/lib/copilot/chat/payload.ts index e718b33cce3..744d462aea0 100644 --- a/apps/sim/lib/copilot/chat/payload.ts +++ b/apps/sim/lib/copilot/chat/payload.ts @@ -7,13 +7,20 @@ import type { VfsSnapshotV1 } from '@/lib/copilot/generated/vfs-snapshot-v1' import { getExposedIntegrationTools } from '@/lib/copilot/integration-tools' import { getToolEntry } from '@/lib/copilot/tool-executor/router' import { getCopilotToolDescription } from '@/lib/copilot/tools/descriptions' +import { + type ChatUploadArchiveEntry, + listChatUploadArchiveEntries, +} from '@/lib/copilot/tools/handlers/upload-file-reader' import { encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { isE2BDocEnabled, isHosted } from '@/lib/core/config/env-flags' import { buildUserSkillTool } from '@/lib/mothership/skills' import { trackChatUpload } from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { isArchiveFileName } from '@/lib/uploads/utils/file-utils' import { stripVersionSuffix } from '@/tools/utils' const logger = createLogger('CopilotChatPayload') +/** Max archive entries listed inline in the upload context before truncating. */ +const MAX_UPLOAD_TREE_ENTRIES = 50 const INTEGRATION_TOOL_SCHEMA_CACHE_TTL_MS = 5_000 const INTEGRATION_TOOL_SCHEMA_CACHE_MAX_ENTRIES = 500 @@ -297,15 +304,56 @@ export async function buildCopilotRequestPayload( } catch { encodedUploadName = displayName } - const lines = [ - `File "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, - `Read with: read("uploads/${encodedUploadName}")`, - `To save permanently: materialize_file(fileName: "${displayName}")`, - ] - if (displayName.endsWith('.json')) { - lines.push( - `To import as a workflow: materialize_file(fileName: "${displayName}", operation: "import")` - ) + let lines: string[] + if (isArchiveFileName(displayName)) { + // An archive is presented as a virtual folder. Show a capped file tree + // up front so the agent sees the contents without a glob round-trip; + // degrade to a glob hint if the tree can't be built (never block send). + let entries: ChatUploadArchiveEntry[] | null = null + try { + entries = await listChatUploadArchiveEntries(displayName, chatId) + } catch (treeErr) { + logger.warn('Failed to build archive upload tree', { + filename, + chatId, + error: toError(treeErr).message, + }) + } + if (entries && entries.length > 0) { + const shown = entries.slice(0, MAX_UPLOAD_TREE_ENTRIES) + const treeLines = shown.map((entry) => ` ${entry.path}`) + if (entries.length > MAX_UPLOAD_TREE_ENTRIES) { + treeLines.push(` … and ${entries.length - MAX_UPLOAD_TREE_ENTRIES} more`) + } + lines = [ + `Archive "${displayName}" (${mediaType}, ${f.size} bytes) uploaded — ${ + entries.length + } file${entries.length === 1 ? '' : 's'}:`, + ...treeLines, + '', + `List entries with: glob("uploads/${encodedUploadName}/**")`, + `Read an entry with: read("uploads/${encodedUploadName}/")`, + `To save the archive permanently: materialize_file(fileName: "${displayName}")`, + ] + } else { + lines = [ + `Archive "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, + `List entries with: glob("uploads/${encodedUploadName}/**")`, + `Read an entry with: read("uploads/${encodedUploadName}/")`, + `To save the archive permanently: materialize_file(fileName: "${displayName}")`, + ] + } + } else { + lines = [ + `File "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, + `Read with: read("uploads/${encodedUploadName}")`, + `To save permanently: materialize_file(fileName: "${displayName}")`, + ] + if (displayName.endsWith('.json')) { + lines.push( + `To import as a workflow: materialize_file(fileName: "${displayName}", operation: "import")` + ) + } } uploadContexts.push({ type: 'uploaded_file', diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index 065b287b99e..f3a8ec0b498 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -2,25 +2,47 @@ * @vitest-environment node */ +import { Buffer } from 'buffer' import { dbChainMock, dbChainMockFns, resetDbChainMock } from '@sim/testing' +import JSZip from 'jszip' import { beforeEach, describe, expect, it, vi } from 'vitest' vi.mock('@sim/db', () => dbChainMock) -const { mockReadFileRecord } = vi.hoisted(() => ({ - mockReadFileRecord: vi.fn(), -})) +const { mockReadFileRecord, mockRenderFileBuffer, mockFetchWorkspaceFileBuffer } = vi.hoisted( + () => ({ + mockReadFileRecord: vi.fn(), + // Echo the entry bytes back as text so a successful resolve is observable. + mockRenderFileBuffer: vi.fn(async (buffer: Buffer) => ({ + content: buffer.toString('utf-8'), + totalLines: 1, + })), + mockFetchWorkspaceFileBuffer: vi.fn(), + }) +) vi.mock('@/lib/copilot/vfs/file-reader', () => ({ readFileRecord: mockReadFileRecord, + renderFileBuffer: mockRenderFileBuffer, +})) +vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({ + fetchWorkspaceFileBuffer: mockFetchWorkspaceFileBuffer, })) import { findMothershipUploadRowByChatAndName, + grepChatUploadPath, + listChatUploadArchiveEntries, listChatUploads, - readChatUpload, + readChatUploadPath, } from './upload-file-reader' +async function buildZip(files: Record): Promise { + const zip = new JSZip() + for (const [name, content] of Object.entries(files)) zip.file(name, content) + return Buffer.from(await zip.generateAsync({ type: 'uint8array' })) +} + const CHAT_ID = '11111111-1111-1111-1111-111111111111' const NOW = new Date('2026-05-05T00:00:00.000Z') @@ -117,6 +139,22 @@ describe('findMothershipUploadRowByChatAndName', () => { expect(result?.id).toBe('wf_3') }) + + it('resolves a literal-% name via its encoded glob form', async () => { + // Stored name has a literal `%`; glob/upload-context expose it double-encoded + // (`test%252A.zip`). The encoded-form fallback recovers the row. + const row = makeRow({ + id: 'wf_pct', + displayName: 'test%2A.zip', + contentType: 'application/zip', + }) + mockOrderByThenLimit([]) + dbChainMockFns.orderBy.mockResolvedValueOnce([row] as never) + + const result = await findMothershipUploadRowByChatAndName(CHAT_ID, 'test%252A.zip') + + expect(result?.id).toBe('wf_pct') + }) }) describe('listChatUploads', () => { @@ -147,7 +185,7 @@ describe('listChatUploads', () => { }) }) -describe('readChatUpload', () => { +describe('readChatUploadPath (plain upload)', () => { beforeEach(() => { vi.clearAllMocks() resetDbChainMock() @@ -159,7 +197,7 @@ describe('readChatUpload', () => { mockOrderByThenLimit([row]) mockReadFileRecord.mockResolvedValueOnce({ content: 'PNGDATA', totalLines: 1 }) - const result = await readChatUpload('image (2).png', CHAT_ID) + const result = await readChatUploadPath('image (2).png', '', CHAT_ID) expect(result).toEqual({ content: 'PNGDATA', totalLines: 1 }) expect(mockReadFileRecord).toHaveBeenCalledWith( @@ -167,13 +205,148 @@ describe('readChatUpload', () => { ) }) + it('ignores a trailing habit suffix on a non-archive upload', async () => { + const row = makeRow({ id: 'wf_3', displayName: 'report.csv', contentType: 'text/csv' }) + mockOrderByThenLimit([row]) + mockReadFileRecord.mockResolvedValueOnce({ content: 'a,b', totalLines: 1 }) + + const result = await readChatUploadPath('report.csv', 'content', CHAT_ID) + + expect(result).toEqual({ content: 'a,b', totalLines: 1 }) + expect(mockReadFileRecord).toHaveBeenCalledWith(expect.objectContaining({ name: 'report.csv' })) + }) + it('returns null when no row matches', async () => { mockOrderByThenLimit([]) dbChainMockFns.orderBy.mockResolvedValueOnce([] as never) - const result = await readChatUpload('nope.png', CHAT_ID) + const result = await readChatUploadPath('nope.png', '', CHAT_ID) expect(result).toBeNull() expect(mockReadFileRecord).not.toHaveBeenCalled() }) }) + +describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { + beforeEach(() => { + vi.clearAllMocks() + resetDbChainMock() + }) + + it('lists archive entries as encoded VFS paths', async () => { + const buffer = await buildZip({ 'report.pdf': 'x', 'data/sheet.csv': 'a,b' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const entries = await listChatUploadArchiveEntries('bundle.zip', CHAT_ID) + + expect(entries?.map((e) => e.vfsPath).sort()).toEqual([ + 'uploads/bundle.zip/data/sheet.csv', + 'uploads/bundle.zip/report.pdf', + ]) + }) + + it('de-duplicates entries that collapse to one VFS key (NFC/NFD, ./ prefix)', async () => { + // "café.txt" stored twice (NFC precomposed + NFD decomposed) plus a + // ./-prefixed duplicate of a/b.txt — all collapse to the same VFS path, so + // only one of each must be listed (otherwise the second is unreachable). + const nfc = `caf\u00e9.txt` // precomposed e-acute + const nfd = `cafe\u0301.txt` // e + combining acute + expect(nfc).not.toBe(nfd) + const buffer = await buildZip({ + [nfc]: 'nfc', + [nfd]: 'nfd', + 'a/b.txt': 'first', + './a/b.txt': 'dup', + }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const entries = await listChatUploadArchiveEntries('bundle.zip', CHAT_ID) + const vfsPaths = entries?.map((e) => e.vfsPath) ?? [] + + expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/caf%C3%A9.txt')).toHaveLength(1) + expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/a/b.txt')).toHaveLength(1) + }) + + it('reads a nested entry by its exact path', async () => { + const buffer = await buildZip({ 'data/sheet.csv': 'a,b\n1,2' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const result = await readChatUploadPath('bundle.zip', 'data/sheet.csv', CHAT_ID) + + expect(result?.content).toBe('a,b\n1,2') + }) + + it('resolves a unicode (NFD) entry addressed by its NFC-encoded glob path', async () => { + // macOS-authored zip: entry name stored decomposed (e + combining acute). + const nfdName = `cafe\u0301.txt` // NFD: e + combining acute + const buffer = await buildZip({ [nfdName]: 'latte' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + // The agent reads back the encoded path glob produced (NFC, percent-encoded). + const result = await readChatUploadPath('bundle.zip', 'caf%C3%A9.txt', CHAT_ID) + + expect(result?.content).toBe('latte') + }) + + it('falls back to the manifest (with a note) when the entry is not found', async () => { + const buffer = await buildZip({ 'present.txt': 'x' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + // Covers the /content habit suffix and plain typos uniformly. + const result = await readChatUploadPath('bundle.zip', 'content', CHAT_ID) + + expect(result?.content).toContain('Entry "content" not found in "bundle.zip"') + expect(result?.content).toContain('present.txt') + }) + + it('surfaces an archive error on a nested read instead of null', async () => { + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(Buffer.from('not a zip at all')) + + const result = await readChatUploadPath('bundle.zip', 'entry.txt', CHAT_ID) + + expect(result?.content).toContain('Not a valid .zip archive') + }) + + it('rejects an oversized archive WITHOUT downloading it', async () => { + mockOrderByThenLimit([ + makeRow({ + displayName: 'huge.zip', + contentType: 'application/zip', + size: 200 * 1024 * 1024, // 200MB > 100MB cap + }), + ]) + + const result = await readChatUploadPath('huge.zip', 'anything.txt', CHAT_ID) + + expect(result?.content).toContain('[Archive too large to read: huge.zip') + expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled() + }) + + it('returns the file-tree manifest for a bare archive read', async () => { + const buffer = await buildZip({ 'report.pdf': 'x', 'data/sheet.csv': 'a,b' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const result = await readChatUploadPath('bundle.zip', '', CHAT_ID) + + expect(result?.content).toContain('Archive "bundle.zip" — 2 files') + expect(result?.content).toContain('report.pdf') + expect(result?.content).toContain('data/sheet.csv') + }) + + it('refuses to grep a bare archive, guiding the agent to an entry', async () => { + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + + await expect(grepChatUploadPath('bundle.zip', '', CHAT_ID, 'pattern')).rejects.toThrow( + /Cannot grep an archive directly/ + ) + // The archive bytes are never downloaded or grepped as a binary blob. + expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 0e914229c8a..e0fa59e0e39 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -3,7 +3,11 @@ import { workspaceFiles } from '@sim/db/schema' import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' import { and, asc, desc, eq, isNull, or } from 'drizzle-orm' -import { type FileReadResult, readFileRecord } from '@/lib/copilot/vfs/file-reader' +import { + type FileReadResult, + readFileRecord, + renderFileBuffer, +} from '@/lib/copilot/vfs/file-reader' import { type GrepCountEntry, type GrepMatch, @@ -13,7 +17,21 @@ import { } from '@/lib/copilot/vfs/operations' import { decodeVfsSegment, encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { getServePathPrefix } from '@/lib/uploads' -import type { WorkspaceFileRecord } from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { + ArchiveError, + extractArchiveEntry, + listArchiveEntries, + MAX_ARCHIVE_BYTES, +} from '@/lib/uploads/archive' +import { + fetchWorkspaceFileBuffer, + type WorkspaceFileRecord, +} from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { + getFileExtension, + getMimeTypeFromExtension, + isArchiveFileName, +} from '@/lib/uploads/utils/file-utils' const logger = createLogger('UploadFileReader') @@ -38,6 +56,21 @@ function canonicalUploadKey(name: string): string { } } +/** + * Per-segment encode of a stored name (no decode first), so a name containing a + * literal `%` (e.g. `test%2A.zip`) round-trips: glob/upload-context expose it as + * `encodeVfsSegment(name)`, and matching that encoded form back recovers the row. + * {@link canonicalUploadKey} can't, because it decodes the input first and a + * literal `%2A` is indistinguishable from an encoded `*`. + */ +function encodeUploadName(name: string): string { + try { + return encodeVfsSegment(name) + } catch { + return name.trim() + } +} + /** VFS-visible name. Coalesces to originalName for legacy rows that predate displayName. */ function vfsName(row: typeof workspaceFiles.$inferSelect): string { return row.displayName ?? row.originalName @@ -109,7 +142,15 @@ export async function findMothershipUploadRowByChatAndName( .orderBy(desc(workspaceFiles.uploadedAt), desc(workspaceFiles.id)) const segmentKey = canonicalUploadKey(fileName) - return allRows.find((r) => canonicalUploadKey(vfsName(r)) === segmentKey) ?? null + return ( + allRows.find((r) => { + const stored = vfsName(r) + // Canonical-key match handles visually-equivalent spellings (U+202F vs + // space); the encoded-form match handles literal `%` names that survive + // encode but not decode. + return canonicalUploadKey(stored) === segmentKey || encodeUploadName(stored) === fileName + }) ?? null + ) } /** @@ -140,21 +181,248 @@ export async function listChatUploads(chatId: string): Promise MAX_ARCHIVE_BYTES +} + +/** Placeholder for an archive too large to download and extract inline. */ +function archiveTooLargeResult(record: WorkspaceFileRecord): FileReadResult { + return { + content: `[Archive too large to read: ${record.name} (${Math.round( + record.size / 1024 / 1024 + )}MB, limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB)]`, + totalLines: 1, + } +} + +/** Decode each `/`-separated segment of a VFS entry path back to its real name. */ +function decodeEntryPath(raw: string): string { + return raw + .split('/') + .map((segment) => { + try { + return decodeVfsSegment(segment) + } catch { + return segment + } + }) + .join('/') +} + +/** Re-encode a real `/`-joined entry path into its VFS-safe per-segment form. */ +function encodeEntryPath(path: string): string { + return path + .split('/') + .map((segment) => encodeVfsSegment(segment)) + .join('/') +} + +/** + * Canonical per-segment-encoded key for an archive entry path. Returns null for + * paths that cannot be encoded (empty/dot segments). */ -export async function readChatUpload( - filename: string, +function archiveEntryKey(path: string): string | null { + try { + return encodeEntryPath(path) + } catch { + return null + } +} + +/** + * De-duplicate raw entry paths by their canonical VFS key (first wins), so two + * entries that differ only in a form the VFS normalizes away (NFC vs NFD, U+202F + * vs space, collapsed whitespace) collapse to one listed path. This matches how + * {@link findArchiveEntryRawPath} resolves a read — first entry whose key matches + * — so every listed path is reachable and none is silently shadowed. + */ +function dedupeArchiveEntriesByKey(paths: string[]): string[] { + const seen = new Set() + const result: string[] = [] + for (const path of paths) { + const key = archiveEntryKey(path) ?? path + if (seen.has(key)) continue + seen.add(key) + result.push(path) + } + return result +} + +/** + * Resolve a requested entry path (percent-encoded as the agent received it from + * glob, or the raw display form from the manifest) to the archive's exact stored + * path. Matching is on the canonical key so the NFC + whitespace normalization + * `encodeVfsSegment` applies stays symmetric between the listed paths and the + * read request — otherwise a macOS-authored (NFD / U+202F) entry name would list + * but never resolve. Returns null when nothing matches. + */ +async function findArchiveEntryRawPath( + archiveBuffer: Buffer, + requestedEntryPath: string +): Promise { + const wantedKey = archiveEntryKey(decodeEntryPath(requestedEntryPath)) + if (!wantedKey) return null + const entries = await listArchiveEntries(archiveBuffer) + return entries.find((entry) => archiveEntryKey(entry) === wantedKey) ?? null +} + +/** A single entry within an uploaded archive, with both its real and VFS paths. */ +export interface ChatUploadArchiveEntry { + /** Real sanitized path inside the archive (e.g. `data/sheet.csv`). */ + path: string + /** VFS path the agent uses to read it (e.g. `uploads/archive.zip/data/sheet.csv`). */ + vfsPath: string +} + +/** + * List the entries of an uploaded archive as VFS paths. Returns null when + * `zipName` is not an archive upload in this chat; returns `[]` when the archive + * is unreadable or empty (logged) so the caller still surfaces the archive leaf. + */ +export async function listChatUploadArchiveEntries( + zipName: string, chatId: string +): Promise { + const row = await findMothershipUploadRowByChatAndName(chatId, zipName) + if (!row) return null + const record = toWorkspaceFileRecord(row) + if (!isArchiveUpload(record)) return null + if (exceedsArchiveReadCap(record)) { + logger.warn('Archive too large to list entries', { zipName, chatId, size: record.size }) + return [] + } + + const encodedZip = encodeUploadName(record.name) + try { + const buffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) + const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(buffer)) + return entries.map((path) => ({ + path, + vfsPath: `uploads/${encodedZip}/${encodeEntryPath(path)}`, + })) + } catch (err) { + logger.warn('Failed to list archive entries', { + zipName, + chatId, + error: toError(err).message, + }) + return [] + } +} + +/** + * Render one archive entry from the archive buffer with the same extraction + * logic as a stored upload. Returns null when the entry is genuinely missing; + * returns a bracketed placeholder for any {@link ArchiveError} (invalid archive, + * too many entries, oversized entry) — matching {@link buildArchiveManifest} so a + * nested read surfaces the real reason instead of the VFS "Upload not found". + */ +async function readArchiveEntry( + archiveBuffer: Buffer, + entryPath: string ): Promise { try { - const row = await findMothershipUploadRowByChatAndName(chatId, filename) + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) return null + const entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + if (!entryBuffer) return null + const ext = getFileExtension(rawPath) + return renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) + } catch (err) { + if (err instanceof ArchiveError) { + return { content: `[${err.message}]`, totalLines: 1 } + } + throw err + } +} + +/** + * Build a file-tree manifest for an archive (`read("uploads/x.zip")`), so the + * agent gets the contents instead of binary bytes. An optional `note` is + * prepended — used to tell the agent a requested entry was not found while still + * showing the valid paths. Returns a placeholder result when the archive is + * unreadable. + */ +async function buildArchiveManifest( + record: WorkspaceFileRecord, + archiveBuffer: Buffer, + note?: string +): Promise { + const encodedZip = encodeUploadName(record.name) + try { + const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(archiveBuffer)) + const header = `Archive "${record.name}" — ${entries.length} file${ + entries.length === 1 ? '' : 's' + }. Read an entry with read("uploads/${encodedZip}/").` + const content = [...(note ? [note, ''] : []), header, '', ...entries].join('\n') + return { content, totalLines: content.split('\n').length } + } catch (err) { + if (err instanceof ArchiveError) { + return { content: `[${err.message}]`, totalLines: 1 } + } + throw err + } +} + +/** + * Read a chat upload addressed by its first path segment and an optional entry + * path, resolving the upload row exactly once. A plain upload renders directly + * (a trailing habit suffix like `/content` is ignored); an archive returns the + * addressed entry, or its file-tree manifest when no entry is given. Resolves + * names like {@link findMothershipUploadRowByChatAndName} so visually equivalent + * spellings (e.g. macOS U+202F vs ASCII space) still match. + */ +export async function readChatUploadPath( + firstSegment: string, + entryPath: string, + chatId: string +): Promise { + try { + const row = await findMothershipUploadRowByChatAndName(chatId, firstSegment) if (!row) return null - return readFileRecord(toWorkspaceFileRecord(row)) + const record = toWorkspaceFileRecord(row) + if (!isArchiveUpload(record)) { + return await readFileRecord(record) + } + if (exceedsArchiveReadCap(record)) { + return archiveTooLargeResult(record) + } + const archiveBuffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) + if (!entryPath) { + return await buildArchiveManifest(record, archiveBuffer) + } + const entry = await readArchiveEntry(archiveBuffer, entryPath) + if (entry) return entry + // Entry not found — show the manifest so the agent can pick a valid path. + // Handles a stray `/content` habit suffix (carried over from files/) and + // plain typos uniformly, without special-casing any segment name. + return await buildArchiveManifest( + record, + archiveBuffer, + `Entry "${decodeEntryPath(entryPath)}" not found in "${record.name}".` + ) } catch (err) { logger.warn('Failed to read chat upload', { - filename, + firstSegment, + entryPath, chatId, error: toError(err).message, }) @@ -163,30 +431,81 @@ export async function readChatUpload( } /** - * Grep the content of a single chat upload (`uploads/`), mirroring - * {@link WorkspaceVFS.grepFile} for the chat-scoped uploads namespace. Resolves - * the upload by name (raw or percent-encoded), reads its text per file type, and - * greps it. Throws {@link WorkspaceFileGrepError} when the upload is missing or - * has no searchable text (image/binary/too-large) so the caller surfaces the - * message verbatim. + * Grep a chat upload addressed by its first path segment and an optional entry + * path, resolving the upload row exactly once and mirroring + * {@link WorkspaceVFS.grepFile} for the chat-scoped namespace. An archive entry + * is grepped from the archive; otherwise the upload itself is grepped (a trailing + * habit suffix on a non-archive is ignored). Throws {@link WorkspaceFileGrepError} + * when the upload/entry is missing or has no searchable text so the caller + * surfaces the message verbatim. */ -export async function grepChatUpload( - filename: string, +export async function grepChatUploadPath( + firstSegment: string, + entryPath: string, chatId: string, pattern: string, options?: GrepOptions ): Promise { - const row = await findMothershipUploadRowByChatAndName(chatId, filename) + const row = await findMothershipUploadRowByChatAndName(chatId, firstSegment) if (!row) { throw new WorkspaceFileGrepError( - `Upload not found: "${filename}". Use glob("uploads/*") to list available uploads.` + `Upload not found: "${firstSegment}". Use glob("uploads/*") to list available uploads.` ) } const record = toWorkspaceFileRecord(row) + + if (entryPath && isArchiveUpload(record)) { + if (exceedsArchiveReadCap(record)) { + throw new WorkspaceFileGrepError( + `Archive too large to grep: "${record.name}" (limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB).` + ) + } + const archiveBuffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) + try { + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) { + throw new WorkspaceFileGrepError( + `Archive entry not found: "${decodeEntryPath(entryPath)}" in "${record.name}".` + ) + } + const entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + if (!entryBuffer) { + throw new WorkspaceFileGrepError( + `Archive entry not found: "${rawPath}" in "${record.name}".` + ) + } + const ext = getFileExtension(rawPath) + const result = await renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) + const uploadsPath = `uploads/${encodeUploadName(record.name)}/${encodeEntryPath(rawPath)}` + return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) + } catch (err) { + // Surface archive failures (invalid/too-many/oversized) as a grep error + // with the real reason rather than a generic internal failure. + if (err instanceof ArchiveError) { + throw new WorkspaceFileGrepError(err.message) + } + throw err + } + } + + // A bare archive has no searchable text of its own — guide the agent to target + // an entry (or read the archive to list them) rather than grepping its bytes. + if (isArchiveUpload(record)) { + throw new WorkspaceFileGrepError( + `Cannot grep an archive directly. Grep an entry (e.g. grep path: "uploads/${encodeUploadName( + record.name + )}/") or read("uploads/${encodeUploadName(record.name)}") to list its contents.` + ) + } + const result = await readFileRecord(record) if (!result) { - throw new WorkspaceFileGrepError(`Upload content not found for "${filename}".`) + throw new WorkspaceFileGrepError(`Upload content not found for "${firstSegment}".`) } - const uploadsPath = `uploads/${canonicalUploadKey(record.name)}` + const uploadsPath = `uploads/${encodeUploadName(record.name)}` return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) } diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts index 72eea0cefb9..82416d350f0 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts @@ -9,19 +9,24 @@ const { getOrMaterializeVFS } = vi.hoisted(() => ({ getOrMaterializeVFS: vi.fn(), })) -const { readChatUpload, listChatUploads, grepChatUpload } = vi.hoisted(() => ({ - readChatUpload: vi.fn(), - listChatUploads: vi.fn(), - grepChatUpload: vi.fn(), -})) +const { readChatUploadPath, listChatUploads, grepChatUploadPath, listChatUploadArchiveEntries } = + vi.hoisted(() => ({ + readChatUploadPath: vi.fn(), + listChatUploads: vi.fn(), + grepChatUploadPath: vi.fn(), + // Defaults to null (not an archive) so archive glob expansion is a no-op + // unless a test opts in. + listChatUploadArchiveEntries: vi.fn().mockResolvedValue(null), + })) vi.mock('@/lib/copilot/vfs', () => ({ getOrMaterializeVFS, })) vi.mock('./upload-file-reader', () => ({ - readChatUpload, + readChatUploadPath, listChatUploads, - grepChatUpload, + grepChatUploadPath, + listChatUploadArchiveEntries, })) import { WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' @@ -305,7 +310,7 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { await executeVfsGrep({ pattern: 'secret' }, GREP_CTX_CHAT) - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() expect(vfs.grep).toHaveBeenCalledWith('secret', undefined, expect.any(Object)) }) @@ -316,11 +321,11 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { await executeVfsGrep({ pattern: 'secret', path: 'files/report.csv' }, GREP_CTX_CHAT) - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) - it('routes an explicit uploads/ path to grepChatUpload', async () => { - grepChatUpload.mockResolvedValue([{ path: 'uploads/report.json', line: 1, content: 'hit' }]) + it('routes an explicit uploads/ path to grepChatUploadPath', async () => { + grepChatUploadPath.mockResolvedValue([{ path: 'uploads/report.json', line: 1, content: 'hit' }]) const result = await executeVfsGrep( { pattern: 'hit', path: 'uploads/report.json' }, @@ -328,8 +333,9 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { ) expect(result.success).toBe(true) - expect(grepChatUpload).toHaveBeenCalledWith( + expect(grepChatUploadPath).toHaveBeenCalledWith( 'report.json', + '', 'chat-1', 'hit', expect.objectContaining({ maxResults: 50 }) @@ -342,7 +348,7 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect(result.success).toBe(false) expect(result.error).toContain('single upload') - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) it('errors when grepping uploads without chat context', async () => { @@ -350,11 +356,11 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect(result.success).toBe(false) expect(result.error).toContain('No chat context') - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) it('surfaces an upload-not-found grep error verbatim', async () => { - grepChatUpload.mockRejectedValue( + grepChatUploadPath.mockRejectedValue( new WorkspaceFileGrepError( 'Upload not found: "ghost.json". Use glob("uploads/*") to list available uploads.' ) @@ -382,26 +388,134 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect((broad.output as { files: string[] }).files).not.toContain('uploads/My%20Report.json') }) - it('reads an upload directly, tolerating a spurious /content suffix', async () => { + it('reads an upload directly, passing the first segment and any trailing suffix', async () => { const vfs = makeVfs() getOrMaterializeVFS.mockResolvedValue(vfs) - readChatUpload.mockResolvedValue({ content: 'hello upload', totalLines: 1 }) + readChatUploadPath.mockResolvedValue({ content: 'hello upload', totalLines: 1 }) const bare = await executeVfsRead({ path: 'uploads/report.csv' }, GREP_CTX_CHAT) expect(bare.success).toBe(true) - expect(readChatUpload).toHaveBeenLastCalledWith('report.csv', 'chat-1') + expect(readChatUploadPath).toHaveBeenLastCalledWith('report.csv', '', 'chat-1') - // The model adds /content out of habit (from files/) — it must still resolve. + // The model adds /content out of habit (from files/); the trailing segment is + // forwarded and ignored by readChatUploadPath for a non-archive upload. const withContent = await executeVfsRead({ path: 'uploads/report.csv/content' }, GREP_CTX_CHAT) expect(withContent.success).toBe(true) - expect(readChatUpload).toHaveBeenLastCalledWith('report.csv', 'chat-1') + expect(readChatUploadPath).toHaveBeenLastCalledWith('report.csv', 'content', 'chat-1') }) - it('tolerates a trailing /content on an uploads grep path', async () => { - grepChatUpload.mockResolvedValue([]) + it('forwards a trailing segment on an uploads grep path', async () => { + grepChatUploadPath.mockResolvedValue([]) await executeVfsGrep({ pattern: 'x', path: 'uploads/report.json/content' }, GREP_CTX_CHAT) - expect(grepChatUpload).toHaveBeenCalledWith('report.json', 'chat-1', 'x', expect.any(Object)) + expect(grepChatUploadPath).toHaveBeenCalledWith( + 'report.json', + 'content', + 'chat-1', + 'x', + expect.any(Object) + ) + }) +}) + +describe('vfs archive uploads (virtual folders)', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const ARCHIVE_ENTRIES = [ + { path: 'report.pdf', vfsPath: 'uploads/bundle.zip/report.pdf' }, + { path: 'data/sheet.csv', vfsPath: 'uploads/bundle.zip/data/sheet.csv' }, + ] + + it('expands a recursive archive glob (/**) into all entry paths', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + listChatUploadArchiveEntries.mockResolvedValue(ARCHIVE_ENTRIES) + + const result = await executeVfsGlob({ pattern: 'uploads/bundle.zip/**' }, GREP_CTX_CHAT) + + expect(listChatUploadArchiveEntries).toHaveBeenCalledWith('bundle.zip', 'chat-1') + expect((result.output as { files: string[] }).files).toEqual( + expect.arrayContaining([ + 'uploads/bundle.zip', + 'uploads/bundle.zip/report.pdf', + 'uploads/bundle.zip/data/sheet.csv', + ]) + ) + }) + + it('honors glob depth — /* is top-level only, /data/* scopes to data', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + listChatUploadArchiveEntries.mockResolvedValue(ARCHIVE_ENTRIES) + + const topLevel = await executeVfsGlob({ pattern: 'uploads/bundle.zip/*' }, GREP_CTX_CHAT) + const topFiles = (topLevel.output as { files: string[] }).files + expect(topFiles).toContain('uploads/bundle.zip/report.pdf') + expect(topFiles).not.toContain('uploads/bundle.zip/data/sheet.csv') + + const scoped = await executeVfsGlob({ pattern: 'uploads/bundle.zip/data/*' }, GREP_CTX_CHAT) + const scopedFiles = (scoped.output as { files: string[] }).files + expect(scopedFiles).toContain('uploads/bundle.zip/data/sheet.csv') + expect(scopedFiles).not.toContain('uploads/bundle.zip/report.pdf') + }) + + it('does not expand archives for the broad uploads/* glob', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + + await executeVfsGlob({ pattern: 'uploads/*' }, GREP_CTX_CHAT) + + expect(listChatUploadArchiveEntries).not.toHaveBeenCalled() + }) + + it('forwards a nested archive entry read to readChatUploadPath', async () => { + readChatUploadPath.mockResolvedValue({ content: 'a,b\n1,2', totalLines: 2 }) + + const result = await executeVfsRead( + { path: 'uploads/bundle.zip/data/sheet.csv' }, + GREP_CTX_CHAT + ) + + expect(result.success).toBe(true) + expect(readChatUploadPath).toHaveBeenCalledWith('bundle.zip', 'data/sheet.csv', 'chat-1') + }) + + it('forwards a bare archive read to readChatUploadPath with no entry', async () => { + readChatUploadPath.mockResolvedValue({ + content: 'Archive "bundle.zip" — 2 files:\nreport.pdf\ndata/sheet.csv', + totalLines: 3, + }) + + const result = await executeVfsRead({ path: 'uploads/bundle.zip' }, GREP_CTX_CHAT) + + expect(result.success).toBe(true) + expect(readChatUploadPath).toHaveBeenCalledWith('bundle.zip', '', 'chat-1') + expect((result.output as { content: string }).content).toContain('Archive "bundle.zip"') + }) + + it('forwards a nested archive entry grep to grepChatUploadPath', async () => { + grepChatUploadPath.mockResolvedValue([ + { path: 'uploads/bundle.zip/notes.txt', line: 1, content: 'hit' }, + ]) + + const result = await executeVfsGrep( + { pattern: 'hit', path: 'uploads/bundle.zip/notes.txt' }, + GREP_CTX_CHAT + ) + + expect(result.success).toBe(true) + expect(grepChatUploadPath).toHaveBeenCalledWith( + 'bundle.zip', + 'notes.txt', + 'chat-1', + 'hit', + expect.any(Object) + ) }) }) diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.ts b/apps/sim/lib/copilot/tools/handlers/vfs.ts index ca1902692e1..38807ee5ca3 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.ts @@ -4,9 +4,14 @@ import { TOOL_RESULT_MAX_INLINE_CHARS } from '@/lib/copilot/constants' import type { ExecutionContext, ToolCallResult } from '@/lib/copilot/request/types' import { getOrMaterializeVFS } from '@/lib/copilot/vfs' import type { GrepCountEntry, GrepMatch } from '@/lib/copilot/vfs/operations' -import { WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' +import { matchesVfsGlob, WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' import { encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' -import { grepChatUpload, listChatUploads, readChatUpload } from './upload-file-reader' +import { + grepChatUploadPath, + listChatUploadArchiveEntries, + listChatUploads, + readChatUploadPath, +} from './upload-file-reader' const logger = createLogger('VfsTools') @@ -40,6 +45,21 @@ function isChatUploadGrepPath(path: string | undefined): path is string { return /^uploads(\/|$)/.test(path.replace(/^\/+/, '')) } +/** + * Extract the concrete archive segment a glob reaches into, e.g. `archive.zip` + * from `uploads/archive.zip/*`. Returns null for the broad `uploads/*` listing + * or when the first segment is itself a glob, so archives stay single leaves + * until the model globs inside one specifically. + */ +function parseArchiveGlobSegment(pattern: string): string | null { + const rest = pattern.replace(/^\/+/, '').replace(/^uploads\//, '') + const firstSlash = rest.indexOf('/') + if (firstSlash === -1) return null + const segment = rest.slice(0, firstSlash) + if (!segment || /[*?[\]{}]/.test(segment)) return null + return segment +} + function serializedResultSize(value: unknown): number { try { return JSON.stringify(value).length @@ -104,20 +124,29 @@ export async function executeVfsGrep( if (!context.chatId) { return { success: false, error: 'No chat context available for uploads/' } } - // The upload is the first segment after uploads/; any trailing segment - // (e.g. a /content suffix) is ignored, mirroring the uploads read path. - const filename = rawPath + // The upload is the first segment after uploads/. A further segment is + // either an archive entry (uploads//) or a habit suffix + // (e.g. a /content suffix), both handled by grepChatUploadPath. + const uploadSegments = rawPath .replace(/^\/+/, '') .replace(/^uploads\/?/, '') - .split('/')[0] - if (!filename) { + .split('/') + const firstSegment = uploadSegments[0] + const entryPath = uploadSegments.slice(1).join('/') + if (!firstSegment) { return { success: false, error: 'Grep over chat uploads must target a single upload (e.g. path: "uploads/report.json"). Use glob("uploads/*") to list uploads.', } } - result = await grepChatUpload(filename, context.chatId, pattern, grepOptions) + result = await grepChatUploadPath( + firstSegment, + entryPath, + context.chatId, + pattern, + grepOptions + ) } else { const vfs = await getOrMaterializeVFS(workspaceId, context.userId) result = isWorkspaceFileGrepPath(rawPath) @@ -185,6 +214,21 @@ export async function executeVfsGlob( // upload resolver accepts both the encoded path and the raw display name. const uploadPaths = uploads.map((f) => `uploads/${encodeUploadSegment(f.name)}`) files = [...files, ...uploadPaths] + + // Expand a specific archive's entries when the glob reaches inside it + // (uploads//*). Broad uploads/* keeps archives as single leaves. Entry + // paths are filtered through the same matcher as the VFS map, so the glob's + // depth (`/*` vs `/**` vs `/data/*`) is honored rather than dumping all. + const archiveSegment = parseArchiveGlobSegment(pattern) + if (archiveSegment) { + const entries = await listChatUploadArchiveEntries(archiveSegment, context.chatId) + if (entries) { + const matched = entries + .map((entry) => entry.vfsPath) + .filter((vfsPath) => matchesVfsGlob(vfsPath, pattern)) + files = [...files, ...matched] + } + } } logger.debug('vfs_glob result', { pattern, fileCount: files.length }) @@ -243,8 +287,13 @@ export async function executeVfsRead( if (!context.chatId) { return { success: false, error: 'No chat context available for uploads/' } } - const filename = path.slice('uploads/'.length).split('/')[0] - const uploadResult = await readChatUpload(filename, context.chatId) + // The upload is the first segment after uploads/. A further segment is + // either an archive entry (uploads//) or a habit suffix + // (e.g. a /content suffix), both handled by readChatUploadPath. + const uploadSegments = path.slice('uploads/'.length).split('/') + const firstSegment = uploadSegments[0] + const entryPath = uploadSegments.slice(1).join('/') + const uploadResult = await readChatUploadPath(firstSegment, entryPath, context.chatId) if (uploadResult) { const isAttachment = hasModelAttachment(uploadResult) if ( diff --git a/apps/sim/lib/copilot/vfs/file-reader.test.ts b/apps/sim/lib/copilot/vfs/file-reader.test.ts index f4326b32035..7674e9d7fc9 100644 --- a/apps/sim/lib/copilot/vfs/file-reader.test.ts +++ b/apps/sim/lib/copilot/vfs/file-reader.test.ts @@ -15,7 +15,7 @@ vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({ fetchWorkspaceFileBuffer, })) -import { readFileRecord } from '@/lib/copilot/vfs/file-reader' +import { readFileRecord, renderFileBuffer } from '@/lib/copilot/vfs/file-reader' const MAX_IMAGE_READ_BYTES = 5 * 1024 * 1024 @@ -62,4 +62,74 @@ describe('readFileRecord', () => { }, SHARP_TEST_TIMEOUT_MS ) + + it('returns the binary placeholder for an unrenderable type WITHOUT downloading', async () => { + fetchWorkspaceFileBuffer.mockClear() + const result = await readFileRecord({ + id: 'wf_bin', + workspaceId: 'ws_1', + name: 'archive.bin', + key: 'uploads/archive.bin', + path: '/api/files/serve/uploads%2Farchive.bin?context=mothership', + size: 4_000_000_000, // 4 GB — must never be fetched into memory + type: 'application/octet-stream', + uploadedBy: 'user_1', + uploadedAt: new Date(), + deletedAt: null, + storageContext: 'mothership', + }) + + expect(result?.content).toContain('[Binary file: archive.bin') + expect(fetchWorkspaceFileBuffer).not.toHaveBeenCalled() + }) +}) + +describe('renderFileBuffer', () => { + it('renders readable text content verbatim with line counts', async () => { + const buffer = Buffer.from('line one\nline two\nline three') + const result = await renderFileBuffer(buffer, { + name: 'notes.txt', + type: 'text/plain', + ext: 'txt', + }) + expect(result.content).toBe('line one\nline two\nline three') + expect(result.totalLines).toBe(3) + expect(result.attachment).toBeUndefined() + }) + + it('renders csv and json by content type', async () => { + const csv = await renderFileBuffer(Buffer.from('a,b\n1,2'), { + name: 'data.csv', + type: 'text/csv', + ext: 'csv', + }) + expect(csv.content).toBe('a,b\n1,2') + + const json = await renderFileBuffer(Buffer.from('{"k":1}'), { + name: 'config.json', + type: 'application/json', + ext: 'json', + }) + expect(json.content).toBe('{"k":1}') + }) + + it('returns a binary placeholder for unrenderable types', async () => { + const result = await renderFileBuffer(Buffer.from([0x00, 0x01, 0x02, 0x03]), { + name: 'blob.dat', + type: 'application/octet-stream', + ext: 'dat', + }) + expect(result.content).toContain('[Binary file: blob.dat') + expect(result.attachment).toBeUndefined() + }) + + it('rejects oversized text without returning content', async () => { + const big = Buffer.alloc(MAX_IMAGE_READ_BYTES + 1, 0x61) // > 5MB of 'a' + const result = await renderFileBuffer(big, { + name: 'huge.txt', + type: 'text/plain', + ext: 'txt', + }) + expect(result.content).toContain('[File too large to display inline: huge.txt') + }) }) diff --git a/apps/sim/lib/copilot/vfs/file-reader.ts b/apps/sim/lib/copilot/vfs/file-reader.ts index 26388d5621a..9d3d112c418 100644 --- a/apps/sim/lib/copilot/vfs/file-reader.ts +++ b/apps/sim/lib/copilot/vfs/file-reader.ts @@ -274,6 +274,141 @@ export interface FileReadResult { } } +/** Placeholder returned when a text file exceeds the inline read budget. */ +function textTooLargeResult(name: string, size: number): FileReadResult { + return { + content: `[File too large to display inline: ${name} (${size} bytes, limit ${MAX_TEXT_READ_BYTES})]`, + totalLines: 1, + } +} + +/** Placeholder returned when a parseable document exceeds the inline parse budget. */ +function documentTooLargeResult(name: string, size: number): FileReadResult { + return { + content: `[Document too large to parse inline: ${name} (${size} bytes, limit ${MAX_PARSEABLE_READ_BYTES})]`, + totalLines: 1, + } +} + +/** Placeholder returned for a file whose bytes cannot be rendered as text. */ +function binaryPlaceholderResult(name: string, type: string, size: number): FileReadResult { + return { + content: `[Binary file: ${name} (${type}, ${size} bytes). Cannot display as text.]`, + totalLines: 1, + } +} + +/** True when a file is binary — not an image, not text, and not a parseable document. */ +function isBinaryFile(type: string, ext: string): boolean { + return !isImageFileType(type) && !isReadableType(type) && !PARSEABLE_EXTENSIONS.has(ext) +} + +/** + * Render an in-memory file buffer into a {@link FileReadResult} using the same + * image / text / parseable-document / binary logic as a stored upload. + * + * Pure aside from the optional `span`, which only carries read-path/outcome + * telemetry when called from {@link readFileRecord}; archive-entry reads pass no + * span. Size caps apply to the buffer length, so an inflated zip entry is bounded + * exactly like a stored file. + */ +export async function renderFileBuffer( + buffer: Buffer, + meta: { name: string; type: string; ext: string }, + span?: Span +): Promise { + const { name, type, ext } = meta + + if (isImageFileType(type)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Image) + const prepared = await prepareImageForVision(buffer, type) + if (!prepared) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ImageTooLarge) + return { + content: `[Image too large: ${name} (${(buffer.length / 1024 / 1024).toFixed(1)}MB, limit 5MB after resize/compression)]`, + totalLines: 1, + } + } + const sizeKb = (prepared.buffer.length / 1024).toFixed(1) + const resizeNote = prepared.resized ? ', resized for vision' : '' + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.ImagePrepared, + [TraceAttr.CopilotVfsReadOutputBytes]: prepared.buffer.length, + [TraceAttr.CopilotVfsReadOutputMediaType]: prepared.mediaType, + [TraceAttr.CopilotVfsReadImageResized]: prepared.resized, + }) + return { + content: `Image: ${name} (${sizeKb}KB, ${prepared.mediaType}${resizeNote})`, + totalLines: 1, + attachment: { + type: 'image', + name, + source: { + type: 'base64' as const, + media_type: prepared.mediaType, + data: prepared.buffer.toString('base64'), + }, + }, + } + } + + if (isReadableType(type)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Text) + if (buffer.length > MAX_TEXT_READ_BYTES) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) + return textTooLargeResult(name, buffer.length) + } + const content = buffer.toString('utf-8') + const lines = content.split('\n').length + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.TextRead, + [TraceAttr.CopilotVfsReadOutputBytes]: buffer.length, + [TraceAttr.CopilotVfsReadOutputLines]: lines, + }) + return { content, totalLines: lines } + } + + if (PARSEABLE_EXTENSIONS.has(ext)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.ParseableDocument) + if (buffer.length > MAX_PARSEABLE_READ_BYTES) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.DocumentTooLarge) + return documentTooLargeResult(name, buffer.length) + } + try { + const { parseBuffer } = await import('@/lib/file-parsers') + const result = await parseBuffer(buffer, ext) + const content = result.content || '' + const lines = content.split('\n').length + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.DocumentParsed, + [TraceAttr.CopilotVfsReadOutputBytes]: content.length, + [TraceAttr.CopilotVfsReadOutputLines]: lines, + }) + return { content, totalLines: lines } + } catch (parseErr) { + logger.warn('Failed to parse document', { + fileName: name, + ext, + error: toError(parseErr).message, + }) + span?.addEvent(TraceEvent.CopilotVfsParseFailed, { + [TraceAttr.ErrorMessage]: toError(parseErr).message.slice(0, 500), + }) + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ParseFailed) + return { + content: `[Could not parse ${name} (${type}, ${buffer.length} bytes)]`, + totalLines: 1, + } + } + } + + span?.setAttributes({ + [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, + }) + return binaryPlaceholderResult(name, type, buffer.length) +} + /** * Read and return the content of a workspace file record. * Handles images (base64 attachment), parseable documents (PDF, DOCX, etc.), @@ -298,111 +433,35 @@ export async function readFileRecord(record: WorkspaceFileRecord): Promise { try { - if (isImageFileType(record.type)) { - span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Image) - const originalBuffer = await fetchWorkspaceFileBuffer(record) - const prepared = await prepareImageForVision(originalBuffer, record.type) - if (!prepared) { - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ImageTooLarge) - return { - content: `[Image too large: ${record.name} (${(record.size / 1024 / 1024).toFixed(1)}MB, limit 5MB after resize/compression)]`, - totalLines: 1, - } - } - const sizeKb = (prepared.buffer.length / 1024).toFixed(1) - const resizeNote = prepared.resized ? ', resized for vision' : '' - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.ImagePrepared, - [TraceAttr.CopilotVfsReadOutputBytes]: prepared.buffer.length, - [TraceAttr.CopilotVfsReadOutputMediaType]: prepared.mediaType, - [TraceAttr.CopilotVfsReadImageResized]: prepared.resized, - }) - return { - content: `Image: ${record.name} (${sizeKb}KB, ${prepared.mediaType}${resizeNote})`, - totalLines: 1, - attachment: { - type: 'image', - name: record.name, - source: { - type: 'base64' as const, - media_type: prepared.mediaType, - data: prepared.buffer.toString('base64'), - }, - }, - } - } - - if (isReadableType(record.type)) { + const ext = getExtension(record.name) + // Pre-fetch size guards: reject oversized text/parseable files without + // paying for the download. Images are always fetched (to sniff + resize). + if (isReadableType(record.type) && record.size > MAX_TEXT_READ_BYTES) { span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Text) - if (record.size > MAX_TEXT_READ_BYTES) { - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) - return { - content: `[File too large to display inline: ${record.name} (${record.size} bytes, limit ${MAX_TEXT_READ_BYTES})]`, - totalLines: 1, - } - } - - const buffer = await fetchWorkspaceFileBuffer(record) - const content = buffer.toString('utf-8') - const lines = content.split('\n').length - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.TextRead, - [TraceAttr.CopilotVfsReadOutputBytes]: buffer.length, - [TraceAttr.CopilotVfsReadOutputLines]: lines, - }) - return { content, totalLines: lines } + span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) + return textTooLargeResult(record.name, record.size) } - - const ext = getExtension(record.name) - if (PARSEABLE_EXTENSIONS.has(ext)) { + if ( + !isImageFileType(record.type) && + !isReadableType(record.type) && + PARSEABLE_EXTENSIONS.has(ext) && + record.size > MAX_PARSEABLE_READ_BYTES + ) { span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.ParseableDocument) - if (record.size > MAX_PARSEABLE_READ_BYTES) { - span.setAttribute( - TraceAttr.CopilotVfsReadOutcome, - CopilotVfsReadOutcome.DocumentTooLarge - ) - return { - content: `[Document too large to parse inline: ${record.name} (${record.size} bytes, limit ${MAX_PARSEABLE_READ_BYTES})]`, - totalLines: 1, - } - } - const buffer = await fetchWorkspaceFileBuffer(record) - try { - const { parseBuffer } = await import('@/lib/file-parsers') - const result = await parseBuffer(buffer, ext) - const content = result.content || '' - const lines = content.split('\n').length - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.DocumentParsed, - [TraceAttr.CopilotVfsReadOutputBytes]: content.length, - [TraceAttr.CopilotVfsReadOutputLines]: lines, - }) - return { content, totalLines: lines } - } catch (parseErr) { - logger.warn('Failed to parse document', { - fileName: record.name, - ext, - error: toError(parseErr).message, - }) - span.addEvent(TraceEvent.CopilotVfsParseFailed, { - [TraceAttr.ErrorMessage]: toError(parseErr).message.slice(0, 500), - }) - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ParseFailed) - return { - content: `[Could not parse ${record.name} (${record.type}, ${record.size} bytes)]`, - totalLines: 1, - } - } + span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.DocumentTooLarge) + return documentTooLargeResult(record.name, record.size) } - - span.setAttributes({ - [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, - }) - return { - content: `[Binary file: ${record.name} (${record.type}, ${record.size} bytes). Cannot display as text.]`, - totalLines: 1, + // Binary/unknown types never need the bytes — return the placeholder + // without paying for a download (workspace files can be multi-GB). + if (isBinaryFile(record.type, ext)) { + span.setAttributes({ + [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, + }) + return binaryPlaceholderResult(record.name, record.type, record.size) } + const buffer = await fetchWorkspaceFileBuffer(record) + return await renderFileBuffer(buffer, { name: record.name, type: record.type, ext }, span) } catch (err) { logger.warn('Failed to read workspace file', { fileName: record.name, diff --git a/apps/sim/lib/copilot/vfs/operations.ts b/apps/sim/lib/copilot/vfs/operations.ts index bd7208719b8..23dc4b4e168 100644 --- a/apps/sim/lib/copilot/vfs/operations.ts +++ b/apps/sim/lib/copilot/vfs/operations.ts @@ -94,6 +94,16 @@ const VFS_GLOB_OPTIONS: micromatch.Options = { noext: true, } +/** + * True when `filePath` matches the glob `pattern` under {@link VFS_GLOB_OPTIONS} + * (path-aware `*`/`?`, `**`, no brace/extglob). Exported so callers that build + * their own path list (e.g. virtual archive entries) filter it exactly the way + * {@link glob} filters the VFS map. + */ +export function matchesVfsGlob(filePath: string, pattern: string): boolean { + return micromatch.isMatch(filePath, pattern, VFS_GLOB_OPTIONS) +} + /** * Splits VFS text into lines for line-oriented grep. Strips a trailing CR so Windows-style * CRLF payloads still match patterns anchored at line end (`$`). diff --git a/apps/sim/lib/uploads/archive.test.ts b/apps/sim/lib/uploads/archive.test.ts new file mode 100644 index 00000000000..a3acc55d97b --- /dev/null +++ b/apps/sim/lib/uploads/archive.test.ts @@ -0,0 +1,123 @@ +/** + * @vitest-environment node + */ +import { Buffer } from 'buffer' +import JSZip from 'jszip' +import { describe, expect, it } from 'vitest' +import { + ArchiveError, + extractArchiveEntry, + listArchiveEntries, + MAX_ARCHIVE_ENTRIES, +} from '@/lib/uploads/archive' + +async function buildZip(files: Record): Promise { + const zip = new JSZip() + for (const [name, content] of Object.entries(files)) { + zip.file(name, content) + } + const arr = await zip.generateAsync({ type: 'uint8array' }) + return Buffer.from(arr) +} + +describe('listArchiveEntries', () => { + it('enumerates nested entries with sanitized joined paths', async () => { + const buffer = await buildZip({ + 'report.txt': 'hello', + 'data/sheet.csv': 'a,b\n1,2', + 'data/nested/deep.json': '{}', + }) + + const paths = (await listArchiveEntries(buffer)).sort() + + expect(paths).toEqual(['data/nested/deep.json', 'data/sheet.csv', 'report.txt']) + }) + + it('skips directory entries', async () => { + const zip = new JSZip() + zip.folder('emptydir') + zip.file('file.txt', 'x') + const buffer = Buffer.from(await zip.generateAsync({ type: 'uint8array' })) + + expect(await listArchiveEntries(buffer)).toEqual(['file.txt']) + }) + + it('never surfaces a path with a traversal segment or absolute root', async () => { + // JSZip itself strips leading `../`, keeping a contained basename; our guard + // additionally rejects any residual `..` (e.g. a Windows-style backslash path + // that JSZip stores verbatim) so nothing can escape the archive root. + const buffer = await buildZip({ + 'safe.txt': 'ok', + '..\\evil.txt': 'evil', + 'sub\\..\\..\\evil2.txt': 'evil', + }) + + const paths = await listArchiveEntries(buffer) + + expect(paths).toContain('safe.txt') + expect(paths.some((p) => p.split('/').includes('..'))).toBe(false) + expect(paths.some((p) => p.startsWith('/'))).toBe(false) + expect(paths).not.toContain('evil.txt') + expect(paths).not.toContain('evil2.txt') + }) + + it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => { + const buffer = await buildZip({ + 'doc.txt': 'real', + '__MACOSX/._doc.txt': 'junk', + '.DS_Store': 'junk', + 'sub/.DS_Store': 'junk', + 'sub/Thumbs.db': 'junk', + }) + + expect(await listArchiveEntries(buffer)).toEqual(['doc.txt']) + }) + + it('rejects archives with too many entries', async () => { + const files: Record = {} + for (let i = 0; i <= MAX_ARCHIVE_ENTRIES; i++) { + files[`f${i}.txt`] = 'x' + } + const buffer = await buildZip(files) + + await expect(listArchiveEntries(buffer)).rejects.toMatchObject({ + name: 'ArchiveError', + reason: 'too_many_entries', + }) + }) + + it('throws ArchiveError invalid for non-zip buffers', async () => { + await expect(listArchiveEntries(Buffer.from('not a zip at all'))).rejects.toBeInstanceOf( + ArchiveError + ) + }) +}) + +describe('extractArchiveEntry', () => { + it('extracts a single entry by sanitized path', async () => { + const buffer = await buildZip({ + 'report.txt': 'the body', + 'data/sheet.csv': 'a,b\n1,2', + }) + + const csv = await extractArchiveEntry(buffer, 'data/sheet.csv') + expect(csv?.toString('utf-8')).toBe('a,b\n1,2') + + const txt = await extractArchiveEntry(buffer, 'report.txt') + expect(txt?.toString('utf-8')).toBe('the body') + }) + + it('returns null when the entry does not exist', async () => { + const buffer = await buildZip({ 'report.txt': 'x' }) + expect(await extractArchiveEntry(buffer, 'missing.txt')).toBeNull() + }) + + it('does not resolve traversal paths', async () => { + const buffer = await buildZip({ '..\\evil.txt': 'evil', 'safe.txt': 'ok' }) + // The traversal entry sanitizes to null, so it is unmatchable by any path. + expect(await extractArchiveEntry(buffer, '../evil.txt')).toBeNull() + expect(await extractArchiveEntry(buffer, '..\\evil.txt')).toBeNull() + expect(await extractArchiveEntry(buffer, 'evil.txt')).toBeNull() + expect((await extractArchiveEntry(buffer, 'safe.txt'))?.toString('utf-8')).toBe('ok') + }) +}) diff --git a/apps/sim/lib/uploads/archive.ts b/apps/sim/lib/uploads/archive.ts new file mode 100644 index 00000000000..5fe4bf60d89 --- /dev/null +++ b/apps/sim/lib/uploads/archive.ts @@ -0,0 +1,213 @@ +import { Buffer } from 'buffer' +import type { Readable } from 'stream' +import JSZip from 'jszip' + +/** + * Shared, zip-bomb / zip-slip safe archive primitives. + * + * These were originally inlined in the file-manage decompress route; they are + * factored here so the copilot VFS can present an uploaded `.zip` as a virtual + * folder (list entries, extract one entry on read) using the exact same safety + * guarantees. The declared sizes in a ZIP header are attacker-controlled, so the + * real caps are always enforced on the inflated byte stream — never on metadata. + */ + +/** Input archive download/size cap. */ +export const MAX_ARCHIVE_BYTES = 100 * 1024 * 1024 +/** Maximum number of entries enumerated/extracted from a single archive. */ +export const MAX_ARCHIVE_ENTRIES = 1000 +/** Maximum uncompressed size for any single archive entry. */ +export const MAX_ARCHIVE_ENTRY_BYTES = 100 * 1024 * 1024 +/** Maximum total uncompressed size across all entries, to bound zip-bomb expansion. */ +export const MAX_ARCHIVE_TOTAL_BYTES = 200 * 1024 * 1024 + +const S_IFMT = 0o170000 +const S_IFLNK = 0o120000 + +/** Reason a {@link ArchiveError} was raised, for mapping to a caller response. */ +export type ArchiveErrorReason = + | 'invalid' + | 'too_many_entries' + | 'entry_too_large' + | 'total_too_large' + +/** Raised for malformed archives and cap violations so callers can surface a clear message. */ +export class ArchiveError extends Error { + readonly reason: ArchiveErrorReason + readonly entryName?: string + + constructor(reason: ArchiveErrorReason, message: string, entryName?: string) { + super(message) + this.name = 'ArchiveError' + this.reason = reason + this.entryName = entryName + } +} + +/** + * Read a zip entry's declared uncompressed size without materializing it. This + * value comes straight from the (attacker-controlled) ZIP metadata, so it is only + * usable as a cheap fast-reject for honestly-declared archives — never as the + * authoritative cap. {@link inflateEntryWithinCaps} enforces the real limit on the + * inflated byte stream. + */ +export const readEntryUncompressedSize = (entry: JSZip.JSZipObject): number | undefined => { + const data = (entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } })._data + const size = data?.uncompressedSize + return typeof size === 'number' && Number.isFinite(size) ? size : undefined +} + +type InflateResult = { ok: true; buffer: Buffer } | { ok: false; reason: 'entry' | 'total' } + +/** + * Inflate a single zip entry through a streaming counting sink, tearing the + * stream down the moment cumulative output would exceed the per-entry cap or the + * remaining total budget. The declared uncompressed size in the ZIP header is + * attacker-controlled and is NOT trusted here: a forged-small or absent size + * cannot cause the full (potentially gigabyte-scale) entry to be materialized in + * memory, because enforcement happens on the actual inflated bytes as they + * arrive. Peak memory is bounded by the cap plus one DEFLATE chunk. + */ +export const inflateEntryWithinCaps = ( + entry: JSZip.JSZipObject, + remainingTotalBudget: number +): Promise => + new Promise((resolve, reject) => { + const chunks: Buffer[] = [] + let size = 0 + let settled = false + const stream = entry.nodeStream() as Readable + + const settle = (result: InflateResult) => { + if (settled) return + settled = true + stream.destroy() + resolve(result) + } + + stream.on('data', (chunk: Buffer) => { + size += chunk.length + if (size > MAX_ARCHIVE_ENTRY_BYTES) { + settle({ ok: false, reason: 'entry' }) + return + } + if (size > remainingTotalBudget) { + settle({ ok: false, reason: 'total' }) + return + } + chunks.push(chunk) + }) + stream.on('end', () => settle({ ok: true, buffer: Buffer.concat(chunks, size) })) + stream.on('error', (error) => { + if (settled) return + settled = true + stream.destroy() + reject(error) + }) + }) + +/** True when a zip entry's unix mode marks it as a symlink (never extracted). */ +export const isSymlinkEntry = (entry: JSZip.JSZipObject): boolean => { + const mode = (entry as JSZip.JSZipObject & { unixPermissions?: number | null }).unixPermissions + return typeof mode === 'number' && (mode & S_IFMT) === S_IFLNK +} + +/** + * Normalize a zip entry path into safe path segments, guarding against zip-slip. + * Returns null for traversal (`..`) and empty paths; a leading slash or drive root + * is dropped to empty segments, so the entry stays relative (contained) rather + * than resolving outside its intended location. + */ +export const sanitizeArchiveEntryPath = (rawPath: string): string[] | null => { + const segments = rawPath + .replace(/\\/g, '/') + .split('/') + .map((segment) => segment.trim()) + .filter((segment) => segment.length > 0 && segment !== '.') + + if (segments.length === 0 || segments.includes('..')) return null + return segments +} + +/** Filesystem cruft that should never surface as a readable archive entry. */ +const isArchiveNoiseEntry = (segments: string[]): boolean => { + if (segments[0] === '__MACOSX') return true + const leaf = segments[segments.length - 1] + return leaf === '.DS_Store' || leaf === 'Thumbs.db' +} + +/** + * Parse an archive buffer, throwing {@link ArchiveError} with reason `invalid` + * when it is not a readable zip. + */ +async function loadArchive(buffer: Buffer): Promise { + try { + return await JSZip.loadAsync(buffer) + } catch { + throw new ArchiveError('invalid', 'Not a valid .zip archive') + } +} + +/** + * Enumerate the safe, extractable entry paths of an archive WITHOUT inflating + * them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips + * directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`, + * `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past + * {@link MAX_ARCHIVE_ENTRIES}. + * + * Paths are returned raw (not de-duplicated): two entries can collide only once + * projected into the VFS's canonical (NFC-encoded) form, so de-duplication + * belongs with the caller that owns that encoding (`listChatUploadArchiveEntries`). + */ +export async function listArchiveEntries(buffer: Buffer): Promise { + const zip = await loadArchive(buffer) + + const realEntries = Object.values(zip.files).filter( + (entry) => !entry.dir && !isSymlinkEntry(entry) + ) + if (realEntries.length > MAX_ARCHIVE_ENTRIES) { + throw new ArchiveError( + 'too_many_entries', + `Archive has too many entries. Maximum is ${MAX_ARCHIVE_ENTRIES}.` + ) + } + + const paths: string[] = [] + for (const entry of realEntries) { + const segments = sanitizeArchiveEntryPath(entry.name) + if (!segments || isArchiveNoiseEntry(segments)) continue + paths.push(segments.join('/')) + } + return paths +} + +/** + * Extract a single archive entry by its sanitized `/`-joined path, inflating + * within the per-entry cap. Returns `null` when no entry matches. Throws + * {@link ArchiveError} `entry_too_large` if the inflated bytes exceed the cap. + */ +export async function extractArchiveEntry( + buffer: Buffer, + entryPath: string +): Promise { + const zip = await loadArchive(buffer) + + const match = Object.values(zip.files).find((entry) => { + if (entry.dir || isSymlinkEntry(entry)) return false + const segments = sanitizeArchiveEntryPath(entry.name) + return segments !== null && segments.join('/') === entryPath + }) + if (!match) return null + + const result = await inflateEntryWithinCaps(match, MAX_ARCHIVE_ENTRY_BYTES) + if (!result.ok) { + throw new ArchiveError( + 'entry_too_large', + `Archive entry "${entryPath}" is too large to extract. Maximum is ${ + MAX_ARCHIVE_ENTRY_BYTES / (1024 * 1024) + } MB per file.`, + entryPath + ) + } + return result.buffer +} diff --git a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts index 8e091d8d8f2..ee37397c7d3 100644 --- a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts +++ b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts @@ -854,15 +854,21 @@ export async function getWorkspaceFile( } /** - * Download workspace file content + * Download workspace file content. Pass `maxBytes` to cap the download on the + * actual byte stream (not just the client-declared `record.size`), so a stored + * object larger than its recorded size cannot be buffered fully into memory. */ -export async function fetchWorkspaceFileBuffer(fileRecord: WorkspaceFileRecord): Promise { +export async function fetchWorkspaceFileBuffer( + fileRecord: WorkspaceFileRecord, + options: { maxBytes?: number } = {} +): Promise { logger.info(`Downloading workspace file: ${fileRecord.name}`) try { const buffer = await downloadFile({ key: fileRecord.key, context: fileRecord.storageContext ?? 'workspace', + maxBytes: options.maxBytes, }) logger.info( `Successfully downloaded workspace file: ${fileRecord.name} (${buffer.length} bytes)` diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts index 0fd254f2e25..dc7f5b779bd 100644 --- a/apps/sim/lib/uploads/utils/file-utils.ts +++ b/apps/sim/lib/uploads/utils/file-utils.ts @@ -1,7 +1,11 @@ import type { Logger } from '@sim/logger' import { omit } from '@sim/utils/object' import type { StorageContext } from '@/lib/uploads' -import { ACCEPTED_FILE_TYPES, SUPPORTED_DOCUMENT_EXTENSIONS } from '@/lib/uploads/utils/validation' +import { + ACCEPTED_FILE_TYPES, + SUPPORTED_ARCHIVE_EXTENSIONS, + SUPPORTED_DOCUMENT_EXTENSIONS, +} from '@/lib/uploads/utils/validation' import { isUuid } from '@/executor/constants' import type { UserFile } from '@/executor/types' @@ -206,6 +210,16 @@ export function getFileExtension(filename: string): string { return lastDot !== -1 ? filename.slice(lastDot + 1).toLowerCase() : '' } +const ARCHIVE_EXTENSIONS = new Set(SUPPORTED_ARCHIVE_EXTENSIONS) + +/** + * True when a file name is a supported archive (zip). Detection is by extension + * so it is robust to the varied/empty MIME types browsers assign to archives. + */ +export function isArchiveFileName(filename: string): boolean { + return ARCHIVE_EXTENSIONS.has(getFileExtension(filename)) +} + const EXTENSION_TO_MIME: Record = { // Images jpg: 'image/jpeg', diff --git a/apps/sim/lib/uploads/utils/validation.test.ts b/apps/sim/lib/uploads/utils/validation.test.ts index 9d5d31ea1d6..c2b8daeb416 100644 --- a/apps/sim/lib/uploads/utils/validation.test.ts +++ b/apps/sim/lib/uploads/utils/validation.test.ts @@ -71,11 +71,20 @@ describe('validateAttachmentFileType', () => { expect(validateAttachmentFileType('config.json')).toBeNull() }) - it('rejects executables and unknown extensions', () => { + it('accepts zip archives', () => { + expect(validateAttachmentFileType('mydata.zip')).toBeNull() + expect(validateAttachmentFileType('mydata~1782582468496.zip')).toBeNull() + expect(validateAttachmentFileType('UPPER.ZIP')).toBeNull() + }) + + it('rejects executables and other archive formats we do not extract', () => { expect(validateAttachmentFileType('virus.exe')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('installer.msi')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('archive.dmg')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('binary.bin')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.tar')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.gz')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.rar')?.code).toBe('UNSUPPORTED_FILE_TYPE') }) it('rejects files with no extension', () => { diff --git a/apps/sim/lib/uploads/utils/validation.ts b/apps/sim/lib/uploads/utils/validation.ts index b4e27684f63..757b76a7acd 100644 --- a/apps/sim/lib/uploads/utils/validation.ts +++ b/apps/sim/lib/uploads/utils/validation.ts @@ -95,6 +95,14 @@ export const SUPPORTED_AUDIO_EXTENSIONS = [ export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const +/** + * Archive formats accepted as chat attachments. A `.zip` is stored as a single + * object and presented to the agent as a virtual folder it can list and read + * entry-by-entry — extraction happens lazily on read in the copilot VFS, so the + * archive itself never needs a document parser here. + */ +export const SUPPORTED_ARCHIVE_EXTENSIONS = ['zip'] as const + export const SUPPORTED_IMAGE_EXTENSIONS = [ 'png', 'jpg', @@ -207,10 +215,18 @@ const SUPPORTED_IMAGE_MIME_TYPES = [ 'image/vnd.microsoft.icon', ] +const SUPPORTED_ARCHIVE_MIME_TYPES = [ + 'application/zip', + 'application/x-zip-compressed', + 'application/x-zip', +] + export const CHAT_ACCEPT_ATTRIBUTE = [ ACCEPT_ATTRIBUTE, ...SUPPORTED_IMAGE_MIME_TYPES, ...SUPPORTED_IMAGE_EXTENSIONS.map((ext) => `.${ext}`), + ...SUPPORTED_ARCHIVE_MIME_TYPES, + ...SUPPORTED_ARCHIVE_EXTENSIONS.map((ext) => `.${ext}`), ].join(',') export interface FileValidationError { @@ -226,14 +242,16 @@ export const SUPPORTED_ATTACHMENT_EXTENSIONS = Array.from( ...SUPPORTED_IMAGE_EXTENSIONS, ...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS, + ...SUPPORTED_ARCHIVE_EXTENSIONS, ]) ) as readonly string[] /** * Validate that a file's extension is allowed as a chat/mothership attachment. * - * Permits documents, code, images, audio, and video — anything users would - * reasonably attach to a chat message. Rejects executables and unknown types. + * Permits documents, code, images, audio, video, and zip archives — anything + * users would reasonably attach to a chat message. Rejects executables and + * unknown types. */ export function validateAttachmentFileType(fileName: string): FileValidationError | null { const raw = extractExtension(fileName) @@ -242,7 +260,7 @@ export function validateAttachmentFileType(fileName: string): FileValidationErro if (!SUPPORTED_ATTACHMENT_EXTENSIONS.includes(extension)) { return { code: 'UNSUPPORTED_FILE_TYPE', - message: `Unsupported file type${extension ? `: ${extension}` : ` for "${fileName}"`}. Supported types include documents, code, images, audio, and video.`, + message: `Unsupported file type${extension ? `: ${extension}` : ` for "${fileName}"`}. Supported types include documents, code, images, audio, video, and zip archives.`, supportedTypes: [...SUPPORTED_ATTACHMENT_EXTENSIONS], } }