fix(chat): cap archive read size, manifest-fallback on miss, dedupe entries

waleedlatif1 · waleedlatif1 · commit 1f729ef43eff · 2026-06-27T18:59:50.000-07:00
Address review findings on the zip-upload feature:
- guard archive list/read/grep on record.size &gt; MAX_ARCHIVE_BYTES before
  downloading, so an oversized zip is never buffered into memory
- a not-found archive entry now returns the file-tree manifest with a note
  (handles a stray /content habit suffix and typos) instead of failing
- de-duplicate archive entries that sanitize to the same path (./a/b vs a/b)
diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts
@@ -252,14 +252,31 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => {
     expect(result?.content).toBe('latte')
   })
 
-  it('returns null for an entry that is not in the archive', async () => {
+  it('falls back to the manifest (with a note) when the entry is not found', async () => {
     const buffer = await buildZip({ 'present.txt': 'x' })
     mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })])
     mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer)
 
-    const result = await readChatUploadPath('bundle.zip', 'missing.txt', CHAT_ID)
+    // Covers the /content habit suffix and plain typos uniformly.
+    const result = await readChatUploadPath('bundle.zip', 'content', CHAT_ID)
 
-    expect(result).toBeNull()
+    expect(result?.content).toContain('Entry "content" not found in "bundle.zip"')
+    expect(result?.content).toContain('present.txt')
+  })
+
+  it('rejects an oversized archive WITHOUT downloading it', async () => {
+    mockOrderByThenLimit([
+      makeRow({
+        displayName: 'huge.zip',
+        contentType: 'application/zip',
+        size: 200 * 1024 * 1024, // 200MB > 100MB cap
+      }),
+    ])
+
+    const result = await readChatUploadPath('huge.zip', 'anything.txt', CHAT_ID)
+
+    expect(result?.content).toContain('[Archive too large to read: huge.zip')
+    expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled()
   })
 
   it('returns the file-tree manifest for a bare archive read', async () => {
diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts
@@ -17,7 +17,12 @@ import {
 } from '@/lib/copilot/vfs/operations'
 import { decodeVfsSegment, encodeVfsSegment } from '@/lib/copilot/vfs/path-utils'
 import { getServePathPrefix } from '@/lib/uploads'
-import { ArchiveError, extractArchiveEntry, listArchiveEntries } from '@/lib/uploads/archive'
+import {
+  ArchiveError,
+  extractArchiveEntry,
+  listArchiveEntries,
+  MAX_ARCHIVE_BYTES,
+} from '@/lib/uploads/archive'
 import {
   fetchWorkspaceFileBuffer,
   type WorkspaceFileRecord,
@@ -160,6 +165,26 @@ export function isArchiveUpload(record: WorkspaceFileRecord): boolean {
   return isArchiveFileName(record.name)
 }
 
+/**
+ * True when an archive's stored size exceeds the read cap, so it must not be
+ * downloaded + parsed inline. Checked against `record.size` BEFORE fetching so an
+ * oversized archive never gets buffered into memory (the decompress tool applies
+ * the same {@link MAX_ARCHIVE_BYTES} cap on its own download path).
+ */
+function exceedsArchiveReadCap(record: WorkspaceFileRecord): boolean {
+  return record.size > MAX_ARCHIVE_BYTES
+}
+
+/** Placeholder for an archive too large to download and extract inline. */
+function archiveTooLargeResult(record: WorkspaceFileRecord): FileReadResult {
+  return {
+    content: `[Archive too large to read: ${record.name} (${Math.round(
+      record.size / 1024 / 1024
+    )}MB, limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB)]`,
+    totalLines: 1,
+  }
+}
+
 /** Decode each `/`-separated segment of a VFS entry path back to its real name. */
 function decodeEntryPath(raw: string): string {
   return raw
@@ -233,6 +258,10 @@ export async function listChatUploadArchiveEntries(
   if (!row) return null
   const record = toWorkspaceFileRecord(row)
   if (!isArchiveUpload(record)) return null
+  if (exceedsArchiveReadCap(record)) {
+    logger.warn('Archive too large to list entries', { zipName, chatId, size: record.size })
+    return []
+  }
 
   const encodedZip = canonicalUploadKey(record.name)
   try {
@@ -282,21 +311,24 @@ async function readArchiveEntry(
 }
 
 /**
- * Build a file-tree manifest for a bare archive read (`read("uploads/x.zip")`),
- * so the agent gets the contents instead of binary bytes. Returns a placeholder
- * result when the archive is unreadable.
+ * Build a file-tree manifest for an archive (`read("uploads/x.zip")`), so the
+ * agent gets the contents instead of binary bytes. An optional `note` is
+ * prepended — used to tell the agent a requested entry was not found while still
+ * showing the valid paths. Returns a placeholder result when the archive is
+ * unreadable.
  */
 async function buildArchiveManifest(
   record: WorkspaceFileRecord,
-  archiveBuffer: Buffer
+  archiveBuffer: Buffer,
+  note?: string
 ): Promise<FileReadResult> {
   const encodedZip = canonicalUploadKey(record.name)
   try {
     const entries = await listArchiveEntries(archiveBuffer)
     const header = `Archive "${record.name}" — ${entries.length} file${
       entries.length === 1 ? '' : 's'
     }. Read an entry with read("uploads/${encodedZip}/<path>").`
-    const content = [header, '', ...entries].join('\n')
+    const content = [...(note ? [note, ''] : []), header, '', ...entries].join('\n')
     return { content, totalLines: content.split('\n').length }
   } catch (err) {
     if (err instanceof ArchiveError) {
@@ -326,10 +358,23 @@ export async function readChatUploadPath(
     if (!isArchiveUpload(record)) {
       return await readFileRecord(record)
     }
+    if (exceedsArchiveReadCap(record)) {
+      return archiveTooLargeResult(record)
+    }
     const archiveBuffer = await fetchWorkspaceFileBuffer(record)
-    return entryPath
-      ? await readArchiveEntry(archiveBuffer, entryPath)
-      : await buildArchiveManifest(record, archiveBuffer)
+    if (!entryPath) {
+      return await buildArchiveManifest(record, archiveBuffer)
+    }
+    const entry = await readArchiveEntry(archiveBuffer, entryPath)
+    if (entry) return entry
+    // Entry not found — show the manifest so the agent can pick a valid path.
+    // Handles a stray `/content` habit suffix (carried over from files/) and
+    // plain typos uniformly, without special-casing any segment name.
+    return await buildArchiveManifest(
+      record,
+      archiveBuffer,
+      `Entry "${decodeEntryPath(entryPath)}" not found in "${record.name}".`
+    )
   } catch (err) {
     logger.warn('Failed to read chat upload', {
       firstSegment,
@@ -366,6 +411,11 @@ export async function grepChatUploadPath(
   const record = toWorkspaceFileRecord(row)
 
   if (entryPath && isArchiveUpload(record)) {
+    if (exceedsArchiveReadCap(record)) {
+      throw new WorkspaceFileGrepError(
+        `Archive too large to grep: "${record.name}" (limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB).`
+      )
+    }
     const archiveBuffer = await fetchWorkspaceFileBuffer(record)
     const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath)
     if (!rawPath) {
diff --git a/apps/sim/lib/uploads/archive.test.ts b/apps/sim/lib/uploads/archive.test.ts
@@ -61,6 +61,17 @@ describe('listArchiveEntries', () => {
     expect(paths).not.toContain('evil2.txt')
   })
 
+  it('de-duplicates entries that sanitize to the same path', async () => {
+    const buffer = await buildZip({
+      'a/b.txt': 'first',
+      './a/b.txt': 'shadowed',
+    })
+
+    const paths = await listArchiveEntries(buffer)
+
+    expect(paths).toEqual(['a/b.txt'])
+  })
+
   it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => {
     const buffer = await buildZip({
       'doc.txt': 'real',
diff --git a/apps/sim/lib/uploads/archive.ts b/apps/sim/lib/uploads/archive.ts
@@ -152,8 +152,9 @@ async function loadArchive(buffer: Buffer): Promise<JSZip> {
  * Enumerate the safe, extractable entry paths of an archive WITHOUT inflating
  * them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips
  * directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`,
- * `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past
- * {@link MAX_ARCHIVE_ENTRIES}.
+ * `.DS_Store`, `Thumbs.db`), and de-duplicates entries that sanitize to the same
+ * path (e.g. `./a/b` and `a/b`) since only the first is extractable by path.
+ * Throws {@link ArchiveError} `too_many_entries` past {@link MAX_ARCHIVE_ENTRIES}.
  */
 export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
   const zip = await loadArchive(buffer)
@@ -168,11 +169,15 @@ export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
     )
   }
 
+  const seen = new Set<string>()
   const paths: string[] = []
   for (const entry of realEntries) {
     const segments = sanitizeArchiveEntryPath(entry.name)
     if (!segments || isArchiveNoiseEntry(segments)) continue
-    paths.push(segments.join('/'))
+    const path = segments.join('/')
+    if (seen.has(path)) continue
+    seen.add(path)
+    paths.push(path)
   }
   return paths
 }