Skip to content

Commit 1f729ef

Browse files
committed
fix(chat): cap archive read size, manifest-fallback on miss, dedupe entries
Address review findings on the zip-upload feature: - guard archive list/read/grep on record.size > MAX_ARCHIVE_BYTES before downloading, so an oversized zip is never buffered into memory - a not-found archive entry now returns the file-tree manifest with a note (handles a stray /content habit suffix and typos) instead of failing - de-duplicate archive entries that sanitize to the same path (./a/b vs a/b)
1 parent a43b956 commit 1f729ef

4 files changed

Lines changed: 98 additions & 15 deletions

File tree

apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,31 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => {
252252
expect(result?.content).toBe('latte')
253253
})
254254

255-
it('returns null for an entry that is not in the archive', async () => {
255+
it('falls back to the manifest (with a note) when the entry is not found', async () => {
256256
const buffer = await buildZip({ 'present.txt': 'x' })
257257
mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })])
258258
mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer)
259259

260-
const result = await readChatUploadPath('bundle.zip', 'missing.txt', CHAT_ID)
260+
// Covers the /content habit suffix and plain typos uniformly.
261+
const result = await readChatUploadPath('bundle.zip', 'content', CHAT_ID)
261262

262-
expect(result).toBeNull()
263+
expect(result?.content).toContain('Entry "content" not found in "bundle.zip"')
264+
expect(result?.content).toContain('present.txt')
265+
})
266+
267+
it('rejects an oversized archive WITHOUT downloading it', async () => {
268+
mockOrderByThenLimit([
269+
makeRow({
270+
displayName: 'huge.zip',
271+
contentType: 'application/zip',
272+
size: 200 * 1024 * 1024, // 200MB > 100MB cap
273+
}),
274+
])
275+
276+
const result = await readChatUploadPath('huge.zip', 'anything.txt', CHAT_ID)
277+
278+
expect(result?.content).toContain('[Archive too large to read: huge.zip')
279+
expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled()
263280
})
264281

265282
it('returns the file-tree manifest for a bare archive read', async () => {

apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ import {
1717
} from '@/lib/copilot/vfs/operations'
1818
import { decodeVfsSegment, encodeVfsSegment } from '@/lib/copilot/vfs/path-utils'
1919
import { getServePathPrefix } from '@/lib/uploads'
20-
import { ArchiveError, extractArchiveEntry, listArchiveEntries } from '@/lib/uploads/archive'
20+
import {
21+
ArchiveError,
22+
extractArchiveEntry,
23+
listArchiveEntries,
24+
MAX_ARCHIVE_BYTES,
25+
} from '@/lib/uploads/archive'
2126
import {
2227
fetchWorkspaceFileBuffer,
2328
type WorkspaceFileRecord,
@@ -160,6 +165,26 @@ export function isArchiveUpload(record: WorkspaceFileRecord): boolean {
160165
return isArchiveFileName(record.name)
161166
}
162167

168+
/**
169+
* True when an archive's stored size exceeds the read cap, so it must not be
170+
* downloaded + parsed inline. Checked against `record.size` BEFORE fetching so an
171+
* oversized archive never gets buffered into memory (the decompress tool applies
172+
* the same {@link MAX_ARCHIVE_BYTES} cap on its own download path).
173+
*/
174+
function exceedsArchiveReadCap(record: WorkspaceFileRecord): boolean {
175+
return record.size > MAX_ARCHIVE_BYTES
176+
}
177+
178+
/** Placeholder for an archive too large to download and extract inline. */
179+
function archiveTooLargeResult(record: WorkspaceFileRecord): FileReadResult {
180+
return {
181+
content: `[Archive too large to read: ${record.name} (${Math.round(
182+
record.size / 1024 / 1024
183+
)}MB, limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB)]`,
184+
totalLines: 1,
185+
}
186+
}
187+
163188
/** Decode each `/`-separated segment of a VFS entry path back to its real name. */
164189
function decodeEntryPath(raw: string): string {
165190
return raw
@@ -233,6 +258,10 @@ export async function listChatUploadArchiveEntries(
233258
if (!row) return null
234259
const record = toWorkspaceFileRecord(row)
235260
if (!isArchiveUpload(record)) return null
261+
if (exceedsArchiveReadCap(record)) {
262+
logger.warn('Archive too large to list entries', { zipName, chatId, size: record.size })
263+
return []
264+
}
236265

237266
const encodedZip = canonicalUploadKey(record.name)
238267
try {
@@ -282,21 +311,24 @@ async function readArchiveEntry(
282311
}
283312

284313
/**
285-
* Build a file-tree manifest for a bare archive read (`read("uploads/x.zip")`),
286-
* so the agent gets the contents instead of binary bytes. Returns a placeholder
287-
* result when the archive is unreadable.
314+
* Build a file-tree manifest for an archive (`read("uploads/x.zip")`), so the
315+
* agent gets the contents instead of binary bytes. An optional `note` is
316+
* prepended — used to tell the agent a requested entry was not found while still
317+
* showing the valid paths. Returns a placeholder result when the archive is
318+
* unreadable.
288319
*/
289320
async function buildArchiveManifest(
290321
record: WorkspaceFileRecord,
291-
archiveBuffer: Buffer
322+
archiveBuffer: Buffer,
323+
note?: string
292324
): Promise<FileReadResult> {
293325
const encodedZip = canonicalUploadKey(record.name)
294326
try {
295327
const entries = await listArchiveEntries(archiveBuffer)
296328
const header = `Archive "${record.name}" — ${entries.length} file${
297329
entries.length === 1 ? '' : 's'
298330
}. Read an entry with read("uploads/${encodedZip}/<path>").`
299-
const content = [header, '', ...entries].join('\n')
331+
const content = [...(note ? [note, ''] : []), header, '', ...entries].join('\n')
300332
return { content, totalLines: content.split('\n').length }
301333
} catch (err) {
302334
if (err instanceof ArchiveError) {
@@ -326,10 +358,23 @@ export async function readChatUploadPath(
326358
if (!isArchiveUpload(record)) {
327359
return await readFileRecord(record)
328360
}
361+
if (exceedsArchiveReadCap(record)) {
362+
return archiveTooLargeResult(record)
363+
}
329364
const archiveBuffer = await fetchWorkspaceFileBuffer(record)
330-
return entryPath
331-
? await readArchiveEntry(archiveBuffer, entryPath)
332-
: await buildArchiveManifest(record, archiveBuffer)
365+
if (!entryPath) {
366+
return await buildArchiveManifest(record, archiveBuffer)
367+
}
368+
const entry = await readArchiveEntry(archiveBuffer, entryPath)
369+
if (entry) return entry
370+
// Entry not found — show the manifest so the agent can pick a valid path.
371+
// Handles a stray `/content` habit suffix (carried over from files/) and
372+
// plain typos uniformly, without special-casing any segment name.
373+
return await buildArchiveManifest(
374+
record,
375+
archiveBuffer,
376+
`Entry "${decodeEntryPath(entryPath)}" not found in "${record.name}".`
377+
)
333378
} catch (err) {
334379
logger.warn('Failed to read chat upload', {
335380
firstSegment,
@@ -366,6 +411,11 @@ export async function grepChatUploadPath(
366411
const record = toWorkspaceFileRecord(row)
367412

368413
if (entryPath && isArchiveUpload(record)) {
414+
if (exceedsArchiveReadCap(record)) {
415+
throw new WorkspaceFileGrepError(
416+
`Archive too large to grep: "${record.name}" (limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB).`
417+
)
418+
}
369419
const archiveBuffer = await fetchWorkspaceFileBuffer(record)
370420
const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath)
371421
if (!rawPath) {

apps/sim/lib/uploads/archive.test.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,17 @@ describe('listArchiveEntries', () => {
6161
expect(paths).not.toContain('evil2.txt')
6262
})
6363

64+
it('de-duplicates entries that sanitize to the same path', async () => {
65+
const buffer = await buildZip({
66+
'a/b.txt': 'first',
67+
'./a/b.txt': 'shadowed',
68+
})
69+
70+
const paths = await listArchiveEntries(buffer)
71+
72+
expect(paths).toEqual(['a/b.txt'])
73+
})
74+
6475
it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => {
6576
const buffer = await buildZip({
6677
'doc.txt': 'real',

apps/sim/lib/uploads/archive.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,9 @@ async function loadArchive(buffer: Buffer): Promise<JSZip> {
152152
* Enumerate the safe, extractable entry paths of an archive WITHOUT inflating
153153
* them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips
154154
* directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`,
155-
* `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past
156-
* {@link MAX_ARCHIVE_ENTRIES}.
155+
* `.DS_Store`, `Thumbs.db`), and de-duplicates entries that sanitize to the same
156+
* path (e.g. `./a/b` and `a/b`) since only the first is extractable by path.
157+
* Throws {@link ArchiveError} `too_many_entries` past {@link MAX_ARCHIVE_ENTRIES}.
157158
*/
158159
export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
159160
const zip = await loadArchive(buffer)
@@ -168,11 +169,15 @@ export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
168169
)
169170
}
170171

172+
const seen = new Set<string>()
171173
const paths: string[] = []
172174
for (const entry of realEntries) {
173175
const segments = sanitizeArchiveEntryPath(entry.name)
174176
if (!segments || isArchiveNoiseEntry(segments)) continue
175-
paths.push(segments.join('/'))
177+
const path = segments.join('/')
178+
if (seen.has(path)) continue
179+
seen.add(path)
180+
paths.push(path)
176181
}
177182
return paths
178183
}

0 commit comments

Comments
 (0)