Skip to content

Commit 74feedf

Browse files
committed
fix(chat): de-dup archive entries by VFS canonical key, not raw path
Greptile: dedup keyed on the raw sanitized path, but archive paths are exposed through VFS per-segment encoding that NFC-normalizes, so visually-identical NFC/NFD entries (e.g. café.txt) kept both while emitting one shared vfsPath — shadowing the second on read. Move dedup to listChatUploadArchiveEntries / buildArchiveManifest, keyed on the same encodeEntryPath the resolver matches on; listArchiveEntries now returns raw paths (dedup is a VFS-presentation concern).
1 parent 8eb5503 commit 74feedf

4 files changed

Lines changed: 51 additions & 21 deletions

File tree

apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,29 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => {
245245
])
246246
})
247247

248+
it('de-duplicates entries that collapse to one VFS key (NFC/NFD, ./ prefix)', async () => {
249+
// "café.txt" stored twice (NFC precomposed + NFD decomposed) plus a
250+
// ./-prefixed duplicate of a/b.txt — all collapse to the same VFS path, so
251+
// only one of each must be listed (otherwise the second is unreachable).
252+
const nfc = `caf\u00e9.txt` // precomposed e-acute
253+
const nfd = `cafe\u0301.txt` // e + combining acute
254+
expect(nfc).not.toBe(nfd)
255+
const buffer = await buildZip({
256+
[nfc]: 'nfc',
257+
[nfd]: 'nfd',
258+
'a/b.txt': 'first',
259+
'./a/b.txt': 'dup',
260+
})
261+
mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })])
262+
mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer)
263+
264+
const entries = await listChatUploadArchiveEntries('bundle.zip', CHAT_ID)
265+
const vfsPaths = entries?.map((e) => e.vfsPath) ?? []
266+
267+
expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/caf%C3%A9.txt')).toHaveLength(1)
268+
expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/a/b.txt')).toHaveLength(1)
269+
})
270+
248271
it('reads a nested entry by its exact path', async () => {
249272
const buffer = await buildZip({ 'data/sheet.csv': 'a,b\n1,2' })
250273
mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })])

apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,25 @@ function archiveEntryKey(path: string): string | null {
242242
}
243243
}
244244

245+
/**
246+
* De-duplicate raw entry paths by their canonical VFS key (first wins), so two
247+
* entries that differ only in a form the VFS normalizes away (NFC vs NFD, U+202F
248+
* vs space, collapsed whitespace) collapse to one listed path. This matches how
249+
* {@link findArchiveEntryRawPath} resolves a read — first entry whose key matches
250+
* — so every listed path is reachable and none is silently shadowed.
251+
*/
252+
function dedupeArchiveEntriesByKey(paths: string[]): string[] {
253+
const seen = new Set<string>()
254+
const result: string[] = []
255+
for (const path of paths) {
256+
const key = archiveEntryKey(path) ?? path
257+
if (seen.has(key)) continue
258+
seen.add(key)
259+
result.push(path)
260+
}
261+
return result
262+
}
263+
245264
/**
246265
* Resolve a requested entry path (percent-encoded as the agent received it from
247266
* glob, or the raw display form from the manifest) to the archive's exact stored
@@ -289,7 +308,7 @@ export async function listChatUploadArchiveEntries(
289308
const encodedZip = encodeUploadName(record.name)
290309
try {
291310
const buffer = await fetchWorkspaceFileBuffer(record)
292-
const entries = await listArchiveEntries(buffer)
311+
const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(buffer))
293312
return entries.map((path) => ({
294313
path,
295314
vfsPath: `uploads/${encodedZip}/${encodeEntryPath(path)}`,
@@ -348,7 +367,7 @@ async function buildArchiveManifest(
348367
): Promise<FileReadResult> {
349368
const encodedZip = encodeUploadName(record.name)
350369
try {
351-
const entries = await listArchiveEntries(archiveBuffer)
370+
const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(archiveBuffer))
352371
const header = `Archive "${record.name}" — ${entries.length} file${
353372
entries.length === 1 ? '' : 's'
354373
}. Read an entry with read("uploads/${encodedZip}/<path>").`

apps/sim/lib/uploads/archive.test.ts

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,6 @@ describe('listArchiveEntries', () => {
6161
expect(paths).not.toContain('evil2.txt')
6262
})
6363

64-
it('de-duplicates entries that sanitize to the same path', async () => {
65-
const buffer = await buildZip({
66-
'a/b.txt': 'first',
67-
'./a/b.txt': 'shadowed',
68-
})
69-
70-
const paths = await listArchiveEntries(buffer)
71-
72-
expect(paths).toEqual(['a/b.txt'])
73-
})
74-
7564
it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => {
7665
const buffer = await buildZip({
7766
'doc.txt': 'real',

apps/sim/lib/uploads/archive.ts

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,12 @@ async function loadArchive(buffer: Buffer): Promise<JSZip> {
152152
* Enumerate the safe, extractable entry paths of an archive WITHOUT inflating
153153
* them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips
154154
* directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`,
155-
* `.DS_Store`, `Thumbs.db`), and de-duplicates entries that sanitize to the same
156-
* path (e.g. `./a/b` and `a/b`) since only the first is extractable by path.
157-
* Throws {@link ArchiveError} `too_many_entries` past {@link MAX_ARCHIVE_ENTRIES}.
155+
* `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past
156+
* {@link MAX_ARCHIVE_ENTRIES}.
157+
*
158+
* Paths are returned raw (not de-duplicated): two entries can collide only once
159+
* projected into the VFS's canonical (NFC-encoded) form, so de-duplication
160+
* belongs with the caller that owns that encoding (`listChatUploadArchiveEntries`).
158161
*/
159162
export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
160163
const zip = await loadArchive(buffer)
@@ -169,15 +172,11 @@ export async function listArchiveEntries(buffer: Buffer): Promise<string[]> {
169172
)
170173
}
171174

172-
const seen = new Set<string>()
173175
const paths: string[] = []
174176
for (const entry of realEntries) {
175177
const segments = sanitizeArchiveEntryPath(entry.name)
176178
if (!segments || isArchiveNoiseEntry(segments)) continue
177-
const path = segments.join('/')
178-
if (seen.has(path)) continue
179-
seen.add(path)
180-
paths.push(path)
179+
paths.push(segments.join('/'))
181180
}
182181
return paths
183182
}

0 commit comments

Comments
 (0)