Skip to content

Commit e2fffd2

Browse files
committed
fix(chunkers): exclude lookbehind from named-group rewrite
Tighten NAMED_GROUP_PREFIX with negative lookahead so patterns like (?<=<tag>) are not misidentified as named capture groups.
1 parent 236b948 commit e2fffd2

2 files changed

Lines changed: 16 additions & 1 deletion

File tree

apps/sim/lib/chunkers/regex-chunker.test.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ describe('RegexChunker', () => {
227227
}
228228
)
229229

230+
it.concurrent('should preserve lookbehind whose body contains a > character', async () => {
231+
const chunker = new RegexChunker({
232+
pattern: '(?<=</section>)',
233+
chunkSize: 1024,
234+
strictBoundaries: true,
235+
})
236+
const text = '<section>one</section><section>two</section><section>three</section>'
237+
const chunks = await chunker.chunk(text)
238+
239+
expect(chunks).toHaveLength(3)
240+
expect(chunks[0].text).toBe('<section>one</section>')
241+
expect(chunks[1].text).toBe('<section>two</section>')
242+
expect(chunks[2].text).toBe('<section>three</section>')
243+
})
244+
230245
it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
231246
const chunker = new RegexChunker({
232247
pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const logger = createLogger('RegexChunker')
1515

1616
const MAX_PATTERN_LENGTH = 500
1717

18-
const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/
18+
const NAMED_GROUP_PREFIX = /^\(\?<(?![=!])[^>]+>/
1919

2020
/**
2121
* Converts unescaped capturing groups `(...)` and named capturing groups

0 commit comments

Comments
 (0)