From 288f983bf27d74c9d8755811c8f06532eab8bc34 Mon Sep 17 00:00:00 2001 From: Nate Date: Fri, 20 Feb 2026 23:55:42 -0500 Subject: [PATCH] perf(parser): fast-path name tokenization to avoid TextDecoder Skip intermediate number[] array, Uint8Array allocation, and TextDecoder.decode() for the 99%+ of PDF names that contain no #XX hex escapes. Build string directly via String.fromCharCode loop. 5-10% improvement on parsing benchmarks (CPU profile showed readName + TextDecoder.decode at ~20% of total parse time). --- src/parser/token-reader.test.ts | 21 +++++++++++++++++ src/parser/token-reader.ts | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/src/parser/token-reader.test.ts b/src/parser/token-reader.test.ts index e5a8ee0..1ddf083 100644 --- a/src/parser/token-reader.test.ts +++ b/src/parser/token-reader.test.ts @@ -323,6 +323,27 @@ describe("TokenReader", () => { expect(token).toMatchObject({ type: "name", value: "Test#5" }); }); + it("uses fast path for plain ASCII name", () => { + const r = reader("/Type"); + const token = r.nextToken(); + + expect(token).toMatchObject({ type: "name", value: "Type" }); + }); + + it("falls back to slow path when # is first char", () => { + const r = reader("/#48ello"); // #48 = 'H' + const token = r.nextToken(); + + expect(token).toMatchObject({ type: "name", value: "Hello" }); + }); + + it("falls back to slow path when # appears mid-name", () => { + const r = reader("/Type#20Name"); // #20 = space + const token = r.nextToken(); + + expect(token).toMatchObject({ type: "name", value: "Type Name" }); + }); + it("stops at whitespace", () => { const r = reader("/Type /Page"); diff --git a/src/parser/token-reader.ts b/src/parser/token-reader.ts index e338925..b506998 100644 --- a/src/parser/token-reader.ts +++ b/src/parser/token-reader.ts @@ -284,6 +284,48 @@ export class TokenReader { // Skip the leading / this.scanner.advance(); + const data = this.scanner.bytes; + const start = this.scanner.position; + let pos = start; + const len = data.length; + + // Fast path: scan for end of name, checking for # escapes + let hasEscape = false; + + while (pos < len) { + const byte = data[pos]; + + if (!isRegularChar(byte)) { + break; + } + + if (byte === CHAR_HASH) { + hasEscape = true; + break; + } + + pos++; + } + + if (!hasEscape) { + // Common case: pure ASCII name with no escapes. + // Build string directly from byte range — no intermediate array, + // no Uint8Array allocation, no TextDecoder. + this.scanner.moveTo(pos); + + let value = ""; + + for (let i = start; i < pos; i++) { + value += String.fromCharCode(data[i]); + } + + return { type: "name", value, position }; + } + + // Slow path: name contains # escapes, need byte-by-byte processing. + // Reset scanner to start and use the original accumulation approach. + this.scanner.moveTo(start); + const bytes: number[] = []; while (true) {