From ec8332fd0a59a62ca4e6966cc97d48aedaa707c0 Mon Sep 17 00:00:00 2001 From: Juliano Martinez Date: Sun, 10 May 2026 15:40:57 -0700 Subject: [PATCH] preallocate scanner tokens --- src/scanner/scanner.zig | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/scanner/scanner.zig b/src/scanner/scanner.zig index e451ba14..b4a90eab 100644 --- a/src/scanner/scanner.zig +++ b/src/scanner/scanner.zig @@ -27,6 +27,10 @@ const isMappingSeparatorAt = lex.isMappingSeparatorAt; const isSeparatedIndicatorAt = lex.isSeparatedIndicatorAt; const lineAt = lex.lineAt; +const min_preallocated_input_bytes: usize = 1024; +const token_capacity_input_divisor: usize = 4; +const max_initial_token_capacity: usize = 8 * 1024; + /// Tokenizes a YAML byte stream into lexical tokens. /// /// UTF-8 input is borrowed for the lifetime of the returned `TokenStream`. @@ -40,6 +44,10 @@ pub fn scan(allocator: std.mem.Allocator, input: []const u8) Error!TokenStream { var tokens: std.ArrayList(Token) = .empty; defer tokens.deinit(arena_allocator); + if (initialTokenCapacity(utf8_input.len)) |capacity| { + try tokens.ensureTotalCapacity(arena_allocator, capacity); + } + try tokens.append(arena_allocator, .stream_start); var scanner_state: Scanner = .{ @@ -414,6 +422,11 @@ fn flowDepth(scanner: anytype) usize { return scanner.square_depth + scanner.curly_depth; } +fn initialTokenCapacity(input_len: usize) ?usize { + if (input_len < min_preallocated_input_bytes) return null; + return @min(max_initial_token_capacity, input_len / token_capacity_input_divisor); +} + fn colonStartsMappingValue(scanner: anytype) bool { const in_flow = flowDepth(scanner) != 0; if (lex.isMappingSeparatorAt(scanner.input, scanner.index, in_flow)) return true; @@ -524,3 +537,9 @@ test "scanner indicator: compact sequence entry requires structural context afte try std.testing.expectEqual(@as(usize, 2), fake.index); try std.testing.expectEqual(@as(usize, 2), fake.tokens.items.len); } + +test "scanner token capacity estimate stays conservative" { + try std.testing.expectEqual(@as(?usize, null), initialTokenCapacity(97)); + try std.testing.expectEqual(@as(?usize, 7509), initialTokenCapacity(30_036)); + try std.testing.expectEqual(@as(?usize, 8192), initialTokenCapacity(1_000_000)); +}