From a5009b580bb08f5876be73de1b290f956f2cd795 Mon Sep 17 00:00:00 2001 From: Victor Guerra Veloso Date: Fri, 19 Jun 2026 16:45:05 -0400 Subject: [PATCH] Fix regression by reverting back to characters offset rather than bytes offset --- .../gen/treesitterng/AbstractTreeSitterNgGenerator.java | 5 +++-- .../treesitterng/AbstractTreeSitterNgGeneratorTest.java | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java b/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java index e099aed8..5a9a1bcd 100644 --- a/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java +++ b/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java @@ -123,8 +123,9 @@ private static int calculateOffset(List contentLines, TSPoint point) { for (int i = 0; i < startRow; i++) { // Each line in contentLines (except maybe the last) was terminated by LF (\n). // If the original was CRLF, the CR (\r) is still at the end of the line string. - // .getBytes().length + 1 correctly counts [LineContent] + [LF]. - offset += contentLines.get(i).getBytes(StandardCharsets.UTF_8).length + 1; + // offset must be a char (UTF-16 code unit) index, matching how consumers (e.g. + // VanillaDiffHtmlBuilder) walk the source, so use .length() and not byte length. + offset += contentLines.get(i).length() + 1; } offset += startColumn; return offset; diff --git a/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java b/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java index bd9ae3c1..65ea5fa6 100644 --- a/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java +++ b/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java @@ -61,14 +61,17 @@ public void testCrlfOffsetConsistency() throws IOException { @Test public void testMultiByteOffsetConsistency() throws IOException { // Line 1: "# 🐍\n" - // '#' (1) + ' ' (1) + '🐍' (4 bytes in UTF-8) + '\n' (1) = 7 bytes total + // '#' (1) + ' ' (1) + '🐍' (2 UTF-16 chars, surrogate pair) + '\n' (1) = 5 chars total // Line 2: "x = 1" + // Offsets must be char-based (UTF-16 code units) to match how the rest of GumTree + // (e.g. AbstractJdtVisitor, VanillaDiffHtmlBuilder) indexes source text, not UTF-8 bytes. String content = "# 🐍\nx = 1"; TreeContext ctx = generator.generateFrom().string(content); Tree xAssignment = ctx.getRoot().getChild(1); assertEquals("expression_statement", xAssignment.getType().name); - assertEquals(7, xAssignment.getPos(), "Line 2 should start at byte offset 7 after a 4-byte emoji and LF"); + assertEquals(5, xAssignment.getPos(), + "Line 2 should start at char offset 5 after a surrogate-pair emoji and LF"); } @Test