From 3aa527d22a4b4a696c26e050b26602339fc5e9d5 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 18 Feb 2026 12:02:13 +0100 Subject: [PATCH 01/22] buffer offset is now compensated for surrogate pairs --- .../library/lang/json/internal/JsonValueReader.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 20c0283c34..31788fb782 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -487,6 +487,7 @@ private int getOffset() { try { assert posHandler != null; var internalPos = (int) posHandler.get(in); + return tracker.getOffsetAtBufferStart() + internalPos; } catch (IllegalArgumentException | SecurityException e) { @@ -1209,6 +1210,14 @@ public int read(char[] cbuf, int off, int len) throws IOException { // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); + // for every high surrogate we assume a low surrogate will follow, + // and we count only one of them for the character offset + for (int i = off; i < charsRead + off; i++) { + if (Character.isHighSurrogate(cbuf[i])) { + offset--; + } + } + // the next buffer[0] offset will be after this increment. // Note that `fillBuffer.limit == read.limit` limit = off + charsRead; From 977759a5375b17ccd336ee74f4d1ae6ff4ba4421 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 19 Feb 2026 12:05:19 +0100 Subject: [PATCH 02/22] added test with unicode surrogate pairs --- .../tests/library/lang/json/unicode.json | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json new file mode 100644 index 0000000000..8b6e44e45f --- /dev/null +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json @@ -0,0 +1,29 @@ +{ + /* €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€ */ + "glossary": { + "title": "exam€le glossary", + "line" : 2, + "GlossDiv": { + "line" : 5, + "title": "S", + "GlossList": { + "line" : 8, + /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ + "GlossEntry": { + "line" : 10, + "ID": "S🍕🍕L", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "line" : 17, + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": ["GML", "XML"] + }, + "GlossSee": "m🍕🍕🍕🍕up" + } + } + } + } +} \ No newline at end of file From 5432aea59c40b46df46ae391d75bb26538b2aa04 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 19 Feb 2026 12:05:49 +0100 Subject: [PATCH 03/22] initial throw at unicode resilient positions during JSON parsing --- .../lang/json/internal/JsonValueReader.java | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 5ff4463991..29bf2628ba 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1159,9 +1159,14 @@ public static class OriginTrackingReader extends FilterReader { private int offset = 0; // limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length private int limit = 0; + // the codepoints array maps char offsets to codepoint offsets + private int[] codepoints = new int[1024]; protected OriginTrackingReader(Reader in) { super(in); + for (int i = 0; i < codepoints.length; i++) { + codepoints[i] = 0; + } } /* This private method from JsonReader must be mirrored by `read` @@ -1195,24 +1200,31 @@ private boolean fillBuffer(int minimum) throws IOException { } */ @Override public int read(char[] cbuf, int off, int len) throws IOException { + assert cbuf.length == codepoints.length; + // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. // Moreover, `fillBuffer.limit == reader.off` at the start of this method. // we know take the previous limit and add it to the // offset, to arrive at the new `pos=0` of `buffer[0]`, // rewinding `off` characters which were reused from the previous buffer - // with System.arraycopy. - offset += limit - off; + // with System.arraycopy. The codepoints array maps char offsets to codepoint offsets. + offset += (limit != 0 ? codepoints[limit - 1] : 0) - off; // make sure we are only a facade for the real reader. // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); + // shift the remaining characters to the left + System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); + // for every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset - for (int i = off; i < charsRead + off; i++) { - if (Character.isHighSurrogate(cbuf[i])) { - offset--; + int shift = 0; + for (int i = 0; i < charsRead + off; i++) { + codepoints[i] = i - shift; + if (Character.isHighSurrogate(cbuf[i])) { + shift++; } } @@ -1224,8 +1236,21 @@ public int read(char[] cbuf, int off, int len) throws IOException { return charsRead; } + /** + * @return the codepoint offset (from the start of the streaming content) + * for the start of the last buffered content (cbuf[0]) + */ public int getOffsetAtBufferStart() { return offset; } + + /** + * @return the codepoint offset (from the start of the streaming content) + * for the character at char position `pos` in the last buffered content. + */ + public int getOffsetAtBufferPos(int pos) { + assert pos < limit; + return offset + codepoints[pos]; + } } } From fd57b975ebd73f3ec82867cdddaca6d42552ffd6 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 19 Feb 2026 12:33:19 +0100 Subject: [PATCH 04/22] minor improvements --- .../lang/json/internal/JsonValueReader.java | 17 ++++++++--------- .../tests/library/lang/json/JSONIOTests.rsc | 2 +- .../rascal/tests/library/lang/json/unicode.json | 8 ++++---- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 29bf2628ba..788f371d80 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -486,7 +486,7 @@ private int getOffset() { assert posHandler != null; var internalPos = (int) posHandler.get(in); - return tracker.getOffsetAtBufferStart() + internalPos; + return tracker.getOffsetAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // we stop trying to track positions if it fails so hard, @@ -615,7 +615,7 @@ private IValue visitStringAsAbstractData(Type type) throws IOException { private IValue visitObjectAsAbstractData(Type type) throws IOException { Set alternatives = null; - int startPos = getOffset() - 1; + int startPos = Math.max(getOffset() - 1, 0); int startLine = getLine(); int startCol = getCol() - 1; @@ -735,7 +735,7 @@ else if (!explicitDataTypes && "_type".equals(label)) { } } - int endPos = getOffset() - 1; + int endPos = Math.max(getOffset() - 1, 0); assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; int endLine = getLine(); @@ -803,7 +803,7 @@ public IValue visitNode(Type type) throws IOException { return inferNullValue(nulls, type); } - int startPos = getOffset() - 1; + int startPos = Math.max(getOffset() - 1, 0); int startLine = getLine(); int startCol = getCol() - 1; @@ -840,7 +840,7 @@ public IValue visitNode(Type type) throws IOException { } } - int endPos = getOffset() - 1; + int endPos = Math.max(getOffset() - 1, 0); int endLine = getLine(); int endCol = getCol() - 1; @@ -923,7 +923,7 @@ public IValue visitList(Type type) throws IOException { } in.endArray(); - getOffset(); + return w.done(); } @@ -1209,7 +1209,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { // offset, to arrive at the new `pos=0` of `buffer[0]`, // rewinding `off` characters which were reused from the previous buffer // with System.arraycopy. The codepoints array maps char offsets to codepoint offsets. - offset += (limit != 0 ? codepoints[limit - 1] : 0) - off; + offset += (limit != 0 ? codepoints[limit - 1] + 1 : 0) - off; // make sure we are only a facade for the real reader. // parameters are mapped one-to-one without mutations. @@ -1249,8 +1249,7 @@ public int getOffsetAtBufferStart() { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - assert pos < limit; - return offset + codepoints[pos]; + return offset + codepoints[pos == 0 ? 0 : Math.min(limit - 1, pos)]; } } } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 65602a9e73..0b8191eb9a 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -109,7 +109,7 @@ bool originTest(loc example) { } test bool originTracking() { - files = [ l | loc l <- |std:///lang/rascal/tests/library/lang/json|.ls, l.extension == "json"]; + files = [ l | loc l <- |std:///lang/rascal/tests/library/lang/json|.ls, l.extension == "json", bprintln(l)]; return (true | it && originTest(example) | loc example <- files); } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json index 8b6e44e45f..d03da2401a 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json @@ -1,7 +1,7 @@ { - /* €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€ */ + /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ "glossary": { - "title": "exam€le glossary", + "title": "example glossary", "line" : 2, "GlossDiv": { "line" : 5, @@ -10,9 +10,9 @@ "line" : 8, /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ "GlossEntry": { - "line" : 10, + "l🍕ne" : 10, "ID": "S🍕🍕L", - "SortAs": "SGML", + "SortAs": "S🍕ML", "GlossTerm": "Standard Generalized Markup Language", "Acronym": "SGML", "Abbrev": "ISO 8879:1986", From 6c64095eca61cb43b4aa0cde9c780935ca1cdad3 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 19 Feb 2026 12:47:01 +0100 Subject: [PATCH 05/22] minor comment --- .../library/lang/json/internal/JsonValueReader.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 788f371d80..8861933165 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1160,7 +1160,7 @@ public static class OriginTrackingReader extends FilterReader { // limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length private int limit = 0; // the codepoints array maps char offsets to codepoint offsets - private int[] codepoints = new int[1024]; + private int[] codepoints = null; protected OriginTrackingReader(Reader in) { super(in); @@ -1200,7 +1200,10 @@ private boolean fillBuffer(int minimum) throws IOException { } */ @Override public int read(char[] cbuf, int off, int len) throws IOException { - assert cbuf.length == codepoints.length; + if (codepoints == null) { + codepoints = new int[cbuf.length]; + assert codepoints[0] == 0; + } // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. // Moreover, `fillBuffer.limit == reader.off` at the start of this method. @@ -1226,6 +1229,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { if (Character.isHighSurrogate(cbuf[i])) { shift++; } + // do not assume the low surrogate is in the current buffer yet (boundary condition) } // the next buffer[0] offset will be after this increment. From f1b25c82b15d5e11339da0bf58bb11c215661cf7 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Fri, 20 Feb 2026 10:56:58 +0100 Subject: [PATCH 06/22] working to get unicode columns right --- .../lang/json/internal/JsonValueReader.java | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 8861933165..8fee1ffe36 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -524,7 +524,10 @@ private int getCol() { } try { - return ((int) posHandler.get(in)) - ((int) lineStartHandler.get(in)); + assert posHandler != null; + var internalPos = (int) posHandler.get(in); + + return tracker.getColumnAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // stop trying to recover the positions @@ -1161,12 +1164,13 @@ public static class OriginTrackingReader extends FilterReader { private int limit = 0; // the codepoints array maps char offsets to codepoint offsets private int[] codepoints = null; - + // the codepoint position of the current beginning of the line + private int[] columns = null; + // the current column position + private int column = 0; + protected OriginTrackingReader(Reader in) { super(in); - for (int i = 0; i < codepoints.length; i++) { - codepoints[i] = 0; - } } /* This private method from JsonReader must be mirrored by `read` @@ -1205,6 +1209,10 @@ public int read(char[] cbuf, int off, int len) throws IOException { assert codepoints[0] == 0; } + if (columns == null) { + columns = new int[cbuf.length]; + } + // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. // Moreover, `fillBuffer.limit == reader.off` at the start of this method. @@ -1220,16 +1228,27 @@ public int read(char[] cbuf, int off, int len) throws IOException { // shift the remaining characters to the left System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); + System.arraycopy(columns, codepoints.length - off, columns, 0, off); // for every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset int shift = 0; + int columnShift = 0; for (int i = 0; i < charsRead + off; i++) { codepoints[i] = i - shift; if (Character.isHighSurrogate(cbuf[i])) { shift++; + columnShift++; } // do not assume the low surrogate is in the current buffer yet (boundary condition) + // TODO add code for recording that we saw a high surrogate here at the end of the buffer + + columns[i] = column - columnShift; + if (cbuf[i] == '\n') { + column = 0; + columnShift = 0; + } + column++; } // the next buffer[0] offset will be after this increment. @@ -1255,5 +1274,9 @@ public int getOffsetAtBufferStart() { public int getOffsetAtBufferPos(int pos) { return offset + codepoints[pos == 0 ? 0 : Math.min(limit - 1, pos)]; } + + public int getColumnAtBufferPos(int pos) { + return pos >= columns.length ? column : columns[pos]; + } } } From 39a0ef1e73b7db23c86f8bcd806fb8437ae1cc83 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Fri, 20 Feb 2026 11:11:08 +0100 Subject: [PATCH 07/22] minor fix --- .../lang/json/internal/JsonValueReader.java | 49 +++++++++++++------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 8fee1ffe36..863350c4cc 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -498,16 +498,17 @@ private int getOffset() { private int getLine() { if (stopTracking) { - return 0; + return 1; } try { - return (int) lineHandler.get(in) + 1; + var internalPos = (int) posHandler.get(in); + return tracker.getLineAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // stop trying to recover the positions stopTracking = true; - return 0; + return 1; } } @@ -1166,8 +1167,12 @@ public static class OriginTrackingReader extends FilterReader { private int[] codepoints = null; // the codepoint position of the current beginning of the line private int[] columns = null; + private int[] lines = null; + // the current column position private int column = 0; + // the current line + private int line = 1; protected OriginTrackingReader(Reader in) { super(in); @@ -1204,14 +1209,7 @@ private boolean fillBuffer(int minimum) throws IOException { } */ @Override public int read(char[] cbuf, int off, int len) throws IOException { - if (codepoints == null) { - codepoints = new int[cbuf.length]; - assert codepoints[0] == 0; - } - - if (columns == null) { - columns = new int[cbuf.length]; - } + initializeBuffers(cbuf); // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. // Moreover, `fillBuffer.limit == reader.off` at the start of this method. @@ -1228,7 +1226,8 @@ public int read(char[] cbuf, int off, int len) throws IOException { // shift the remaining characters to the left System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); - System.arraycopy(columns, codepoints.length - off, columns, 0, off); + System.arraycopy(columns, columns.length - off, columns, 0, off); + System.arraycopy(lines, lines.length - off, lines, 0, off); // for every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset @@ -1244,7 +1243,9 @@ public int read(char[] cbuf, int off, int len) throws IOException { // TODO add code for recording that we saw a high surrogate here at the end of the buffer columns[i] = column - columnShift; + lines[i] = line; if (cbuf[i] == '\n') { + line++; column = 0; columnShift = 0; } @@ -1259,6 +1260,20 @@ public int read(char[] cbuf, int off, int len) throws IOException { return charsRead; } + private void initializeBuffers(char[] cbuf) { + if (codepoints == null) { + codepoints = new int[cbuf.length]; + } + + if (columns == null) { + columns = new int[cbuf.length]; + } + + if (lines == null) { + lines = new int[cbuf.length]; + } + } + /** * @return the codepoint offset (from the start of the streaming content) * for the start of the last buffered content (cbuf[0]) @@ -1272,11 +1287,15 @@ public int getOffsetAtBufferStart() { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - return offset + codepoints[pos == 0 ? 0 : Math.min(limit - 1, pos)]; + return pos >= limit ? offset : codepoints[pos]; } public int getColumnAtBufferPos(int pos) { - return pos >= columns.length ? column : columns[pos]; - } + return pos >= limit ? column : columns[pos]; + } + + public int getLineAtBufferPos(int pos) { + return pos >= limit ? line : lines[pos]; + } } } From d21bc29e3e9bd006afad94595b261895e5d1cfd1 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Fri, 20 Feb 2026 11:17:33 +0100 Subject: [PATCH 08/22] fixed line markup in unicode example for testing --- .../lang/rascal/tests/library/lang/json/JSONIOTests.rsc | 4 ++++ .../lang/rascal/tests/library/lang/json/unicode.json | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 0b8191eb9a..a91d54006b 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -100,8 +100,12 @@ bool originTest(loc example) { for ( <- poss) { assert content[p.offset] == "{"; // all nodes start with a { assert content[p.offset + p.length - 1] == "}"; // all nodes end with a } + println(p.begin.line); + println(line); assert p.begin.line == line; + println(lines[p.begin.line - 1][p.begin.column]); assert lines[p.begin.line - 1][p.begin.column] == "{"; + println(lines[p.end.line - 1][p.end.column - 1]); assert lines[p.end.line - 1][p.end.column - 1] == "}"; } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json index d03da2401a..a3df2a4508 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json @@ -7,17 +7,17 @@ "line" : 5, "title": "S", "GlossList": { - "line" : 8, + "line" : 9, /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ "GlossEntry": { - "l🍕ne" : 10, + "line" : 12, "ID": "S🍕🍕L", "SortAs": "S🍕ML", "GlossTerm": "Standard Generalized Markup Language", "Acronym": "SGML", "Abbrev": "ISO 8879:1986", "GlossDef": { - "line" : 17, + "line" : 19, "para": "A meta-markup language, used to create markup languages such as DocBook.", "GlossSeeAlso": ["GML", "XML"] }, From e1be9a14ef2395bb83ac638b672082697b15cbe1 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Sun, 22 Feb 2026 11:31:12 +0100 Subject: [PATCH 09/22] gettin the off-by-ones under control --- .../lang/json/internal/JsonValueReader.java | 62 +++++++++++-------- .../tests/library/lang/json/JSONIOTests.rsc | 4 -- .../tests/library/lang/json/unicode.json | 4 +- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 863350c4cc..ee37b2cbb5 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1161,19 +1161,21 @@ public IValue read(Reader in, Type expected) throws IOException { public static class OriginTrackingReader extends FilterReader { // offset is always pointing at the point in the file where JsonReader.pos == 0 private int offset = 0; - // limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length + // limit is always pointing to the amount of no-junk chars in the underlying buffer below buffer.length private int limit = 0; // the codepoints array maps char offsets to codepoint offsets private int[] codepoints = null; - // the codepoint position of the current beginning of the line + // columns maps char offsets to codepoint column positions private int[] columns = null; + // lines maps char offsets to codepoint line numbers private int[] lines = null; // the current column position private int column = 0; // the current line private int line = 1; - + // the current amount of high surrogates counted in this buffer + protected OriginTrackingReader(Reader in) { super(in); } @@ -1216,40 +1218,39 @@ public int read(char[] cbuf, int off, int len) throws IOException { // we know take the previous limit and add it to the // offset, to arrive at the new `pos=0` of `buffer[0]`, - // rewinding `off` characters which were reused from the previous buffer - // with System.arraycopy. The codepoints array maps char offsets to codepoint offsets. offset += (limit != 0 ? codepoints[limit - 1] + 1 : 0) - off; // make sure we are only a facade for the real reader. // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); - // shift the remaining characters to the left - System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); - System.arraycopy(columns, columns.length - off, columns, 0, off); - System.arraycopy(lines, lines.length - off, lines, 0, off); + shiftRemaindersLeft(off); - // for every high surrogate we assume a low surrogate will follow, - // and we count only one of them for the character offset - int shift = 0; - int columnShift = 0; - for (int i = 0; i < charsRead + off; i++) { + // reconstruct the character shift from the start of the new buffer to the offset we kept: + int shift = off == 0 ? 0 : off - codepoints[off - 1]; + int columnShift = off == 0 ? 0 : off - (columns[off - 1] - columns[0]); + + for (int i = off; i < charsRead + off; i++) { codepoints[i] = i - shift; if (Character.isHighSurrogate(cbuf[i])) { + // for every high surrogate we assume a low surrogate will follow, + // and we count only one of them for the character offset by increasing `shift` shift++; columnShift++; + // do not assume the low surrogate is in the current buffer yet (boundary condition) } - // do not assume the low surrogate is in the current buffer yet (boundary condition) - // TODO add code for recording that we saw a high surrogate here at the end of the buffer - + columns[i] = column - columnShift; lines[i] = line; + if (cbuf[i] == '\n') { line++; column = 0; columnShift = 0; } - column++; + else { + column++; + } } // the next buffer[0] offset will be after this increment. @@ -1260,18 +1261,28 @@ public int read(char[] cbuf, int off, int len) throws IOException { return charsRead; } + private void shiftRemaindersLeft(int off) { + if (off > 0) { + System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); + System.arraycopy(columns, columns.length - off, columns, 0, off); + System.arraycopy(lines, lines.length - off, lines, 0, off); + } + } + private void initializeBuffers(char[] cbuf) { if (codepoints == null) { - codepoints = new int[cbuf.length]; - } + assert columns == null; + assert lines == null; - if (columns == null) { + codepoints = new int[cbuf.length]; columns = new int[cbuf.length]; - } - - if (lines == null) { lines = new int[cbuf.length]; } + + // nothing else changed in the mean time, especially not the length of cbuf. + assert codepoints.length == cbuf.length; + assert columns.length == cbuf.length; + assert lines.length == cbuf.length; } /** @@ -1282,12 +1293,13 @@ public int getOffsetAtBufferStart() { return offset; } + int prevPos = -1; /** * @return the codepoint offset (from the start of the streaming content) * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - return pos >= limit ? offset : codepoints[pos]; + return pos >= limit ? offset : offset + codepoints[pos]; } public int getColumnAtBufferPos(int pos) { diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index a91d54006b..0b8191eb9a 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -100,12 +100,8 @@ bool originTest(loc example) { for ( <- poss) { assert content[p.offset] == "{"; // all nodes start with a { assert content[p.offset + p.length - 1] == "}"; // all nodes end with a } - println(p.begin.line); - println(line); assert p.begin.line == line; - println(lines[p.begin.line - 1][p.begin.column]); assert lines[p.begin.line - 1][p.begin.column] == "{"; - println(lines[p.end.line - 1][p.end.column - 1]); assert lines[p.end.line - 1][p.end.column - 1] == "}"; } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json index a3df2a4508..ff8557d334 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json @@ -2,9 +2,9 @@ /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ "glossary": { "title": "example glossary", - "line" : 2, + "line" : 3, "GlossDiv": { - "line" : 5, + "line" : 6, "title": "S", "GlossList": { "line" : 9, From d842105fa8442fd211eff6b1a022bbc6cb230694 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 23 Feb 2026 11:35:27 +0100 Subject: [PATCH 10/22] cleanup, refactoring and documentation, plus corrections --- .../lang/json/internal/JsonValueReader.java | 77 +++++++++++++------ 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index ee37b2cbb5..6516d056c2 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1159,15 +1159,22 @@ public IValue read(Reader in, Type expected) throws IOException { * current position in the buffer (the private field `pos` of JsonReader). */ public static class OriginTrackingReader extends FilterReader { - // offset is always pointing at the point in the file where JsonReader.pos == 0 + // TODO: some of these fields may be derived from one another for speed or space reasons. + + // offset is a codepoint counter which always represents the point in the file where JsonReader.pos == 0 private int offset = 0; + // shift is the amount of high surrogate pairs encountered so far + private int shift = 0; + // columnShift is the amount of high surrogate pairs encountered on the current line + private int columnShift = 0; // limit is always pointing to the amount of no-junk chars in the underlying buffer below buffer.length private int limit = 0; - // the codepoints array maps char offsets to codepoint offsets + // the codepoints array maps char offsets to codepoint offsets. private int[] codepoints = null; // columns maps char offsets to codepoint column positions private int[] columns = null; // lines maps char offsets to codepoint line numbers + // TODO: lines may be superfluous if GsonReader can compute line numbers accurately for Unicode content already private int[] lines = null; // the current column position @@ -1213,25 +1220,49 @@ private boolean fillBuffer(int minimum) throws IOException { public int read(char[] cbuf, int off, int len) throws IOException { initializeBuffers(cbuf); - // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. - // Moreover, `fillBuffer.limit == reader.off` at the start of this method. - - // we know take the previous limit and add it to the - // offset, to arrive at the new `pos=0` of `buffer[0]`, - offset += (limit != 0 ? codepoints[limit - 1] + 1 : 0) - off; + // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. + // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. + offset += (limit != 0 ? codepoints[limit - off - 1] + 1 : 0); // make sure we are only a facade for the real reader. // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); + // now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs: shiftRemaindersLeft(off); - // reconstruct the character shift from the start of the new buffer to the offset we kept: - int shift = off == 0 ? 0 : off - codepoints[off - 1]; - int columnShift = off == 0 ? 0 : off - (columns[off - 1] - columns[0]); + // the next buffer[0] offset will be after this increment. + // Note that `fillBuffer.limit == read.limit` + limit = off + charsRead; - for (int i = off; i < charsRead + off; i++) { + // and then we can fill our administrsation of surrogate pairs quickly + precomputeSurrogatePairCompensation(cbuf, off, limit); + + // and return only the number of characters read. + return charsRead; + } + + /** + * The cbuf char buffer may contain "surrogate pairs", where two int16 chars represent + * one unicode codepoint. We want to count in codepoints so we store here for every + * character in cbuf what it's codepoint offset is, what its codepoint column is + * and what its codepoint line is. + * + * Later when the JSONValueReader needs to know "current positions", this OriginTrackerReader + * will have the answers stored in its buffers. + * + * @param cbuf + * @param off + * @param charsRead + */ + private void precomputeSurrogatePairCompensation(char[] cbuf, int off, int limit) { + // NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0]; + // So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already. + for (int i = off; i < limit; i++) { codepoints[i] = i - shift; + columns[i] = column - columnShift; + lines[i] = line; + if (Character.isHighSurrogate(cbuf[i])) { // for every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset by increasing `shift` @@ -1239,11 +1270,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { columnShift++; // do not assume the low surrogate is in the current buffer yet (boundary condition) } - - columns[i] = column - columnShift; - lines[i] = line; - - if (cbuf[i] == '\n') { + else if (cbuf[i] == '\n') { line++; column = 0; columnShift = 0; @@ -1252,13 +1279,6 @@ public int read(char[] cbuf, int off, int len) throws IOException { column++; } } - - // the next buffer[0] offset will be after this increment. - // Note that `fillBuffer.limit == read.limit` - limit = off + charsRead; - - // and return only the number of characters read. - return charsRead; } private void shiftRemaindersLeft(int off) { @@ -1293,7 +1313,6 @@ public int getOffsetAtBufferStart() { return offset; } - int prevPos = -1; /** * @return the codepoint offset (from the start of the streaming content) * for the character at char position `pos` in the last buffered content. @@ -1302,10 +1321,18 @@ public int getOffsetAtBufferPos(int pos) { return pos >= limit ? offset : offset + codepoints[pos]; } + /** + * @return the codepoint column (from the start of the current line) + * for the character at char position `pos` in the last buffered content. + */ public int getColumnAtBufferPos(int pos) { return pos >= limit ? column : columns[pos]; } + /** + * @return the codepoint line (from the start of the entire content) + * for the character at char position `pos` in the last buffered content. + */ public int getLineAtBufferPos(int pos) { return pos >= limit ? line : lines[pos]; } From 31f1a8ae19bc0895bfea1b09fabb5286c072ecf3 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 23 Feb 2026 11:42:29 +0100 Subject: [PATCH 11/22] cleanup --- .../library/lang/json/internal/JsonValueReader.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 6516d056c2..8d2dbf8a25 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1169,7 +1169,7 @@ public static class OriginTrackingReader extends FilterReader { private int columnShift = 0; // limit is always pointing to the amount of no-junk chars in the underlying buffer below buffer.length private int limit = 0; - // the codepoints array maps char offsets to codepoint offsets. + // the codepoints array maps char offsets to the number of codepoints since the start of the buffer private int[] codepoints = null; // columns maps char offsets to codepoint column positions private int[] columns = null; @@ -1222,9 +1222,9 @@ public int read(char[] cbuf, int off, int len) throws IOException { // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. - offset += (limit != 0 ? codepoints[limit - off - 1] + 1 : 0); + offset += (limit == 0 ? 0 : codepoints[limit - off - 1] + 1); - // make sure we are only a facade for the real reader. + // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); @@ -1235,7 +1235,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { // Note that `fillBuffer.limit == read.limit` limit = off + charsRead; - // and then we can fill our administrsation of surrogate pairs quickly + // and then we can fill our administration of surrogate pairs quickly precomputeSurrogatePairCompensation(cbuf, off, limit); // and return only the number of characters read. From 09bbbb35c121100bfd1ed8a1fbb8e179c49244c8 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 23 Feb 2026 12:19:00 +0100 Subject: [PATCH 12/22] working on another bug --- .../lang/json/internal/JsonValueReader.java | 29 ++++++++----------- .../tests/library/lang/json/JSONIOTests.rsc | 2 +- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 8d2dbf8a25..32944cef68 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -807,11 +807,12 @@ public IValue visitNode(Type type) throws IOException { return inferNullValue(nulls, type); } - int startPos = Math.max(getOffset() - 1, 0); + in.beginObject(); + + int startPos = Math.max(getOffset(), 0); int startLine = getLine(); - int startCol = getCol() - 1; + int startCol = getCol(); - in.beginObject(); Map kws = new HashMap<>(); Map args = new HashMap<>(); @@ -844,11 +845,13 @@ public IValue visitNode(Type type) throws IOException { } } - int endPos = Math.max(getOffset() - 1, 0); + + in.endObject(); + + int endPos = Math.max(getOffset(), 0); int endLine = getLine(); int endCol = getCol() - 1; - in.endObject(); if (trackOrigins && !stopTracking) { kws.put(kws.containsKey("src") ? "rascal-src" : "src", @@ -1222,7 +1225,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. - offset += (limit == 0 ? 0 : codepoints[limit - off - 1] + 1); + offset += (limit == 0 ? 0 : codepoints[limit - off - 1] + 1) ; // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. @@ -1305,20 +1308,12 @@ private void initializeBuffers(char[] cbuf) { assert lines.length == cbuf.length; } - /** - * @return the codepoint offset (from the start of the streaming content) - * for the start of the last buffered content (cbuf[0]) - */ - public int getOffsetAtBufferStart() { - return offset; - } - /** * @return the codepoint offset (from the start of the streaming content) * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - return pos >= limit ? offset : offset + codepoints[pos]; + return (pos >= limit) ? offset + codepoints[pos - 1] + 1 : (offset + codepoints[pos]); } /** @@ -1326,7 +1321,7 @@ public int getOffsetAtBufferPos(int pos) { * for the character at char position `pos` in the last buffered content. */ public int getColumnAtBufferPos(int pos) { - return pos >= limit ? column : columns[pos]; + return (pos >= limit) ? column : columns[pos]; } /** @@ -1334,7 +1329,7 @@ public int getColumnAtBufferPos(int pos) { * for the character at char position `pos` in the last buffered content. */ public int getLineAtBufferPos(int pos) { - return pos >= limit ? line : lines[pos]; + return (pos >= limit) ? line : lines[pos]; } } } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 0b8191eb9a..24220d9276 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -111,7 +111,7 @@ bool originTest(loc example) { test bool originTracking() { files = [ l | loc l <- |std:///lang/rascal/tests/library/lang/json|.ls, l.extension == "json", bprintln(l)]; - return (true | it && originTest(example) | loc example <- files); + return (true | it && originTest(example) | loc example <- files, bprintln(example)); } value numNormalizer(int i) = i % maxLong when abs(i) > maxLong; From eb402e302c38c7d1b76f41ea23c9a1483217653c Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 23 Feb 2026 12:21:03 +0100 Subject: [PATCH 13/22] fixed boundary condition for getOffset --- .../rascalmpl/library/lang/json/internal/JsonValueReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 32944cef68..259281ad86 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1313,7 +1313,7 @@ private void initializeBuffers(char[] cbuf) { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - return (pos >= limit) ? offset + codepoints[pos - 1] + 1 : (offset + codepoints[pos]); + return (pos >= limit) ? (offset + codepoints[pos - 1] + 1) : (offset + codepoints[pos]); } /** From df0a5c427b6cd39b117849c37fcdb396ba6d1412 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:12:14 +0100 Subject: [PATCH 14/22] fixed all tests --- .../lang/json/internal/JsonValueReader.java | 35 ++++++++++--------- .../tests/library/lang/json/JSONIOTests.rsc | 3 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 259281ad86..af8d542ff0 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -485,8 +485,10 @@ private int getOffset() { try { assert posHandler != null; var internalPos = (int) posHandler.get(in); - - return tracker.getOffsetAtBufferPos(internalPos); + // System.err.println(" pos: " + internalPos); + var o = tracker.getOffsetAtBufferPos(internalPos); + // System.err.println("offset: " + o); + return o; } catch (IllegalArgumentException | SecurityException e) { // we stop trying to track positions if it fails so hard, @@ -619,12 +621,13 @@ private IValue visitStringAsAbstractData(Type type) throws IOException { private IValue visitObjectAsAbstractData(Type type) throws IOException { Set alternatives = null; - int startPos = Math.max(getOffset() - 1, 0); + int startPos = Math.max(getOffset() - 1 /* pos cursor is at { */, 0); int startLine = getLine(); int startCol = getCol() - 1; in.beginObject(); - + + // use explicit information in the JSON to select and filter constructors from the TypeStore // we expect always to have the field _constructor before _type. if (explicitConstructorNames || explicitDataTypes) { @@ -739,14 +742,14 @@ else if (!explicitDataTypes && "_type".equals(label)) { } } + int endPos = Math.max(getOffset() - 1, 0); assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; - int endLine = getLine(); int endCol = getCol() - 1; in.endObject(); - + for (int i = 0; i < args.length; i++) { if (args[i] == null) { throw parseErrorHere( @@ -807,13 +810,12 @@ public IValue visitNode(Type type) throws IOException { return inferNullValue(nulls, type); } - in.beginObject(); - - int startPos = Math.max(getOffset(), 0); + int startPos = Math.max(getOffset() - 1, 0); int startLine = getLine(); - int startCol = getCol(); + int startCol = getCol() - 1; + + in.beginObject(); - Map kws = new HashMap<>(); Map args = new HashMap<>(); @@ -845,13 +847,11 @@ public IValue visitNode(Type type) throws IOException { } } - - in.endObject(); - - int endPos = Math.max(getOffset(), 0); + int endPos = Math.max(getOffset() - 1, 0); int endLine = getLine(); int endCol = getCol() - 1; + in.endObject(); if (trackOrigins && !stopTracking) { kws.put(kws.containsKey("src") ? "rascal-src" : "src", @@ -917,7 +917,7 @@ public IValue visitList(Type type) throws IOException { } IListWriter w = vf.listWriter(); - getOffset(); + in.beginArray(); while (in.hasNext()) { // here we pass label from the higher context @@ -1225,7 +1225,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. - offset += (limit == 0 ? 0 : codepoints[limit - off - 1] + 1) ; + offset += (limit == 0 ? 0 : codepoints[Math.max(0, limit - off - 1)] + 1) ; // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. @@ -1313,6 +1313,7 @@ private void initializeBuffers(char[] cbuf) { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { + // System.err.println("limit: " + pos); return (pos >= limit) ? (offset + codepoints[pos - 1] + 1) : (offset + codepoints[pos]); } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 24220d9276..47a61e5529 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -98,7 +98,8 @@ bool originTest(loc example) { poss = [ | /node x := ex2, x.line?]; // every node has a .src field, otherwise this fails with an exception for ( <- poss) { - assert content[p.offset] == "{"; // all nodes start with a { + println(p); + assert content[p.offset] == "{" : "content(): "; // all nodes start with a { assert content[p.offset + p.length - 1] == "}"; // all nodes end with a } assert p.begin.line == line; assert lines[p.begin.line - 1][p.begin.column] == "{"; From 38e625a30fa89233a9652fdee5739a8b736bbbdd Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:18:52 +0100 Subject: [PATCH 15/22] added new failing tests for boundary conditions with unicode origins --- .../tests/library/lang/json/JSONIOTests.rsc | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 47a61e5529..5c35a5cfc3 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -358,4 +358,37 @@ bool jsonVerifyOriginCorrectAcrossBufferBoundaries(int sSize) { } return true; -} \ No newline at end of file +} + +test bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries() { + /* twice just before and after the 1024 buffer size of JsonReader */ + for (int sSize <- [1000..1025] + [2000..2050]) { + // we try different shapes across the boundary, where also the high surrogate and + // the low surrogate will end up on either side of the buffer limit + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, false, false); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, true, false); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, false, true); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, true, true); + } + return true; +} + +bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(int sSize, bool offbyoneChar, bool switchBackAndForth) { + ref = v1(x=123456789); + refExpected = asJSON(ref); + + t1 = [v1(s="a<}>🍕a<}><}>"), ref]; + writeJSON(|memory:///test.json|, t1); + + //s this throws exceptions and asserts if there are bugs with the + // origin tracker. In particular it triggers #2633 + v = readJSON(#list[X],|memory:///test.json|, trackOrigins=true); + + // checking the last element + if (refExpected != readFile(v[1].src)) { + println("Failed for : != "); + return false; + } + + return true; +} From b0ce362ac218b467e94aa75ba80b8b2530b0600c Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:38:50 +0100 Subject: [PATCH 16/22] fixed specific unicode offset issues --- .../lang/json/internal/JsonValueReader.java | 3 +++ .../tests/library/lang/json/JSONIOTests.rsc | 20 +++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index af8d542ff0..7f89aa8d4b 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1226,6 +1226,9 @@ public int read(char[] cbuf, int off, int len) throws IOException { // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. offset += (limit == 0 ? 0 : codepoints[Math.max(0, limit - off - 1)] + 1) ; + // the accumlated shift is present in the previous offset, so we start from scratch now. + shift = 0; + columnShift = 0; // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 5c35a5cfc3..06bd9ea9ac 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -98,21 +98,20 @@ bool originTest(loc example) { poss = [ | /node x := ex2, x.line?]; // every node has a .src field, otherwise this fails with an exception for ( <- poss) { - println(p); - assert content[p.offset] == "{" : "content(): "; // all nodes start with a { - assert content[p.offset + p.length - 1] == "}"; // all nodes end with a } - assert p.begin.line == line; - assert lines[p.begin.line - 1][p.begin.column] == "{"; - assert lines[p.end.line - 1][p.end.column - 1] == "}"; + assert content[p.offset] == "{" : "

"; // all nodes start with a { + assert content[p.offset + p.length - 1] == "}" : "

"; // all nodes end with a } + assert p.begin.line == line : "

"; + assert lines[p.begin.line - 1][p.begin.column] == "{" : "

"; + assert lines[p.end.line - 1][p.end.column - 1] == "}" : "

"; } return true; } test bool originTracking() { - files = [ l | loc l <- |std:///lang/rascal/tests/library/lang/json|.ls, l.extension == "json", bprintln(l)]; + files = [ l | loc l <- |std:///lang/rascal/tests/library/lang/json|.ls, l.extension == "json"]; - return (true | it && originTest(example) | loc example <- files, bprintln(example)); + return (true | it && originTest(example) | loc example <- files); } value numNormalizer(int i) = i % maxLong when abs(i) > maxLong; @@ -376,8 +375,9 @@ test bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries() { bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(int sSize, bool offbyoneChar, bool switchBackAndForth) { ref = v1(x=123456789); refExpected = asJSON(ref); - + t1 = [v1(s="a<}>🍕a<}><}>"), ref]; + writeJSON(|memory:///test.json|, t1); //s this throws exceptions and asserts if there are bugs with the @@ -386,7 +386,7 @@ bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(int sSize, bool offbyo // checking the last element if (refExpected != readFile(v[1].src)) { - println("Failed for : != "); + println("Failed for : != , offbyoneChar: , switchBackAndForth: "); return false; } From 0c29e5a870b5bc03a69abadda114517074923a3f Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:40:05 +0100 Subject: [PATCH 17/22] cleanup debug code --- .../library/lang/json/internal/JsonValueReader.java | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 7f89aa8d4b..c0c35fb4cc 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -83,8 +83,6 @@ public class JsonValueReader { private final IRascalMonitor monitor; private final ISourceLocation src; private VarHandle posHandler; - private VarHandle lineHandler; - private VarHandle lineStartHandler; /* options */ private ThreadLocal format; @@ -96,7 +94,6 @@ public class JsonValueReader { private IFunction parsers; private Map nulls = Collections.emptyMap(); - private final class ExpectedTypeDispatcher implements ITypeVisitor { private final JsonReader in; private final OriginTrackingReader tracker; @@ -485,10 +482,7 @@ private int getOffset() { try { assert posHandler != null; var internalPos = (int) posHandler.get(in); - // System.err.println(" pos: " + internalPos); - var o = tracker.getOffsetAtBufferPos(internalPos); - // System.err.println("offset: " + o); - return o; + return tracker.getOffsetAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // we stop trying to track positions if it fails so hard, @@ -1316,7 +1310,6 @@ private void initializeBuffers(char[] cbuf) { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - // System.err.println("limit: " + pos); return (pos >= limit) ? (offset + codepoints[pos - 1] + 1) : (offset + codepoints[pos]); } From 9d146ddf52cfc8fcc33bbe6c5c4698b6b06e0bd9 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:40:52 +0100 Subject: [PATCH 18/22] cleanup unused handler code --- .../rascalmpl/library/lang/json/internal/JsonValueReader.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index c0c35fb4cc..43f6c1f1c4 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -982,9 +982,7 @@ public JsonValueReader(IValueFactory vf, TypeStore store, IRascalMonitor monitor var lookup = MethodHandles.lookup(); var privateLookup = MethodHandles.privateLookupIn(JsonReader.class, lookup); this.posHandler = privateLookup.findVarHandle(JsonReader.class, "pos", int.class); - this.lineHandler = privateLookup.findVarHandle(JsonReader.class, "lineNumber", int.class); - this.lineStartHandler = privateLookup.findVarHandle(JsonReader.class, "lineStart", int.class); - + if (posHandler == null) { stopTracking = true; } From 0f7162ea894a8c3fec433439ee00ebcdf7e411a1 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 10:56:42 +0100 Subject: [PATCH 19/22] added rationale in comment to explain use of offset buffers --- .../library/lang/json/internal/JsonValueReader.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 43f6c1f1c4..d000488fa5 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1287,6 +1287,19 @@ private void shiftRemaindersLeft(int off) { } } + /* + * We keep our own buffers of int offsets instead of reference or copy of the cbuf: + * * a quick and dirty reference to cbuf won't do because the GsonReader client uses System.arrayCopy to + * overwrite the buffer before we can get to it. + * * a copy/clone of the buffer could work to have access to the previous version. However, + * we would still need to loop over it and compute the offsets. So that adds copying the entire buffer + * at every `read` to the load. + * * Also the GsonReader does not guarantee that `pos` grows (strictly) monotically, which means we + * sometimes might have to look back and only having the last offset at the parsing frontier is not sufficient. + * * the current solution requires more memory, but it is faster while streaming. + * * the line buffer is strictly not required, but beats the reflective access we need to + * _two_ private variables in GsonReader. + */ private void initializeBuffers(char[] cbuf) { if (codepoints == null) { assert columns == null; From c88eb12d24163465cfad8929cc13ac305e2d9066 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 13:27:22 +0100 Subject: [PATCH 20/22] better field names, removed need for comments --- .../lang/json/internal/JsonValueReader.java | 134 +++++++----------- 1 file changed, 50 insertions(+), 84 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index d000488fa5..6fa1ce9dcc 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1151,76 +1151,42 @@ public IValue read(Reader in, Type expected) throws IOException { * just enough information, together with internal private fields of JsonReader, to compute Rascal-required * offsets. We get only the character offset in the file, at the start of each streamed buffer contents. * That should be just enough information to recompute the actual offset of every Json element, using the - * current position in the buffer (the private field `pos` of JsonReader). + * current position in the buffer as stored in {@link JsonReader#pos} (private). + * + * See the body of {@link JsonReader#fillBuffer(int minimum)} for the contract that we must satisfy and + * the preconditions we are given at every call to {@link #read(char[], int, int)}. */ public static class OriginTrackingReader extends FilterReader { - // TODO: some of these fields may be derived from one another for speed or space reasons. - - // offset is a codepoint counter which always represents the point in the file where JsonReader.pos == 0 - private int offset = 0; - // shift is the amount of high surrogate pairs encountered so far - private int shift = 0; - // columnShift is the amount of high surrogate pairs encountered on the current line - private int columnShift = 0; - // limit is always pointing to the amount of no-junk chars in the underlying buffer below buffer.length - private int limit = 0; - // the codepoints array maps char offsets to the number of codepoints since the start of the buffer - private int[] codepoints = null; - // columns maps char offsets to codepoint column positions - private int[] columns = null; - // lines maps char offsets to codepoint line numbers - // TODO: lines may be superfluous if GsonReader can compute line numbers accurately for Unicode content already - private int[] lines = null; - - // the current column position - private int column = 0; - // the current line - private int line = 1; - // the current amount of high surrogates counted in this buffer - + private int codepointOffset = 0; + private int codepointColumn = 0; + private int codepointLine = 1; + + private int surrogatePairs = 0; + private int surrogatePairsThisLine = 0; + + private int prevBufferLimit = 0; + + private int[] charPosToCodepoints = null; + private int[] charPosToCodepointColumns = null; + private int[] charPosToLines = null; + protected OriginTrackingReader(Reader in) { super(in); } - /* This private method from JsonReader must be mirrored by `read` - private boolean fillBuffer(int minimum) throws IOException { - char[] buffer = this.buffer; - lineStart -= pos; - if (limit != pos) { - limit -= pos; - System.arraycopy(buffer, pos, buffer, 0, limit); - } else { - limit = 0; - } - - pos = 0; - int total; - while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) { - limit += total; - - // if this is the first read, consume an optional byte order mark (BOM) if it exists - if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') { - pos++; - lineStart++; - minimum++; - } - - if (limit >= minimum) { - return true; - } - } - return false; - } */ @Override public int read(char[] cbuf, int off, int len) throws IOException { initializeBuffers(cbuf); // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. - offset += (limit == 0 ? 0 : codepoints[Math.max(0, limit - off - 1)] + 1) ; - // the accumlated shift is present in the previous offset, so we start from scratch now. - shift = 0; - columnShift = 0; + codepointOffset += (prevBufferLimit == 0 + ? 0 + : charPosToCodepoints[Math.max(0, prevBufferLimit - off - 1)] + 1); + + // the accumlated shift is present in the previous codepointOffset, so we start from scratch now. + surrogatePairs = 0; + surrogatePairsThisLine = 0; // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. @@ -1231,10 +1197,10 @@ public int read(char[] cbuf, int off, int len) throws IOException { // the next buffer[0] offset will be after this increment. // Note that `fillBuffer.limit == read.limit` - limit = off + charsRead; + prevBufferLimit = off + charsRead; // and then we can fill our administration of surrogate pairs quickly - precomputeSurrogatePairCompensation(cbuf, off, limit); + precomputeSurrogatePairCompensation(cbuf, off, prevBufferLimit); // and return only the number of characters read. return charsRead; @@ -1257,33 +1223,33 @@ private void precomputeSurrogatePairCompensation(char[] cbuf, int off, int limit // NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0]; // So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already. for (int i = off; i < limit; i++) { - codepoints[i] = i - shift; - columns[i] = column - columnShift; - lines[i] = line; + charPosToCodepoints[i] = i - surrogatePairs; + charPosToCodepointColumns[i] = codepointColumn - surrogatePairsThisLine; + charPosToLines[i] = codepointLine; if (Character.isHighSurrogate(cbuf[i])) { // for every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset by increasing `shift` - shift++; - columnShift++; + surrogatePairs++; + surrogatePairsThisLine++; // do not assume the low surrogate is in the current buffer yet (boundary condition) } else if (cbuf[i] == '\n') { - line++; - column = 0; - columnShift = 0; + codepointLine++; + codepointColumn = 0; + surrogatePairsThisLine = 0; } else { - column++; + codepointColumn++; } } } private void shiftRemaindersLeft(int off) { if (off > 0) { - System.arraycopy(codepoints, codepoints.length - off, codepoints, 0, off); - System.arraycopy(columns, columns.length - off, columns, 0, off); - System.arraycopy(lines, lines.length - off, lines, 0, off); + System.arraycopy(charPosToCodepoints, charPosToCodepoints.length - off, charPosToCodepoints, 0, off); + System.arraycopy(charPosToCodepointColumns, charPosToCodepointColumns.length - off, charPosToCodepointColumns, 0, off); + System.arraycopy(charPosToLines, charPosToLines.length - off, charPosToLines, 0, off); } } @@ -1301,19 +1267,19 @@ private void shiftRemaindersLeft(int off) { * _two_ private variables in GsonReader. */ private void initializeBuffers(char[] cbuf) { - if (codepoints == null) { - assert columns == null; - assert lines == null; + if (charPosToCodepoints == null) { + assert charPosToCodepointColumns == null; + assert charPosToLines == null; - codepoints = new int[cbuf.length]; - columns = new int[cbuf.length]; - lines = new int[cbuf.length]; + charPosToCodepoints = new int[cbuf.length]; + charPosToCodepointColumns = new int[cbuf.length]; + charPosToLines = new int[cbuf.length]; } // nothing else changed in the mean time, especially not the length of cbuf. - assert codepoints.length == cbuf.length; - assert columns.length == cbuf.length; - assert lines.length == cbuf.length; + assert charPosToCodepoints.length == cbuf.length; + assert charPosToCodepointColumns.length == cbuf.length; + assert charPosToLines.length == cbuf.length; } /** @@ -1321,7 +1287,7 @@ private void initializeBuffers(char[] cbuf) { * for the character at char position `pos` in the last buffered content. */ public int getOffsetAtBufferPos(int pos) { - return (pos >= limit) ? (offset + codepoints[pos - 1] + 1) : (offset + codepoints[pos]); + return (pos >= prevBufferLimit) ? (codepointOffset + charPosToCodepoints[pos - 1] + 1) : (codepointOffset + charPosToCodepoints[pos]); } /** @@ -1329,7 +1295,7 @@ public int getOffsetAtBufferPos(int pos) { * for the character at char position `pos` in the last buffered content. */ public int getColumnAtBufferPos(int pos) { - return (pos >= limit) ? column : columns[pos]; + return (pos >= prevBufferLimit) ? codepointColumn : charPosToCodepointColumns[pos]; } /** @@ -1337,7 +1303,7 @@ public int getColumnAtBufferPos(int pos) { * for the character at char position `pos` in the last buffered content. */ public int getLineAtBufferPos(int pos) { - return (pos >= limit) ? line : lines[pos]; + return (pos >= prevBufferLimit) ? codepointLine : charPosToLines[pos]; } } } From 7c2170c2094c5f683c32fbfc7e3873eb95e1f74d Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 13:33:54 +0100 Subject: [PATCH 21/22] comments --- .../lang/json/internal/JsonValueReader.java | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 6fa1ce9dcc..a24f84c238 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1178,13 +1178,14 @@ protected OriginTrackingReader(Reader in) { public int read(char[] cbuf, int off, int len) throws IOException { initializeBuffers(cbuf); - // `codepoints[limit - 1] - 1` is the offset of the last character read with the previous call to read. - // So the new offset starts there. We look back `off` chars because of possible left-overs before the limit. + // `codepoints[prevBufferLimit - 1] - 1` is the offset of the last character read with the previous call to read. + // So the new codepointOffset starts there. We look back `off` chars because of possible left-overs before the limit. codepointOffset += (prevBufferLimit == 0 ? 0 : charPosToCodepoints[Math.max(0, prevBufferLimit - off - 1)] + 1); - // the accumlated shift is present in the previous codepointOffset, so we start from scratch now. + // The accumlated surrogatePairs is included in the codepointOffset and codepointColumn counters, + // so we start from scratch again. surrogatePairs = 0; surrogatePairsThisLine = 0; @@ -1192,15 +1193,16 @@ public int read(char[] cbuf, int off, int len) throws IOException { // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); - // now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs: + // Now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs. + // It DOES happen that {@see GsonValueReader} asks for `charPosToCodepoints[0]`. shiftRemaindersLeft(off); - // the next buffer[0] offset will be after this increment. - // Note that `fillBuffer.limit == read.limit` + // The next buffer[0] offset will be right after this increment. + // Note that `GsonReader::fillBuffer.limit == this.prevBufferLimit` prevBufferLimit = off + charsRead; // and then we can fill our administration of surrogate pairs quickly - precomputeSurrogatePairCompensation(cbuf, off, prevBufferLimit); + precomputeSurrogatePairCounts(cbuf, off, prevBufferLimit); // and return only the number of characters read. return charsRead; @@ -1215,11 +1217,11 @@ public int read(char[] cbuf, int off, int len) throws IOException { * Later when the JSONValueReader needs to know "current positions", this OriginTrackerReader * will have the answers stored in its buffers. * - * @param cbuf - * @param off - * @param charsRead + * @param cbuf buffer to detect surrogate pairs in + * @param off where we left off the last time + * @param limit until which index the buffer is filled */ - private void precomputeSurrogatePairCompensation(char[] cbuf, int off, int limit) { + private void precomputeSurrogatePairCounts(char[] cbuf, int off, int limit) { // NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0]; // So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already. for (int i = off; i < limit; i++) { @@ -1228,11 +1230,11 @@ private void precomputeSurrogatePairCompensation(char[] cbuf, int off, int limit charPosToLines[i] = codepointLine; if (Character.isHighSurrogate(cbuf[i])) { - // for every high surrogate we assume a low surrogate will follow, + // For every high surrogate we assume a low surrogate will follow, // and we count only one of them for the character offset by increasing `shift` surrogatePairs++; surrogatePairsThisLine++; - // do not assume the low surrogate is in the current buffer yet (boundary condition) + // Do not assume the low surrogate is in the current buffer yet (boundary condition) } else if (cbuf[i] == '\n') { codepointLine++; From cebf2f979a83652105b0102b19d3530fe697105d Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 24 Feb 2026 13:34:35 +0100 Subject: [PATCH 22/22] comments --- .../rascalmpl/library/lang/json/internal/JsonValueReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index a24f84c238..82352cb1c8 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -1256,8 +1256,8 @@ private void shiftRemaindersLeft(int off) { } /* - * We keep our own buffers of int offsets instead of reference or copy of the cbuf: - * * a quick and dirty reference to cbuf won't do because the GsonReader client uses System.arrayCopy to + * We keep our own buffers of int offsets instead of a reference or a copy of the cbuf: + * * a quick and dirty reference to cbuf won't do because the {@see GsonReader} client uses System.arrayCopy to * overwrite the buffer before we can get to it. * * a copy/clone of the buffer could work to have access to the previous version. However, * we would still need to loop over it and compute the offsets. So that adds copying the entire buffer