Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3aa527d
buffer offset is now compensated for surrogate pairs
jurgenvinju Feb 18, 2026
505e233
Merge branch 'main' into fix/json-unicode-offsets
jurgenvinju Feb 18, 2026
977759a
added test with unicode surrogate pairs
jurgenvinju Feb 19, 2026
5432aea
initial throw at unicode resilient positions during JSON parsing
jurgenvinju Feb 19, 2026
fd57b97
minor improvements
jurgenvinju Feb 19, 2026
6c64095
minor comment
jurgenvinju Feb 19, 2026
f1b25c8
working to get unicode columns right
jurgenvinju Feb 20, 2026
39a0ef1
minor fix
jurgenvinju Feb 20, 2026
d21bc29
fixed line markup in unicode example for testing
jurgenvinju Feb 20, 2026
e1be9a1
gettin the off-by-ones under control
jurgenvinju Feb 22, 2026
d842105
cleanup, refactoring and documentation, plus corrections
jurgenvinju Feb 23, 2026
31f1a8a
cleanup
jurgenvinju Feb 23, 2026
09bbbb3
working on another bug
jurgenvinju Feb 23, 2026
eb402e3
fixed boundary condition for getOffset
jurgenvinju Feb 23, 2026
df0a5c4
fixed all tests
jurgenvinju Feb 24, 2026
38e625a
added new failing tests for boundary conditions with unicode origins
jurgenvinju Feb 24, 2026
b0ce362
fixed specific unicode offset issues
jurgenvinju Feb 24, 2026
0c29e5a
cleanup debug code
jurgenvinju Feb 24, 2026
9d146dd
cleanup unused handler code
jurgenvinju Feb 24, 2026
0f7162e
added rationale in comment to explain use of offset buffers
jurgenvinju Feb 24, 2026
c88eb12
better field names, removed need for comments
jurgenvinju Feb 24, 2026
7c2170c
comments
jurgenvinju Feb 24, 2026
cebf2f9
comments
jurgenvinju Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 158 additions & 69 deletions src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ public class JsonValueReader {
private final IRascalMonitor monitor;
private final ISourceLocation src;
private VarHandle posHandler;
private VarHandle lineHandler;
private VarHandle lineStartHandler;

/* options */
private ThreadLocal<SimpleDateFormat> format;
Expand All @@ -96,7 +94,6 @@ public class JsonValueReader {
private IFunction parsers;
private Map<Type, IValue> nulls = Collections.emptyMap();


private final class ExpectedTypeDispatcher implements ITypeVisitor<IValue, IOException> {
private final JsonReader in;
private final OriginTrackingReader tracker;
Expand Down Expand Up @@ -485,7 +482,7 @@ private int getOffset() {
try {
assert posHandler != null;
var internalPos = (int) posHandler.get(in);
return tracker.getOffsetAtBufferStart() + internalPos;
return tracker.getOffsetAtBufferPos(internalPos);
}
catch (IllegalArgumentException | SecurityException e) {
// we stop trying to track positions if it fails so hard,
Expand All @@ -497,16 +494,17 @@ private int getOffset() {

private int getLine() {
if (stopTracking) {
return 0;
return 1;
}

try {
return (int) lineHandler.get(in) + 1;
var internalPos = (int) posHandler.get(in);
return tracker.getLineAtBufferPos(internalPos);
}
catch (IllegalArgumentException | SecurityException e) {
// stop trying to recover the positions
stopTracking = true;
return 0;
return 1;
}
}

Expand All @@ -523,7 +521,10 @@ private int getCol() {
}

try {
return ((int) posHandler.get(in)) - ((int) lineStartHandler.get(in));
assert posHandler != null;
var internalPos = (int) posHandler.get(in);

return tracker.getColumnAtBufferPos(internalPos);
}
catch (IllegalArgumentException | SecurityException e) {
// stop trying to recover the positions
Expand Down Expand Up @@ -614,12 +615,13 @@ private IValue visitStringAsAbstractData(Type type) throws IOException {
private IValue visitObjectAsAbstractData(Type type) throws IOException {
Set<Type> alternatives = null;

int startPos = getOffset() - 1;
int startPos = Math.max(getOffset() - 1 /* pos cursor is at { */, 0);
int startLine = getLine();
int startCol = getCol() - 1;

in.beginObject();



// use explicit information in the JSON to select and filter constructors from the TypeStore
// we expect always to have the field _constructor before _type.
if (explicitConstructorNames || explicitDataTypes) {
Expand Down Expand Up @@ -734,14 +736,14 @@ else if (!explicitDataTypes && "_type".equals(label)) {
}
}

int endPos = getOffset() - 1;
assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins;

int endPos = Math.max(getOffset() - 1, 0);
assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins;
int endLine = getLine();
int endCol = getCol() - 1;

in.endObject();

for (int i = 0; i < args.length; i++) {
if (args[i] == null) {
throw parseErrorHere(
Expand Down Expand Up @@ -802,12 +804,12 @@ public IValue visitNode(Type type) throws IOException {
return inferNullValue(nulls, type);
}

int startPos = getOffset() - 1;
int startPos = Math.max(getOffset() - 1, 0);
int startLine = getLine();
int startCol = getCol() - 1;

in.beginObject();

Map<String, IValue> kws = new HashMap<>();
Map<String, IValue> args = new HashMap<>();

Expand Down Expand Up @@ -839,7 +841,7 @@ public IValue visitNode(Type type) throws IOException {
}
}

int endPos = getOffset() - 1;
int endPos = Math.max(getOffset() - 1, 0);
int endLine = getLine();
int endCol = getCol() - 1;

Expand Down Expand Up @@ -909,7 +911,7 @@ public IValue visitList(Type type) throws IOException {
}

IListWriter w = vf.listWriter();
getOffset();

in.beginArray();
while (in.hasNext()) {
// here we pass label from the higher context
Expand All @@ -922,7 +924,7 @@ public IValue visitList(Type type) throws IOException {
}

in.endArray();
getOffset();

return w.done();
}

Expand Down Expand Up @@ -980,9 +982,7 @@ public JsonValueReader(IValueFactory vf, TypeStore store, IRascalMonitor monitor
var lookup = MethodHandles.lookup();
var privateLookup = MethodHandles.privateLookupIn(JsonReader.class, lookup);
this.posHandler = privateLookup.findVarHandle(JsonReader.class, "pos", int.class);
this.lineHandler = privateLookup.findVarHandle(JsonReader.class, "lineNumber", int.class);
this.lineStartHandler = privateLookup.findVarHandle(JsonReader.class, "lineStart", int.class);


if (posHandler == null) {
stopTracking = true;
}
Expand Down Expand Up @@ -1151,72 +1151,161 @@ public IValue read(Reader in, Type expected) throws IOException {
* just enough information, together with internal private fields of JsonReader, to compute Rascal-required
* offsets. We get only the character offset in the file, at the start of each streamed buffer contents.
* That should be just enough information to recompute the actual offset of every Json element, using the
* current position in the buffer (the private field `pos` of JsonReader).
* current position in the buffer as stored in {@link JsonReader#pos} (private).
*
* See the body of {@link JsonReader#fillBuffer(int minimum)} for the contract that we must satisfy and
* the preconditions we are given at every call to {@link #read(char[], int, int)}.
*/
public static class OriginTrackingReader extends FilterReader {
// offset is always pointing at the point in the file where JsonReader.pos == 0
private int offset = 0;
// limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length
private int limit = 0;

private int codepointOffset = 0;
private int codepointColumn = 0;
private int codepointLine = 1;

private int surrogatePairs = 0;
private int surrogatePairsThisLine = 0;

private int prevBufferLimit = 0;

private int[] charPosToCodepoints = null;
private int[] charPosToCodepointColumns = null;
private int[] charPosToLines = null;

protected OriginTrackingReader(Reader in) {
super(in);
}

/* This private method from JsonReader must be mirrored by `read`
private boolean fillBuffer(int minimum) throws IOException {
char[] buffer = this.buffer;
lineStart -= pos;
if (limit != pos) {
limit -= pos;
System.arraycopy(buffer, pos, buffer, 0, limit);
} else {
limit = 0;
}

pos = 0;
int total;
while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) {
limit += total;

// if this is the first read, consume an optional byte order mark (BOM) if it exists
if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') {
pos++;
lineStart++;
minimum++;
}

if (limit >= minimum) {
return true;
}
}
return false;
} */
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
// Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`.
// Moreover, `fillBuffer.limit == reader.off` at the start of this method.
initializeBuffers(cbuf);

// we know take the previous limit and add it to the
// offset, to arrive at the new `pos=0` of `buffer[0]`,
// rewinding `off` characters which were reused from the previous buffer
// with System.arraycopy.
offset += limit - off;
// `codepoints[prevBufferLimit - 1] - 1` is the offset of the last character read with the previous call to read.
// So the new codepointOffset starts there. We look back `off` chars because of possible left-overs before the limit.
codepointOffset += (prevBufferLimit == 0
? 0
: charPosToCodepoints[Math.max(0, prevBufferLimit - off - 1)] + 1);

// make sure we are only a facade for the real reader.
// The accumlated surrogatePairs is included in the codepointOffset and codepointColumn counters,
// so we start from scratch again.
surrogatePairs = 0;
surrogatePairsThisLine = 0;

// make sure we are only a transparant facade for the real reader.
// parameters are mapped one-to-one without mutations.
var charsRead = in.read(cbuf, off, len);

// the next buffer[0] offset will be after this increment.
// Note that `fillBuffer.limit == read.limit`
limit = off + charsRead;
// Now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs.
// It DOES happen that {@see GsonValueReader} asks for `charPosToCodepoints[0]`.
shiftRemaindersLeft(off);

// The next buffer[0] offset will be right after this increment.
// Note that `GsonReader::fillBuffer.limit == this.prevBufferLimit`
prevBufferLimit = off + charsRead;

// and then we can fill our administration of surrogate pairs quickly
precomputeSurrogatePairCounts(cbuf, off, prevBufferLimit);

// and return only the number of characters read.
return charsRead;
}

public int getOffsetAtBufferStart() {
return offset;
/**
* The cbuf char buffer may contain "surrogate pairs", where two int16 chars represent
* one unicode codepoint. We want to count in codepoints so we store here for every
* character in cbuf what it's codepoint offset is, what its codepoint column is
* and what its codepoint line is.
*
* Later when the JSONValueReader needs to know "current positions", this OriginTrackerReader
* will have the answers stored in its buffers.
*
* @param cbuf buffer to detect surrogate pairs in
* @param off where we left off the last time
* @param limit until which index the buffer is filled
*/
private void precomputeSurrogatePairCounts(char[] cbuf, int off, int limit) {
// NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0];
// So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already.
for (int i = off; i < limit; i++) {
charPosToCodepoints[i] = i - surrogatePairs;
charPosToCodepointColumns[i] = codepointColumn - surrogatePairsThisLine;
charPosToLines[i] = codepointLine;

if (Character.isHighSurrogate(cbuf[i])) {
// For every high surrogate we assume a low surrogate will follow,
// and we count only one of them for the character offset by increasing `shift`
surrogatePairs++;
surrogatePairsThisLine++;
// Do not assume the low surrogate is in the current buffer yet (boundary condition)
}
else if (cbuf[i] == '\n') {
codepointLine++;
codepointColumn = 0;
surrogatePairsThisLine = 0;
}
else {
codepointColumn++;
}
}
}

private void shiftRemaindersLeft(int off) {
if (off > 0) {
System.arraycopy(charPosToCodepoints, charPosToCodepoints.length - off, charPosToCodepoints, 0, off);
System.arraycopy(charPosToCodepointColumns, charPosToCodepointColumns.length - off, charPosToCodepointColumns, 0, off);
System.arraycopy(charPosToLines, charPosToLines.length - off, charPosToLines, 0, off);
}
}

/*
* We keep our own buffers of int offsets instead of a reference or a copy of the cbuf:
* * a quick and dirty reference to cbuf won't do because the {@see GsonReader} client uses System.arrayCopy to
* overwrite the buffer before we can get to it.
* * a copy/clone of the buffer could work to have access to the previous version. However,
* we would still need to loop over it and compute the offsets. So that adds copying the entire buffer
* at every `read` to the load.
* * Also the GsonReader does not guarantee that `pos` grows (strictly) monotically, which means we
* sometimes might have to look back and only having the last offset at the parsing frontier is not sufficient.
* * the current solution requires more memory, but it is faster while streaming.
* * the line buffer is strictly not required, but beats the reflective access we need to
* _two_ private variables in GsonReader.
*/
private void initializeBuffers(char[] cbuf) {
if (charPosToCodepoints == null) {
assert charPosToCodepointColumns == null;
assert charPosToLines == null;

charPosToCodepoints = new int[cbuf.length];
charPosToCodepointColumns = new int[cbuf.length];
charPosToLines = new int[cbuf.length];
}

// nothing else changed in the mean time, especially not the length of cbuf.
assert charPosToCodepoints.length == cbuf.length;
assert charPosToCodepointColumns.length == cbuf.length;
assert charPosToLines.length == cbuf.length;
}

/**
* @return the codepoint offset (from the start of the streaming content)
* for the character at char position `pos` in the last buffered content.
*/
public int getOffsetAtBufferPos(int pos) {
return (pos >= prevBufferLimit) ? (codepointOffset + charPosToCodepoints[pos - 1] + 1) : (codepointOffset + charPosToCodepoints[pos]);
}

/**
* @return the codepoint column (from the start of the current line)
* for the character at char position `pos` in the last buffered content.
*/
public int getColumnAtBufferPos(int pos) {
return (pos >= prevBufferLimit) ? codepointColumn : charPosToCodepointColumns[pos];
}

/**
* @return the codepoint line (from the start of the entire content)
* for the character at char position `pos` in the last buffered content.
*/
public int getLineAtBufferPos(int pos) {
return (pos >= prevBufferLimit) ? codepointLine : charPosToLines[pos];
}
}
}
Loading
Loading