diff --git a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java index 02e2af3857..82352cb1c8 100644 --- a/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java +++ b/src/org/rascalmpl/library/lang/json/internal/JsonValueReader.java @@ -83,8 +83,6 @@ public class JsonValueReader { private final IRascalMonitor monitor; private final ISourceLocation src; private VarHandle posHandler; - private VarHandle lineHandler; - private VarHandle lineStartHandler; /* options */ private ThreadLocal format; @@ -96,7 +94,6 @@ public class JsonValueReader { private IFunction parsers; private Map nulls = Collections.emptyMap(); - private final class ExpectedTypeDispatcher implements ITypeVisitor { private final JsonReader in; private final OriginTrackingReader tracker; @@ -485,7 +482,7 @@ private int getOffset() { try { assert posHandler != null; var internalPos = (int) posHandler.get(in); - return tracker.getOffsetAtBufferStart() + internalPos; + return tracker.getOffsetAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // we stop trying to track positions if it fails so hard, @@ -497,16 +494,17 @@ private int getOffset() { private int getLine() { if (stopTracking) { - return 0; + return 1; } try { - return (int) lineHandler.get(in) + 1; + var internalPos = (int) posHandler.get(in); + return tracker.getLineAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // stop trying to recover the positions stopTracking = true; - return 0; + return 1; } } @@ -523,7 +521,10 @@ private int getCol() { } try { - return ((int) posHandler.get(in)) - ((int) lineStartHandler.get(in)); + assert posHandler != null; + var internalPos = (int) posHandler.get(in); + + return tracker.getColumnAtBufferPos(internalPos); } catch (IllegalArgumentException | SecurityException e) { // stop trying to recover the positions @@ -614,12 +615,13 @@ private IValue visitStringAsAbstractData(Type type) throws IOException { private IValue visitObjectAsAbstractData(Type type) throws IOException { Set alternatives = null; - int startPos = getOffset() - 1; + int startPos = Math.max(getOffset() - 1 /* pos cursor is at { */, 0); int startLine = getLine(); int startCol = getCol() - 1; in.beginObject(); - + + // use explicit information in the JSON to select and filter constructors from the TypeStore // we expect always to have the field _constructor before _type. if (explicitConstructorNames || explicitDataTypes) { @@ -734,14 +736,14 @@ else if (!explicitDataTypes && "_type".equals(label)) { } } - int endPos = getOffset() - 1; - assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; + int endPos = Math.max(getOffset() - 1, 0); + assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; int endLine = getLine(); int endCol = getCol() - 1; in.endObject(); - + for (int i = 0; i < args.length; i++) { if (args[i] == null) { throw parseErrorHere( @@ -802,12 +804,12 @@ public IValue visitNode(Type type) throws IOException { return inferNullValue(nulls, type); } - int startPos = getOffset() - 1; + int startPos = Math.max(getOffset() - 1, 0); int startLine = getLine(); int startCol = getCol() - 1; in.beginObject(); - + Map kws = new HashMap<>(); Map args = new HashMap<>(); @@ -839,7 +841,7 @@ public IValue visitNode(Type type) throws IOException { } } - int endPos = getOffset() - 1; + int endPos = Math.max(getOffset() - 1, 0); int endLine = getLine(); int endCol = getCol() - 1; @@ -909,7 +911,7 @@ public IValue visitList(Type type) throws IOException { } IListWriter w = vf.listWriter(); - getOffset(); + in.beginArray(); while (in.hasNext()) { // here we pass label from the higher context @@ -922,7 +924,7 @@ public IValue visitList(Type type) throws IOException { } in.endArray(); - getOffset(); + return w.done(); } @@ -980,9 +982,7 @@ public JsonValueReader(IValueFactory vf, TypeStore store, IRascalMonitor monitor var lookup = MethodHandles.lookup(); var privateLookup = MethodHandles.privateLookupIn(JsonReader.class, lookup); this.posHandler = privateLookup.findVarHandle(JsonReader.class, "pos", int.class); - this.lineHandler = privateLookup.findVarHandle(JsonReader.class, "lineNumber", int.class); - this.lineStartHandler = privateLookup.findVarHandle(JsonReader.class, "lineStart", int.class); - + if (posHandler == null) { stopTracking = true; } @@ -1151,72 +1151,161 @@ public IValue read(Reader in, Type expected) throws IOException { * just enough information, together with internal private fields of JsonReader, to compute Rascal-required * offsets. We get only the character offset in the file, at the start of each streamed buffer contents. * That should be just enough information to recompute the actual offset of every Json element, using the - * current position in the buffer (the private field `pos` of JsonReader). + * current position in the buffer as stored in {@link JsonReader#pos} (private). + * + * See the body of {@link JsonReader#fillBuffer(int minimum)} for the contract that we must satisfy and + * the preconditions we are given at every call to {@link #read(char[], int, int)}. */ public static class OriginTrackingReader extends FilterReader { - // offset is always pointing at the point in the file where JsonReader.pos == 0 - private int offset = 0; - // limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length - private int limit = 0; - + private int codepointOffset = 0; + private int codepointColumn = 0; + private int codepointLine = 1; + + private int surrogatePairs = 0; + private int surrogatePairsThisLine = 0; + + private int prevBufferLimit = 0; + + private int[] charPosToCodepoints = null; + private int[] charPosToCodepointColumns = null; + private int[] charPosToLines = null; + protected OriginTrackingReader(Reader in) { super(in); } - /* This private method from JsonReader must be mirrored by `read` - private boolean fillBuffer(int minimum) throws IOException { - char[] buffer = this.buffer; - lineStart -= pos; - if (limit != pos) { - limit -= pos; - System.arraycopy(buffer, pos, buffer, 0, limit); - } else { - limit = 0; - } - - pos = 0; - int total; - while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) { - limit += total; - - // if this is the first read, consume an optional byte order mark (BOM) if it exists - if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') { - pos++; - lineStart++; - minimum++; - } - - if (limit >= minimum) { - return true; - } - } - return false; - } */ @Override public int read(char[] cbuf, int off, int len) throws IOException { - // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. - // Moreover, `fillBuffer.limit == reader.off` at the start of this method. + initializeBuffers(cbuf); - // we know take the previous limit and add it to the - // offset, to arrive at the new `pos=0` of `buffer[0]`, - // rewinding `off` characters which were reused from the previous buffer - // with System.arraycopy. - offset += limit - off; + // `codepoints[prevBufferLimit - 1] - 1` is the offset of the last character read with the previous call to read. + // So the new codepointOffset starts there. We look back `off` chars because of possible left-overs before the limit. + codepointOffset += (prevBufferLimit == 0 + ? 0 + : charPosToCodepoints[Math.max(0, prevBufferLimit - off - 1)] + 1); - // make sure we are only a facade for the real reader. + // The accumlated surrogatePairs is included in the codepointOffset and codepointColumn counters, + // so we start from scratch again. + surrogatePairs = 0; + surrogatePairsThisLine = 0; + + // make sure we are only a transparant facade for the real reader. // parameters are mapped one-to-one without mutations. var charsRead = in.read(cbuf, off, len); - // the next buffer[0] offset will be after this increment. - // Note that `fillBuffer.limit == read.limit` - limit = off + charsRead; + // Now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs. + // It DOES happen that {@see GsonValueReader} asks for `charPosToCodepoints[0]`. + shiftRemaindersLeft(off); + + // The next buffer[0] offset will be right after this increment. + // Note that `GsonReader::fillBuffer.limit == this.prevBufferLimit` + prevBufferLimit = off + charsRead; + + // and then we can fill our administration of surrogate pairs quickly + precomputeSurrogatePairCounts(cbuf, off, prevBufferLimit); // and return only the number of characters read. return charsRead; } - public int getOffsetAtBufferStart() { - return offset; + /** + * The cbuf char buffer may contain "surrogate pairs", where two int16 chars represent + * one unicode codepoint. We want to count in codepoints so we store here for every + * character in cbuf what it's codepoint offset is, what its codepoint column is + * and what its codepoint line is. + * + * Later when the JSONValueReader needs to know "current positions", this OriginTrackerReader + * will have the answers stored in its buffers. + * + * @param cbuf buffer to detect surrogate pairs in + * @param off where we left off the last time + * @param limit until which index the buffer is filled + */ + private void precomputeSurrogatePairCounts(char[] cbuf, int off, int limit) { + // NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0]; + // So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already. + for (int i = off; i < limit; i++) { + charPosToCodepoints[i] = i - surrogatePairs; + charPosToCodepointColumns[i] = codepointColumn - surrogatePairsThisLine; + charPosToLines[i] = codepointLine; + + if (Character.isHighSurrogate(cbuf[i])) { + // For every high surrogate we assume a low surrogate will follow, + // and we count only one of them for the character offset by increasing `shift` + surrogatePairs++; + surrogatePairsThisLine++; + // Do not assume the low surrogate is in the current buffer yet (boundary condition) + } + else if (cbuf[i] == '\n') { + codepointLine++; + codepointColumn = 0; + surrogatePairsThisLine = 0; + } + else { + codepointColumn++; + } + } + } + + private void shiftRemaindersLeft(int off) { + if (off > 0) { + System.arraycopy(charPosToCodepoints, charPosToCodepoints.length - off, charPosToCodepoints, 0, off); + System.arraycopy(charPosToCodepointColumns, charPosToCodepointColumns.length - off, charPosToCodepointColumns, 0, off); + System.arraycopy(charPosToLines, charPosToLines.length - off, charPosToLines, 0, off); + } } + + /* + * We keep our own buffers of int offsets instead of a reference or a copy of the cbuf: + * * a quick and dirty reference to cbuf won't do because the {@see GsonReader} client uses System.arrayCopy to + * overwrite the buffer before we can get to it. + * * a copy/clone of the buffer could work to have access to the previous version. However, + * we would still need to loop over it and compute the offsets. So that adds copying the entire buffer + * at every `read` to the load. + * * Also the GsonReader does not guarantee that `pos` grows (strictly) monotically, which means we + * sometimes might have to look back and only having the last offset at the parsing frontier is not sufficient. + * * the current solution requires more memory, but it is faster while streaming. + * * the line buffer is strictly not required, but beats the reflective access we need to + * _two_ private variables in GsonReader. + */ + private void initializeBuffers(char[] cbuf) { + if (charPosToCodepoints == null) { + assert charPosToCodepointColumns == null; + assert charPosToLines == null; + + charPosToCodepoints = new int[cbuf.length]; + charPosToCodepointColumns = new int[cbuf.length]; + charPosToLines = new int[cbuf.length]; + } + + // nothing else changed in the mean time, especially not the length of cbuf. + assert charPosToCodepoints.length == cbuf.length; + assert charPosToCodepointColumns.length == cbuf.length; + assert charPosToLines.length == cbuf.length; + } + + /** + * @return the codepoint offset (from the start of the streaming content) + * for the character at char position `pos` in the last buffered content. + */ + public int getOffsetAtBufferPos(int pos) { + return (pos >= prevBufferLimit) ? (codepointOffset + charPosToCodepoints[pos - 1] + 1) : (codepointOffset + charPosToCodepoints[pos]); + } + + /** + * @return the codepoint column (from the start of the current line) + * for the character at char position `pos` in the last buffered content. + */ + public int getColumnAtBufferPos(int pos) { + return (pos >= prevBufferLimit) ? codepointColumn : charPosToCodepointColumns[pos]; + } + + /** + * @return the codepoint line (from the start of the entire content) + * for the character at char position `pos` in the last buffered content. + */ + public int getLineAtBufferPos(int pos) { + return (pos >= prevBufferLimit) ? codepointLine : charPosToLines[pos]; + } } } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc index 65602a9e73..06bd9ea9ac 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/JSONIOTests.rsc @@ -98,11 +98,11 @@ bool originTest(loc example) { poss = [ | /node x := ex2, x.line?]; // every node has a .src field, otherwise this fails with an exception for ( <- poss) { - assert content[p.offset] == "{"; // all nodes start with a { - assert content[p.offset + p.length - 1] == "}"; // all nodes end with a } - assert p.begin.line == line; - assert lines[p.begin.line - 1][p.begin.column] == "{"; - assert lines[p.end.line - 1][p.end.column - 1] == "}"; + assert content[p.offset] == "{" : "

"; // all nodes start with a { + assert content[p.offset + p.length - 1] == "}" : "

"; // all nodes end with a } + assert p.begin.line == line : "

"; + assert lines[p.begin.line - 1][p.begin.column] == "{" : "

"; + assert lines[p.end.line - 1][p.end.column - 1] == "}" : "

"; } return true; @@ -357,4 +357,38 @@ bool jsonVerifyOriginCorrectAcrossBufferBoundaries(int sSize) { } return true; -} \ No newline at end of file +} + +test bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries() { + /* twice just before and after the 1024 buffer size of JsonReader */ + for (int sSize <- [1000..1025] + [2000..2050]) { + // we try different shapes across the boundary, where also the high surrogate and + // the low surrogate will end up on either side of the buffer limit + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, false, false); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, true, false); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, false, true); + jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(sSize, true, true); + } + return true; +} + +bool jsonUnicodeVerifyOriginCorrectAcrossBufferBoundaries(int sSize, bool offbyoneChar, bool switchBackAndForth) { + ref = v1(x=123456789); + refExpected = asJSON(ref); + + t1 = [v1(s="a<}>🍕a<}><}>"), ref]; + + writeJSON(|memory:///test.json|, t1); + + //s this throws exceptions and asserts if there are bugs with the + // origin tracker. In particular it triggers #2633 + v = readJSON(#list[X],|memory:///test.json|, trackOrigins=true); + + // checking the last element + if (refExpected != readFile(v[1].src)) { + println("Failed for : != , offbyoneChar: , switchBackAndForth: "); + return false; + } + + return true; +} diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json new file mode 100644 index 0000000000..ff8557d334 --- /dev/null +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/lang/json/unicode.json @@ -0,0 +1,29 @@ +{ + /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ + "glossary": { + "title": "example glossary", + "line" : 3, + "GlossDiv": { + "line" : 6, + "title": "S", + "GlossList": { + "line" : 9, + /* 🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕🍕 */ + "GlossEntry": { + "line" : 12, + "ID": "S🍕🍕L", + "SortAs": "S🍕ML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "line" : 19, + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": ["GML", "XML"] + }, + "GlossSee": "m🍕🍕🍕🍕up" + } + } + } + } +} \ No newline at end of file