Skip to content
62 changes: 62 additions & 0 deletions sjsonnet/src-js/sjsonnet/CharSWAR.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,66 @@ object CharSWAR {
}
false
}

/** Scalar scan returning position of first escape char, or -1 if none. */
def findFirstEscapeChar(arr: Array[Byte], from: Int, to: Int): Int = {
var i = from
while (i < to) {
val b = arr(i) & 0xff
if (b < 32 || b == '"' || b == '\\') return i
i += 1
}
-1
Comment thread
He-Pin marked this conversation as resolved.
}

/** Scalar scan for char[] returning position of first escape char, or -1 if none. */
def findFirstEscapeCharChar(arr: Array[Char], from: Int, to: Int): Int = {
var i = from
while (i < to) {
val c = arr(i)
if (c < 32 || c == '"' || c == '\\') return i
i += 1
}
-1
}

/**
* Returns true if all characters in the string are ASCII (< 0x80). Scalar fallback for Scala.js.
*/
def isAllAscii(s: String): Boolean = {
var i = 0
val len = s.length
while (i < len) {
if (s.charAt(i) >= 0x80) return false
i += 1
}
true
}

/**
* Compare two strings by Unicode codepoint values. Scalar fallback for Scala.js. Uses
* equal-char-skip fast path with deferred surrogate check.
*/
def compareStrings(s1: String, s2: String): Int = {
if (s1 eq s2) return 0
val n1 = s1.length
val n2 = s2.length
val minLen = math.min(n1, n2)
var i = 0
while (i < minLen) {
val c1 = s1.charAt(i)
val c2 = s2.charAt(i)
if (c1 == c2) {
i += 1
} else if (!Character.isSurrogate(c1) && !Character.isSurrogate(c2)) {
return c1 - c2
} else {
val cp1 = Character.codePointAt(s1, i)
val cp2 = Character.codePointAt(s2, i)
if (cp1 != cp2) return Integer.compare(cp1, cp2)
i += Character.charCount(cp1)
}
}
Integer.compare(n1, n2)
}
}
187 changes: 180 additions & 7 deletions sjsonnet/src-jvm/sjsonnet/CharSWAR.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@
import java.nio.charset.StandardCharsets;

/**
* SWAR (SIMD Within A Register) escape-char scanner for JSON string rendering.
* SWAR (SIMD Within A Register) utilities for JSON string rendering and string comparison.
*
* <p>Detects characters requiring JSON escaping: control chars ({@code < 32}),
* double-quote ({@code '"'}), and backslash ({@code '\\'}).
* <p>Provides:
* <ul>
* <li>Escape-char scanning: detects/locates chars requiring JSON escaping
* (control chars, double-quote, backslash).</li>
* <li>String comparison: codepoint-correct comparison with array-based inner loop
* that the JIT can auto-vectorize to SIMD instructions.</li>
* </ul>
*
* <p>For strings above a threshold length, converts to ISO-8859-1 bytes and
* processes 8 bytes at a time using {@link VarHandle} bulk reads + Hacker's
Expand All @@ -23,7 +28,7 @@
*
* @see <a href="https://richardstartin.github.io/posts/finding-bytes">Finding Bytes in Arrays</a>
*/
final class CharSWAR {
public final class CharSWAR {
private CharSWAR() {}

// VarHandle for reading longs from byte[] — replaces sun.misc.Unsafe.
Expand Down Expand Up @@ -57,7 +62,7 @@ private CharSWAR() {}
* Check if any char in {@code str} needs JSON string escaping.
* Scan-first API: call on the String before copying to the output buffer.
*/
static boolean hasEscapeChar(String str) {
public static boolean hasEscapeChar(String str) {
int len = str.length();
if (len < SWAR_THRESHOLD) {
return hasEscapeCharScalar(str, len);
Expand All @@ -75,14 +80,14 @@ static boolean hasEscapeChar(String str) {
* UTF-8 multi-byte sequences never produce bytes matching '"', '\\', or &lt; 0x20,
* so this is safe for scanning UTF-8 encoded data.
*/
static boolean hasEscapeChar(byte[] arr, int from, int to) {
public static boolean hasEscapeChar(byte[] arr, int from, int to) {
return hasEscapeCharSWAR(arr, from, to);
}

/**
* Check if any char in {@code arr[from..to)} needs JSON string escaping.
*/
static boolean hasEscapeChar(char[] arr, int from, int to) {
public static boolean hasEscapeChar(char[] arr, int from, int to) {
for (int i = from; i < to; i++) {
char c = arr[i];
if (c < 32 || c == '"' || c == '\\') return true;
Expand Down Expand Up @@ -138,4 +143,172 @@ private static boolean hasEscapeCharScalar(String s, int len) {
}
return false;
}

// =========================================================================
// findFirstEscapeChar — position-returning SWAR scan for chunked rendering
// =========================================================================

/**
* Find the index of the first byte in {@code arr[from..to)} that needs JSON
* string escaping. Returns {@code -1} if no escape char is found.
*
* <p>Uses SWAR to scan 8 bytes per iteration, then pinpoints the exact byte
* within a matched 8-byte word via scalar fallback.
*/
public static int findFirstEscapeChar(byte[] arr, int from, int to) {
int i = from;
int limit = to - 7;
while (i < limit) {
long word = (long) LONG_VIEW.get(arr, i);
if (swarHasMatch(word)) {
// Pinpoint exact byte within the matched 8-byte word
for (int j = i; j < i + 8; j++) {
int b = arr[j] & 0xFF;
if (b < 32 || b == '"' || b == '\\') return j;
}
}
i += 8;
}
// Tail: remaining 0-7 bytes
while (i < to) {
int b = arr[i] & 0xFF;
if (b < 32 || b == '"' || b == '\\') return i;
i++;
}
return -1;
}

/**
* Find the index of the first char in {@code arr[from..to)} that needs JSON
* string escaping. Returns {@code -1} if no escape char is found.
* Scalar scan on char[] — used by char-based chunked rendering.
*/
public static int findFirstEscapeCharChar(char[] arr, int from, int to) {
for (int i = from; i < to; i++) {
char c = arr[i];
if (c < 32 || c == '"' || c == '\\') return i;
}
return -1;
}

// =========================================================================
// isAllAscii — check if all chars are ASCII (< 0x80)
// =========================================================================

/**
* Returns true if all characters in the string are ASCII (&lt; 0x80).
* Uses ISO-8859-1 encoding + SWAR for long strings. For ASCII-only strings,
* codepoint operations can be replaced with direct char indexing.
*/
public static boolean isAllAscii(String s) {
int len = s.length();
for (int i = 0; i < len; i++) {
if (s.charAt(i) >= 0x80) return false;
}
return true;
}

// =========================================================================
// compareStrings — JIT-vectorizable codepoint-correct string comparison
// =========================================================================

/** Reusable char buffers for string comparison (one per thread). */
private static final int CMP_BUF_SIZE = 32768;
private static final ThreadLocal<char[]> CMP_BUF1 =
ThreadLocal.withInitial(() -> new char[CMP_BUF_SIZE]);
private static final ThreadLocal<char[]> CMP_BUF2 =
ThreadLocal.withInitial(() -> new char[CMP_BUF_SIZE]);

/** Below this length, scalar charAt comparison is faster than getChars + array loop. */
private static final int CMP_THRESHOLD = 16;

/**
* Compare two strings by Unicode codepoint values. Equivalent to
* {@code Util.compareStringsByCodepoint} but uses bulk {@code getChars} +
* tight array loop so the JIT can auto-vectorize the comparison to SIMD
* instructions (AVX2/SSE on x86, NEON on ARM).
*
* <p>Surrogate checks are deferred to the mismatch point (O(1) instead of
* O(n)), which is correct because equal chars — even surrogates — can be
* skipped without affecting ordering.
*/
public static int compareStrings(String s1, String s2) {
if (s1 == s2) return 0;
int n1 = s1.length(), n2 = s2.length();
int minLen = Math.min(n1, n2);

// Short strings or strings exceeding buffer: scalar path
if (minLen < CMP_THRESHOLD || n1 > CMP_BUF_SIZE || n2 > CMP_BUF_SIZE) {
return compareStringsScalar(s1, n1, s2, n2);
}

// Bulk-copy to char arrays — eliminates String.charAt() virtual dispatch,
// enabling the JIT to auto-vectorize the comparison loop.
char[] c1 = CMP_BUF1.get();
char[] c2 = CMP_BUF2.get();
s1.getChars(0, n1, c1, 0);
s2.getChars(0, n2, c2, 0);

// Tight comparison loop — the simple c1[i] != c2[i] pattern is what
// the C2 JIT compiler recognizes and vectorizes.
int i = 0;
while (i < minLen) {
if (c1[i] != c2[i]) {
char a = c1[i], b = c2[i];
if (!Character.isSurrogate(a) && !Character.isSurrogate(b)) {
return a - b;
}
// Back up if we landed on a low surrogate that's part of a pair
int pos = i;
if (pos > 0 && Character.isLowSurrogate(a) && Character.isHighSurrogate(c1[pos - 1])) {
pos--;
}
return compareCodepointsFrom(c1, n1, c2, n2, pos);
}
i++;
}
return Integer.compare(n1, n2);
}

/**
* Scalar codepoint comparison for short strings or overflow.
* Uses the equal-char-skip fast path (no surrogate check on matching chars).
*/
private static int compareStringsScalar(String s1, int n1, String s2, int n2) {
int minLen = Math.min(n1, n2);
int i = 0;
while (i < minLen) {
char c1 = s1.charAt(i);
char c2 = s2.charAt(i);
if (c1 == c2) {
i++;
} else if (!Character.isSurrogate(c1) && !Character.isSurrogate(c2)) {
return c1 - c2;
} else {
int cp1 = Character.codePointAt(s1, i);
int cp2 = Character.codePointAt(s2, i);
if (cp1 != cp2) return Integer.compare(cp1, cp2);
i += Character.charCount(cp1);
}
}
return Integer.compare(n1, n2);
}

/**
* Codepoint-level comparison from a given position in char arrays.
* Used as fallback when a mismatch involves surrogate chars.
*/
private static int compareCodepointsFrom(char[] c1, int n1, char[] c2, int n2, int from) {
int i1 = from, i2 = from;
while (i1 < n1 && i2 < n2) {
int cp1 = Character.codePointAt(c1, i1);
int cp2 = Character.codePointAt(c2, i2);
if (cp1 != cp2) return Integer.compare(cp1, cp2);
i1 += Character.charCount(cp1);
i2 += Character.charCount(cp2);
}
if (i1 < n1) return 1;
if (i2 < n2) return -1;
return 0;
}
}
Loading
Loading