From 28401771ae1b691ecc695810ef01e8f1de6f424e Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Mon, 1 Jun 2026 12:47:06 -0400 Subject: [PATCH 1/9] Make the JSON parse loop iterative As opposed to a recursive loop. We do this by keeping a stack of frames (very similar to how the stack of values was already stored). Each frame represents the state of a container. Since there are only 2 in JSON, it doesn't have to get too complex. --- ext/json/ext/parser/parser.c | 460 +++++++++++++++++++++++++---------- 1 file changed, 329 insertions(+), 131 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index d0482f68..0bffe4b5 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -309,6 +309,137 @@ static void rvalue_stack_eagerly_release(VALUE handle) } } +/* frame stack */ + +// Iterative (non-recursive) parsing keeps an explicit stack of the containers +// currently being built, instead of relying on the C call stack. Each frame +// only needs enough bookkeeping to close its container: which kind it is, the +// rvalue_stack position where its children start (so we know how many to pop), +// and the cursor at its opening brace (used to rewind for duplicate key errors). +// Frames hold no VALUEs, so this stack needs no GC marking; it reuses the same +// stack-allocated-with-heap-spill strategy as the rvalue_stack so that it's +// freed even if parsing raises. +// +// The lifecycle helpers below (grow/push/peek/pop/spill/free/eagerly_release and +// the rb_data_type_t) deliberately mirror their rvalue_stack counterparts -- the +// element type and the absence of a mark function are the only real differences. +// Keep the two in sync: a fix to the spill/release or HAVE_RUBY_TYPED_EMBEDDABLE +// handling in one almost certainly belongs in the other. +#define JSON_FRAME_STACK_INITIAL_CAPA 32 + +enum json_frame_type { + JSON_FRAME_ARRAY, + JSON_FRAME_OBJECT, +}; + +typedef struct json_frame_struct { + enum json_frame_type type; + long stack_head; // rvalue_stack->head when this container opened + const char *start_cursor; // object frames only (the '{'); NULL for arrays +} json_frame; + +typedef struct json_frame_stack_struct { + enum rvalue_stack_type type; // shared with rvalue_stack: is ptr stack- or heap-allocated + long capa; + long head; + json_frame *ptr; +} json_frame_stack; + +static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref); + +static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) +{ + long required = stack->capa * 2; + + if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { + stack = json_frame_stack_spill(stack, handle, stack_ref); + } else { + REALLOC_N(stack->ptr, json_frame, required); + stack->capa = required; + } + return stack; +} + +static void json_frame_stack_push(json_frame_stack *stack, json_frame frame, VALUE *handle, json_frame_stack **stack_ref) +{ + if (RB_UNLIKELY(stack->head >= stack->capa)) { + stack = json_frame_stack_grow(stack, handle, stack_ref); + } + stack->ptr[stack->head] = frame; + stack->head++; +} + +static inline json_frame *json_frame_stack_peek(json_frame_stack *stack) +{ + return &stack->ptr[stack->head - 1]; +} + +static inline void json_frame_stack_pop(json_frame_stack *stack) +{ + stack->head--; +} + +static void json_frame_stack_free_buffer(json_frame_stack *stack) +{ + ruby_xfree(stack->ptr); + stack->ptr = NULL; +} + +static void json_frame_stack_free(void *ptr) +{ + json_frame_stack *stack = (json_frame_stack *)ptr; + if (stack) { + json_frame_stack_free_buffer(stack); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + ruby_xfree(stack); +#endif + } +} + +static size_t json_frame_stack_memsize(const void *ptr) +{ + const json_frame_stack *stack = (const json_frame_stack *)ptr; + return sizeof(json_frame_stack) + sizeof(json_frame) * stack->capa; +} + +static const rb_data_type_t JSON_Parser_frame_stack_type = { + .wrap_struct_name = "JSON::Ext::Parser/frame_stack", + .function = { + .dmark = NULL, + .dfree = json_frame_stack_free, + .dsize = json_frame_stack_memsize, + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, +}; + +static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref) +{ + json_frame_stack *stack; + *handle = TypedData_Make_Struct(0, json_frame_stack, &JSON_Parser_frame_stack_type, stack); + *stack_ref = stack; + MEMCPY(stack, old_stack, json_frame_stack, 1); + + stack->capa = old_stack->capa << 1; + stack->ptr = ALLOC_N(json_frame, stack->capa); + stack->type = RVALUE_STACK_HEAP_ALLOCATED; + MEMCPY(stack->ptr, old_stack->ptr, json_frame, old_stack->head); + return stack; +} + +static void json_frame_stack_eagerly_release(VALUE handle) +{ + if (handle) { + json_frame_stack *stack; + TypedData_Get_Struct(handle, json_frame_stack, &JSON_Parser_frame_stack_type, stack); +#ifdef HAVE_RUBY_TYPED_EMBEDDABLE + json_frame_stack_free_buffer(stack); +#else + json_frame_stack_free(stack); + RTYPEDDATA_DATA(handle) = NULL; +#endif + } +} + static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) { int len = 1; @@ -357,10 +488,12 @@ typedef struct JSON_ParserStruct { typedef struct JSON_ParserStateStruct { VALUE *stack_handle; + VALUE *frame_stack_handle; const char *start; const char *cursor; const char *end; rvalue_stack *stack; + json_frame_stack *frames; rvalue_cache name_cache; int in_array; int current_nesting; @@ -1247,166 +1380,231 @@ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_Par return json_parse_number(state, config, true, start); } +// Parse an arbitrary JSON value iteratively (without recursing per nesting +// level). The loop alternates between two modes: +// +// parsing_value == true: read one complete value at the cursor. Scalars are +// pushed straight onto the rvalue_stack; an opening '[' or '{' pushes a +// frame describing the new container and keeps parsing_value true to read +// its first element/value. +// +// parsing_value == false: a value has just landed on the rvalue_stack, so we +// consult the innermost open container (the top frame) to decide what +// comes next: a ',' (parse another element/value), or a closing bracket +// (pop the frame, bulk-build the container, push the result, and stay in +// this mode so the parent is consulted next). With no frames left, that +// value is the whole document and we return it. +// +// Because every value -- scalar, key, or freshly-closed container -- is pushed +// onto the shared rvalue_stack, the "attach the finished child to its parent" +// step happens implicitly: closing a container pops its children and pushes its +// single result back, right after its parent's earlier children. static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) { - json_eat_whitespace(state); + bool parsing_value = true; - switch (peek(state)) { - case 'n': - if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { - state->cursor += 4; - return json_push_value(state, config, Qnil); - } + while (true) { + if (parsing_value) { + json_eat_whitespace(state); - raise_parse_error("unexpected token %s", state); - break; - case 't': - if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { - state->cursor += 4; - return json_push_value(state, config, Qtrue); - } + switch (peek(state)) { + case 'n': + if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qnil); + parsing_value = false; + break; + } - raise_parse_error("unexpected token %s", state); - break; - case 'f': - // Note: memcmp with a small power of two compile to an integer comparison - if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { - state->cursor += 5; - return json_push_value(state, config, Qfalse); - } + raise_parse_error("unexpected token %s", state); + case 't': + if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qtrue); + parsing_value = false; + break; + } - raise_parse_error("unexpected token %s", state); - break; - case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { - state->cursor += 3; - return json_push_value(state, config, CNaN); - } + raise_parse_error("unexpected token %s", state); + case 'f': + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { + state->cursor += 5; + json_push_value(state, config, Qfalse); + parsing_value = false; + break; + } - raise_parse_error("unexpected token %s", state); - break; - case 'I': - if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { - state->cursor += 8; - return json_push_value(state, config, CInfinity); - } + raise_parse_error("unexpected token %s", state); + case 'N': + // Note: memcmp with a small power of two compile to an integer comparison + if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { + state->cursor += 3; + json_push_value(state, config, CNaN); + parsing_value = false; + break; + } - raise_parse_error("unexpected token %s", state); - break; - case '-': { - // Note: memcmp with a small power of two compile to an integer comparison - if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { - if (config->allow_nan) { - state->cursor += 9; - return json_push_value(state, config, CMinusInfinity); - } else { raise_parse_error("unexpected token %s", state); + case 'I': + if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { + state->cursor += 8; + json_push_value(state, config, CInfinity); + parsing_value = false; + break; + } + + raise_parse_error("unexpected token %s", state); + case '-': { + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { + if (config->allow_nan) { + state->cursor += 9; + json_push_value(state, config, CMinusInfinity); + parsing_value = false; + break; + } else { + raise_parse_error("unexpected token %s", state); + } + } + json_push_value(state, config, json_parse_negative_number(state, config)); + parsing_value = false; + break; } - } - return json_push_value(state, config, json_parse_negative_number(state, config)); - break; - } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - return json_push_value(state, config, json_parse_positive_number(state, config)); - break; - case '"': { - // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} - return json_parse_string(state, config, false); - break; - } - case '[': { - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + json_push_value(state, config, json_parse_positive_number(state, config)); + parsing_value = false; + break; + case '"': { + // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + json_parse_string(state, config, false); + parsing_value = false; + break; + } + case '[': { + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; - if (peek(state) == ']') { - state->cursor++; - return json_push_value(state, config, json_decode_array(state, config, 0)); - } else { - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + if (peek(state) == ']') { + state->cursor++; + json_push_value(state, config, json_decode_array(state, config, 0)); + parsing_value = false; + break; + } + + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + state->in_array++; + + json_frame frame = { .type = JSON_FRAME_ARRAY, .stack_head = stack_head, .start_cursor = NULL }; + json_frame_stack_push(state->frames, frame, state->frame_stack_handle, &state->frames); + // Keep parsing_value true to read the first element. + break; } - state->in_array++; - json_parse_any(state, config); + case '{': { + const char *object_start_cursor = state->cursor; + + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; + + if (peek(state) == '}') { + state->cursor++; + json_push_value(state, config, json_decode_object(state, config, 0)); + parsing_value = false; + break; + } + + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + + if (peek(state) != '"') { + raise_parse_error("expected object key, got %s", state); + } + json_parse_string(state, config, true); + + json_eat_whitespace(state); + if (peek(state) != ':') { + raise_parse_error("expected ':' after object key", state); + } + state->cursor++; + + json_frame frame = { .type = JSON_FRAME_OBJECT, .stack_head = stack_head, .start_cursor = object_start_cursor }; + json_frame_stack_push(state->frames, frame, state->frame_stack_handle, &state->frames); + // Keep parsing_value true to read the first value. + break; + } + + case 0: + raise_parse_error("unexpected end of input", state); + + default: + raise_parse_error("unexpected character: %s", state); + } + } else { + json_frame_stack *frames = state->frames; + if (frames->head == 0) { + // The completed value is the entire document. + return *rvalue_stack_peek(state->stack, 1); } - while (true) { + json_frame *frame = json_frame_stack_peek(frames); + + if (frame->type == JSON_FRAME_ARRAY) { json_eat_whitespace(state); const char next_char = peek(state); + if (next_char == ']') { + state->cursor++; + long count = state->stack->head - frame->stack_head; + state->current_nesting--; + state->in_array--; + json_frame_stack_pop(frames); + json_push_value(state, config, json_decode_array(state, config, count)); + // parsing_value stays false: consult the parent next. + continue; + } + if (RB_LIKELY(next_char == ',')) { state->cursor++; if (config->allow_trailing_comma) { json_eat_whitespace(state); if (peek(state) == ']') { + // Trailing comma: re-consult the frame to close it. continue; } } - json_parse_any(state, config); + parsing_value = true; continue; } - if (next_char == ']') { - state->cursor++; - long count = state->stack->head - stack_head; - state->current_nesting--; - state->in_array--; - return json_push_value(state, config, json_decode_array(state, config, count)); - } - raise_parse_error("expected ',' or ']' after array value", state); - } - break; - } - case '{': { - const char *object_start_cursor = state->cursor; - - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; - - if (peek(state) == '}') { - state->cursor++; - return json_push_value(state, config, json_decode_object(state, config, 0)); } else { - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); - } - - if (peek(state) != '"') { - raise_parse_error("expected object key, got %s", state); - } - json_parse_string(state, config, true); - - json_eat_whitespace(state); - if (peek(state) != ':') { - raise_parse_error("expected ':' after object key", state); - } - state->cursor++; - - json_parse_any(state, config); - } - - while (true) { json_eat_whitespace(state); const char next_char = peek(state); + if (next_char == '}') { state->cursor++; state->current_nesting--; - size_t count = state->stack->head - stack_head; + size_t count = state->stack->head - frame->stack_head; // Temporary rewind cursor in case an error is raised const char *final_cursor = state->cursor; - state->cursor = object_start_cursor; + state->cursor = frame->start_cursor; VALUE object = json_decode_object(state, config, count); state->cursor = final_cursor; - return json_push_value(state, config, object); + json_frame_stack_pop(frames); + json_push_value(state, config, object); + // parsing_value stays false: consult the parent next. + continue; } if (next_char == ',') { @@ -1415,6 +1613,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (config->allow_trailing_comma) { if (peek(state) == '}') { + // Trailing comma: re-consult the frame to close it. continue; } } @@ -1430,27 +1629,14 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } state->cursor++; - json_parse_any(state, config); - + parsing_value = true; continue; } raise_parse_error("expected ',' or '}' after object value, got: %s", state); } - break; } - - case 0: - raise_parse_error("unexpected end of input", state); - break; - - default: - raise_parse_error("unexpected character: %s", state); - break; } - - raise_parse_error("unreachable: %s", state); - return Qundef; } static void json_ensure_eof(JSON_ParserState *state) @@ -1622,18 +1808,28 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) .capa = RVALUE_STACK_INITIAL_CAPA, }; + json_frame frame_stack_buffer[JSON_FRAME_STACK_INITIAL_CAPA]; + json_frame_stack frames = { + .type = RVALUE_STACK_STACK_ALLOCATED, + .ptr = frame_stack_buffer, + .capa = JSON_FRAME_STACK_INITIAL_CAPA, + }; + long len; const char *start; RSTRING_GETMEM(Vsource, start, len); VALUE stack_handle = 0; + VALUE frame_stack_handle = 0; JSON_ParserState _state = { .start = start, .cursor = start, .end = start + len, .stack = &stack, .stack_handle = &stack_handle, + .frames = &frames, + .frame_stack_handle = &frame_stack_handle, }; JSON_ParserState *state = &_state; @@ -1642,7 +1838,9 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) // This may be skipped in case of exception, but // it won't cause a leak. rvalue_stack_eagerly_release(stack_handle); + json_frame_stack_eagerly_release(frame_stack_handle); RB_GC_GUARD(stack_handle); + RB_GC_GUARD(frame_stack_handle); RB_GC_GUARD(Vsource); json_ensure_eof(state); From ba2f0342df57af2dfe9920e6f35433fc2a7befd0 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Mon, 1 Jun 2026 13:43:40 -0400 Subject: [PATCH 2/9] JSON iterative parsing phases Each frame in the iterative parser now holds an enum describing its "phase", in order to support suspending parsing. --- ext/json/ext/parser/parser.c | 475 +++++++++++++++++++---------------- 1 file changed, 255 insertions(+), 220 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 0bffe4b5..f5aa27c5 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -315,27 +315,42 @@ static void rvalue_stack_eagerly_release(VALUE handle) // currently being built, instead of relying on the C call stack. Each frame // only needs enough bookkeeping to close its container: which kind it is, the // rvalue_stack position where its children start (so we know how many to pop), -// and the cursor at its opening brace (used to rewind for duplicate key errors). -// Frames hold no VALUEs, so this stack needs no GC marking; it reuses the same -// stack-allocated-with-heap-spill strategy as the rvalue_stack so that it's -// freed even if parsing raises. +// and the cursor at its opening brace (used to rewind for duplicate key +// errors). Frames hold no VALUEs, so this stack needs no GC marking; it reuses +// the same stack-allocated-with-heap-spill strategy as the rvalue_stack so that +// it's freed even if parsing raises. // -// The lifecycle helpers below (grow/push/peek/pop/spill/free/eagerly_release and -// the rb_data_type_t) deliberately mirror their rvalue_stack counterparts -- the -// element type and the absence of a mark function are the only real differences. -// Keep the two in sync: a fix to the spill/release or HAVE_RUBY_TYPED_EMBEDDABLE -// handling in one almost certainly belongs in the other. +// The lifecycle helpers below (grow/push/peek/pop/spill/free/eagerly_release +// and the rb_data_type_t) deliberately mirror their rvalue_stack counterparts +// -- the element type and the absence of a mark function are the only real +// differences. Keep the two in sync: a fix to the spill/release or +// HAVE_RUBY_TYPED_EMBEDDABLE handling in one almost certainly belongs in the +// other. #define JSON_FRAME_STACK_INITIAL_CAPA 32 enum json_frame_type { + JSON_FRAME_ROOT, JSON_FRAME_ARRAY, JSON_FRAME_OBJECT, }; +// Where a frame is within its container's grammar. This is the entirety of the +// parser's "what to do next" state: json_parse_any dispatches on the top +// frame's phase and holds no resume state in C locals, so a parse can stop at +// any value boundary and be resumed purely from the (persistable) frame stack. +enum json_frame_phase { + JSON_PHASE_VALUE, // expecting a value (document root, array element, or object value after ':') + JSON_PHASE_COMMA, // after a value: expecting ',' or the closing ']' / '}' + JSON_PHASE_KEY, // object only: expecting a '"' key (after '{' or ',') + JSON_PHASE_COLON, // object only: after a key, expecting ':' + JSON_PHASE_DONE, // root only: the document value has been parsed +}; + typedef struct json_frame_struct { enum json_frame_type type; + enum json_frame_phase phase; long stack_head; // rvalue_stack->head when this container opened - const char *start_cursor; // object frames only (the '{'); NULL for arrays + const char *start_cursor; // object frames only (the '{'); NULL otherwise } json_frame; typedef struct json_frame_stack_struct { @@ -1380,261 +1395,280 @@ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_Par return json_parse_number(state, config, true, start); } -// Parse an arbitrary JSON value iteratively (without recursing per nesting -// level). The loop alternates between two modes: -// -// parsing_value == true: read one complete value at the cursor. Scalars are -// pushed straight onto the rvalue_stack; an opening '[' or '{' pushes a -// frame describing the new container and keeps parsing_value true to read -// its first element/value. -// -// parsing_value == false: a value has just landed on the rvalue_stack, so we -// consult the innermost open container (the top frame) to decide what -// comes next: a ',' (parse another element/value), or a closing bracket -// (pop the frame, bulk-build the container, push the result, and stay in -// this mode so the parent is consulted next). With no frames left, that -// value is the whole document and we return it. -// -// Because every value -- scalar, key, or freshly-closed container -- is pushed -// onto the shared rvalue_stack, the "attach the finished child to its parent" -// step happens implicitly: closing a container pops its children and pushes its -// single result back, right after its parent's earlier children. -static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +// How many values (array elements, or interleaved object keys+values) have been +// pushed onto the rvalue stack since this container opened. Used to size the +// bulk decode on close, and to tell the first key/colon from later ones. +static inline long json_frame_entry_count(const json_frame *frame, const rvalue_stack *stack) +{ + return stack->head - frame->stack_head; +} + +// A complete value now sits on top of the rvalue stack. Advance the frame that +// was waiting for it: the root document is done, or the enclosing container +// moves on to expecting a ',' or its closing bracket. The caller passes the +// frame it already has in hand -- the one that was expecting the value -- which +// after a container close is the freshly re-exposed parent. +static inline void json_value_completed(json_frame *frame) +{ + frame->phase = (frame->type == JSON_FRAME_ROOT) ? JSON_PHASE_DONE : JSON_PHASE_COMMA; +} + +// Seed the frame stack with the root frame, establishing the invariant that +// json_parse_any always has a top frame to dispatch on (so the stack is never +// empty mid-parse). +static void json_parse_begin(JSON_ParserState *state) { - bool parsing_value = true; + json_frame root_frame = { .type = JSON_FRAME_ROOT, .phase = JSON_PHASE_VALUE }; + json_frame_stack_push(state->frames, root_frame, state->frame_stack_handle, &state->frames); +} +// Parse an arbitrary JSON value iteratively. This is a state machine driven +// entirely by the top frame's phase so it can stop at any value boundary and +// resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the +// bottom of the stack, so the stack is never empty mid-parse and the document +// itself is just another frame whose value, once parsed, leaves its phase DONE. +static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +{ while (true) { - if (parsing_value) { - json_eat_whitespace(state); - - switch (peek(state)) { - case 'n': - if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { - state->cursor += 4; - json_push_value(state, config, Qnil); - parsing_value = false; - break; - } + json_frame *frame = json_frame_stack_peek(state->frames); - raise_parse_error("unexpected token %s", state); - case 't': - if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { - state->cursor += 4; - json_push_value(state, config, Qtrue); - parsing_value = false; - break; - } + switch (frame->phase) { + case JSON_PHASE_DONE: + // The root document value is parsed; it is the lone survivor on + // the rvalue stack. + return *rvalue_stack_peek(state->stack, 1); - raise_parse_error("unexpected token %s", state); - case 'f': - // Note: memcmp with a small power of two compile to an integer comparison - if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { - state->cursor += 5; - json_push_value(state, config, Qfalse); - parsing_value = false; - break; - } + case JSON_PHASE_VALUE: + json_eat_whitespace(state); - raise_parse_error("unexpected token %s", state); - case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { - state->cursor += 3; - json_push_value(state, config, CNaN); - parsing_value = false; - break; - } + switch (peek(state)) { + case 'n': + if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qnil); + json_value_completed(frame); + break; + } - raise_parse_error("unexpected token %s", state); - case 'I': - if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { - state->cursor += 8; - json_push_value(state, config, CInfinity); - parsing_value = false; + raise_parse_error("unexpected token %s", state); + case 't': + if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qtrue); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'f': + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { + state->cursor += 5; + json_push_value(state, config, Qfalse); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'N': + // Note: memcmp with a small power of two compile to an integer comparison + if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { + state->cursor += 3; + json_push_value(state, config, CNaN); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'I': + if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { + state->cursor += 8; + json_push_value(state, config, CInfinity); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case '-': { + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { + if (config->allow_nan) { + state->cursor += 9; + json_push_value(state, config, CMinusInfinity); + json_value_completed(frame); + break; + } else { + raise_parse_error("unexpected token %s", state); + } + } + json_push_value(state, config, json_parse_negative_number(state, config)); + json_value_completed(frame); break; } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + json_push_value(state, config, json_parse_positive_number(state, config)); + json_value_completed(frame); + break; + case '"': + // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + json_parse_string(state, config, false); + json_value_completed(frame); + break; + case '[': { + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; - raise_parse_error("unexpected token %s", state); - case '-': { - // Note: memcmp with a small power of two compile to an integer comparison - if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { - if (config->allow_nan) { - state->cursor += 9; - json_push_value(state, config, CMinusInfinity); - parsing_value = false; + if (peek(state) == ']') { + state->cursor++; + json_push_value(state, config, json_decode_array(state, config, 0)); + json_value_completed(frame); break; - } else { - raise_parse_error("unexpected token %s", state); } - } - json_push_value(state, config, json_parse_negative_number(state, config)); - parsing_value = false; - break; - } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - json_push_value(state, config, json_parse_positive_number(state, config)); - parsing_value = false; - break; - case '"': { - // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} - json_parse_string(state, config, false); - parsing_value = false; - break; - } - case '[': { - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; - if (peek(state) == ']') { - state->cursor++; - json_push_value(state, config, json_decode_array(state, config, 0)); - parsing_value = false; + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + state->in_array++; + + json_frame array_frame = { .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, .stack_head = stack_head, .start_cursor = NULL }; + json_frame_stack_push(state->frames, array_frame, state->frame_stack_handle, &state->frames); + // Phase stays VALUE: the next iteration reads the first element. break; } + case '{': { + const char *object_start_cursor = state->cursor; - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); - } - state->in_array++; + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; - json_frame frame = { .type = JSON_FRAME_ARRAY, .stack_head = stack_head, .start_cursor = NULL }; - json_frame_stack_push(state->frames, frame, state->frame_stack_handle, &state->frames); - // Keep parsing_value true to read the first element. - break; - } - case '{': { - const char *object_start_cursor = state->cursor; + if (peek(state) == '}') { + state->cursor++; + json_push_value(state, config, json_decode_object(state, config, 0)); + json_value_completed(frame); + break; + } - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } - if (peek(state) == '}') { - state->cursor++; - json_push_value(state, config, json_decode_object(state, config, 0)); - parsing_value = false; + json_frame object_frame = { .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_KEY, .stack_head = stack_head, .start_cursor = object_start_cursor }; + json_frame_stack_push(state->frames, object_frame, state->frame_stack_handle, &state->frames); + // Phase KEY: the next iteration reads the first key. break; } - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); - } + case 0: + raise_parse_error("unexpected end of input", state); + + default: + raise_parse_error("unexpected character: %s", state); + } + break; - if (peek(state) != '"') { + case JSON_PHASE_KEY: + json_eat_whitespace(state); + if (RB_UNLIKELY(peek(state) != '"')) { + // The message differs for the first key vs. a key after a + // ',': the first is the only one reached with nothing pushed + // for this object yet. + if (json_frame_entry_count(frame, state->stack) == 0) { raise_parse_error("expected object key, got %s", state); + } else { + raise_parse_error("expected object key, got: %s", state); } - json_parse_string(state, config, true); + } + json_parse_string(state, config, true); + frame->phase = JSON_PHASE_COLON; + break; - json_eat_whitespace(state); - if (peek(state) != ':') { + case JSON_PHASE_COLON: + json_eat_whitespace(state); + if (RB_UNLIKELY(peek(state) != ':')) { + // First colon (only the first pair's key is pushed, nothing + // else) vs. a later one. + if (json_frame_entry_count(frame, state->stack) == 1) { raise_parse_error("expected ':' after object key", state); + } else { + raise_parse_error("expected ':' after object key, got: %s", state); } - state->cursor++; - - json_frame frame = { .type = JSON_FRAME_OBJECT, .stack_head = stack_head, .start_cursor = object_start_cursor }; - json_frame_stack_push(state->frames, frame, state->frame_stack_handle, &state->frames); - // Keep parsing_value true to read the first value. - break; } + state->cursor++; + frame->phase = JSON_PHASE_VALUE; + break; - case 0: - raise_parse_error("unexpected end of input", state); - - default: - raise_parse_error("unexpected character: %s", state); - } - } else { - json_frame_stack *frames = state->frames; - if (frames->head == 0) { - // The completed value is the entire document. - return *rvalue_stack_peek(state->stack, 1); - } - - json_frame *frame = json_frame_stack_peek(frames); - - if (frame->type == JSON_FRAME_ARRAY) { + case JSON_PHASE_COMMA: json_eat_whitespace(state); - const char next_char = peek(state); + if (frame->type == JSON_FRAME_ARRAY) { + const char next_char = peek(state); - if (next_char == ']') { - state->cursor++; - long count = state->stack->head - frame->stack_head; - state->current_nesting--; - state->in_array--; - json_frame_stack_pop(frames); - json_push_value(state, config, json_decode_array(state, config, count)); - // parsing_value stays false: consult the parent next. - continue; - } + if (next_char == ']') { + state->cursor++; + long count = json_frame_entry_count(frame, state->stack); + state->current_nesting--; + state->in_array--; + json_frame_stack_pop(state->frames); + json_push_value(state, config, json_decode_array(state, config, count)); + json_value_completed(json_frame_stack_peek(state->frames)); + break; + } - if (RB_LIKELY(next_char == ',')) { - state->cursor++; - if (config->allow_trailing_comma) { - json_eat_whitespace(state); - if (peek(state) == ']') { - // Trailing comma: re-consult the frame to close it. - continue; + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + if (config->allow_trailing_comma) { + json_eat_whitespace(state); + if (peek(state) == ']') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; + } } + frame->phase = JSON_PHASE_VALUE; + break; } - parsing_value = true; - continue; - } - - raise_parse_error("expected ',' or ']' after array value", state); - } else { - json_eat_whitespace(state); - const char next_char = peek(state); + raise_parse_error("expected ',' or ']' after array value", state); + } else { + const char next_char = peek(state); - if (next_char == '}') { - state->cursor++; - state->current_nesting--; - size_t count = state->stack->head - frame->stack_head; - - // Temporary rewind cursor in case an error is raised - const char *final_cursor = state->cursor; - state->cursor = frame->start_cursor; - VALUE object = json_decode_object(state, config, count); - state->cursor = final_cursor; - - json_frame_stack_pop(frames); - json_push_value(state, config, object); - // parsing_value stays false: consult the parent next. - continue; - } + if (next_char == '}') { + state->cursor++; + state->current_nesting--; + size_t count = json_frame_entry_count(frame, state->stack); + + // Temporary rewind cursor in case an error is raised + const char *final_cursor = state->cursor; + state->cursor = frame->start_cursor; + VALUE object = json_decode_object(state, config, count); + state->cursor = final_cursor; + + json_frame_stack_pop(state->frames); + json_push_value(state, config, object); + json_value_completed(json_frame_stack_peek(state->frames)); + break; + } - if (next_char == ',') { - state->cursor++; - json_eat_whitespace(state); + if (next_char == ',') { + state->cursor++; + json_eat_whitespace(state); - if (config->allow_trailing_comma) { - if (peek(state) == '}') { - // Trailing comma: re-consult the frame to close it. - continue; + if (config->allow_trailing_comma) { + if (peek(state) == '}') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; + } } - } - if (RB_UNLIKELY(peek(state) != '"')) { - raise_parse_error("expected object key, got: %s", state); + frame->phase = JSON_PHASE_KEY; + break; } - json_parse_string(state, config, true); - json_eat_whitespace(state); - if (RB_UNLIKELY(peek(state) != ':')) { - raise_parse_error("expected ':' after object key, got: %s", state); - } - state->cursor++; - - parsing_value = true; - continue; + raise_parse_error("expected ',' or '}' after object value, got: %s", state); } - - raise_parse_error("expected ',' or '}' after object value, got: %s", state); - } } } } @@ -1833,6 +1867,7 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) }; JSON_ParserState *state = &_state; + json_parse_begin(state); VALUE result = json_parse_any(state, config); // This may be skipped in case of exception, but From 09cc5afa8cd348ef700c71385063eb73051f243b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 2 Jun 2026 09:19:09 +0200 Subject: [PATCH 3/9] parser.c: Split JSON_PHASE_COMMA JSON_PHASE_ARRAY_COMMA and JSON_PHASE_OBJECT_COMMA Allows to remove one conditional. --- ext/json/ext/parser/parser.c | 163 ++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 69 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index f5aa27c5..bb61d8f0 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -340,9 +340,10 @@ enum json_frame_type { // any value boundary and be resumed purely from the (persistable) frame stack. enum json_frame_phase { JSON_PHASE_VALUE, // expecting a value (document root, array element, or object value after ':') - JSON_PHASE_COMMA, // after a value: expecting ',' or the closing ']' / '}' - JSON_PHASE_KEY, // object only: expecting a '"' key (after '{' or ',') - JSON_PHASE_COLON, // object only: after a key, expecting ':' + JSON_PHASE_ARRAY_COMMA, // after a value: expecting ',' or the closing ']' + JSON_PHASE_OBJECT_KEY, // expecting a '"' key (after '{' or ',') + JSON_PHASE_OBJECT_COMMA, // after a value: expecting ',' or the closing '}' + JSON_PHASE_OBJECT_COLON, // object only: after a key, expecting ':' JSON_PHASE_DONE, // root only: the document value has been parsed }; @@ -1410,7 +1411,18 @@ static inline long json_frame_entry_count(const json_frame *frame, const rvalue_ // after a container close is the freshly re-exposed parent. static inline void json_value_completed(json_frame *frame) { - frame->phase = (frame->type == JSON_FRAME_ROOT) ? JSON_PHASE_DONE : JSON_PHASE_COMMA; + // TODO: consider a lookup table? + switch (frame->type) { + case JSON_FRAME_ROOT: + frame->phase = JSON_PHASE_DONE; + break; + case JSON_FRAME_ARRAY: + frame->phase = JSON_PHASE_ARRAY_COMMA; + break; + case JSON_FRAME_OBJECT: + frame->phase = JSON_PHASE_OBJECT_COMMA; + break; + } } // Seed the frame stack with the root frame, establishing the invariant that @@ -1433,12 +1445,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) json_frame *frame = json_frame_stack_peek(state->frames); switch (frame->phase) { - case JSON_PHASE_DONE: + case JSON_PHASE_DONE: { // The root document value is parsed; it is the lone survivor on // the rvalue stack. return *rvalue_stack_peek(state->stack, 1); + } - case JSON_PHASE_VALUE: + case JSON_PHASE_VALUE: { json_eat_whitespace(state); switch (peek(state)) { @@ -1556,7 +1569,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); } - json_frame object_frame = { .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_KEY, .stack_head = stack_head, .start_cursor = object_start_cursor }; + json_frame object_frame = { .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .stack_head = stack_head, .start_cursor = object_start_cursor }; json_frame_stack_push(state->frames, object_frame, state->frame_stack_handle, &state->frames); // Phase KEY: the next iteration reads the first key. break; @@ -1569,10 +1582,17 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) raise_parse_error("unexpected character: %s", state); } break; + } + + case JSON_PHASE_OBJECT_KEY: { + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); - case JSON_PHASE_KEY: json_eat_whitespace(state); - if (RB_UNLIKELY(peek(state) != '"')) { + + if (RB_LIKELY(peek(state) == '"')) { + json_parse_string(state, config, true); + frame->phase = JSON_PHASE_OBJECT_COLON; + } else { // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed // for this object yet. @@ -1582,13 +1602,18 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) raise_parse_error("expected object key, got: %s", state); } } - json_parse_string(state, config, true); - frame->phase = JSON_PHASE_COLON; break; + } + + case JSON_PHASE_OBJECT_COLON: { + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); - case JSON_PHASE_COLON: json_eat_whitespace(state); - if (RB_UNLIKELY(peek(state) != ':')) { + + if (RB_LIKELY(peek(state) == ':')) { + state->cursor++; + frame->phase = JSON_PHASE_VALUE; + } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. if (json_frame_entry_count(frame, state->stack) == 1) { @@ -1597,78 +1622,78 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) raise_parse_error("expected ':' after object key, got: %s", state); } } - state->cursor++; - frame->phase = JSON_PHASE_VALUE; break; + } - case JSON_PHASE_COMMA: - json_eat_whitespace(state); + case JSON_PHASE_ARRAY_COMMA: { + JSON_ASSERT(frame->type == JSON_FRAME_ARRAY); - if (frame->type == JSON_FRAME_ARRAY) { - const char next_char = peek(state); + json_eat_whitespace(state); - if (next_char == ']') { - state->cursor++; - long count = json_frame_entry_count(frame, state->stack); - state->current_nesting--; - state->in_array--; - json_frame_stack_pop(state->frames); - json_push_value(state, config, json_decode_array(state, config, count)); - json_value_completed(json_frame_stack_peek(state->frames)); - break; - } + const char next_char = peek(state); - if (RB_LIKELY(next_char == ',')) { - state->cursor++; - if (config->allow_trailing_comma) { - json_eat_whitespace(state); - if (peek(state) == ']') { - // Trailing comma: stay in COMMA to close on the next iteration. - break; - } + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + if (config->allow_trailing_comma) { + json_eat_whitespace(state); + if (peek(state) == ']') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; } - frame->phase = JSON_PHASE_VALUE; - break; } - - raise_parse_error("expected ',' or ']' after array value", state); + frame->phase = JSON_PHASE_VALUE; + } else if (next_char == ']') { + state->cursor++; + long count = json_frame_entry_count(frame, state->stack); + state->current_nesting--; + state->in_array--; + json_frame_stack_pop(state->frames); + json_push_value(state, config, json_decode_array(state, config, count)); + json_value_completed(json_frame_stack_peek(state->frames)); } else { - const char next_char = peek(state); + raise_parse_error("expected ',' or ']' after array value", state); + } + break; + } - if (next_char == '}') { - state->cursor++; - state->current_nesting--; - size_t count = json_frame_entry_count(frame, state->stack); - - // Temporary rewind cursor in case an error is raised - const char *final_cursor = state->cursor; - state->cursor = frame->start_cursor; - VALUE object = json_decode_object(state, config, count); - state->cursor = final_cursor; - - json_frame_stack_pop(state->frames); - json_push_value(state, config, object); - json_value_completed(json_frame_stack_peek(state->frames)); - break; - } + case JSON_PHASE_OBJECT_COMMA: { + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); - if (next_char == ',') { - state->cursor++; - json_eat_whitespace(state); + json_eat_whitespace(state); + const char next_char = peek(state); - if (config->allow_trailing_comma) { - if (peek(state) == '}') { - // Trailing comma: stay in COMMA to close on the next iteration. - break; - } - } + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + json_eat_whitespace(state); - frame->phase = JSON_PHASE_KEY; - break; + if (config->allow_trailing_comma) { + if (peek(state) == '}') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; + } } + frame->phase = JSON_PHASE_OBJECT_KEY; + break; + } else if (next_char == '}') { + state->cursor++; + state->current_nesting--; + size_t count = json_frame_entry_count(frame, state->stack); + + // Temporary rewind cursor in case an error is raised + const char *final_cursor = state->cursor; + state->cursor = frame->start_cursor; + VALUE object = json_decode_object(state, config, count); + state->cursor = final_cursor; + + json_frame_stack_pop(state->frames); + json_push_value(state, config, object); + json_value_completed(json_frame_stack_peek(state->frames)); + break; + } else { raise_parse_error("expected ',' or '}' after object value, got: %s", state); } + } } } } From 956fdba2fd59b72d22c22373c8e7fb2c387e5072 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 2 Jun 2026 09:33:37 +0200 Subject: [PATCH 4/9] json_parse_any: introduce computed gotos Saves having to go through the dispatch loop again. --- ext/json/ext/parser/parser.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index bb61d8f0..5784ae84 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -376,13 +376,15 @@ static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *h return stack; } -static void json_frame_stack_push(json_frame_stack *stack, json_frame frame, VALUE *handle, json_frame_stack **stack_ref) +static json_frame *json_frame_stack_push(json_frame_stack *stack, json_frame frame, VALUE *handle, json_frame_stack **stack_ref) { if (RB_UNLIKELY(stack->head >= stack->capa)) { stack = json_frame_stack_grow(stack, handle, stack_ref); } stack->ptr[stack->head] = frame; + json_frame *frame_ptr = &stack->ptr[stack->head]; stack->head++; + return frame_ptr; } static inline json_frame *json_frame_stack_peek(json_frame_stack *stack) @@ -1452,6 +1454,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } case JSON_PHASE_VALUE: { + JSON_PHASE_VALUE: json_eat_whitespace(state); switch (peek(state)) { @@ -1545,9 +1548,10 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } state->in_array++; - json_frame array_frame = { .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, .stack_head = stack_head, .start_cursor = NULL }; - json_frame_stack_push(state->frames, array_frame, state->frame_stack_handle, &state->frames); // Phase stays VALUE: the next iteration reads the first element. + json_frame array_frame = { .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, .stack_head = stack_head, .start_cursor = NULL }; + frame = json_frame_stack_push(state->frames, array_frame, state->frame_stack_handle, &state->frames); + goto JSON_PHASE_VALUE; break; } case '{': { @@ -1570,8 +1574,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } json_frame object_frame = { .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .stack_head = stack_head, .start_cursor = object_start_cursor }; - json_frame_stack_push(state->frames, object_frame, state->frame_stack_handle, &state->frames); + frame = json_frame_stack_push(state->frames, object_frame, state->frame_stack_handle, &state->frames); // Phase KEY: the next iteration reads the first key. + goto JSON_PHASE_OBJECT_KEY; break; } @@ -1585,6 +1590,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } case JSON_PHASE_OBJECT_KEY: { + JSON_PHASE_OBJECT_KEY: JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); json_eat_whitespace(state); @@ -1592,6 +1598,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (RB_LIKELY(peek(state) == '"')) { json_parse_string(state, config, true); frame->phase = JSON_PHASE_OBJECT_COLON; + goto JSON_PHASE_OBJECT_COLON; } else { // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed @@ -1606,6 +1613,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } case JSON_PHASE_OBJECT_COLON: { + JSON_PHASE_OBJECT_COLON: JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); json_eat_whitespace(state); @@ -1613,6 +1621,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (RB_LIKELY(peek(state) == ':')) { state->cursor++; frame->phase = JSON_PHASE_VALUE; + goto JSON_PHASE_VALUE; } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. @@ -1642,6 +1651,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } } frame->phase = JSON_PHASE_VALUE; + goto JSON_PHASE_VALUE; } else if (next_char == ']') { state->cursor++; long count = json_frame_entry_count(frame, state->stack); @@ -1674,6 +1684,8 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } frame->phase = JSON_PHASE_OBJECT_KEY; + goto JSON_PHASE_OBJECT_KEY; + break; } else if (next_char == '}') { state->cursor++; From f92199a7dbd5c15a1fc7045eae556ab64d6ca7c8 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 2 Jun 2026 09:47:30 +0200 Subject: [PATCH 5/9] parser.c: Refactor json_frame_stack_push Take less arguments so it's easier to read. --- ext/json/ext/parser/parser.c | 101 +++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 5784ae84..17436fe7 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -361,6 +361,40 @@ typedef struct json_frame_stack_struct { json_frame *ptr; } json_frame_stack; +enum duplicate_key_action { + JSON_DEPRECATED = 0, + JSON_IGNORE, + JSON_RAISE, +}; + +typedef struct JSON_ParserStruct { + VALUE on_load_proc; + VALUE decimal_class; + ID decimal_method_id; + enum duplicate_key_action on_duplicate_key; + int max_nesting; + bool allow_nan; + bool allow_trailing_comma; + bool allow_control_characters; + bool allow_invalid_escape; + bool symbolize_names; + bool freeze; +} JSON_ParserConfig; + +typedef struct JSON_ParserStateStruct { + VALUE *stack_handle; + VALUE *frame_stack_handle; + const char *start; + const char *cursor; + const char *end; + rvalue_stack *stack; + json_frame_stack *frames; + rvalue_cache name_cache; + int in_array; + int current_nesting; + unsigned int emitted_deprecations; +} JSON_ParserState; + static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref); static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) @@ -376,14 +410,15 @@ static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *h return stack; } -static json_frame *json_frame_stack_push(json_frame_stack *stack, json_frame frame, VALUE *handle, json_frame_stack **stack_ref) +static json_frame *json_frame_stack_push(JSON_ParserState *state, json_frame frame) { + json_frame_stack *stack = state->frames; if (RB_UNLIKELY(stack->head >= stack->capa)) { - stack = json_frame_stack_grow(stack, handle, stack_ref); + stack = json_frame_stack_grow(stack, state->frame_stack_handle, &state->frames); } - stack->ptr[stack->head] = frame; - json_frame *frame_ptr = &stack->ptr[stack->head]; - stack->head++; + + json_frame *frame_ptr = &stack->ptr[stack->head++]; + *frame_ptr = frame; return frame_ptr; } @@ -484,40 +519,6 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) return len; } -enum duplicate_key_action { - JSON_DEPRECATED = 0, - JSON_IGNORE, - JSON_RAISE, -}; - -typedef struct JSON_ParserStruct { - VALUE on_load_proc; - VALUE decimal_class; - ID decimal_method_id; - enum duplicate_key_action on_duplicate_key; - int max_nesting; - bool allow_nan; - bool allow_trailing_comma; - bool allow_control_characters; - bool allow_invalid_escape; - bool symbolize_names; - bool freeze; -} JSON_ParserConfig; - -typedef struct JSON_ParserStateStruct { - VALUE *stack_handle; - VALUE *frame_stack_handle; - const char *start; - const char *cursor; - const char *end; - rvalue_stack *stack; - json_frame_stack *frames; - rvalue_cache name_cache; - int in_array; - int current_nesting; - unsigned int emitted_deprecations; -} JSON_ParserState; - static inline size_t rest(JSON_ParserState *state) { return state->end - state->cursor; } @@ -1432,8 +1433,10 @@ static inline void json_value_completed(json_frame *frame) // empty mid-parse). static void json_parse_begin(JSON_ParserState *state) { - json_frame root_frame = { .type = JSON_FRAME_ROOT, .phase = JSON_PHASE_VALUE }; - json_frame_stack_push(state->frames, root_frame, state->frame_stack_handle, &state->frames); + json_frame_stack_push(state, (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }); } // Parse an arbitrary JSON value iteratively. This is a state machine driven @@ -1549,8 +1552,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->in_array++; // Phase stays VALUE: the next iteration reads the first element. - json_frame array_frame = { .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, .stack_head = stack_head, .start_cursor = NULL }; - frame = json_frame_stack_push(state->frames, array_frame, state->frame_stack_handle, &state->frames); + frame = json_frame_stack_push(state, (json_frame){ + .type = JSON_FRAME_ARRAY, + .phase = JSON_PHASE_VALUE, + .stack_head = stack_head, + .start_cursor = NULL, + }); goto JSON_PHASE_VALUE; break; } @@ -1573,9 +1580,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); } - json_frame object_frame = { .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .stack_head = stack_head, .start_cursor = object_start_cursor }; - frame = json_frame_stack_push(state->frames, object_frame, state->frame_stack_handle, &state->frames); // Phase KEY: the next iteration reads the first key. + frame = json_frame_stack_push(state, (json_frame){ + .type = JSON_FRAME_OBJECT, + .phase = JSON_PHASE_OBJECT_KEY, + .stack_head = stack_head, + .start_cursor = object_start_cursor, + }); goto JSON_PHASE_OBJECT_KEY; break; } From fd2f8489c0e2bdbadd10e5b172364ce69a4087f4 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 3 Jun 2026 09:34:45 +0200 Subject: [PATCH 6/9] json_frame_stack: use ruby_sized_xfree --- ext/json/ext/parser/parser.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 17436fe7..f5063071 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -404,7 +404,7 @@ static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *h if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = json_frame_stack_spill(stack, handle, stack_ref); } else { - REALLOC_N(stack->ptr, json_frame, required); + JSON_SIZED_REALLOC_N(stack->ptr, json_frame, required, stack->capa); stack->capa = required; } return stack; @@ -434,7 +434,7 @@ static inline void json_frame_stack_pop(json_frame_stack *stack) static void json_frame_stack_free_buffer(json_frame_stack *stack) { - ruby_xfree(stack->ptr); + JSON_SIZED_FREE_N(stack->ptr, stack->capa); stack->ptr = NULL; } @@ -444,7 +444,7 @@ static void json_frame_stack_free(void *ptr) if (stack) { json_frame_stack_free_buffer(stack); #ifndef HAVE_RUBY_TYPED_EMBEDDABLE - ruby_xfree(stack); + JSON_SIZED_FREE(stack); #endif } } From e1bc6c36220c9c842136f44ce48afa94d17004fa Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 3 Jun 2026 22:27:15 +0200 Subject: [PATCH 7/9] Get rid of json_parse_begin --- ext/json/ext/parser/parser.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index f5063071..099640b4 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1428,17 +1428,6 @@ static inline void json_value_completed(json_frame *frame) } } -// Seed the frame stack with the root frame, establishing the invariant that -// json_parse_any always has a top frame to dispatch on (so the stack is never -// empty mid-parse). -static void json_parse_begin(JSON_ParserState *state) -{ - json_frame_stack_push(state, (json_frame){ - .type = JSON_FRAME_ROOT, - .phase = JSON_PHASE_VALUE, - }); -} - // Parse an arbitrary JSON value iteratively. This is a state machine driven // entirely by the top frame's phase so it can stop at any value boundary and // resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the @@ -1890,11 +1879,19 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) .capa = RVALUE_STACK_INITIAL_CAPA, }; + // Seed the frame stack with the root frame, establishing the invariant that + // json_parse_any always has a top frame to dispatch on (so the stack is never + // empty mid-parse). json_frame frame_stack_buffer[JSON_FRAME_STACK_INITIAL_CAPA]; + frame_stack_buffer[0] = (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }; json_frame_stack frames = { .type = RVALUE_STACK_STACK_ALLOCATED, .ptr = frame_stack_buffer, .capa = JSON_FRAME_STACK_INITIAL_CAPA, + .head = 1, }; long len; @@ -1915,7 +1912,6 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) }; JSON_ParserState *state = &_state; - json_parse_begin(state); VALUE result = json_parse_any(state, config); // This may be skipped in case of exception, but From b369764fe545456b04fe252269c4fa339c70cdc7 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 3 Jun 2026 22:53:48 +0200 Subject: [PATCH 8/9] Cleanup json_frame_stack_push calls --- ext/json/ext/parser/parser.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 099640b4..00eb2d20 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1525,7 +1525,6 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case '[': { state->cursor++; json_eat_whitespace(state); - long stack_head = state->stack->head; if (peek(state) == ']') { state->cursor++; @@ -1544,8 +1543,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) frame = json_frame_stack_push(state, (json_frame){ .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, - .stack_head = stack_head, - .start_cursor = NULL, + .stack_head = state->stack->head, }); goto JSON_PHASE_VALUE; break; @@ -1555,7 +1553,6 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; json_eat_whitespace(state); - long stack_head = state->stack->head; if (peek(state) == '}') { state->cursor++; @@ -1573,7 +1570,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) frame = json_frame_stack_push(state, (json_frame){ .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, - .stack_head = stack_head, + .stack_head = state->stack->head, .start_cursor = object_start_cursor, }); goto JSON_PHASE_OBJECT_KEY; From a46ea82a58cbc3f180d06dd18a49b55f071fa9aa Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 3 Jun 2026 22:57:54 +0200 Subject: [PATCH 9/9] Rename JSON_ParserState.stack into .value_stack With the introduction of the frames stack, the naming became confusing. --- ext/json/ext/parser/parser.c | 46 ++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 00eb2d20..0b0bfd71 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -350,7 +350,7 @@ enum json_frame_phase { typedef struct json_frame_struct { enum json_frame_type type; enum json_frame_phase phase; - long stack_head; // rvalue_stack->head when this container opened + long value_stack_head; // rvalue_stack->head when this container opened const char *start_cursor; // object frames only (the '{'); NULL otherwise } json_frame; @@ -382,12 +382,12 @@ typedef struct JSON_ParserStruct { } JSON_ParserConfig; typedef struct JSON_ParserStateStruct { - VALUE *stack_handle; + VALUE *value_stack_handle; VALUE *frame_stack_handle; const char *start; const char *cursor; const char *end; - rvalue_stack *stack; + rvalue_stack *value_stack; json_frame_stack *frames; rvalue_cache name_cache; int in_array; @@ -1042,8 +1042,8 @@ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantis static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count) { - VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->stack, count)); - rvalue_stack_pop(state->stack, count); + VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->value_stack, count)); + rvalue_stack_pop(state->value_stack, count); if (config->freeze) { RB_OBJ_FREEZE(array); @@ -1097,7 +1097,7 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi { size_t entries_count = count / 2; VALUE object = rb_hash_new_capa(entries_count); - const VALUE *pairs = rvalue_stack_peek(state->stack, count); + const VALUE *pairs = rvalue_stack_peek(state->value_stack, count); rb_hash_bulk_insert(count, pairs, object); if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) { @@ -1118,7 +1118,7 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi } } - rvalue_stack_pop(state->stack, count); + rvalue_stack_pop(state->value_stack, count); if (config->freeze) { RB_OBJ_FREEZE(object); @@ -1132,7 +1132,7 @@ static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig * if (RB_UNLIKELY(config->on_load_proc)) { value = rb_proc_call_with_block(config->on_load_proc, 1, &value, Qnil); } - rvalue_stack_push(state->stack, value, state->stack_handle, &state->stack); + rvalue_stack_push(state->value_stack, value, state->value_stack_handle, &state->value_stack); return value; } @@ -1402,9 +1402,9 @@ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_Par // How many values (array elements, or interleaved object keys+values) have been // pushed onto the rvalue stack since this container opened. Used to size the // bulk decode on close, and to tell the first key/colon from later ones. -static inline long json_frame_entry_count(const json_frame *frame, const rvalue_stack *stack) +static inline long json_frame_entry_count(const json_frame *frame, const rvalue_stack *value_stack) { - return stack->head - frame->stack_head; + return value_stack->head - frame->value_stack_head; } // A complete value now sits on top of the rvalue stack. Advance the frame that @@ -1442,7 +1442,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_DONE: { // The root document value is parsed; it is the lone survivor on // the rvalue stack. - return *rvalue_stack_peek(state->stack, 1); + return *rvalue_stack_peek(state->value_stack, 1); } case JSON_PHASE_VALUE: { @@ -1543,7 +1543,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) frame = json_frame_stack_push(state, (json_frame){ .type = JSON_FRAME_ARRAY, .phase = JSON_PHASE_VALUE, - .stack_head = state->stack->head, + .value_stack_head = state->value_stack->head, }); goto JSON_PHASE_VALUE; break; @@ -1570,7 +1570,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) frame = json_frame_stack_push(state, (json_frame){ .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, - .stack_head = state->stack->head, + .value_stack_head = state->value_stack->head, .start_cursor = object_start_cursor, }); goto JSON_PHASE_OBJECT_KEY; @@ -1600,7 +1600,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed // for this object yet. - if (json_frame_entry_count(frame, state->stack) == 0) { + if (json_frame_entry_count(frame, state->value_stack) == 0) { raise_parse_error("expected object key, got %s", state); } else { raise_parse_error("expected object key, got: %s", state); @@ -1622,7 +1622,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. - if (json_frame_entry_count(frame, state->stack) == 1) { + if (json_frame_entry_count(frame, state->value_stack) == 1) { raise_parse_error("expected ':' after object key", state); } else { raise_parse_error("expected ':' after object key, got: %s", state); @@ -1651,7 +1651,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) goto JSON_PHASE_VALUE; } else if (next_char == ']') { state->cursor++; - long count = json_frame_entry_count(frame, state->stack); + long count = json_frame_entry_count(frame, state->value_stack); state->current_nesting--; state->in_array--; json_frame_stack_pop(state->frames); @@ -1687,7 +1687,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } else if (next_char == '}') { state->cursor++; state->current_nesting--; - size_t count = json_frame_entry_count(frame, state->stack); + size_t count = json_frame_entry_count(frame, state->value_stack); // Temporary rewind cursor in case an error is raised const char *final_cursor = state->cursor; @@ -1870,7 +1870,7 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) } VALUE rvalue_stack_buffer[RVALUE_STACK_INITIAL_CAPA]; - rvalue_stack stack = { + rvalue_stack value_stack = { .type = RVALUE_STACK_STACK_ALLOCATED, .ptr = rvalue_stack_buffer, .capa = RVALUE_STACK_INITIAL_CAPA, @@ -1896,14 +1896,14 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) RSTRING_GETMEM(Vsource, start, len); - VALUE stack_handle = 0; + VALUE value_stack_handle = 0; VALUE frame_stack_handle = 0; JSON_ParserState _state = { .start = start, .cursor = start, .end = start + len, - .stack = &stack, - .stack_handle = &stack_handle, + .value_stack = &value_stack, + .value_stack_handle = &value_stack_handle, .frames = &frames, .frame_stack_handle = &frame_stack_handle, }; @@ -1913,9 +1913,9 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) // This may be skipped in case of exception, but // it won't cause a leak. - rvalue_stack_eagerly_release(stack_handle); + rvalue_stack_eagerly_release(value_stack_handle); json_frame_stack_eagerly_release(frame_stack_handle); - RB_GC_GUARD(stack_handle); + RB_GC_GUARD(value_stack_handle); RB_GC_GUARD(frame_stack_handle); RB_GC_GUARD(Vsource); json_ensure_eof(state);