From 2739ed929e61b61e9348725e046c88720624b274 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 15:09:34 +0000 Subject: [PATCH 1/6] Run the RBS parser on JRuby via WebAssembly JRuby cannot load the MRI C extension, so on JRuby RBS now runs the parser inside WebAssembly (Chicory, a pure-Java runtime) and rebuilds the AST in pure Ruby. `lib/rbs.rb` branches on RUBY_ENGINE. WebAssembly ABI (wasm/rbs_wasm.c): - rbs_wasm_parse_signature / _parse_type / _parse_method_type parse a character range of a source buffer and leave the serialized AST (or, on a parse error, an error blob) in linear memory for the host to read via rbs_wasm_result_ptr / _len. Ruby side (lib/rbs/wasm, loaded only on JRuby): - Runtime loads rbs_parser.wasm into Chicory, wires up WASI, and drives the parse functions. - Parser implements RBS::Parser._parse_signature/_parse_type/ _parse_method_type on top of the runtime and RBS::WASM::Deserializer, raising RBS::ParsingError on failure just like the C extension. _lex, _parse_type_params and the inline annotation entries are not supported yet. - Location is a pure-Ruby implementation of the primitives behind RBS::Location (the C extension's legacy_location.c), so rbs/location_aux.rb works unchanged. Packaging and CI: - `rake wasm:jruby_setup` assembles lib/rbs/wasm/ (the .wasm plus the Chicory jars from Maven Central); the gemspec ships them in the `java` platform gem and skips the C extension there. - A JRuby CI job parses the whole bundled corpus and runs test/rbs/wasm/jruby_parser_test.rb. Verified that JRuby and CRuby produce byte-identical ASTs across the entire bundled corpus (core + stdlib + sig). https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- .github/workflows/jruby.yml | 63 ++++++++ .gitignore | 4 + Rakefile | 36 ++++- Steepfile | 7 + lib/rbs.rb | 9 +- lib/rbs/wasm/location.rb | 61 ++++++++ lib/rbs/wasm/parser.rb | 81 ++++++++++ lib/rbs/wasm/runtime.rb | 150 +++++++++++++++++++ rbs.gemspec | 13 +- test/rbs/wasm/jruby_parser_test.rb | 71 +++++++++ wasm/README.md | 59 ++++---- wasm/rbs_wasm.c | 233 ++++++++++++++++++++++++----- 12 files changed, 722 insertions(+), 65 deletions(-) create mode 100644 .github/workflows/jruby.yml create mode 100644 lib/rbs/wasm/location.rb create mode 100644 lib/rbs/wasm/parser.rb create mode 100644 lib/rbs/wasm/runtime.rb create mode 100644 test/rbs/wasm/jruby_parser_test.rb diff --git a/.github/workflows/jruby.yml b/.github/workflows/jruby.yml new file mode 100644 index 000000000..619de747a --- /dev/null +++ b/.github/workflows/jruby.yml @@ -0,0 +1,63 @@ +name: JRuby + +on: + push: + branches: + - master + pull_request: + paths: + - ".github/workflows/jruby.yml" + - "include/**" + - "src/**" + - "wasm/**" + - "lib/rbs/wasm/**" + - "lib/rbs.rb" + - "Rakefile" + +permissions: + contents: read + +env: + # Keep in sync with .github/workflows/wasm.yml. + WASI_SDK_VERSION: "33" + WASI_SDK_RELEASE: "33.0" + +jobs: + test: + name: jruby + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - run: git fetch --depth=1 origin +refs/tags/*:refs/tags/* + + # Build the .wasm and fetch the Chicory jars with CRuby + the WASI SDK, + # then run RBS itself on JRuby against those artifacts. + - name: Set up Ruby (to assemble the WebAssembly runtime) + uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler: none + - name: Update rubygems & bundler + run: gem update --system + - name: Install gems + run: | + bundle config set --local without libs:profilers + bundle install --jobs 4 --retry 3 + - name: Install the WASI SDK + run: | + url="https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-${WASI_SDK_VERSION}/wasi-sdk-${WASI_SDK_RELEASE}-x86_64-linux.tar.gz" + mkdir -p "$HOME/wasi-sdk" + curl -sSL "$url" | tar xz --strip-components=1 -C "$HOME/wasi-sdk" + echo "WASI_SDK_PATH=$HOME/wasi-sdk" >> "$GITHUB_ENV" + - name: Assemble the JRuby runtime (rbs_parser.wasm + Chicory jars) + run: bundle exec rake wasm:jruby_setup + + - name: Set up JRuby + uses: ruby/setup-ruby@v1 + with: + ruby-version: jruby + bundler: none + - name: Install runtime and test gems + run: gem install prism test-unit --no-document + - name: Run RBS's parser on JRuby + run: jruby -Ilib -Itest test/rbs/wasm/jruby_parser_test.rb diff --git a/.gitignore b/.gitignore index e7e523cd4..93c5b4a5c 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ rust/ruby-rbs/vendor/rbs/ # Compiled WebAssembly module (built by rake wasm:build) wasm/*.wasm + +# JRuby runtime artifacts (assembled by rake wasm:jruby_setup, bundled in the JRuby gem) +lib/rbs/wasm/*.wasm +lib/rbs/wasm/jars/ diff --git a/Rakefile b/Rakefile index 9cf0e66cc..9eb6cd592 100644 --- a/Rakefile +++ b/Rakefile @@ -609,16 +609,46 @@ namespace :wasm do task :check => :build do wasmtime = ENV["WASMTIME"] || "wasmtime" - # `rbs_wasm_selftest` parses a small fixed signature and returns 0 on + # `rbs_wasm_selftest` parses a small fixed signature and returns 1 on # success. `--invoke` prints the return value to stdout. output = IO.popen([wasmtime, "run", "--invoke", "rbs_wasm_selftest", WASM_OUTPUT], err: File::NULL, &:read).to_s.strip - if output == "0" + if output == "1" puts "WebAssembly selftest passed." else - raise "WebAssembly selftest failed: rbs_wasm_selftest returned #{output.inspect} (expected \"0\")" + raise "WebAssembly selftest failed: rbs_wasm_selftest returned #{output.inspect} (expected \"1\")" end end + + # Where the runtime looks for the module and jars by default (see + # RBS::WASM::Runtime). These are build artifacts, bundled into the JRuby gem. + JRUBY_WASM_DIR = File.expand_path("lib/rbs/wasm", __dir__) + CHICORY_VERSION = ENV.fetch("CHICORY_VERSION", "1.7.5") + CHICORY_JARS = %w[wasm runtime log wasi].freeze + + desc "Download the Chicory jars the JRuby runtime needs into lib/rbs/wasm/jars" + task :vendor_jars do + require "open-uri" + require "fileutils" + + jars_dir = File.join(JRUBY_WASM_DIR, "jars") + FileUtils.mkdir_p(jars_dir) + + CHICORY_JARS.each do |name| + url = "https://repo1.maven.org/maven2/com/dylibso/chicory/#{name}/#{CHICORY_VERSION}/#{name}-#{CHICORY_VERSION}.jar" + dest = File.join(jars_dir, "#{name}.jar") + puts "Downloading #{url}" + URI.open(url) { |io| File.binwrite(dest, io.read) } # steep:ignore + end + + puts "Vendored Chicory #{CHICORY_VERSION} into #{jars_dir}" + end + + desc "Assemble everything the JRuby gem needs: the .wasm and the Chicory jars" + task :jruby_setup => [:build, :vendor_jars] do + cp WASM_OUTPUT, File.join(JRUBY_WASM_DIR, "rbs_parser.wasm") + puts "JRuby runtime is ready under #{JRUBY_WASM_DIR}" + end end namespace :rust do diff --git a/Steepfile b/Steepfile index b00e4542b..d1bc69f40 100644 --- a/Steepfile +++ b/Steepfile @@ -6,6 +6,13 @@ target :lib do ignore( "lib/rbs/test", # "lib/rbs/test.rb" + + # JRuby-only implementations of RBS::Location and RBS::Parser. Like the C + # extension, these implement interfaces already described in sig/, and + # runtime.rb is Java interop, so they are not type-checked here. + "lib/rbs/wasm/location.rb", + "lib/rbs/wasm/runtime.rb", + "lib/rbs/wasm/parser.rb", ) library "pathname", "json", "logger", "monitor", "tsort", "uri", 'dbm', 'pstore', 'singleton', 'shellwords', 'fileutils', 'find', 'digest', 'prettyprint', 'yaml', "psych", "securerandom" diff --git a/lib/rbs.rb b/lib/rbs.rb index bbc8c8382..e59fd95e4 100644 --- a/lib/rbs.rb +++ b/lib/rbs.rb @@ -69,7 +69,14 @@ require "rbs/type_alias_regularity" require "rbs/collection" -require "rbs_extension" +if RUBY_ENGINE == "jruby" + # JRuby cannot load the MRI C extension. Run the parser in WebAssembly and + # provide pure-Ruby implementations of RBS::Location and RBS::Parser instead. + require "rbs/wasm/location" + require "rbs/wasm/parser" +else + require "rbs_extension" +end require "rbs/parser_aux" require "rbs/location_aux" diff --git a/lib/rbs/wasm/location.rb b/lib/rbs/wasm/location.rb new file mode 100644 index 000000000..6f89d6927 --- /dev/null +++ b/lib/rbs/wasm/location.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +module RBS + # Pure-Ruby implementation of the primitives that back RBS::Location. + # + # On CRuby these come from the C extension (ext/rbs_extension/legacy_location.c). + # JRuby loads this instead, before rbs/location_aux.rb layers the public API on + # top, so RBS::Location behaves identically without the native extension. + class Location + attr_reader :buffer + + def initialize(buffer, start_pos, end_pos) + @buffer = buffer + @start_pos = start_pos + @end_pos = end_pos + @required_children = {} #: Hash[Symbol, [ Integer, Integer ]] + @optional_children = {} #: Hash[Symbol, [ Integer, Integer ]?] + end + + def _start_pos + @start_pos + end + + def _end_pos + @end_pos + end + + def _add_required_child(name, start_pos, end_pos) + @required_children[name] = [start_pos, end_pos] + end + + def _add_optional_child(name, start_pos, end_pos) + @optional_children[name] = [start_pos, end_pos] + end + + def _add_optional_no_child(name) + @optional_children[name] = nil + end + + def _required_keys + @required_children.keys + end + + def _optional_keys + @optional_children.keys + end + + def [](name) + if (range = @required_children[name]) + return Location.new(@buffer, range[0], range[1]) + end + + if @optional_children.key?(name) + range = @optional_children[name] + return range && Location.new(@buffer, range[0], range[1]) + end + + nil + end + end +end diff --git a/lib/rbs/wasm/parser.rb b/lib/rbs/wasm/parser.rb new file mode 100644 index 000000000..b7a238f59 --- /dev/null +++ b/lib/rbs/wasm/parser.rb @@ -0,0 +1,81 @@ +# frozen_string_literal: true + +require_relative "runtime" +require_relative "deserializer" + +module RBS + # WebAssembly-backed implementation of the parser primitives. + # + # On CRuby these come from the C extension (ext/rbs_extension/main.c). JRuby + # loads this instead: it runs the parser inside WebAssembly, then rebuilds the + # AST with RBS::WASM::Deserializer. rbs/parser_aux.rb layers the public + # RBS::Parser API on top, exactly as it does for the C extension. + class Parser + class << self + def _parse_signature(buffer, start_pos, end_pos) + success, bytes = WASM::Runtime.instance.parse_signature(buffer.content, start_pos, end_pos) + raise_parsing_error(buffer, bytes) unless success + + WASM::Deserializer.deserialize(bytes, buffer) + end + + def _parse_type(buffer, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) + success, bytes = WASM::Runtime.instance.parse_type(buffer.content, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) + raise_parsing_error(buffer, bytes) unless success + + deserialize_or_nil(bytes, buffer) + end + + def _parse_method_type(buffer, start_pos, end_pos, variables, require_eof) + success, bytes = WASM::Runtime.instance.parse_method_type(buffer.content, start_pos, end_pos, variables, require_eof) + raise_parsing_error(buffer, bytes) unless success + + deserialize_or_nil(bytes, buffer) + end + + def _parse_type_params(buffer, start_pos, end_pos, module_type_params) + raise NotImplementedError, "RBS::Parser._parse_type_params is not yet supported on #{RUBY_ENGINE}" + end + + def _lex(buffer, end_pos) + raise NotImplementedError, "RBS::Parser._lex is not yet supported on #{RUBY_ENGINE}" + end + + def _parse_inline_leading_annotation(buffer, start_pos, end_pos, variables) + raise NotImplementedError, "RBS::Parser._parse_inline_leading_annotation is not yet supported on #{RUBY_ENGINE}" + end + + def _parse_inline_trailing_annotation(buffer, start_pos, end_pos, variables) + raise NotImplementedError, "RBS::Parser._parse_inline_trailing_annotation is not yet supported on #{RUBY_ENGINE}" + end + + private + + # An empty result means the parser reached EOF immediately (`nil`). + def deserialize_or_nil(bytes, buffer) + bytes.empty? ? nil : WASM::Deserializer.deserialize(bytes, buffer) + end + + # Decodes the error blob written by set_error_result (rbs_wasm.c) and raises + # the same error the C extension would (see raise_error in main.c). + def raise_parsing_error(buffer, blob) + start_char, end_char, syntax_error = blob.unpack("l, 0, "expected to find bundled RBS files" + + paths.each do |path| + source = File.read(path, encoding: "UTF-8") + _buffer, _directives, declarations = RBS::Parser.parse_signature(source) + assert_not_nil declarations, "failed to parse #{path}" + end + end + + def test_parse_signature_structure + _buffer, _directives, declarations = RBS::Parser.parse_signature(<<~RBS) + class Foo < Bar + attr_reader name: String + def greet: (String name) -> String + end + RBS + + decl = declarations[0] + assert_instance_of RBS::AST::Declarations::Class, decl + assert_equal "Foo", decl.name.to_s + assert_equal "Bar", decl.super_class&.name&.to_s + assert_equal [:name, :greet], decl.members.map { |member| member.respond_to?(:name) ? member.name : nil } + assert_equal 1, decl.location.start_line + end + + def test_parse_type + assert_equal "Hash[Symbol, Array[Integer]]", RBS::Parser.parse_type("Hash[Symbol, Array[Integer]]").to_s + assert_equal "^(Integer, ?String) { () -> void } -> bool", RBS::Parser.parse_type("^(Integer, ?String) { () -> void } -> bool").to_s + assert_equal "A | B", RBS::Parser.parse_type("A | B", variables: [:A, :B]).to_s + end + + def test_parse_method_type + assert_equal "[T] (T, ?Integer) { (T) -> void } -> T", RBS::Parser.parse_method_type("[T] (T, ?Integer) { (T) -> void } -> T").to_s + end + + def test_parse_error_raises_parsing_error + error = assert_raises(RBS::ParsingError) do + RBS::Parser.parse_signature("class 123 Broken end") + end + assert_not_nil error.location + assert_equal "tINTEGER", error.token_type + end + end + end +end diff --git a/wasm/README.md b/wasm/README.md index 1fd81d452..e2df4fb72 100644 --- a/wasm/README.md +++ b/wasm/README.md @@ -5,10 +5,12 @@ on the Ruby C API, so it can be compiled to WebAssembly as-is. This directory holds the small entry-point shim ([`rbs_wasm.c`](rbs_wasm.c)) that exposes a stable ABI to a WebAssembly host. -The motivating use case is running RBS on Ruby implementations that cannot load -the MRI C extension (notably JRuby): the host loads `rbs_parser.wasm`, runs the -parser over a source buffer, and reads the result back out — no native build per -platform required. +This is how RBS runs on Ruby implementations that cannot load the MRI C +extension (notably JRuby): the host loads `rbs_parser.wasm`, runs the parser over +a source buffer, and reads the serialized AST back out. The Ruby side then +rebuilds `RBS::AST` objects with `RBS::WASM::Deserializer` — no native build per +platform required. See [`lib/rbs/wasm`](../lib/rbs/wasm) and +[`docs/wasm_serialization.md`](../docs/wasm_serialization.md). ## Building @@ -17,16 +19,9 @@ The build needs the [WASI SDK](https://github.com/WebAssembly/wasi-sdk/releases) ```console $ export WASI_SDK_PATH=/path/to/wasi-sdk -$ rake wasm:build -Built .../wasm/rbs_parser.wasm -``` - -To also run the smoke test you need [wasmtime](https://wasmtime.dev/) (or another -WASI runtime, via the `WASMTIME` environment variable): - -```console -$ rake wasm:check -WebAssembly selftest passed. +$ rake wasm:build # compile rbs_parser.wasm +$ rake wasm:check # also smoke-test it (needs wasmtime) +$ rake wasm:jruby_setup # assemble lib/rbs/wasm/ for JRuby (wasm + Chicory jars) ``` The compiled `rbs_parser.wasm` is a build artifact and is not checked in. @@ -36,14 +31,28 @@ The compiled `rbs_parser.wasm` is a build artifact and is not checked in. The module is built as a "reactor": it has no `main`, and the host calls `_initialize` once before invoking any export. -| Export | Signature | Description | -| -------------------------- | --------------------- | ------------------------------------------------------------------------ | -| `rbs_wasm_alloc` | `(i32) -> i32` | Allocate N bytes in linear memory and return the offset. | -| `rbs_wasm_free` | `(i32) -> ()` | Free a region returned by `rbs_wasm_alloc`. | -| `rbs_wasm_parse_signature` | `(i32 ptr, i32 len) -> i32` | Parse the UTF-8 source at `ptr`/`len`. Returns 0 on success, 1 on error. | -| `rbs_wasm_selftest` | `() -> i32` | Parse a small fixed signature. Returns 0 on success, 1 otherwise. | - -This is the foundation step: it proves the parser builds and runs under -WebAssembly. Subsequent steps add a compact serialization of the parsed AST so -the host can reconstruct `RBS::AST` objects, and wire the module into RBS on -JRuby through a JVM WebAssembly runtime. +Memory management and results: + +| Export | Signature | Description | +| --- | --- | --- | +| `rbs_wasm_alloc` | `(i32) -> i32` | Allocate N bytes in linear memory, return the offset. | +| `rbs_wasm_free` | `(i32) -> ()` | Free a region from `rbs_wasm_alloc`. | +| `rbs_wasm_result_ptr` | `() -> i32` | Offset of the most recent result. | +| `rbs_wasm_result_len` | `() -> i32` | Length of the most recent result. | + +Parsing — each takes the whole buffer (`ptr`/`len`) plus the character range to +parse (`start`/`end`), and returns `1` on success or `0` on a parse error. On +success the result is the serialized AST; on error it is an error blob (start/end +positions, syntax flag, token type, message). Type/method-type parsing also takes +a buffer of newline-separated type-variable names (`vars`/`vars_len`, with +`vars_len < 0` meaning "none"): + +| Export | Signature | +| --- | --- | +| `rbs_wasm_parse_signature` | `(ptr, len, start, end) -> i32` | +| `rbs_wasm_parse_type` | `(ptr, len, start, end, vars, vars_len, require_eof, void_allowed, self_allowed, classish_allowed) -> i32` | +| `rbs_wasm_parse_method_type` | `(ptr, len, start, end, vars, vars_len, require_eof) -> i32` | +| `rbs_wasm_selftest` | `() -> i32` (parses a fixed sample; `1` on success) | + +For type and method-type parsing, a successful result of length 0 means the input +was empty (`nil`). diff --git a/wasm/rbs_wasm.c b/wasm/rbs_wasm.c index 41a35051f..f6ca1b75c 100644 --- a/wasm/rbs_wasm.c +++ b/wasm/rbs_wasm.c @@ -3,34 +3,49 @@ * * WebAssembly entry points for the RBS parser. * - * The RBS parser in `src/` is plain, self-contained C with no dependency on - * the Ruby C API. This file exposes a small, stable ABI so that the parser can - * be driven from a WebAssembly host (for example, a JVM-based runtime running - * under JRuby). + * The parser in `src/` is plain, self-contained C with no dependency on the + * Ruby C API, so it compiles to WebAssembly as-is. This file exposes a small, + * stable ABI so the parser can be driven from a WebAssembly host (a JVM-based + * runtime running under JRuby). * - * This module is built as a "reactor" (`-mexec-model=reactor`): it has no - * `main`, and the host is expected to call `_initialize` once before invoking - * any of the exported functions below. + * The flow is: the host writes a UTF-8 source string into linear memory + * (`rbs_wasm_alloc`), calls one of the `rbs_wasm_parse_*` functions, and reads + * the result back out (`rbs_wasm_result_ptr` / `rbs_wasm_result_len`). On + * success the result is the serialized AST (see `rbs_serialize_node` and + * `docs/wasm_serialization.md`); on a parse error it is an error blob (see + * `set_error_result`). `RBS::WASM` on the Ruby side decodes both. * - * For now this only proves the toolchain end to end: it can allocate memory in - * the linear address space, run the parser over a source buffer, and report - * whether parsing succeeded. Serializing the resulting AST back to the host is - * handled in a later step. + * Built as a "reactor": no `main`, and the host calls `_initialize` once before + * invoking any export. */ +#include #include #include -#include #include "rbs/parser.h" +#include "rbs/serialize.h" #include "rbs/string.h" #include "rbs/util/rbs_encoding.h" +// The result of the most recent parse, living in linear memory until the next +// call replaces it. WebAssembly is little-endian, so the multi-byte integers +// written below match the little-endian format the Ruby decoder expects. +static char *result_buffer = NULL; +static int32_t result_length = 0; + +// Replace the current result with a fresh `length`-byte buffer and return a +// pointer to it for the caller to fill in. +static char *allocate_result(size_t length) { + free(result_buffer); + result_buffer = (char *) malloc(length == 0 ? 1 : length); + result_length = (int32_t) length; + return result_buffer; +} + /** - * Allocate `size` bytes in the module's linear memory and return the offset. - * - * The host uses this to reserve a region it can write an input string into - * before calling one of the parse entry points. + * Allocate `size` bytes in linear memory and return the offset. The host uses + * this to reserve space for an input string before calling a parse function. */ __attribute__((export_name("rbs_wasm_alloc"))) void *rbs_wasm_alloc(size_t size) { return malloc(size); @@ -43,47 +58,195 @@ __attribute__((export_name("rbs_wasm_free"))) void rbs_wasm_free(void *ptr) { free(ptr); } +/** + * Offset of the most recent parse result in linear memory. + */ +__attribute__((export_name("rbs_wasm_result_ptr"))) +int32_t +rbs_wasm_result_ptr(void) { + return (int32_t) (intptr_t) result_buffer; +} + +/** + * Length, in bytes, of the most recent parse result. + */ +__attribute__((export_name("rbs_wasm_result_len"))) +int32_t +rbs_wasm_result_len(void) { + return result_length; +} + +// Encode the parser's error into the result buffer: +// +// [i32 start_char][i32 end_char][u8 syntax_error] +// [u32 token_type_len][token_type bytes][u32 message_len][message bytes] +// +// Always returns 0, the failure status for the parse functions. +static int set_error_result(rbs_parser_t *parser) { + rbs_error_t *error = parser->error; + const char *token_type = rbs_token_type_str(error->token.type); + const char *message = error->message; + uint32_t token_type_len = (uint32_t) strlen(token_type); + uint32_t message_len = (uint32_t) strlen(message); + + int32_t start_char = error->token.range.start.char_pos; + int32_t end_char = error->token.range.end.char_pos; + uint8_t syntax_error = error->syntax_error ? 1 : 0; + + size_t total = 4 + 4 + 1 + 4 + token_type_len + 4 + message_len; + char *p = allocate_result(total); + + memcpy(p, &start_char, 4); + p += 4; + memcpy(p, &end_char, 4); + p += 4; + *p++ = (char) syntax_error; + memcpy(p, &token_type_len, 4); + p += 4; + memcpy(p, token_type, token_type_len); + p += token_type_len; + memcpy(p, &message_len, 4); + p += 4; + memcpy(p, message, message_len); + + return 0; +} + +static int set_serialized_result(rbs_parser_t *parser, rbs_node_t *node) { + rbs_string_t bytes = rbs_serialize_node(parser->allocator, &parser->constant_pool, node); + size_t length = rbs_string_len(bytes); + memcpy(allocate_result(length), bytes.start, length); + return 1; +} + +// Declare type variables from a buffer of newline-separated names. A negative +// length means "no variables given" (the parser keeps its default table). +static void declare_variables(rbs_parser_t *parser, const char *variables, int variables_length) { + if (variables_length < 0) return; + + rbs_parser_push_typevar_table(parser, true); + + const char *cursor = variables; + const char *end = variables + variables_length; + const char *name_start = cursor; + + while (cursor <= end) { + if (cursor == end || *cursor == '\n') { + size_t name_length = (size_t) (cursor - name_start); + if (name_length > 0) { + uint8_t *copied = (uint8_t *) malloc(name_length); + memcpy(copied, name_start, name_length); + rbs_constant_id_t id = rbs_constant_pool_insert_owned(&parser->constant_pool, copied, name_length); + (void) rbs_parser_insert_typevar(parser, id); + } + name_start = cursor + 1; + } + cursor++; + } +} + /** * Parse an RBS signature from a UTF-8 source buffer. * - * @param source Offset of the source buffer in linear memory. - * @param length Length of the source buffer, in bytes. - * @return 0 if parsing succeeded, 1 if a parse error occurred. + * `source`/`length` is the whole buffer content; `start_pos`/`end_pos` are the + * character range within it to parse, so reported locations are absolute (this + * mirrors RBS::Parser._parse_signature). + * + * @return 1 on success (result is the serialized AST), 0 on a parse error + * (result is an error blob). */ -__attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_signature(const char *source, int length) { +__attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_signature(const char *source, int length, int start_pos, int end_pos) { rbs_string_t string = rbs_string_new(source, source + length); - const rbs_encoding_t *encoding = RBS_ENCODING_UTF_8_ENTRY; - rbs_parser_t *parser = rbs_parser_new(string, encoding, 0, length); + rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); rbs_signature_t *signature = NULL; - bool ok = rbs_parse_signature(parser, &signature); + rbs_parse_signature(parser, &signature); - int result = (ok && parser->error == NULL) ? 0 : 1; + int status; + if (parser->error == NULL) { + status = set_serialized_result(parser, (rbs_node_t *) signature); + } else { + status = set_error_result(parser); + } rbs_parser_free(parser); + return status; +} + +/** + * Parse a single RBS type. + * + * @param variables Newline-separated type variable names (length < 0 for none). + * @return 1 on success, 0 on a parse error. On success with an empty result + * (`rbs_wasm_result_len` == 0), the input was empty (`nil`). + */ +__attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(const char *source, int length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof, int void_allowed, int self_allowed, int classish_allowed) { + rbs_string_t string = rbs_string_new(source, source + length); + rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); + declare_variables(parser, variables, variables_length); + + int status; + if (parser->next_token.type == pEOF) { + allocate_result(0); + status = 1; + } else { + rbs_node_t *type = NULL; + rbs_parse_type(parser, &type, void_allowed != 0, self_allowed != 0, classish_allowed != 0); + + if (parser->error == NULL && require_eof) { + rbs_parser_advance(parser); + if (parser->current_token.type != pEOF) { + rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF)); + } + } - return result; + status = parser->error == NULL ? set_serialized_result(parser, type) : set_error_result(parser); + } + + rbs_parser_free(parser); + return status; } /** - * Parse a small, fixed RBS document. + * Parse a single RBS method type. * - * This exercises the whole parser path inside WebAssembly without the host - * having to write anything into linear memory, which makes it convenient as a - * build smoke test (`wasmtime run --invoke rbs_wasm_selftest rbs_parser.wasm`). + * @param variables Newline-separated type variable names (length < 0 for none). + * @return 1 on success, 0 on a parse error. On success with an empty result, + * the input was empty (`nil`). + */ +__attribute__((export_name("rbs_wasm_parse_method_type"))) int rbs_wasm_parse_method_type(const char *source, int length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof) { + rbs_string_t string = rbs_string_new(source, source + length); + rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); + declare_variables(parser, variables, variables_length); + + int status; + if (parser->next_token.type == pEOF) { + allocate_result(0); + status = 1; + } else { + rbs_method_type_t *method_type = NULL; + rbs_parse_method_type(parser, &method_type, require_eof != 0, true); + + status = parser->error == NULL ? set_serialized_result(parser, (rbs_node_t *) method_type) : set_error_result(parser); + } + + rbs_parser_free(parser); + return status; +} + +/** + * Parse a small, fixed RBS document, used as a build smoke test + * (`wasmtime run --invoke rbs_wasm_selftest rbs_parser.wasm`). * - * @return 0 if the sample parsed successfully, 1 otherwise. + * @return 1 if the sample parsed successfully, 0 otherwise. */ __attribute__((export_name("rbs_wasm_selftest"))) int rbs_wasm_selftest(void) { static const char source[] = "class User\n" " attr_reader name: String\n" " def initialize: (String name) -> void\n" - "end\n" - "\n" - "module Authentication\n" - " def authenticate: (String, String) -> bool\n" "end\n"; - return rbs_wasm_parse_signature(source, (int) (sizeof(source) - 1)); + int length = (int) (sizeof(source) - 1); + return rbs_wasm_parse_signature(source, length, 0, length); } From 0f62191550a30553f151bec857768e32591cf0d2 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 15:51:05 +0000 Subject: [PATCH 2/6] Skip JRuby-incompatible tests like TruffleRuby Add `omit_on_jruby!` (class- and instance-level), mirroring `omit_on_truffle_ruby!`, for tests that depend on the C extension or on parser features not yet wired through the WebAssembly bridge. - parser_test: omit `test__lex` and `test_parse_type_params` on JRuby (those primitives raise NotImplementedError there for now). - serialization_test: omit the class on JRuby; its round-trip is driven by the C extension's `_parse_*_to_bytes`. - jruby_parser_test: qualify Test::Unit as ::Test::Unit so the file also loads under the full suite, where RBS::Test would otherwise shadow it. https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- test/rbs/parser_test.rb | 4 ++++ test/rbs/wasm/jruby_parser_test.rb | 3 ++- test/rbs/wasm/serialization_test.rb | 4 ++++ test/test_helper.rb | 20 ++++++++++++++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index 9977f7f07..a745a266f 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -857,6 +857,8 @@ def test_proc__untyped_function end def test_parse_type_params + omit_on_jruby! "RBS::Parser.parse_type_params is not yet wired through the WebAssembly parser" + RBS::Parser.parse_type_params(buffer("[T]")).tap do |params| assert_equal 1, params.size assert_equal :T, params[0].name @@ -1000,6 +1002,8 @@ def test_parse_type_params end def test__lex + omit_on_jruby! "RBS::Parser._lex is not yet wired through the WebAssembly parser" + content = <<~RBS # LineComment class Foo[T < Integer] < Bar # Comment diff --git a/test/rbs/wasm/jruby_parser_test.rb b/test/rbs/wasm/jruby_parser_test.rb index cf994b250..ae72daa08 100644 --- a/test/rbs/wasm/jruby_parser_test.rb +++ b/test/rbs/wasm/jruby_parser_test.rb @@ -15,7 +15,8 @@ module WASM # # Nested modules (rather than `class RBS::WASM::JRubyParserTest`) so the file # also loads on CRuby, where RBS::WASM is otherwise absent and the test omits. - class JRubyParserTest < Test::Unit::TestCase + # Test::Unit is fully qualified because RBS::Test exists and would shadow it. + class JRubyParserTest < ::Test::Unit::TestCase ROOT = File.expand_path("../../..", __dir__) def setup diff --git a/test/rbs/wasm/serialization_test.rb b/test/rbs/wasm/serialization_test.rb index ac2ab10ae..ccd4b99a0 100644 --- a/test/rbs/wasm/serialization_test.rb +++ b/test/rbs/wasm/serialization_test.rb @@ -13,6 +13,10 @@ # is what gives us confidence that the same format, produced inside WebAssembly, # will rebuild correct objects on JRuby. class RBS::WASM::SerializationTest < Test::Unit::TestCase + # The round-trip is driven by the C extension's `_parse_*_to_bytes`, which only + # exists on CRuby. JRuby's end-to-end coverage lives in jruby_parser_test.rb. + omit_on_jruby! "Uses the C extension's _parse_*_to_bytes helpers" + ROOT = File.expand_path("../../..", __dir__) def buffer(source) diff --git a/test/test_helper.rb b/test/test_helper.rb index 9749d615e..56bf68e6b 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -57,6 +57,26 @@ def self.omit_on_truffle_ruby!(reason = "Not supported on TruffleRuby") def omit_on_truffle_ruby!(reason = "Not supported on TruffleRuby") omit(reason) if RUBY_ENGINE == "truffleruby" end + + # Omit *all* test cases defined in this class when running on JRuby. + # + # On JRuby the parser runs in WebAssembly (see lib/rbs/wasm). Use this at the + # class body level for features that depend on the C extension or on APIs JRuby + # does not implement. + def self.omit_on_jruby!(reason = "Not supported on JRuby") + return unless RUBY_ENGINE == "jruby" + + setup { omit(reason) } + end + + # Omit the running test case when running on JRuby. + # + # Use it inside a test method when only a few cases of an otherwise supported + # class fail on JRuby (e.g. those exercising parser features not yet wired + # through the WebAssembly bridge, such as `lex` or `parse_type_params`). + def omit_on_jruby!(reason = "Not supported on JRuby") + omit(reason) if RUBY_ENGINE == "jruby" + end end module TestHelper From 78d47af2d3db886bd5d780997a9467770bd66225 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 20:59:07 +0000 Subject: [PATCH 3/6] Honor the source encoding in the WebAssembly parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WASM parser was hardcoded to UTF-8. Pass the buffer's Ruby encoding name through the ABI and resolve it with rbs_encoding_find (falling back to UTF-8), so non-UTF-8 sources (EUC-JP, Windows-31J, ...) lex correctly — matching the C extension, which uses the buffer's encoding. Verified that an EUC-JP signature produces byte-identical locations and a correctly-encoded comment string on JRuby and CRuby, and that the UTF-8 corpus digest is unchanged. https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- lib/rbs/wasm/parser.rb | 9 ++++++--- lib/rbs/wasm/runtime.rb | 33 +++++++++++++++++++-------------- wasm/README.md | 19 ++++++++++--------- wasm/rbs_wasm.c | 33 ++++++++++++++++++++++----------- 4 files changed, 57 insertions(+), 37 deletions(-) diff --git a/lib/rbs/wasm/parser.rb b/lib/rbs/wasm/parser.rb index b7a238f59..d436920c9 100644 --- a/lib/rbs/wasm/parser.rb +++ b/lib/rbs/wasm/parser.rb @@ -13,21 +13,24 @@ module RBS class Parser class << self def _parse_signature(buffer, start_pos, end_pos) - success, bytes = WASM::Runtime.instance.parse_signature(buffer.content, start_pos, end_pos) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_signature(buffer.content, encoding, start_pos, end_pos) raise_parsing_error(buffer, bytes) unless success WASM::Deserializer.deserialize(bytes, buffer) end def _parse_type(buffer, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) - success, bytes = WASM::Runtime.instance.parse_type(buffer.content, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_type(buffer.content, encoding, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) raise_parsing_error(buffer, bytes) unless success deserialize_or_nil(bytes, buffer) end def _parse_method_type(buffer, start_pos, end_pos, variables, require_eof) - success, bytes = WASM::Runtime.instance.parse_method_type(buffer.content, start_pos, end_pos, variables, require_eof) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_method_type(buffer.content, encoding, start_pos, end_pos, variables, require_eof) raise_parsing_error(buffer, bytes) unless success deserialize_or_nil(bytes, buffer) diff --git a/lib/rbs/wasm/runtime.rb b/lib/rbs/wasm/runtime.rb index 725be1115..bf11b59ce 100644 --- a/lib/rbs/wasm/runtime.rb +++ b/lib/rbs/wasm/runtime.rb @@ -51,43 +51,48 @@ def initialize # `bytes` is the serialized AST, otherwise it is the error blob (see # set_error_result in rbs_wasm.c). - def parse_signature(content, start_pos, end_pos) - run(content) { |ptr, len| @parse_signature.apply(ptr, len, start_pos, end_pos)[0] } + def parse_signature(content, encoding, start_pos, end_pos) + run(content, encoding) { |ptr, len, enc_ptr, enc_len| @parse_signature.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos)[0] } end - def parse_type(content, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) + def parse_type(content, encoding, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) with_variables(variables) do |vars_ptr, vars_len| - run(content) do |ptr, len| - @parse_type.apply(ptr, len, start_pos, end_pos, vars_ptr, vars_len, bool(require_eof), bool(void_allowed), bool(self_allowed), bool(classish_allowed))[0] + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @parse_type.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos, vars_ptr, vars_len, bool(require_eof), bool(void_allowed), bool(self_allowed), bool(classish_allowed))[0] end end end - def parse_method_type(content, start_pos, end_pos, variables, require_eof) + def parse_method_type(content, encoding, start_pos, end_pos, variables, require_eof) with_variables(variables) do |vars_ptr, vars_len| - run(content) do |ptr, len| - @parse_method_type.apply(ptr, len, start_pos, end_pos, vars_ptr, vars_len, bool(require_eof))[0] + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @parse_method_type.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos, vars_ptr, vars_len, bool(require_eof))[0] end end end private - # Copies `source` into linear memory, yields its pointer/length to the block - # (which invokes the parser and returns its status), then reads the result - # back out. Serialized through the monitor because the module keeps its - # result in a single shared location. - def run(source) + # Copies `source` and its encoding name into linear memory, yields their + # pointers/lengths to the block (which invokes the parser and returns its + # status), then reads the result back out. Serialized through the monitor + # because the module keeps its result in a single shared location. + def run(source, encoding) synchronize do bytes = source.b length = bytes.bytesize + name = encoding.to_s.b + name_length = name.bytesize source_ptr = @alloc.apply(length)[0] + name_ptr = @alloc.apply(name_length)[0] begin @memory.write(source_ptr, bytes.to_java_bytes) - status = yield(source_ptr, length) + @memory.write(name_ptr, name.to_java_bytes) unless name_length.zero? + status = yield(source_ptr, length, name_ptr, name_length) [status == 1, read_result] ensure @free.apply(source_ptr) + @free.apply(name_ptr) end end end diff --git a/wasm/README.md b/wasm/README.md index e2df4fb72..71e513a63 100644 --- a/wasm/README.md +++ b/wasm/README.md @@ -40,18 +40,19 @@ Memory management and results: | `rbs_wasm_result_ptr` | `() -> i32` | Offset of the most recent result. | | `rbs_wasm_result_len` | `() -> i32` | Length of the most recent result. | -Parsing — each takes the whole buffer (`ptr`/`len`) plus the character range to -parse (`start`/`end`), and returns `1` on success or `0` on a parse error. On -success the result is the serialized AST; on error it is an error blob (start/end -positions, syntax flag, token type, message). Type/method-type parsing also takes -a buffer of newline-separated type-variable names (`vars`/`vars_len`, with -`vars_len < 0` meaning "none"): +Parsing — each takes the whole buffer (`ptr`/`len`), its Ruby encoding name +(`enc`/`enc_len`, e.g. `"UTF-8"` or `"EUC-JP"`; falls back to UTF-8 when empty or +unknown), and the character range to parse (`start`/`end`). Each returns `1` on +success or `0` on a parse error. On success the result is the serialized AST; on +error it is an error blob (start/end positions, syntax flag, token type, +message). Type/method-type parsing also takes a buffer of newline-separated +type-variable names (`vars`/`vars_len`, with `vars_len < 0` meaning "none"): | Export | Signature | | --- | --- | -| `rbs_wasm_parse_signature` | `(ptr, len, start, end) -> i32` | -| `rbs_wasm_parse_type` | `(ptr, len, start, end, vars, vars_len, require_eof, void_allowed, self_allowed, classish_allowed) -> i32` | -| `rbs_wasm_parse_method_type` | `(ptr, len, start, end, vars, vars_len, require_eof) -> i32` | +| `rbs_wasm_parse_signature` | `(ptr, len, enc, enc_len, start, end) -> i32` | +| `rbs_wasm_parse_type` | `(ptr, len, enc, enc_len, start, end, vars, vars_len, require_eof, void_allowed, self_allowed, classish_allowed) -> i32` | +| `rbs_wasm_parse_method_type` | `(ptr, len, enc, enc_len, start, end, vars, vars_len, require_eof) -> i32` | | `rbs_wasm_selftest` | `() -> i32` (parses a fixed sample; `1` on success) | For type and method-type parsing, a successful result of length 0 means the input diff --git a/wasm/rbs_wasm.c b/wasm/rbs_wasm.c index f6ca1b75c..ff7e74759 100644 --- a/wasm/rbs_wasm.c +++ b/wasm/rbs_wasm.c @@ -119,6 +119,16 @@ static int set_serialized_result(rbs_parser_t *parser, rbs_node_t *node) { return 1; } +// Resolve a Ruby encoding name (e.g. "UTF-8", "EUC-JP") to an rbs encoding, +// falling back to UTF-8 when none is given or the name is not recognised. +static const rbs_encoding_t *resolve_encoding(const char *name, int name_length) { + if (name_length > 0) { + const rbs_encoding_t *encoding = rbs_encoding_find((const uint8_t *) name, (const uint8_t *) (name + name_length)); + if (encoding != NULL) return encoding; + } + return RBS_ENCODING_UTF_8_ENTRY; +} + // Declare type variables from a buffer of newline-separated names. A negative // length means "no variables given" (the parser keeps its default table). static void declare_variables(rbs_parser_t *parser, const char *variables, int variables_length) { @@ -146,18 +156,19 @@ static void declare_variables(rbs_parser_t *parser, const char *variables, int v } /** - * Parse an RBS signature from a UTF-8 source buffer. + * Parse an RBS signature from a source buffer. * - * `source`/`length` is the whole buffer content; `start_pos`/`end_pos` are the - * character range within it to parse, so reported locations are absolute (this - * mirrors RBS::Parser._parse_signature). + * `source`/`length` is the whole buffer content; `encoding`/`encoding_length` is + * its Ruby encoding name; `start_pos`/`end_pos` are the character range within it + * to parse, so reported locations are absolute (this mirrors + * RBS::Parser._parse_signature). * * @return 1 on success (result is the serialized AST), 0 on a parse error * (result is an error blob). */ -__attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_signature(const char *source, int length, int start_pos, int end_pos) { +__attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_signature(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos) { rbs_string_t string = rbs_string_new(source, source + length); - rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); + rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); rbs_signature_t *signature = NULL; rbs_parse_signature(parser, &signature); @@ -180,9 +191,9 @@ __attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_sign * @return 1 on success, 0 on a parse error. On success with an empty result * (`rbs_wasm_result_len` == 0), the input was empty (`nil`). */ -__attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(const char *source, int length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof, int void_allowed, int self_allowed, int classish_allowed) { +__attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof, int void_allowed, int self_allowed, int classish_allowed) { rbs_string_t string = rbs_string_new(source, source + length); - rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); + rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); declare_variables(parser, variables, variables_length); int status; @@ -214,9 +225,9 @@ __attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(cons * @return 1 on success, 0 on a parse error. On success with an empty result, * the input was empty (`nil`). */ -__attribute__((export_name("rbs_wasm_parse_method_type"))) int rbs_wasm_parse_method_type(const char *source, int length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof) { +__attribute__((export_name("rbs_wasm_parse_method_type"))) int rbs_wasm_parse_method_type(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof) { rbs_string_t string = rbs_string_new(source, source + length); - rbs_parser_t *parser = rbs_parser_new(string, RBS_ENCODING_UTF_8_ENTRY, start_pos, end_pos); + rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); declare_variables(parser, variables, variables_length); int status; @@ -248,5 +259,5 @@ __attribute__((export_name("rbs_wasm_selftest"))) int rbs_wasm_selftest(void) { "end\n"; int length = (int) (sizeof(source) - 1); - return rbs_wasm_parse_signature(source, length, 0, length); + return rbs_wasm_parse_signature(source, length, "UTF-8", 5, 0, length); } From 7ce3d944298d5cb5576841819b80090eb9c2475c Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 01:42:09 +0000 Subject: [PATCH 4/6] Fix WebAssembly parser hang and argument validation on JRuby parser_test surfaced two behaviors the WebAssembly path didn't match: - A reversed `byte_range` (e.g. 1..0) made the lexer loop forever inside WebAssembly, hanging the host. RBS::Parser now validates the position range (matching validate_position_range in the C extension) and raises ArgumentError, and the wasm shim guards against invalid ranges so a stray caller can never wedge the VM. - `variables:` that is not nil or an Array of Symbols now raises TypeError, matching declare_type_variables in the C extension (it used to raise NoMethodError). With these, test/rbs/parser_test.rb passes on JRuby (only `_lex` and `parse_type_params`, which aren't wired through the bridge yet, are omitted), so the JRuby CI job now runs it too. https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- .github/workflows/jruby.yml | 4 +++- lib/rbs/wasm/parser.rb | 33 +++++++++++++++++++++++++++++++++ wasm/rbs_wasm.c | 22 ++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/.github/workflows/jruby.yml b/.github/workflows/jruby.yml index 619de747a..651e33236 100644 --- a/.github/workflows/jruby.yml +++ b/.github/workflows/jruby.yml @@ -60,4 +60,6 @@ jobs: - name: Install runtime and test gems run: gem install prism test-unit --no-document - name: Run RBS's parser on JRuby - run: jruby -Ilib -Itest test/rbs/wasm/jruby_parser_test.rb + run: | + jruby -Ilib -Itest test/rbs/wasm/jruby_parser_test.rb + jruby -Ilib -Itest test/rbs/parser_test.rb diff --git a/lib/rbs/wasm/parser.rb b/lib/rbs/wasm/parser.rb index d436920c9..dcb0363fd 100644 --- a/lib/rbs/wasm/parser.rb +++ b/lib/rbs/wasm/parser.rb @@ -13,6 +13,7 @@ module RBS class Parser class << self def _parse_signature(buffer, start_pos, end_pos) + validate_position_range(start_pos, end_pos) encoding = buffer.content.encoding.name success, bytes = WASM::Runtime.instance.parse_signature(buffer.content, encoding, start_pos, end_pos) raise_parsing_error(buffer, bytes) unless success @@ -21,6 +22,8 @@ def _parse_signature(buffer, start_pos, end_pos) end def _parse_type(buffer, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) + validate_position_range(start_pos, end_pos) + validate_variables(variables) encoding = buffer.content.encoding.name success, bytes = WASM::Runtime.instance.parse_type(buffer.content, encoding, start_pos, end_pos, variables, require_eof, void_allowed, self_allowed, classish_allowed) raise_parsing_error(buffer, bytes) unless success @@ -29,6 +32,8 @@ def _parse_type(buffer, start_pos, end_pos, variables, require_eof, void_allowed end def _parse_method_type(buffer, start_pos, end_pos, variables, require_eof) + validate_position_range(start_pos, end_pos) + validate_variables(variables) encoding = buffer.content.encoding.name success, bytes = WASM::Runtime.instance.parse_method_type(buffer.content, encoding, start_pos, end_pos, variables, require_eof) raise_parsing_error(buffer, bytes) unless success @@ -54,6 +59,34 @@ def _parse_inline_trailing_annotation(buffer, start_pos, end_pos, variables) private + # Reject negative or reversed ranges before handing them to the parser, + # matching validate_position_range in the C extension (main.c). A reversed + # range would otherwise make the lexer loop forever inside WebAssembly. + def validate_position_range(start_pos, end_pos) + if start_pos < 0 || end_pos < 0 + raise ArgumentError, "negative position range: #{start_pos}...#{end_pos}" + end + if start_pos > end_pos + raise ArgumentError, "invalid position range: #{start_pos}...#{end_pos}" + end + end + + # Reject anything that is not nil or an Array of Symbols, matching + # declare_type_variables in the C extension (main.c). + def validate_variables(variables) + return if variables.nil? + + unless variables.is_a?(Array) + raise TypeError, "wrong argument type #{variables.class} (must be an Array of Symbols or nil)" + end + + variables.each do |variable| + unless variable.is_a?(Symbol) + raise TypeError, "Type variables Array contains invalid value #{variable.inspect} of type #{variable.class} (must be an Array of Symbols or nil)" + end + end + end + # An empty result means the parser reached EOF immediately (`nil`). def deserialize_or_nil(bytes, buffer) bytes.empty? ? nil : WASM::Deserializer.deserialize(bytes, buffer) diff --git a/wasm/rbs_wasm.c b/wasm/rbs_wasm.c index ff7e74759..408e310ba 100644 --- a/wasm/rbs_wasm.c +++ b/wasm/rbs_wasm.c @@ -119,6 +119,13 @@ static int set_serialized_result(rbs_parser_t *parser, rbs_node_t *node) { return 1; } +// A reversed or out-of-bounds range would make the lexer loop forever, which +// would hang the whole host. Hosts are expected to validate too (RBS::Parser +// raises on bad ranges), but guard here so a stray caller can never wedge the VM. +static bool range_is_valid(int start_pos, int end_pos, int length) { + return start_pos >= 0 && end_pos >= 0 && start_pos <= end_pos && end_pos <= length; +} + // Resolve a Ruby encoding name (e.g. "UTF-8", "EUC-JP") to an rbs encoding, // falling back to UTF-8 when none is given or the name is not recognised. static const rbs_encoding_t *resolve_encoding(const char *name, int name_length) { @@ -167,6 +174,11 @@ static void declare_variables(rbs_parser_t *parser, const char *variables, int v * (result is an error blob). */ __attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_signature(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos) { + if (!range_is_valid(start_pos, end_pos, length)) { + allocate_result(0); + return 0; + } + rbs_string_t string = rbs_string_new(source, source + length); rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); @@ -192,6 +204,11 @@ __attribute__((export_name("rbs_wasm_parse_signature"))) int rbs_wasm_parse_sign * (`rbs_wasm_result_len` == 0), the input was empty (`nil`). */ __attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof, int void_allowed, int self_allowed, int classish_allowed) { + if (!range_is_valid(start_pos, end_pos, length)) { + allocate_result(0); + return 0; + } + rbs_string_t string = rbs_string_new(source, source + length); rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); declare_variables(parser, variables, variables_length); @@ -226,6 +243,11 @@ __attribute__((export_name("rbs_wasm_parse_type"))) int rbs_wasm_parse_type(cons * the input was empty (`nil`). */ __attribute__((export_name("rbs_wasm_parse_method_type"))) int rbs_wasm_parse_method_type(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length, int require_eof) { + if (!range_is_valid(start_pos, end_pos, length)) { + allocate_result(0); + return 0; + } + rbs_string_t string = rbs_string_new(source, source + length); rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); declare_variables(parser, variables, variables_length); From 4213dfdcc4df64cebe4e2fc8e57010a5ef7edd01 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 03:07:12 +0000 Subject: [PATCH 5/6] Compile the WebAssembly parser ahead of time on JRuby Drive the module through Chicory's AOT compiler (wasm -> JVM bytecode) instead of the interpreter. On the bundled corpus this cuts a full parse pass from ~18.2s to ~2.1s (about 9x), while producing byte-identical ASTs. - runtime.rb uses MachineFactoryCompiler when its jars are present and falls back to the interpreter otherwise (so the parser still works with only the base Chicory jars). - rake wasm:vendor_jars now also fetches the Chicory `compiler` jar and the ow2 ASM libraries it needs (pinned via ASM_VERSION), bundled into the JRuby gem. Compilation happens at runtime, once per process (~0.4s), against whichever Chicory runtime is loaded, so the compiler and runtime versions can never drift apart. https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- Rakefile | 22 ++++++++++++++-------- lib/rbs/wasm/runtime.rb | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Rakefile b/Rakefile index 9eb6cd592..9b46e77b3 100644 --- a/Rakefile +++ b/Rakefile @@ -624,9 +624,14 @@ namespace :wasm do # RBS::WASM::Runtime). These are build artifacts, bundled into the JRuby gem. JRUBY_WASM_DIR = File.expand_path("lib/rbs/wasm", __dir__) CHICORY_VERSION = ENV.fetch("CHICORY_VERSION", "1.7.5") - CHICORY_JARS = %w[wasm runtime log wasi].freeze - - desc "Download the Chicory jars the JRuby runtime needs into lib/rbs/wasm/jars" + # `compiler` is Chicory's AOT compiler (wasm -> JVM bytecode); the asm* jars + # are the ow2 ASM libraries it depends on. Keep ASM_VERSION in sync with what + # the pinned Chicory release declares. + CHICORY_JARS = %w[wasm runtime log wasi compiler].freeze + ASM_VERSION = ENV.fetch("ASM_VERSION", "9.9.1") + ASM_JARS = %w[asm asm-tree asm-util asm-commons asm-analysis].freeze + + desc "Download the Chicory and ASM jars the JRuby runtime needs into lib/rbs/wasm/jars" task :vendor_jars do require "open-uri" require "fileutils" @@ -634,14 +639,15 @@ namespace :wasm do jars_dir = File.join(JRUBY_WASM_DIR, "jars") FileUtils.mkdir_p(jars_dir) - CHICORY_JARS.each do |name| - url = "https://repo1.maven.org/maven2/com/dylibso/chicory/#{name}/#{CHICORY_VERSION}/#{name}-#{CHICORY_VERSION}.jar" - dest = File.join(jars_dir, "#{name}.jar") + downloads = CHICORY_JARS.map { |name| ["#{name}.jar", "https://repo1.maven.org/maven2/com/dylibso/chicory/#{name}/#{CHICORY_VERSION}/#{name}-#{CHICORY_VERSION}.jar"] } + downloads += ASM_JARS.map { |name| ["#{name}.jar", "https://repo1.maven.org/maven2/org/ow2/asm/#{name}/#{ASM_VERSION}/#{name}-#{ASM_VERSION}.jar"] } + + downloads.each do |filename, url| puts "Downloading #{url}" - URI.open(url) { |io| File.binwrite(dest, io.read) } # steep:ignore + URI.open(url) { |io| File.binwrite(File.join(jars_dir, filename), io.read) } # steep:ignore end - puts "Vendored Chicory #{CHICORY_VERSION} into #{jars_dir}" + puts "Vendored Chicory #{CHICORY_VERSION} + ASM #{ASM_VERSION} into #{jars_dir}" end desc "Assemble everything the JRuby gem needs: the .wasm and the Chicory jars" diff --git a/lib/rbs/wasm/runtime.rb b/lib/rbs/wasm/runtime.rb index bf11b59ce..d418fbb47 100644 --- a/lib/rbs/wasm/runtime.rb +++ b/lib/rbs/wasm/runtime.rb @@ -16,8 +16,15 @@ class Runtime include MonitorMixin # The Chicory jars the runtime needs at load time. + # Jars Chicory needs to load and run the module. JARS = %w[wasm runtime log wasi].freeze + # Jars for Chicory's ahead-of-time compiler (wasm -> JVM bytecode), which + # runs the parser ~8x faster than the interpreter. Optional: the runtime + # falls back to the interpreter when they are absent. asm* are the ow2 ASM + # libraries the compiler depends on. + OPTIONAL_JARS = %w[compiler asm asm-tree asm-util asm-commons asm-analysis].freeze + class << self def instance @instance ||= new @@ -142,13 +149,34 @@ def build_instance wasi = wasi_preview1.builder.with_options(wasi_options.builder.build).build imports = import_values.builder.add_function(wasi.to_host_functions).build - wasm = instance_class.builder(wasm_module).with_import_values(imports).build + builder = instance_class.builder(wasm_module).with_import_values(imports) + if (factory = machine_factory(wasm_module)) + builder = builder.with_machine_factory(factory) + end + + wasm = builder.build wasm.export("_initialize").apply wasm end + # Chicory's AOT compiler when its jars are present, otherwise nil (the + # builder then uses the interpreter). + def machine_factory(wasm_module) + Java::ComDylibsoChicoryCompiler::MachineFactoryCompiler.compile(wasm_module) + rescue NameError + nil + end + def load_jars - JARS.each { |name| require File.join(self.class.jars_dir, "#{name}.jar") } + JARS.each { |name| require jar_path(name) } + OPTIONAL_JARS.each do |name| + path = jar_path(name) + require path if File.exist?(path) + end + end + + def jar_path(name) + File.join(self.class.jars_dir, "#{name}.jar") end end end From 27890237d7687c3222a73eee4e958855c3ef0c55 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 03:23:15 +0000 Subject: [PATCH 6/6] Wire lex, type params and inline annotations through the WebAssembly parser These were the last RBS::Parser primitives still raising NotImplementedError on JRuby. With them in place, test/rbs/parser_test.rb passes on JRuby with no omissions (42 tests, same as CRuby). - serialize.c gains rbs_serialize_node_list for the bare node list that parse_type_params returns; rbs_wasm.c adds parse_type_params, the two inline annotation entries, and lex (a countless stream of [type, start, end] records read until exhausted). - The deserializer gains deserialize_node_list and deserialize_tokens; the parser implements _parse_type_params, _lex and the inline annotation methods on top of them, with the same range/variable validation as the others. - The omit_on_jruby! markers on test__lex and test_parse_type_params are gone, and jruby_parser_test covers lex and type params directly. Also harden the AOT fallback: machine_factory now rescues LinkageError too, so an incompatible ASM jar set degrades to the interpreter instead of crashing. https://claude.ai/code/session_01LTveMt3NLbYHEboXuzAKpA --- include/rbs/serialize.h | 6 ++ lib/rbs/wasm/deserializer.rb | 33 ++++++- lib/rbs/wasm/parser.rb | 28 +++++- lib/rbs/wasm/runtime.rb | 40 +++++++- sig/wasm/deserializer.rbs | 12 ++- src/serialize.c | 12 +++ templates/include/rbs/serialize.h.erb | 6 ++ templates/src/serialize.c.erb | 12 +++ test/rbs/parser_test.rb | 4 - test/rbs/wasm/jruby_parser_test.rb | 13 +++ wasm/rbs_wasm.c | 126 ++++++++++++++++++++++++++ 11 files changed, 275 insertions(+), 17 deletions(-) diff --git a/include/rbs/serialize.h b/include/rbs/serialize.h index e55b7b64c..387f051bf 100644 --- a/include/rbs/serialize.h +++ b/include/rbs/serialize.h @@ -30,4 +30,10 @@ */ rbs_string_t rbs_serialize_node(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_t *node); +/** + * Like rbs_serialize_node, but for a bare node list (e.g. the result of + * rbs_parse_type_params). Decoded by RBS::WASM::Deserializer.deserialize_node_list. + */ +rbs_string_t rbs_serialize_node_list(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_list_t *list); + #endif diff --git a/lib/rbs/wasm/deserializer.rb b/lib/rbs/wasm/deserializer.rb index 4b19331a1..29e2fd53b 100644 --- a/lib/rbs/wasm/deserializer.rb +++ b/lib/rbs/wasm/deserializer.rb @@ -19,6 +19,18 @@ def self.deserialize(bytes, buffer) new(bytes, buffer).read_node end + # Deserialize a bare node list (rbs_serialize_node_list), e.g. the result + # of RBS::Parser._parse_type_params. + def self.deserialize_node_list(bytes, buffer) + new(bytes, buffer).read_node_list + end + + # Deserialize the token stream produced by rbs_wasm_lex into the + # [type, location] pairs RBS::Parser._lex returns. + def self.deserialize_tokens(bytes, buffer) + new(bytes, buffer).read_tokens + end + def initialize(bytes, buffer) @bytes = bytes @buffer = buffer @@ -50,6 +62,23 @@ def read_node end end + def read_node_list + Array.new(read_count) { read_node } + end + + # The lex stream has no leading count: read records until the buffer is + # exhausted. Each is a token type name followed by its character range. + def read_tokens + tokens = [] #: Array[[ Symbol, Location ]] + until @pos >= @bytes.bytesize + type = read_string(Encoding::UTF_8).to_sym + start_char = read_i32 + end_char = read_i32 + tokens << [type, RBS::Location.new(@buffer, start_char, end_char)] + end + tokens + end + private def read_struct(entry) @@ -87,10 +116,6 @@ def read_field(reader) end end - def read_node_list - Array.new(read_count) { read_node } - end - def read_hash hash = {} #: Hash[untyped, untyped] read_count.times do diff --git a/lib/rbs/wasm/parser.rb b/lib/rbs/wasm/parser.rb index dcb0363fd..c46aedfa0 100644 --- a/lib/rbs/wasm/parser.rb +++ b/lib/rbs/wasm/parser.rb @@ -42,19 +42,39 @@ def _parse_method_type(buffer, start_pos, end_pos, variables, require_eof) end def _parse_type_params(buffer, start_pos, end_pos, module_type_params) - raise NotImplementedError, "RBS::Parser._parse_type_params is not yet supported on #{RUBY_ENGINE}" + validate_position_range(start_pos, end_pos) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_type_params(buffer.content, encoding, start_pos, end_pos, module_type_params) + raise_parsing_error(buffer, bytes) unless success + + bytes.empty? ? nil : WASM::Deserializer.deserialize_node_list(bytes, buffer) end def _lex(buffer, end_pos) - raise NotImplementedError, "RBS::Parser._lex is not yet supported on #{RUBY_ENGINE}" + encoding = buffer.content.encoding.name + _success, bytes = WASM::Runtime.instance.lex(buffer.content, encoding, end_pos) + + WASM::Deserializer.deserialize_tokens(bytes, buffer) end def _parse_inline_leading_annotation(buffer, start_pos, end_pos, variables) - raise NotImplementedError, "RBS::Parser._parse_inline_leading_annotation is not yet supported on #{RUBY_ENGINE}" + validate_position_range(start_pos, end_pos) + validate_variables(variables) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_inline_leading_annotation(buffer.content, encoding, start_pos, end_pos, variables) + raise_parsing_error(buffer, bytes) unless success + + deserialize_or_nil(bytes, buffer) end def _parse_inline_trailing_annotation(buffer, start_pos, end_pos, variables) - raise NotImplementedError, "RBS::Parser._parse_inline_trailing_annotation is not yet supported on #{RUBY_ENGINE}" + validate_position_range(start_pos, end_pos) + validate_variables(variables) + encoding = buffer.content.encoding.name + success, bytes = WASM::Runtime.instance.parse_inline_trailing_annotation(buffer.content, encoding, start_pos, end_pos, variables) + raise_parsing_error(buffer, bytes) unless success + + deserialize_or_nil(bytes, buffer) end private diff --git a/lib/rbs/wasm/runtime.rb b/lib/rbs/wasm/runtime.rb index d418fbb47..5065b85e4 100644 --- a/lib/rbs/wasm/runtime.rb +++ b/lib/rbs/wasm/runtime.rb @@ -51,6 +51,10 @@ def initialize @parse_signature = @wasm.export("rbs_wasm_parse_signature") @parse_type = @wasm.export("rbs_wasm_parse_type") @parse_method_type = @wasm.export("rbs_wasm_parse_method_type") + @parse_type_params = @wasm.export("rbs_wasm_parse_type_params") + @parse_inline_leading_annotation = @wasm.export("rbs_wasm_parse_inline_leading_annotation") + @parse_inline_trailing_annotation = @wasm.export("rbs_wasm_parse_inline_trailing_annotation") + @lex = @wasm.export("rbs_wasm_lex") end # `content` is the whole buffer; `start_pos`/`end_pos` are the character @@ -78,6 +82,34 @@ def parse_method_type(content, encoding, start_pos, end_pos, variables, require_ end end + def parse_type_params(content, encoding, start_pos, end_pos, module_type_params) + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @parse_type_params.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos, bool(module_type_params))[0] + end + end + + def parse_inline_leading_annotation(content, encoding, start_pos, end_pos, variables) + with_variables(variables) do |vars_ptr, vars_len| + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @parse_inline_leading_annotation.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos, vars_ptr, vars_len)[0] + end + end + end + + def parse_inline_trailing_annotation(content, encoding, start_pos, end_pos, variables) + with_variables(variables) do |vars_ptr, vars_len| + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @parse_inline_trailing_annotation.apply(ptr, len, enc_ptr, enc_len, start_pos, end_pos, vars_ptr, vars_len)[0] + end + end + end + + def lex(content, encoding, end_pos) + run(content, encoding) do |ptr, len, enc_ptr, enc_len| + @lex.apply(ptr, len, enc_ptr, enc_len, end_pos)[0] + end + end + private # Copies `source` and its encoding name into linear memory, yields their @@ -159,11 +191,13 @@ def build_instance wasm end - # Chicory's AOT compiler when its jars are present, otherwise nil (the - # builder then uses the interpreter). + # Chicory's AOT compiler when its jars are present and usable, otherwise nil + # (the builder then uses the interpreter). NameError covers a missing + # compiler class; LinkageError covers an incompatible/missing ASM (so a bad + # jar set degrades to the interpreter instead of crashing). def machine_factory(wasm_module) Java::ComDylibsoChicoryCompiler::MachineFactoryCompiler.compile(wasm_module) - rescue NameError + rescue NameError, Java::JavaLang::LinkageError nil end diff --git a/sig/wasm/deserializer.rbs b/sig/wasm/deserializer.rbs index 4e04e43c2..33cffbe5c 100644 --- a/sig/wasm/deserializer.rbs +++ b/sig/wasm/deserializer.rbs @@ -17,19 +17,27 @@ module RBS # `[directives, declarations]` to match RBS::Parser._parse_signature. def self.deserialize: (String bytes, Buffer buffer) -> untyped + # Deserialize a bare node list (RBS::Parser._parse_type_params). + def self.deserialize_node_list: (String bytes, Buffer buffer) -> Array[untyped] + + # Deserialize the token stream from rbs_wasm_lex (RBS::Parser._lex). + def self.deserialize_tokens: (String bytes, Buffer buffer) -> Array[[ Symbol, Location ]] + def initialize: (String bytes, Buffer buffer) -> void # Reads the next node and returns the reconstructed Ruby value. def read_node: () -> untyped + def read_node_list: () -> Array[untyped] + + def read_tokens: () -> Array[[ Symbol, Location ]] + private def read_struct: (Array[untyped] entry) -> untyped def read_field: (untyped reader) -> untyped - def read_node_list: () -> Array[untyped] - def read_hash: () -> Hash[untyped, untyped] def read_count: () -> Integer diff --git a/src/serialize.c b/src/serialize.c index 77162f074..064c6701b 100644 --- a/src/serialize.c +++ b/src/serialize.c @@ -944,3 +944,15 @@ rbs_string_t rbs_serialize_node(rbs_allocator_t *allocator, rbs_constant_pool_t return rbs_buffer_to_string(&state.buffer); } + +rbs_string_t rbs_serialize_node_list(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_list_t *list) { + rbs_serialize_state state = { + .allocator = allocator, + .constant_pool = constant_pool, + }; + rbs_buffer_init(allocator, &state.buffer); + + w_node_list(&state, list); + + return rbs_buffer_to_string(&state.buffer); +} diff --git a/templates/include/rbs/serialize.h.erb b/templates/include/rbs/serialize.h.erb index c9794f62e..572a5f616 100644 --- a/templates/include/rbs/serialize.h.erb +++ b/templates/include/rbs/serialize.h.erb @@ -23,4 +23,10 @@ */ rbs_string_t rbs_serialize_node(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_t *node); +/** + * Like rbs_serialize_node, but for a bare node list (e.g. the result of + * rbs_parse_type_params). Decoded by RBS::WASM::Deserializer.deserialize_node_list. + */ +rbs_string_t rbs_serialize_node_list(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_list_t *list); + #endif diff --git a/templates/src/serialize.c.erb b/templates/src/serialize.c.erb index b0c4aceaf..c72d2fecc 100644 --- a/templates/src/serialize.c.erb +++ b/templates/src/serialize.c.erb @@ -219,3 +219,15 @@ rbs_string_t rbs_serialize_node(rbs_allocator_t *allocator, rbs_constant_pool_t return rbs_buffer_to_string(&state.buffer); } + +rbs_string_t rbs_serialize_node_list(rbs_allocator_t *allocator, rbs_constant_pool_t *constant_pool, rbs_node_list_t *list) { + rbs_serialize_state state = { + .allocator = allocator, + .constant_pool = constant_pool, + }; + rbs_buffer_init(allocator, &state.buffer); + + w_node_list(&state, list); + + return rbs_buffer_to_string(&state.buffer); +} diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index a745a266f..9977f7f07 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -857,8 +857,6 @@ def test_proc__untyped_function end def test_parse_type_params - omit_on_jruby! "RBS::Parser.parse_type_params is not yet wired through the WebAssembly parser" - RBS::Parser.parse_type_params(buffer("[T]")).tap do |params| assert_equal 1, params.size assert_equal :T, params[0].name @@ -1002,8 +1000,6 @@ def test_parse_type_params end def test__lex - omit_on_jruby! "RBS::Parser._lex is not yet wired through the WebAssembly parser" - content = <<~RBS # LineComment class Foo[T < Integer] < Bar # Comment diff --git a/test/rbs/wasm/jruby_parser_test.rb b/test/rbs/wasm/jruby_parser_test.rb index ae72daa08..c66a3e4e5 100644 --- a/test/rbs/wasm/jruby_parser_test.rb +++ b/test/rbs/wasm/jruby_parser_test.rb @@ -60,6 +60,19 @@ def test_parse_method_type assert_equal "[T] (T, ?Integer) { (T) -> void } -> T", RBS::Parser.parse_method_type("[T] (T, ?Integer) { (T) -> void } -> T").to_s end + def test_parse_type_params + params = RBS::Parser.parse_type_params("[T < Comparable, unchecked out U]") + assert_equal [:T, :U], params.map(&:name) + assert_equal "Comparable", params[0].upper_bound&.name&.to_s + assert_equal :covariant, params[1].variance + end + + def test_lex + types = RBS::Parser.lex("class Foo\nend\n").value.map(&:type) + assert_include types, :kCLASS + assert_equal :pEOF, types.last + end + def test_parse_error_raises_parsing_error error = assert_raises(RBS::ParsingError) do RBS::Parser.parse_signature("class 123 Broken end") diff --git a/wasm/rbs_wasm.c b/wasm/rbs_wasm.c index 408e310ba..14697d43f 100644 --- a/wasm/rbs_wasm.c +++ b/wasm/rbs_wasm.c @@ -26,6 +26,7 @@ #include "rbs/parser.h" #include "rbs/serialize.h" #include "rbs/string.h" +#include "rbs/util/rbs_buffer.h" #include "rbs/util/rbs_encoding.h" // The result of the most recent parse, living in linear memory until the next @@ -267,6 +268,131 @@ __attribute__((export_name("rbs_wasm_parse_method_type"))) int rbs_wasm_parse_me return status; } +/** + * Parse a type parameter list (e.g. `[T < Comparable]`). On success the result + * is a serialized node list; an empty result means the input was empty (`nil`). + */ +__attribute__((export_name("rbs_wasm_parse_type_params"))) int rbs_wasm_parse_type_params(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, int module_type_params) { + if (!range_is_valid(start_pos, end_pos, length)) { + allocate_result(0); + return 0; + } + + rbs_string_t string = rbs_string_new(source, source + length); + rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); + + int status; + if (parser->next_token.type == pEOF) { + allocate_result(0); + status = 1; + } else { + rbs_node_list_t *params = NULL; + rbs_parse_type_params(parser, module_type_params != 0, ¶ms); + + if (parser->error == NULL) { + rbs_string_t bytes = rbs_serialize_node_list(parser->allocator, &parser->constant_pool, params); + size_t n = rbs_string_len(bytes); + memcpy(allocate_result(n), bytes.start, n); + status = 1; + } else { + status = set_error_result(parser); + } + } + + rbs_parser_free(parser); + return status; +} + +// Shared body for the leading/trailing inline annotation parsers. +static int parse_inline_annotation(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length, bool leading) { + if (!range_is_valid(start_pos, end_pos, length)) { + allocate_result(0); + return 0; + } + + rbs_string_t string = rbs_string_new(source, source + length); + rbs_parser_t *parser = rbs_parser_new(string, resolve_encoding(encoding, encoding_length), start_pos, end_pos); + declare_variables(parser, variables, variables_length); + + rbs_ast_ruby_annotations_t *annotation = NULL; + bool success = leading ? rbs_parse_inline_leading_annotation(parser, &annotation) : rbs_parse_inline_trailing_annotation(parser, &annotation); + + int status; + if (parser->error != NULL) { + status = set_error_result(parser); + } else if (!success || annotation == NULL) { + allocate_result(0); + status = 1; + } else { + status = set_serialized_result(parser, (rbs_node_t *) annotation); + } + + rbs_parser_free(parser); + return status; +} + +/** + * Parse an inline leading annotation. On success the result is a serialized + * node; an empty result means there was no annotation (`nil`). + */ +__attribute__((export_name("rbs_wasm_parse_inline_leading_annotation"))) int rbs_wasm_parse_inline_leading_annotation(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length) { + return parse_inline_annotation(source, length, encoding, encoding_length, start_pos, end_pos, variables, variables_length, true); +} + +/** + * Parse an inline trailing annotation. See rbs_wasm_parse_inline_leading_annotation. + */ +__attribute__((export_name("rbs_wasm_parse_inline_trailing_annotation"))) int rbs_wasm_parse_inline_trailing_annotation(const char *source, int length, const char *encoding, int encoding_length, int start_pos, int end_pos, const char *variables, int variables_length) { + return parse_inline_annotation(source, length, encoding, encoding_length, start_pos, end_pos, variables, variables_length, false); +} + +static void w_lex_u32(rbs_allocator_t *allocator, rbs_buffer_t *buffer, uint32_t value) { + unsigned char bytes[4] = { + (unsigned char) (value & 0xff), + (unsigned char) ((value >> 8) & 0xff), + (unsigned char) ((value >> 16) & 0xff), + (unsigned char) ((value >> 24) & 0xff), + }; + rbs_buffer_append_string(allocator, buffer, (const char *) bytes, 4); +} + +/** + * Lex the source into tokens. The result is a sequence of records, with no + * leading count (the host reads until the buffer is exhausted): + * + * [u32 type_name_len][type_name bytes][i32 start_char][i32 end_char] + * + * The final token is always pEOF, mirroring RBS::Parser._lex. + * + * @return 1 always (lexing does not report parse errors here). + */ +__attribute__((export_name("rbs_wasm_lex"))) int rbs_wasm_lex(const char *source, int length, const char *encoding, int encoding_length, int end_pos) { + rbs_allocator_t *allocator = rbs_allocator_init(); + rbs_lexer_t *lexer = rbs_lexer_new(allocator, rbs_string_new(source, source + length), resolve_encoding(encoding, encoding_length), 0, end_pos); + + rbs_buffer_t buffer; + rbs_buffer_init(allocator, &buffer); + + rbs_token_t token = NullToken; + while (token.type != pEOF) { + token = rbs_lexer_next_token(lexer); + + const char *type_name = rbs_token_type_str(token.type); + uint32_t type_name_length = (uint32_t) strlen(type_name); + w_lex_u32(allocator, &buffer, type_name_length); + rbs_buffer_append_string(allocator, &buffer, type_name, type_name_length); + w_lex_u32(allocator, &buffer, (uint32_t) token.range.start.char_pos); + w_lex_u32(allocator, &buffer, (uint32_t) token.range.end.char_pos); + } + + rbs_string_t bytes = rbs_buffer_to_string(&buffer); + size_t n = rbs_string_len(bytes); + memcpy(allocate_result(n), bytes.start, n); + + rbs_allocator_free(allocator); + return 1; +} + /** * Parse a small, fixed RBS document, used as a build smoke test * (`wasmtime run --invoke rbs_wasm_selftest rbs_parser.wasm`).