From daf52ca72a04ee359208b90c4379e3805e0f7203 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 9 Jul 2025 05:49:47 +0000 Subject: [PATCH 1/2] Stop emitting K".." and K"..." in lexer Unfortunately, the sequences `..` and `...` do not always refer to the `..` operator or the `...` syntax. There are two and a half cases where they don't: 1. After `@` in macrocall, where they are both regular identifiers 2. In `import ...A` where the dots specify the level 3. `:(...)` treats `...` as quoted identifier Case 1 was handled in a previous commit by lexing these as identifiers after `2`. However, as a result of case 2, it is problematic to tokenize these dots together; we essentially have to untokenize them in the import parser. It is also infeasible to change the lexer to have speical context-sensitive lexing in `import`, because there could be arbitrary interpolations, `@eval import A, $(f(x..y)), ..b`, so deciding whether a particular `..` after import refers to the operator or a level specifier requires the parser. Currently the parser handles this by splitting the obtained tokens again in the import parser, but this is undesirable, because it invalidates the invariant that the tokens produced by the lexer correspond to the non-terminals of the final parse tree. This PR attempts to address this by only ever having the lexer emit `K"."` and having the parser decide which case it refers to. The new non-terminal `K"dots"` handles the identifier cases (ordinary `..` and quoted `:(...)` ). K"..." is now exclusively used for splat/slurp, and is no longer used in its non-terminal form for case 3. --- src/integration/expr.jl | 3 ++ src/julia/julia_parse_stream.jl | 23 ++++++---- src/julia/kinds.jl | 7 ++-- src/julia/parser.jl | 74 ++++++++++++++++++++------------- src/julia/tokenize.jl | 20 ++++----- test/expr.jl | 2 + test/parser.jl | 10 ++--- test/tokenize.jl | 15 +++---- 8 files changed, 88 insertions(+), 66 deletions(-) diff --git a/src/integration/expr.jl b/src/integration/expr.jl index 038bad9a..05e9e769 100644 --- a/src/integration/expr.jl +++ b/src/integration/expr.jl @@ -338,6 +338,9 @@ end return adjust_macro_name!(retexpr.args[1], k) elseif k == K"?" retexpr.head = :if + elseif k == K"dots" + n = numeric_flags(flags(nodehead)) + return n == 2 ? :(..) : :(...) elseif k == K"op=" && length(args) == 3 lhs = args[1] op = args[2] diff --git a/src/julia/julia_parse_stream.jl b/src/julia/julia_parse_stream.jl index 87ad0386..80e2ed8e 100644 --- a/src/julia/julia_parse_stream.jl +++ b/src/julia/julia_parse_stream.jl @@ -137,8 +137,8 @@ function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true) is_postfix_op_call(head) && (str = str*"-post") k = kind(head) - # Handle numeric flags for nrow/ncat nodes - if k in KSet"nrow ncat typed_ncat" + # Handle numeric flags for nodes that take them + if k in KSet"nrow ncat typed_ncat dots" n = numeric_flags(head) n != 0 && (str = str*"-"*string(n)) else @@ -307,7 +307,12 @@ function peek_dotted_op_token(ps, allow_whitespace=false) isdotted = kind(t) == K"." if isdotted t2 = peek_token(ps, 2) - if !is_operator(t2) || (!allow_whitespace && preceding_whitespace(t2)) + if (!allow_whitespace && preceding_whitespace(t2)) + isdotted = false + elseif !is_operator(t2) + isdotted = false + elseif kind(t2) == K"." && peek(ps, 3) == K"." + # Treat `..` as dotted K".", unless there's another dot after isdotted = false else t = t2 @@ -316,13 +321,13 @@ function peek_dotted_op_token(ps, allow_whitespace=false) return (isdotted, t) end -function bump_dotted(ps, isdot, flags=EMPTY_FLAGS; emit_dot_node=false, remap_kind=K"None") +function bump_dotted(ps, isdot, t, flags=EMPTY_FLAGS; emit_dot_node=false, remap_kind=K"None") if isdot - if emit_dot_node - dotmark = position(ps) - bump(ps, TRIVIA_FLAG) # TODO: NOTATION_FLAG - else - bump(ps, TRIVIA_FLAG) # TODO: NOTATION_FLAG + dotmark = position(ps) + bump(ps, TRIVIA_FLAG) + if kind(t) == K"." + bump(ps, TRIVIA_FLAG) + return emit(ps, dotmark, K"dots", set_numeric_flags(2)) end end pos = bump(ps, flags, remap_kind=remap_kind) diff --git a/src/julia/kinds.jl b/src/julia/kinds.jl index 19a00eb2..010ffb9b 100644 --- a/src/julia/kinds.jl +++ b/src/julia/kinds.jl @@ -278,8 +278,6 @@ register_kinds!(JuliaSyntax, 0, [ "ErrorInvalidOperator" "Error**" - "..." - # Level 1 "BEGIN_ASSIGNMENTS" "BEGIN_SYNTACTIC_ASSIGNMENTS" @@ -774,7 +772,6 @@ register_kinds!(JuliaSyntax, 0, [ # Level 8 "BEGIN_COLON" ":" - ".." "…" "⁝" "⋮" @@ -1033,6 +1030,10 @@ register_kinds!(JuliaSyntax, 0, [ "typed_ncat" "row" "nrow" + # splat/slurp + "..." + # ../... as a identifier + "dots" # Comprehensions "generator" "filter" diff --git a/src/julia/parser.jl b/src/julia/parser.jl index 2abed160..e347a5d0 100644 --- a/src/julia/parser.jl +++ b/src/julia/parser.jl @@ -371,7 +371,7 @@ function parse_RtoL(ps::ParseState, down, is_op, self) down(ps) isdot, tk = peek_dotted_op_token(ps) if is_op(tk) - bump_dotted(ps, isdot, remap_kind=K"Identifier") + bump_dotted(ps, isdot, tk, remap_kind=K"Identifier") self(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) end @@ -598,7 +598,7 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where { # a .~ b ==> (dotcall-i a ~ b) # [a ~ b c] ==> (hcat (call-i a ~ b) c) # [a~b] ==> (vect (call-i a ~ b)) - bump_dotted(ps, isdot, remap_kind=K"Identifier") + bump_dotted(ps, isdot, t, remap_kind=K"Identifier") bump_trivia(ps) parse_assignment(ps, down) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) @@ -617,7 +617,7 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where { (-1, K"Identifier", EMPTY_FLAGS), # op (1, K"=", TRIVIA_FLAG)) else - bump_dotted(ps, isdot, TRIVIA_FLAG) + bump_dotted(ps, isdot, t, TRIVIA_FLAG) end bump_trivia(ps) # Syntax Edition TODO: We'd like to call `down` here when @@ -743,7 +743,7 @@ function parse_arrow(ps::ParseState) # x <--> y ==> (call-i x <--> y) # x .--> y ==> (dotcall-i x --> y) # x -->₁ y ==> (call-i x -->₁ y) - bump_dotted(ps, isdot, remap_kind=K"Identifier") + bump_dotted(ps, isdot, t, remap_kind=K"Identifier") parse_arrow(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) end @@ -771,7 +771,7 @@ function parse_lazy_cond(ps::ParseState, down, is_op, self) (isdot, t) = peek_dotted_op_token(ps) k = kind(t) if is_op(k) - bump_dotted(ps, isdot, TRIVIA_FLAG) + bump_dotted(ps, isdot, t, TRIVIA_FLAG) self(ps) emit(ps, mark, isdot ? dotted(k) : k, flags(t)) if isdot @@ -819,7 +819,7 @@ function parse_comparison(ps::ParseState, subtype_comparison=false) while ((isdot, t) = peek_dotted_op_token(ps); is_prec_comparison(t)) n_comparisons += 1 op_dotted = isdot - op_pos = bump_dotted(ps, isdot, emit_dot_node=true, remap_kind=K"Identifier") + op_pos = bump_dotted(ps, isdot, t, emit_dot_node=true, remap_kind=K"Identifier") parse_pipe_lt(ps) end if n_comparisons == 1 @@ -873,15 +873,16 @@ end function parse_range(ps::ParseState) mark = position(ps) parse_invalid_ops(ps) + (initial_dot, initial_tok) = peek_dotted_op_token(ps) initial_kind = kind(initial_tok) - if initial_kind != K":" && is_prec_colon(initial_kind) - # a..b ==> (call-i a .. b) + if initial_kind != K":" && (is_prec_colon(initial_kind) || (initial_dot && initial_kind == K".")) + # a..b ==> (call-i a (dots-2) b) # a … b ==> (call-i a … b) # a .… b ==> (dotcall-i a … b) - bump_dotted(ps, initial_dot, remap_kind=K"Identifier") + bump_dotted(ps, initial_dot, initial_tok, remap_kind=K"Identifier") parse_invalid_ops(ps) - emit(ps, mark, initial_dot ? K"dotcall" : K"call", INFIX_FLAG) + emit(ps, mark, (initial_dot && initial_kind != K".") ? K"dotcall" : K"call", INFIX_FLAG) elseif initial_kind == K":" && ps.range_colon_enabled # a ? b : c:d ==> (? a b (call-i c : d)) n_colons = 0 @@ -948,8 +949,10 @@ function parse_range(ps::ParseState) # x... ==> (... x) # x:y... ==> (... (call-i x : y)) # x..y... ==> (... (call-i x .. y)) # flisp parser fails here - if peek(ps) == K"..." + if peek(ps) == K"." && peek(ps, 2) == K"." && peek(ps, 3) == K"." bump(ps, TRIVIA_FLAG) + bump(ps, TRIVIA_FLAG) # second dot + bump(ps, TRIVIA_FLAG) # third dot emit(ps, mark, K"...") end end @@ -965,7 +968,7 @@ function parse_invalid_ops(ps::ParseState) parse_expr(ps) while ((isdot, t) = peek_dotted_op_token(ps); kind(t) in KSet"ErrorInvalidOperator Error**") bump_trivia(ps) - bump_dotted(ps, isdot) + bump_dotted(ps, isdot, t) parse_expr(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) end @@ -1006,7 +1009,7 @@ function parse_with_chains(ps::ParseState, down, is_op, chain_ops) # [x+y + z] ==> (vect (call-i x + y z)) break end - bump_dotted(ps, isdot, remap_kind=K"Identifier") + bump_dotted(ps, isdot, t, remap_kind=K"Identifier") down(ps) if kind(t) in chain_ops && !is_suffixed(t) && !isdot # a + b + c ==> (call-i a + b c) @@ -1258,7 +1261,7 @@ function parse_unary(ps::ParseState) # # (The flisp parser only considers commas before `;` and thus gets this # last case wrong) - op_pos = bump_dotted(ps, op_dotted, emit_dot_node=true, remap_kind=K"Identifier") + op_pos = bump_dotted(ps, op_dotted, op_t, emit_dot_node=true, remap_kind=K"Identifier") space_before_paren = preceding_whitespace(t2) if space_before_paren @@ -1352,12 +1355,12 @@ function parse_unary(ps::ParseState) # -0x1 ==> (call-pre - 0x01) # - 2 ==> (call-pre - 2) # .-2 ==> (dotcall-pre - 2) - op_pos = bump_dotted(ps, op_dotted, remap_kind=K"Identifier") + op_pos = bump_dotted(ps, op_dotted, op_t, remap_kind=K"Identifier") else # /x ==> (call-pre (error /) x) # +₁ x ==> (call-pre (error +₁) x) # .<: x ==> (dotcall-pre (error (. <:)) x) - bump_dotted(ps, op_dotted, emit_dot_node=true, remap_kind=K"Identifier") + bump_dotted(ps, op_dotted, op_t, emit_dot_node=true, remap_kind=K"Identifier") op_pos = emit(ps, mark, K"error", error="not a unary operator") end parse_unary(ps) @@ -1388,7 +1391,7 @@ end function parse_factor_with_initial_ex(ps::ParseState, mark) parse_decl_with_initial_ex(ps, mark) if ((isdot, t) = peek_dotted_op_token(ps); is_prec_power(kind(t))) - bump_dotted(ps, isdot, remap_kind=K"Identifier") + bump_dotted(ps, isdot, t, remap_kind=K"Identifier") parse_factor_after(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) end @@ -2476,11 +2479,11 @@ function parse_import_atsym(ps::ParseState, allow_quotes=true) end end b = peek_behind(ps, pos) - if warn_parens && b.orig_kind != K".." + if warn_parens && b.kind != K"dots" emit_diagnostic(ps, mark, warning="parentheses are not required here") end ok = (b.is_leaf && (b.kind == K"Identifier" || is_operator(b.kind))) || - (!b.is_leaf && b.kind in KSet"$ var") + (!b.is_leaf && (b.kind in KSet"$ var" || b.kind == K"dots")) if !ok emit(ps, mark, K"error", error="expected identifier") end @@ -2589,10 +2592,6 @@ function parse_import_path(ps::ParseState) end if k == K"." bump(ps) - elseif k == K".." - bump_split(ps, (1,K".",EMPTY_FLAGS), (1,K".",EMPTY_FLAGS)) - elseif k == K"..." - bump_split(ps, (1,K".",EMPTY_FLAGS), (1,K".",EMPTY_FLAGS), (1,K".",EMPTY_FLAGS)) else break end @@ -2611,6 +2610,17 @@ function parse_import_path(ps::ParseState) # import A.⋆.f ==> (import (importpath A ⋆ f)) next_tok = peek_token(ps, 2) if is_operator(kind(next_tok)) + if kind(next_tok) == K"." && peek(ps, 3) == K"." + # Import the .. operator + # import A... ==> (import (importpath A (dots-2))) + bump_disallowed_space(ps) + bump(ps, TRIVIA_FLAG) + dotmark = position(ps) + bump(ps, TRIVIA_FLAG) + bump(ps, TRIVIA_FLAG) + emit(ps, dotmark, K"dots", set_numeric_flags(2)) + continue + end if preceding_whitespace(t) # Whitespace in import path allowed but discouraged # import A .== ==> (import (importpath A ==)) @@ -2623,10 +2633,6 @@ function parse_import_path(ps::ParseState) end bump(ps, TRIVIA_FLAG) parse_import_atsym(ps) - elseif k == K"..." - # Import the .. operator - # import A... ==> (import (importpath A ..)) - bump_split(ps, (1,K".",TRIVIA_FLAG), (2,K"..",EMPTY_FLAGS)) elseif k in KSet"NewlineWs ; , : EndMarker" # import A; B ==> (import (importpath A)) break @@ -3496,6 +3502,16 @@ function parse_atom(ps::ParseState, check_identifiers=true, has_unary_prefix=fal # . ==> (error .) emit(ps, mark, K"error", error="invalid identifier") end + elseif kind(leading_tok) == K"." && peek(ps, 2) == K"." && peek(ps, 3) == K"." + # ... + bump(ps, TRIVIA_FLAG) + bump(ps, TRIVIA_FLAG) + bump(ps, TRIVIA_FLAG) + emit(ps, mark, K"dots", set_numeric_flags(3)) + if check_identifiers + # ... ==> (error ...) + emit(ps, mark, K"error", error="invalid identifier") + end elseif is_error(leading_kind) # Errors for bad tokens are emitted in validate_tokens() rather than # here. @@ -3583,9 +3599,9 @@ function parse_atom(ps::ParseState, check_identifiers=true, has_unary_prefix=fal @label is_operator # + ==> + # .+ ==> (. +) - bump_dotted(ps, leading_dot, emit_dot_node=true, remap_kind= + bump_dotted(ps, leading_dot, leading_tok, emit_dot_node=true, remap_kind= is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier") - if check_identifiers && !is_valid_identifier(leading_kind) + if check_identifiers && !(is_valid_identifier(leading_kind) || (leading_dot && leading_kind == K".")) # += ==> (error (op= +)) # ? ==> (error ?) # .+= ==> (error (. (op= +))) diff --git a/src/julia/tokenize.jl b/src/julia/tokenize.jl index 2bd0f56d..37e40109 100644 --- a/src/julia/tokenize.jl +++ b/src/julia/tokenize.jl @@ -153,7 +153,6 @@ end function optakessuffix(k) (K"BEGIN_OPS" <= k <= K"END_OPS") && !( - k == K"..." || K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" || k == K"?" || k == K"<:" || @@ -165,7 +164,6 @@ function optakessuffix(k) k == K"≔" || k == K"⩴" || k == K":" || - k == K".." || k == K"$" || k == K"::" || k == K"where" || @@ -987,7 +985,7 @@ function lex_digit(l::Lexer, kind) pc,ppc = dpeekchar(l) if pc == '.' if ppc == '.' - # Number followed by K".." or K"..." + # Number followed by K"." return emit(l, kind) elseif kind === K"Float" # If we enter the function with kind == K"Float" then a '.' has been parsed. @@ -1166,23 +1164,19 @@ function lex_backslash(l::Lexer) end function lex_dot(l::Lexer) - if accept(l, '.') + if l.last_token == K"@" if accept(l, '.') - l.last_token == K"@" && return emit(l, K"Identifier") - return emit(l, K"...") - else - if is_dottable_operator_start_char(peekchar(l)) + if !accept(l, '.') && is_dottable_operator_start_char(peekchar(l)) readchar(l) return emit(l, K"ErrorInvalidOperator") - else - l.last_token == K"@" && return emit(l, K"Identifier") - return emit(l, K"..") end end - elseif Base.isdigit(peekchar(l)) + # Emit `.`, `..` and `...` as identifiers after `@` + emit(l, K"Identifier") + elseif l.last_token != K"." && Base.isdigit(peekchar(l)) + # Only start a numeric constant if the previous token wasn't a dot return lex_digit(l, K"Float") else - l.last_token == K"@" && return emit(l, K"Identifier") return emit(l, K".") end end diff --git a/test/expr.jl b/test/expr.jl index d7547848..dde93e34 100644 --- a/test/expr.jl +++ b/test/expr.jl @@ -14,6 +14,8 @@ @test parseatom(":(a)") == QuoteNode(:a) @test parseatom(":(:a)") == Expr(:quote, QuoteNode(:a)) @test parseatom(":(1+2)") == Expr(:quote, Expr(:call, :+, 1, 2)) + @test parseatom(":...") == QuoteNode(Symbol("...")) + @test parseatom(":(...)") == QuoteNode(Symbol("...")) # Compatibility hack for VERSION >= v"1.4" # https://github.com/JuliaLang/julia/pull/34077 @test parseatom(":true") == Expr(:quote, true) diff --git a/test/parser.jl b/test/parser.jl index 64ecc8ea..804d20a4 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -141,14 +141,14 @@ tests = [ "1:\n2" => "(call-i 1 : (error))" ], JuliaSyntax.parse_range => [ - "a..b" => "(call-i a .. b)" + "a..b" => "(call-i a (dots-2) b)" "a … b" => "(call-i a … b)" "a .… b" => "(dotcall-i a … b)" "[1 :a]" => "(hcat 1 (quote-: a))" "[1 2:3 :a]" => "(hcat 1 (call-i 2 : 3) (quote-: a))" "x..." => "(... x)" "x:y..." => "(... (call-i x : y))" - "x..y..." => "(... (call-i x .. y))" + "x..y..." => "(... (call-i x (dots-2) y))" ], JuliaSyntax.parse_invalid_ops => [ "a--b" => "(call-i a (ErrorInvalidOperator) b)" @@ -719,7 +719,7 @@ tests = [ "import A.:(+)" => "(import (importpath A (quote-: (parens +))))" "import A.==" => "(import (importpath A ==))" "import A.⋆.f" => "(import (importpath A ⋆ f))" - "import A..." => "(import (importpath A ..))" + "import A..." => "(import (importpath A (dots-2)))" "import A; B" => "(import (importpath A))" # Colons not allowed first in import paths # but are allowed in trailing components (#473) @@ -816,7 +816,7 @@ tests = [ "&&" => "(error &&)" "||" => "(error ||)" "." => "(error .)" - "..." => "(error ...)" + "..." => "(error (dots-3))" "+=" => "(error +=)" "-=" => "(error -=)" "*=" => "(error *=)" @@ -1143,7 +1143,7 @@ parsestmt_with_kind_tests = [ "a → b" => "(call-i a::Identifier →::Identifier b::Identifier)" "a < b < c" => "(comparison a::Identifier <::Identifier b::Identifier <::Identifier c::Identifier)" "a .<: b"=> "(dotcall-i a::Identifier <:::Identifier b::Identifier)" - "a .. b" => "(call-i a::Identifier ..::Identifier b::Identifier)" + "a .. b" => "(call-i a::Identifier (dots-2) b::Identifier)" "a : b" => "(call-i a::Identifier :::Identifier b::Identifier)" "-2^x" => "(call-pre -::Identifier (call-i 2::Integer ^::Identifier x::Identifier))" "-(2)" => "(call-pre -::Identifier (parens 2::Integer))" diff --git a/test/tokenize.jl b/test/tokenize.jl index 50891520..ab3800c9 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -155,7 +155,7 @@ end # testset end # testset @testset "issue 5, '..'" begin - @test kind.(collect(tokenize("1.23..3.21"))) == [K"Float",K"..",K"Float",K"EndMarker"] + @test kind.(collect(tokenize("1.23..3.21"))) == [K"Float",K".",K".",K"Float",K"EndMarker"] end @testset "issue 17, >>" begin @@ -712,10 +712,10 @@ end @test toks("1.#") == ["1."=>K"Float", "#"=>K"Comment"] # ellipses - @test toks("1..") == ["1"=>K"Integer", ".."=>K".."] - @test toks("1...") == ["1"=>K"Integer", "..."=>K"..."] - @test toks(".1..") == [".1"=>K"Float", ".."=>K".."] - @test toks("0x01..") == ["0x01"=>K"HexInt", ".."=>K".."] + @test toks("1..") == ["1"=>K"Integer", "."=>K".", "."=>K"."] + @test toks("1...") == ["1"=>K"Integer", "."=>K".", "."=>K".", "."=>K"."] + @test toks(".1..") == [".1"=>K"Float", "."=>K".", "."=>K"."] + @test toks("0x01..") == ["0x01"=>K"HexInt", "."=>K".", "."=>K"."] # Dotted operators and other dotted suffixes @test toks("1234 .+1") == ["1234"=>K"Integer", " "=>K"Whitespace", "."=>K".", "+"=>K"+", "1"=>K"Integer"] @@ -876,8 +876,9 @@ end @test toks("--") == ["--"=>K"ErrorInvalidOperator"] @test toks("1**2") == ["1"=>K"Integer", "**"=>K"Error**", "2"=>K"Integer"] @test toks("a<---b") == ["a"=>K"Identifier", "<---"=>K"ErrorInvalidOperator", "b"=>K"Identifier"] - @test toks("a..+b") == ["a"=>K"Identifier", "..+"=>K"ErrorInvalidOperator", "b"=>K"Identifier"] - @test toks("a..−b") == ["a"=>K"Identifier", "..−"=>K"ErrorInvalidOperator", "b"=>K"Identifier"] + # These used to test for invalid operators ..+ and ..−, but now .. is tokenized as two dots + @test toks("a..+b") == ["a"=>K"Identifier", "."=>K".", "."=>K".", "+"=>K"+", "b"=>K"Identifier"] + @test toks("a..−b") == ["a"=>K"Identifier", "."=>K".", "."=>K".", "−"=>K"-", "b"=>K"Identifier"] end @testset "hat suffix" begin From 4f953412a396d16d6cd4f28bb04143d0a17a6706 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 10 Jul 2025 20:28:16 +0000 Subject: [PATCH 2/2] Remove separate syntax heads for each operator This replaces all the specialized operator heads by a single K"Operator" head that encodes the precedence level in its flags (except for operators that are also used for non-operator purposes). The operators are already K"Identifier" in the final parse tree. There is very little reason to spend all of the extra effort separating them into separate heads only to undo this later. Moreover, I think it's actively misleading, because it makes people think that they can query things about an operator by looking at the head, which doesn't work for suffixed operators. Additionally, this removes the `op=` token, replacing it by two tokens, one K"Operator" with a special precendence level and one `=`. This then removes the last use of `bump_split` (since this PR is on top of #573). As a free bonus this prepares us for having compound assignment syntax for suffixed operators, which was infeasible in the flips parser. That syntax change is not part of this PR but would be trivial (this PR makes it an explicit error). Fixes #334 --- docs/src/api.md | 1 - src/JuliaSyntax.jl | 7 +- src/core/parse_stream.jl | 7 +- src/integration/expr.jl | 36 +- src/julia/julia_parse_stream.jl | 60 +-- src/julia/kinds.jl | 902 ++++++-------------------------- src/julia/parser.jl | 149 +++--- src/julia/tokenize.jl | 306 +++++------ test/parser.jl | 50 +- test/parser_api.jl | 6 +- test/tokenize.jl | 91 ++-- 11 files changed, 541 insertions(+), 1074 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 5dfbec6e..b2440f01 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -101,7 +101,6 @@ JuliaSyntax.is_infix_op_call JuliaSyntax.is_prefix_op_call JuliaSyntax.is_postfix_op_call JuliaSyntax.is_dotted -JuliaSyntax.is_suffixed JuliaSyntax.is_decorated JuliaSyntax.numeric_flags ``` diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl index da5861c0..0f930224 100644 --- a/src/JuliaSyntax.jl +++ b/src/JuliaSyntax.jl @@ -41,7 +41,11 @@ export SourceFile @_public source_line_range # Expression predicates, kinds and flags -export @K_str, kind +export @K_str, kind, PrecedenceLevel, PREC_NONE, PREC_ASSIGNMENT, + PREC_PAIRARROW, PREC_CONDITIONAL, PREC_ARROW, PREC_LAZYOR, PREC_LAZYAND, + PREC_COMPARISON, PREC_PIPE_LT, PREC_PIPE_GT, PREC_COLON, PREC_PLUS, + PREC_BITSHIFT, PREC_TIMES, PREC_RATIONAL, PREC_POWER, PREC_DECL, + PREC_WHERE, PREC_DOT, PREC_QUOTE, PREC_UNICODE_OPS, PREC_COMPOUND_ASSIGN, generic_operators_by_level @_public Kind @_public flags, @@ -53,7 +57,6 @@ export @K_str, kind is_prefix_op_call, is_postfix_op_call, is_dotted, - is_suffixed, is_decorated, numeric_flags, has_flags, diff --git a/src/core/parse_stream.jl b/src/core/parse_stream.jl index da4d70cc..7ba09901 100644 --- a/src/core/parse_stream.jl +++ b/src/core/parse_stream.jl @@ -45,7 +45,7 @@ kind(head::SyntaxHead) = head.kind Return the flag bits of a syntactic construct. Prefer to query these with the predicates `is_trivia`, `is_prefix_call`, `is_infix_op_call`, -`is_prefix_op_call`, `is_postfix_op_call`, `is_dotted`, `is_suffixed`, +`is_prefix_op_call`, `is_postfix_op_call`, `is_dotted`, `is_decorated`. Or extract numeric portion of the flags with `numeric_flags`. @@ -376,7 +376,10 @@ function _buffer_lookahead_tokens(lexer, lookahead) was_whitespace = is_whitespace(k) had_whitespace |= was_whitespace f = EMPTY_FLAGS - raw.suffix && (f |= SUFFIXED_FLAG) + if k == K"Operator" && raw.op_precedence != Tokenize.PREC_NONE + # Store operator precedence in numeric flags + f |= set_numeric_flags(Int(raw.op_precedence)) + end push!(lookahead, SyntaxToken(SyntaxHead(k, f), k, had_whitespace, raw.endbyte + 2)) token_count += 1 diff --git a/src/integration/expr.jl b/src/integration/expr.jl index 05e9e769..52f9e22f 100644 --- a/src/integration/expr.jl +++ b/src/integration/expr.jl @@ -341,20 +341,28 @@ end elseif k == K"dots" n = numeric_flags(flags(nodehead)) return n == 2 ? :(..) : :(...) - elseif k == K"op=" && length(args) == 3 - lhs = args[1] - op = args[2] - rhs = args[3] - headstr = string(args[2], '=') - retexpr.head = Symbol(headstr) - retexpr.args = Any[lhs, rhs] - elseif k == K".op=" && length(args) == 3 - lhs = args[1] - op = args[2] - rhs = args[3] - headstr = '.' * string(args[2], '=') - retexpr.head = Symbol(headstr) - retexpr.args = Any[lhs, rhs] + elseif k == K"op=" + if length(args) == 3 + lhs = args[1] + op = args[2] + rhs = args[3] + headstr = string(args[2], '=') + retexpr.head = Symbol(headstr) + retexpr.args = Any[lhs, rhs] + elseif length(args) == 1 + return Symbol(string(args[1], '=')) + end + elseif k == K".op=" + if length(args) == 3 + lhs = args[1] + op = args[2] + rhs = args[3] + headstr = '.' * string(args[2], '=') + retexpr.head = Symbol(headstr) + retexpr.args = Any[lhs, rhs] + else + return Symbol(string('.', args[1], '=')) + end elseif k == K"macrocall" if length(args) >= 2 a2 = args[2] diff --git a/src/julia/julia_parse_stream.jl b/src/julia/julia_parse_stream.jl index 80e2ed8e..21d4728f 100644 --- a/src/julia/julia_parse_stream.jl +++ b/src/julia/julia_parse_stream.jl @@ -1,7 +1,3 @@ -# Token flags - may be set for operator kinded tokens -# Operator has a suffix -const SUFFIXED_FLAG = RawFlags(1<<2) - # Set for K"call", K"dotcall" or any syntactic operator heads # Distinguish various syntaxes which are mapped to K"call" const PREFIX_CALL_FLAG = RawFlags(0<<3) @@ -110,15 +106,6 @@ Return true for postfix operator calls such as the `'ᵀ` call node parsed from """ is_postfix_op_call(x) = call_type_flags(x) == POSTFIX_OP_FLAG - -""" - is_suffixed(x) - -Return true for operators which have suffixes, such as `+₁` -""" -is_suffixed(x) = has_flags(x, SUFFIXED_FLAG) - - """ numeric_flags(x) @@ -164,7 +151,6 @@ function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true) str *= "-," end end - is_suffixed(head) && (str = str*"-suf") end str end @@ -262,45 +248,6 @@ function validate_tokens(stream::ParseStream) sort!(stream.diagnostics, by=first_byte) end -""" - bump_split(stream, token_spec1, [token_spec2 ...]) - -Bump the next token, splitting it into several pieces - -Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`. -If all `nbyte` are positive, the sum must equal the token length. If one -`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of -all `nbyte` must equal zero. - -This is a hack which helps resolves the occasional lexing ambiguity. For -example -* Whether .+ should be a single token or the composite (. +) which is used for - standalone operators. -* Whether ... is splatting (most of the time) or three . tokens in import paths - -TODO: Are these the only cases? Can we replace this general utility with a -simpler one which only splits preceding dots? -""" -function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N} - tok = stream.lookahead[stream.lookahead_index] - stream.lookahead_index += 1 - start_b = _next_byte(stream) - toklen = tok.next_byte - start_b - prev_b = start_b - for (i, (nbyte, k, f)) in enumerate(split_spec) - h = SyntaxHead(k, f) - actual_nbyte = nbyte < 0 ? (toklen + nbyte) : nbyte - orig_k = k == K"." ? K"." : kind(tok) - node = RawGreenNode(h, actual_nbyte, orig_k) - push!(stream.output, node) - prev_b += actual_nbyte - stream.next_byte += actual_nbyte - end - @assert tok.next_byte == prev_b - stream.peek_count = 0 - return position(stream) -end - function peek_dotted_op_token(ps, allow_whitespace=false) # Peek the next token, but if it is a dot, peek the next one as well t = peek_token(ps) @@ -318,7 +265,12 @@ function peek_dotted_op_token(ps, allow_whitespace=false) t = t2 end end - return (isdotted, t) + isassign = false + if !allow_whitespace && is_operator(t) + t3 = peek_token(ps, 2+isdotted) + isassign = kind(t3) == K"=" && !preceding_whitespace(t3) + end + return (isdotted, isassign, t) end function bump_dotted(ps, isdot, t, flags=EMPTY_FLAGS; emit_dot_node=false, remap_kind=K"None") diff --git a/src/julia/kinds.jl b/src/julia/kinds.jl index 010ffb9b..3aa17ef9 100644 --- a/src/julia/kinds.jl +++ b/src/julia/kinds.jl @@ -181,7 +181,6 @@ Return the `Kind` of `x`. """ kind(k::Kind) = k - #------------------------------------------------------------------------------- # Kinds used by JuliaSyntax register_kinds!(JuliaSyntax, 0, [ @@ -193,6 +192,7 @@ register_kinds!(JuliaSyntax, 0, [ # Identifiers "BEGIN_IDENTIFIERS" "Identifier" + "Operator" "Placeholder" # Used for empty catch variables, and all-underscore identifiers in lowering "END_IDENTIFIERS" @@ -278,725 +278,56 @@ register_kinds!(JuliaSyntax, 0, [ "ErrorInvalidOperator" "Error**" - # Level 1 + # Various operators that have special parsing rules and thus get explicit heads. + # All other operators (including suffixed versions of these) are K"Operator". "BEGIN_ASSIGNMENTS" - "BEGIN_SYNTACTIC_ASSIGNMENTS" "=" ".=" - "op=" # Updating assignment operator ( $= %= &= *= += -= //= /= <<= >>= >>>= \= ^= |= ÷= ⊻= ) - ".op=" ":=" - "END_SYNTACTIC_ASSIGNMENTS" "~" "≔" "⩴" "≕" + # Compound assignments + "op=" + ".op=" "END_ASSIGNMENTS" - - "BEGIN_PAIRARROW" - "=>" - "END_PAIRARROW" - - # Level 2 - "BEGIN_CONDITIONAL" - "?" - "END_CONDITIONAL" - - # Level 3 - "BEGIN_ARROW" - "-->" - "<--" - "<-->" - "←" - "→" - "↔" - "↚" - "↛" - "↞" - "↠" - "↢" - "↣" - "↤" - "↦" - "↮" - "⇎" - "⇍" - "⇏" - "⇐" - "⇒" - "⇔" - "⇴" - "⇶" - "⇷" - "⇸" - "⇹" - "⇺" - "⇻" - "⇼" - "⇽" - "⇾" - "⇿" - "⟵" - "⟶" - "⟷" - "⟹" - "⟺" - "⟻" - "⟼" - "⟽" - "⟾" - "⟿" - "⤀" - "⤁" - "⤂" - "⤃" - "⤄" - "⤅" - "⤆" - "⤇" - "⤌" - "⤍" - "⤎" - "⤏" - "⤐" - "⤑" - "⤔" - "⤕" - "⤖" - "⤗" - "⤘" - "⤝" - "⤞" - "⤟" - "⤠" - "⥄" - "⥅" - "⥆" - "⥇" - "⥈" - "⥊" - "⥋" - "⥎" - "⥐" - "⥒" - "⥓" - "⥖" - "⥗" - "⥚" - "⥛" - "⥞" - "⥟" - "⥢" - "⥤" - "⥦" - "⥧" - "⥨" - "⥩" - "⥪" - "⥫" - "⥬" - "⥭" - "⥰" - "⧴" - "⬱" - "⬰" - "⬲" - "⬳" - "⬴" - "⬵" - "⬶" - "⬷" - "⬸" - "⬹" - "⬺" - "⬻" - "⬼" - "⬽" - "⬾" - "⬿" - "⭀" - "⭁" - "⭂" - "⭃" - "⥷" - "⭄" - "⥺" - "⭇" - "⭈" - "⭉" - "⭊" - "⭋" - "⭌" - "←" - "→" - "⇜" - "⇝" - "↜" - "↝" - "↩" - "↪" - "↫" - "↬" - "↼" - "↽" - "⇀" - "⇁" - "⇄" - "⇆" - "⇇" - "⇉" - "⇋" - "⇌" - "⇚" - "⇛" - "⇠" - "⇢" - "↷" - "↶" - "↺" - "↻" - "🢲" - "END_ARROW" - - # Level 4 - "BEGIN_LAZYOR" - "||" - ".||" - "END_LAZYOR" - - # Level 5 - "BEGIN_LAZYAND" - "&&" - ".&&" - "END_LAZYAND" - - # Level 6 - "BEGIN_COMPARISON" - "<:" - ">:" - ">" - "<" - ">=" - "≥" - "<=" - "≤" - "==" - "===" - "≡" - "!=" - "≠" - "!==" - "≢" - "∈" - "in" - "isa" - "∉" - "∋" - "∌" - "⊆" - "⊈" - "⊂" - "⊄" - "⊊" - "∝" - "∊" - "∍" - "∥" - "∦" - "∷" - "∺" - "∻" - "∽" - "∾" - "≁" - "≃" - "≂" - "≄" - "≅" - "≆" - "≇" - "≈" - "≉" - "≊" - "≋" - "≌" - "≍" - "≎" - "≐" - "≑" - "≒" - "≓" - "≖" - "≗" - "≘" - "≙" - "≚" - "≛" - "≜" - "≝" - "≞" - "≟" - "≣" - "≦" - "≧" - "≨" - "≩" - "≪" - "≫" - "≬" - "≭" - "≮" - "≯" - "≰" - "≱" - "≲" - "≳" - "≴" - "≵" - "≶" - "≷" - "≸" - "≹" - "≺" - "≻" - "≼" - "≽" - "≾" - "≿" - "⊀" - "⊁" - "⊃" - "⊅" - "⊇" - "⊉" - "⊋" - "⊏" - "⊐" - "⊑" - "⊒" - "⊜" - "⊩" - "⊬" - "⊮" - "⊰" - "⊱" - "⊲" - "⊳" - "⊴" - "⊵" - "⊶" - "⊷" - "⋍" - "⋐" - "⋑" - "⋕" - "⋖" - "⋗" - "⋘" - "⋙" - "⋚" - "⋛" - "⋜" - "⋝" - "⋞" - "⋟" - "⋠" - "⋡" - "⋢" - "⋣" - "⋤" - "⋥" - "⋦" - "⋧" - "⋨" - "⋩" - "⋪" - "⋫" - "⋬" - "⋭" - "⋲" - "⋳" - "⋴" - "⋵" - "⋶" - "⋷" - "⋸" - "⋹" - "⋺" - "⋻" - "⋼" - "⋽" - "⋾" - "⋿" - "⟈" - "⟉" - "⟒" - "⦷" - "⧀" - "⧁" - "⧡" - "⧣" - "⧤" - "⧥" - "⩦" - "⩧" - "⩪" - "⩫" - "⩬" - "⩭" - "⩮" - "⩯" - "⩰" - "⩱" - "⩲" - "⩳" - "⩵" - "⩶" - "⩷" - "⩸" - "⩹" - "⩺" - "⩻" - "⩼" - "⩽" - "⩾" - "⩿" - "⪀" - "⪁" - "⪂" - "⪃" - "⪄" - "⪅" - "⪆" - "⪇" - "⪈" - "⪉" - "⪊" - "⪋" - "⪌" - "⪍" - "⪎" - "⪏" - "⪐" - "⪑" - "⪒" - "⪓" - "⪔" - "⪕" - "⪖" - "⪗" - "⪘" - "⪙" - "⪚" - "⪛" - "⪜" - "⪝" - "⪞" - "⪟" - "⪠" - "⪡" - "⪢" - "⪣" - "⪤" - "⪥" - "⪦" - "⪧" - "⪨" - "⪩" - "⪪" - "⪫" - "⪬" - "⪭" - "⪮" - "⪯" - "⪰" - "⪱" - "⪲" - "⪳" - "⪴" - "⪵" - "⪶" - "⪷" - "⪸" - "⪹" - "⪺" - "⪻" - "⪼" - "⪽" - "⪾" - "⪿" - "⫀" - "⫁" - "⫂" - "⫃" - "⫄" - "⫅" - "⫆" - "⫇" - "⫈" - "⫉" - "⫊" - "⫋" - "⫌" - "⫍" - "⫎" - "⫏" - "⫐" - "⫑" - "⫒" - "⫓" - "⫔" - "⫕" - "⫖" - "⫗" - "⫘" - "⫙" - "⫷" - "⫸" - "⫹" - "⫺" - "⊢" - "⊣" - "⟂" - # ⫪,⫫ see https://github.com/JuliaLang/julia/issues/39350 - "⫪" - "⫫" - "END_COMPARISON" - - # Level 7 - "BEGIN_PIPE" - "<|" - "|>" - "END_PIPE" - - # Level 8 - "BEGIN_COLON" - ":" - "…" - "⁝" - "⋮" - "⋱" - "⋰" - "⋯" - "END_COLON" - - # Level 9 - "BEGIN_PLUS" - "\$" - "+" - "-" # also used for "−" - "++" - "⊕" - "⊖" - "⊞" - "⊟" - "|" - "∪" - "∨" - "⊔" - "±" - "∓" - "∔" - "∸" - "≏" - "⊎" - "⊻" - "⊽" - "⋎" - "⋓" - "⟇" - "⧺" - "⧻" - "⨈" - "⨢" - "⨣" - "⨤" - "⨥" - "⨦" - "⨧" - "⨨" - "⨩" - "⨪" - "⨫" - "⨬" - "⨭" - "⨮" - "⨹" - "⨺" - "⩁" - "⩂" - "⩅" - "⩊" - "⩌" - "⩏" - "⩐" - "⩒" - "⩔" - "⩖" - "⩗" - "⩛" - "⩝" - "⩡" - "⩢" - "⩣" - "¦" - "END_PLUS" - - # Level 10 - "BEGIN_TIMES" - "*" - "/" - "÷" - "%" - "⋅" # also used for lookalikes "·" and "·" - "∘" - "×" - "\\" - "&" - "∩" - "∧" - "⊗" - "⊘" - "⊙" - "⊚" - "⊛" - "⊠" - "⊡" - "⊓" - "∗" - "∙" - "∤" - "⅋" - "≀" - "⊼" - "⋄" - "⋆" - "⋇" - "⋉" - "⋊" - "⋋" - "⋌" - "⋏" - "⋒" - "⟑" - "⦸" - "⦼" - "⦾" - "⦿" - "⧶" - "⧷" - "⨇" - "⨰" - "⨱" - "⨲" - "⨳" - "⨴" - "⨵" - "⨶" - "⨷" - "⨸" - "⨻" - "⨼" - "⨽" - "⩀" - "⩃" - "⩄" - "⩋" - "⩍" - "⩎" - "⩑" - "⩓" - "⩕" - "⩘" - "⩚" - "⩜" - "⩞" - "⩟" - "⩠" - "⫛" - "⊍" - "▷" - "⨝" - "⟕" - "⟖" - "⟗" - "⌿" - "⨟" - "END_TIMES" - - # Level 11 - "BEGIN_RATIONAL" - "//" - "END_RATIONAL" - - # Level 12 - "BEGIN_BITSHIFTS" - "<<" - ">>" - ">>>" - "END_BITSHIFTS" - - # Level 13 - "BEGIN_POWER" - "^" - "↑" - "↓" - "⇵" - "⟰" - "⟱" - "⤈" - "⤉" - "⤊" - "⤋" - "⤒" - "⤓" - "⥉" - "⥌" - "⥍" - "⥏" - "⥑" - "⥔" - "⥕" - "⥘" - "⥙" - "⥜" - "⥝" - "⥠" - "⥡" - "⥣" - "⥥" - "⥮" - "⥯" - "↑" - "↓" - "END_POWER" - - # Level 14 - "BEGIN_DECL" - "::" - "END_DECL" - - # Level 15 - "BEGIN_WHERE" - "where" - "END_WHERE" - - # Level 16 - "BEGIN_DOT" - "." - "END_DOT" - - "!" - "'" - ".'" - "->" - - "BEGIN_UNICODE_OPS" - "¬" - "√" - "∛" - "∜" - "END_UNICODE_OPS" + "?" # ternary operator + "||" # not an operator call + ".||" # dotted of above (not emitted by lexer) + "&&" # not an operator call + ".&&" # dotted of above (not emitted by lexer) + "<:" # subtype syntax + ">:" # supertype syntax + "::" # field type syntax + "." # various dot syntax + ".." # .. operator (not emitted by lexer) + "in" # iteration syntax + "isa" + "where" + "!" # syntactic unary + "'" # special postfix parsing + ".'" # special postfix parsing + "->" # syntactic arrow + "-->" # syntactic arrow + ":" # used for quoting + "+" # used in numeric constants + "++" # special chaining syntax + "*" # special chaining syntax + "<" # recovery path for :< + ">" # recovery path for :> + "\$" # interpolation + "-" # negated constants + "&" # syntactic unary + "∈" # iteration syntax + # all syntactic unary + "⋆" + "±" + "∓" + "¬" + "√" + "∛" + "∜" "END_OPS" # 2. Nonterminals which are exposed in the AST, but where the surface @@ -1074,6 +405,109 @@ register_kinds!(JuliaSyntax, 0, [ "END_ERRORS" ]) +@enum PrecedenceLevel begin + PREC_NONE + PREC_ASSIGNMENT + PREC_PAIRARROW + PREC_CONDITIONAL + PREC_ARROW + PREC_LAZYOR + PREC_LAZYAND + PREC_COMPARISON + PREC_PIPE_LT + PREC_PIPE_GT + PREC_COLON + PREC_PLUS + PREC_BITSHIFT + PREC_TIMES + PREC_RATIONAL + PREC_POWER + PREC_DECL + PREC_WHERE + PREC_DOT + PREC_QUOTE + PREC_UNICODE_OPS + # Special precendence to only allow compound assignment for designated operators, for + # compatibility with flisp + PREC_COMPOUND_ASSIGN +end + +const generic_operators_by_level = Dict{PrecedenceLevel, Vector{Char}}( + PREC_ASSIGNMENT => Char[#= = .= := ~ ≔ ⩴ ≕ =#], + PREC_PAIRARROW => Char[#= => =#], + PREC_CONDITIONAL => Char[#= ? =#], + PREC_ARROW => + [#= -> --> <-- <--> =# + '←', '→', '↔', '↚', '↛', '↞', '↠', '↢', + '↣', '↤', '↦', '↮', '⇎', '⇍', '⇏', '⇐', '⇒', '⇔', '⇴', + '⇶', '⇷', '⇸', '⇹', '⇺', '⇻', '⇼', '⇽', '⇾', '⇿', '⟵', + '⟶', '⟷', '⟹', '⟺', '⟻', '⟼', '⟽', '⟾', '⟿', '⤀', '⤁', + '⤂', '⤃', '⤄', '⤅', '⤆', '⤇', '⤌', '⤍', '⤎', '⤏', '⤐', '⤑', + '⤔', '⤕', '⤖', '⤗', '⤘', '⤝', '⤞', '⤟', '⤠', '⥄', '⥅', '⥆', + '⥇', '⥈', '⥊', '⥋', '⥎', '⥐', '⥒', '⥓', '⥖', '⥗', '⥚', '⥛', + '⥞', '⥟', '⥢', '⥤', '⥦', '⥧', '⥨', '⥩', '⥪', '⥫', '⥬', '⥭', + '⥰', '⧴', '⬱', '⬰', '⬲', '⬳', '⬴', '⬵', '⬶', '⬷', '⬸', '⬹', + '⬺', '⬻', '⬼', '⬽', '⬾', '⬿', '⭀', '⭁', '⭂', '⭃', '⥷', '⭄', + '⥺', '⭇', '⭈', '⭉', '⭊', '⭋', '⭌', '←', '→', '⇜', '⇝', '↜', '↝', + '↩', '↪', '↫', '↬', '↼', '↽', '⇀', '⇁', '⇄', '⇆', '⇇', '⇉', '⇋', + '⇌', '⇚', '⇛', '⇠', '⇢', '↷', '↶', '↺', '↻', '🢲'], + PREC_LAZYOR => Char[#= || =#], + PREC_LAZYAND => Char[#= && =#], + PREC_COMPARISON => + [#= <: >: in isa < > ∈ == != !== =# + '≥', '≤', '≡', '≠', '≢', '∉', '∋', + '∌', '⊆', '⊈', '⊂', '⊄', '⊊', '∝', '∊', '∍', '∥', '∦', + '∷', '∺', '∻', '∽', '∾', '≁', '≃', '≂', '≄', '≅', '≆', + '≇', '≈', '≉', '≊', '≋', '≌', '≍', '≎', '≐', '≑', '≒', + '≓', '≖', '≗', '≘', '≙', '≚', '≛', '≜', '≝', '≞', '≟', + '≣', '≦', '≧', '≨', '≩', '≪', '≫', '≬', '≭', '≮', '≯', + '≰', '≱', '≲', '≳', '≴', '≵', '≶', '≷', '≸', '≹', '≺', + '≻', '≼', '≽', '≾', '≿', '⊀', '⊁', '⊃', '⊅', '⊇', '⊉', + '⊋', '⊏', '⊐', '⊑', '⊒', '⊜', '⊩', '⊬', '⊮', '⊰', '⊱', + '⊲', '⊳', '⊴', '⊵', '⊶', '⊷', '⋍', '⋐', '⋑', '⋕', '⋖', + '⋗', '⋘', '⋙', '⋚', '⋛', '⋜', '⋝', '⋞', '⋟', '⋠', '⋡', + '⋢', '⋣', '⋤', '⋥', '⋦', '⋧', '⋨', '⋩', '⋪', '⋫', '⋬', + '⋭', '⋲', '⋳', '⋴', '⋵', '⋶', '⋷', '⋸', '⋹', '⋺', '⋻', + '⋼', '⋽', '⋾', '⋿', '⟈', '⟉', '⟒', '⦷', '⧀', '⧁', '⧡', + '⧣', '⧤', '⧥', '⩦', '⩧', '⩪', '⩫', '⩬', '⩭', '⩮', '⩯', + '⩰', '⩱', '⩲', '⩳', '⩵', '⩶', '⩷', '⩸', '⩹', '⩺', '⩻', + '⩼', '⩽', '⩾', '⩿', '⪀', '⪁', '⪂', '⪃', '⪄', '⪅', '⪆', '⪇', + '⪈', '⪉', '⪊', '⪋', '⪌', '⪍', '⪎', '⪏', '⪐', '⪑', '⪒', '⪓', + '⪔', '⪕', '⪖', '⪗', '⪘', '⪙', '⪚', '⪛', '⪜', '⪝', '⪞', '⪟', + '⪠', '⪡', '⪢', '⪣', '⪤', '⪥', '⪦', '⪧', '⪨', '⪩', '⪪', + '⪫', '⪬', '⪭', '⪮', '⪯', '⪰', '⪱', '⪲', '⪳', '⪴', '⪵', + '⪶', '⪷', '⪸', '⪹', '⪺', '⪻', '⪼', '⪽', '⪾', '⪿', '⫀', + '⫁', '⫂', '⫃', '⫄', '⫅', '⫆', '⫇', '⫈', '⫉', '⫊', '⫋', + '⫌', '⫍', '⫎', '⫏', '⫐', '⫑', '⫒', '⫓', '⫔', '⫕', '⫖', + '⫗', '⫘', '⫙', '⫷', '⫸', '⫹', '⫺', '⊢', '⊣', '⟂', '⫪', '⫫'], + PREC_PIPE_LT => Char[#= <| =#], + PREC_PIPE_GT => Char[#= |> =#], + PREC_COLON => [ #= : .. =# '…', '⁝', '⋮', '⋱', '⋰', '⋯'], + PREC_PLUS => + [ #= + - ± ∓ ++ =# + '⊕', '⊖', '⊞', '⊟', '|', '∪', '∨', + '⊔', '±', '∓', '∔', '∸', '≏', '⊎', '⊻', '⊽', '⋎', '⋓', '⟇', '⧺', + '⧻', '⨈', '⨢', '⨣', '⨤', '⨥', '⨦', '⨧', '⨨', '⨩', '⨪', '⨫', '⨬', '⨭', + '⨮', '⨹', '⨺', '⩁', '⩂', '⩅', '⩊', '⩌', '⩏', '⩐', '⩒', '⩔', '⩖', '⩗', + '⩛', '⩝', '⩡', '⩢', '⩣', '¦'], + PREC_TIMES => + [ #= * ⋆ & =# + '/', '÷', '%', '⋅', '·', '·', '∘', '×', '\\', '∩', '∧', '⊗', + '⊘', '⊙', '⊚', '⊛', '⊠', '⊡', '⊓', '∗', '∙', '∤', '⅋', '≀', '⊼', '⋄', '⋆', + '⋇', '⋉', '⋊', '⋋', '⋌', '⋏', '⋒', '⟑', '⦸', '⦼', '⦾', '⦿', '⧶', '⧷', + '⨇', '⨰', '⨱', '⨲', '⨳', '⨴', '⨵', '⨶', '⨷', '⨸', '⨻', '⨼', '⨽', '⩀', + '⩃', '⩄', '⩋', '⩍', '⩎', '⩑', '⩓', '⩕', '⩘', '⩚', '⩜', '⩞', '⩟', '⩠', + '⫛', '⊍', '▷', '⨝', '⟕', '⟖', '⟗', '⌿', '⨟', + '\u00b7', # '·' Middle Dot + '\u0387' # '·' Greek Ano Teleia + ], + PREC_RATIONAL => Char[#= // =#], + PREC_BITSHIFT => Char[#= << >> >>> =#], + PREC_POWER => ['^', '↑', '↓', '⇵', '⟰', '⟱', '⤈', '⤉', '⤊', '⤋', '⤒', '⤓', '⥉', + '⥌', '⥍', '⥏', '⥑', '⥔', '⥕', '⥘', '⥙', '⥜', '⥝', '⥠', '⥡', '⥣', '⥥', + '⥮', '⥯', '↑', '↓'], +) + #------------------------------------------------------------------------------- const _nonunique_kind_names = Set([ K"Comment" @@ -1157,7 +591,7 @@ is_keyword(k::Kind) = K"BEGIN_KEYWORDS" <= k <= K"END_KEYWORDS" is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" <= k <= K"END_BLOCK_CONTINUATION_KEYWORDS" is_literal(k::Kind) = K"BEGIN_LITERAL" <= k <= K"END_LITERAL" is_number(k::Kind) = K"BEGIN_NUMBERS" <= k <= K"END_NUMBERS" -is_operator(k::Kind) = K"BEGIN_OPS" <= k <= K"END_OPS" +is_operator(k::Kind) = k == K"Operator" || K"BEGIN_OPS" <= k <= K"END_OPS" is_word_operator(k::Kind) = (k == K"in" || k == K"isa" || k == K"where") is_identifier(x) = is_identifier(kind(x)) @@ -1172,28 +606,30 @@ is_word_operator(x) = is_word_operator(kind(x)) # Predicates for operator precedence # FIXME: Review how precedence depends on dottedness, eg # https://github.com/JuliaLang/julia/pull/36725 + + is_prec_assignment(x) = K"BEGIN_ASSIGNMENTS" <= kind(x) <= K"END_ASSIGNMENTS" -is_prec_pair(x) = K"BEGIN_PAIRARROW" <= kind(x) <= K"END_PAIRARROW" -is_prec_conditional(x) = K"BEGIN_CONDITIONAL" <= kind(x) <= K"END_CONDITIONAL" -is_prec_arrow(x) = K"BEGIN_ARROW" <= kind(x) <= K"END_ARROW" -is_prec_lazy_or(x) = K"BEGIN_LAZYOR" <= kind(x) <= K"END_LAZYOR" -is_prec_lazy_and(x) = K"BEGIN_LAZYAND" <= kind(x) <= K"END_LAZYAND" -is_prec_comparison(x) = K"BEGIN_COMPARISON" <= kind(x) <= K"END_COMPARISON" -is_prec_pipe(x) = K"BEGIN_PIPE" <= kind(x) <= K"END_PIPE" -is_prec_colon(x) = K"BEGIN_COLON" <= kind(x) <= K"END_COLON" -is_prec_plus(x) = K"BEGIN_PLUS" <= kind(x) <= K"END_PLUS" -is_prec_bitshift(x) = K"BEGIN_BITSHIFTS" <= kind(x) <= K"END_BITSHIFTS" -is_prec_times(x) = K"BEGIN_TIMES" <= kind(x) <= K"END_TIMES" -is_prec_rational(x) = K"BEGIN_RATIONAL" <= kind(x) <= K"END_RATIONAL" -is_prec_power(x) = K"BEGIN_POWER" <= kind(x) <= K"END_POWER" -is_prec_decl(x) = K"BEGIN_DECL" <= kind(x) <= K"END_DECL" -is_prec_where(x) = K"BEGIN_WHERE" <= kind(x) <= K"END_WHERE" -is_prec_dot(x) = K"BEGIN_DOT" <= kind(x) <= K"END_DOT" -is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" <= kind(x) <= K"END_UNICODE_OPS" -is_prec_pipe_lt(x) = kind(x) == K"<|" -is_prec_pipe_gt(x) = kind(x) == K"|>" +is_prec_pair(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_PAIRARROW)) +is_prec_conditional(x) = kind(x) == K"?" +is_prec_arrow(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_ARROW)) || kind(x) == K"-->" +is_prec_lazy_or(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_LAZYOR)) || kind(x) in KSet"||" +is_prec_lazy_and(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_LAZYAND)) || kind(x) in KSet"&&" +is_prec_comparison(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_COMPARISON)) || kind(x) in KSet"<: >: in isa < > ∈" +is_prec_pipe_lt(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_PIPE_LT) +is_prec_pipe_gt(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_PIPE_GT) +is_prec_pipe(x) = is_prec_pipe_lt(x) || is_prec_pipe_gt(x) +is_prec_colon(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_COLON)) || kind(x) == K".." +is_prec_plus(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_PLUS)) || kind(x) in KSet"+ - ± ∓" +is_prec_bitshift(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_BITSHIFT) +is_prec_times(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_TIMES)) || kind(x) in KSet"* ⋆ &" +is_prec_rational(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_RATIONAL) +is_prec_power(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_POWER) +is_prec_decl(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_DECL)) || kind(x) == K"::" +is_prec_where(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_WHERE)) || kind(x) == K"where" +is_prec_dot(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_DOT)) || kind(x) == K"." +is_prec_quote(x) = (kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_QUOTE)) || kind(x) == K"'" is_syntax_kind(x) = K"BEGIN_SYNTAX_KINDS"<= kind(x) <= K"END_SYNTAX_KINDS" -is_syntactic_assignment(x) = K"BEGIN_SYNTACTIC_ASSIGNMENTS" <= kind(x) <= K"END_SYNTACTIC_ASSIGNMENTS" +is_prec_compound_assign(x) = kind(x) == K"Operator" && numeric_flags(head(x)) == Int(PREC_COMPOUND_ASSIGN) function is_string_delim(x) kind(x) in (K"\"", K"\"\"\"") @@ -1217,5 +653,5 @@ function is_syntactic_operator(x) # in the parser? The lexer itself usually disallows such tokens, so it's # not clear whether we need to handle them. (Though note `.->` is a # token...) - return k in KSet"&& || . ... ->" || is_syntactic_assignment(k) + return k in KSet"&& || . ... -> = :=" end diff --git a/src/julia/parser.jl b/src/julia/parser.jl index e347a5d0..201cdcb0 100644 --- a/src/julia/parser.jl +++ b/src/julia/parser.jl @@ -101,10 +101,6 @@ function bump_glue(ps::ParseState, args...; kws...) bump_glue(ps.stream, args...; kws...) end -function bump_split(ps::ParseState, args...; kws...) - bump_split(ps.stream, args...; kws...) -end - function reset_node!(ps::ParseState, args...; kws...) reset_node!(ps.stream, args...; kws...) end @@ -221,9 +217,7 @@ end # # All these take either a raw kind or a token. -function is_plain_equals(t) - kind(t) == K"=" && !is_suffixed(t) -end +is_plain_equals(t) = kind(t) == K"=" function is_closing_token(ps::ParseState, k) k = kind(k) @@ -274,8 +268,10 @@ function is_block_form(k) abstract primitive struct try module" end -function is_syntactic_unary_op(k) - kind(k) in KSet"$ & ::" +function is_syntactic_unary_op(x) + # $, & and :: are syntactic unary operators + k = kind(x) + return k in KSet":: $ &" end function is_type_operator(t, isdot) @@ -284,20 +280,14 @@ end function is_unary_op(t, isdot) k = kind(t) - !is_suffixed(t) && ( - (k in KSet"<: >:" && !isdot) || - k in KSet"+ - ! ~ ¬ √ ∛ ∜ ⋆ ± ∓" # dotop allowed - ) + (k in KSet"<: >:" && !isdot) || + k in KSet"+ - ! ~ ¬ √ ∛ ∜ ⋆ ± ∓" # dotop allowed end # Operators that are both unary and binary function is_both_unary_and_binary(t, isdot) k = kind(t) - # Preventing is_suffixed here makes this consistent with the flisp parser. - # But is this by design or happenstance? - !is_suffixed(t) && ( - k in KSet"+ - ⋆ ± ∓" || (k in KSet"$ & ~" && !isdot) - ) + k in KSet"+ - ⋆ ± ∓" || (k in KSet"$ & ~" && !isdot) end function is_string_macro_suffix(k) @@ -353,8 +343,8 @@ function parse_LtoR(ps::ParseState, down, is_op) mark = position(ps) down(ps) while true - isdot, tk = peek_dotted_op_token(ps) - is_op(tk) || break + isdot, isassign, tk = peek_dotted_op_token(ps) + (is_op(tk) && !isassign) || break isdot && bump(ps, TRIVIA_FLAG) # TODO: NOTATION_FLAG bump(ps, remap_kind=K"Identifier") down(ps) @@ -369,8 +359,8 @@ end function parse_RtoL(ps::ParseState, down, is_op, self) mark = position(ps) down(ps) - isdot, tk = peek_dotted_op_token(ps) - if is_op(tk) + isdot, isassign, tk = peek_dotted_op_token(ps) + if is_op(tk) && !isassign bump_dotted(ps, isdot, tk, remap_kind=K"Identifier") self(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) @@ -581,11 +571,13 @@ function parse_assignment(ps::ParseState, down) end function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where {T} # where => specialize on `down` - isdot, t = peek_dotted_op_token(ps) + isdot, is_compound_assignment, t = peek_dotted_op_token(ps) k = kind(t) - if !is_prec_assignment(k) + + if !is_prec_assignment(t) && !is_compound_assignment return end + if k == K"~" if ps.space_sensitive && preceding_whitespace(t) && !preceding_whitespace(peek_token(ps, 2)) # Unary ~ in space sensitive context is not assignment precedence @@ -608,14 +600,18 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where { # a += b ==> (+= a b) # a .= b ==> (.= a b) is_short_form_func = k == K"=" && !isdot && was_eventually_call(ps) - if k == K"op=" + if is_compound_assignment # x += y ==> (op= x + y) # x .+= y ==> (.op= x + y) bump_trivia(ps) - isdot && bump(ps, TRIVIA_FLAG) # TODO: NOTATION_FLAG - bump_split(ps, - (-1, K"Identifier", EMPTY_FLAGS), # op - (1, K"=", TRIVIA_FLAG)) + opmark = position(ps) + bump_dotted(ps, isdot, t, remap_kind=K"Identifier") + if is_compound_assignment && !is_prec_compound_assign(t) + emit(ps, opmark, K"error", + error="Compound assignment is not allowed for this operator") + end + bump(ps, TRIVIA_FLAG) # bump the = + k = K"op=" # Set k for the emit below else bump_dotted(ps, isdot, t, TRIVIA_FLAG) end @@ -730,10 +726,10 @@ end function parse_arrow(ps::ParseState) mark = position(ps) parse_or(ps) - isdot, t = peek_dotted_op_token(ps) + isdot, isassign, t = peek_dotted_op_token(ps) k = kind(t) - if is_prec_arrow(k) - if kind(t) == K"-->" && !isdot && !is_suffixed(t) + if is_prec_arrow(t) + if kind(t) == K"-->" && !isdot # x --> y ==> (--> x y) # The only syntactic arrow bump(ps, TRIVIA_FLAG) parse_arrow(ps) @@ -768,9 +764,9 @@ end function parse_lazy_cond(ps::ParseState, down, is_op, self) mark = position(ps) down(ps) - (isdot, t) = peek_dotted_op_token(ps) + (isdot, isassign, t) = peek_dotted_op_token(ps) k = kind(t) - if is_op(k) + if is_op(t) bump_dotted(ps, isdot, t, TRIVIA_FLAG) self(ps) emit(ps, mark, isdot ? dotted(k) : k, flags(t)) @@ -815,8 +811,8 @@ function parse_comparison(ps::ParseState, subtype_comparison=false) n_comparisons = 0 op_pos = NO_POSITION op_dotted = false - (initial_dot, initial_tok) = peek_dotted_op_token(ps) - while ((isdot, t) = peek_dotted_op_token(ps); is_prec_comparison(t)) + (initial_dot, initial_isassign, initial_tok) = peek_dotted_op_token(ps) + while ((isdot, isassign, t) = peek_dotted_op_token(ps); is_prec_comparison(t)) n_comparisons += 1 op_dotted = isdot op_pos = bump_dotted(ps, isdot, t, emit_dot_node=true, remap_kind=K"Identifier") @@ -874,9 +870,9 @@ function parse_range(ps::ParseState) mark = position(ps) parse_invalid_ops(ps) - (initial_dot, initial_tok) = peek_dotted_op_token(ps) + (initial_dot, initial_isassign, initial_tok) = peek_dotted_op_token(ps) initial_kind = kind(initial_tok) - if initial_kind != K":" && (is_prec_colon(initial_kind) || (initial_dot && initial_kind == K".")) + if initial_kind != K":" && (is_prec_colon(initial_tok) || (initial_dot && initial_kind == K".")) # a..b ==> (call-i a (dots-2) b) # a … b ==> (call-i a … b) # a .… b ==> (dotcall-i a … b) @@ -966,7 +962,7 @@ end function parse_invalid_ops(ps::ParseState) mark = position(ps) parse_expr(ps) - while ((isdot, t) = peek_dotted_op_token(ps); kind(t) in KSet"ErrorInvalidOperator Error**") + while ((isdot, isassign, t) = peek_dotted_op_token(ps); kind(t) in KSet"ErrorInvalidOperator Error**") bump_trivia(ps) bump_dotted(ps, isdot, t) parse_expr(ps) @@ -996,7 +992,7 @@ end function parse_with_chains(ps::ParseState, down, is_op, chain_ops) mark = position(ps) down(ps) - while ((isdot, t) = peek_dotted_op_token(ps); is_op(kind(t))) + while ((isdot, isassign, t) = peek_dotted_op_token(ps); is_op(t) && !isassign) if ps.space_sensitive && preceding_whitespace(t) && is_both_unary_and_binary(t, isdot) && !preceding_whitespace(peek_token(ps, 2)) @@ -1011,7 +1007,7 @@ function parse_with_chains(ps::ParseState, down, is_op, chain_ops) end bump_dotted(ps, isdot, t, remap_kind=K"Identifier") down(ps) - if kind(t) in chain_ops && !is_suffixed(t) && !isdot + if kind(t) in chain_ops && !isdot # a + b + c ==> (call-i a + b c) # a + b .+ c ==> (dotcall-i (call-i a + b) + c) parse_chain(ps, down, kind(t)) @@ -1027,8 +1023,8 @@ end # flisp: parse-chain function parse_chain(ps::ParseState, down, op_kind) while true - isdot, t = peek_dotted_op_token(ps) - if kind(t) != op_kind || is_suffixed(t) || isdot + isdot, isassign, t = peek_dotted_op_token(ps) + if kind(t) != op_kind || isdot break end if ps.space_sensitive && preceding_whitespace(t) && @@ -1145,7 +1141,8 @@ function parse_juxtapose(ps::ParseState) is_syntactic_unary_op(prev_k) || is_initial_reserved_word(ps, prev_k) ))) && (!is_operator(k) || is_radical_op(k)) && - !is_closing_token(ps, k) + !is_closing_token(ps, k) && k != K"..." && + k != K"ErrorInvalidOperator" && k != K"Error**" if prev_k == K"string" || is_string_delim(t) bump_invisible(ps, K"error", TRIVIA_FLAG, error="cannot juxtapose string literal") @@ -1194,7 +1191,7 @@ end function parse_unary(ps::ParseState) mark = position(ps) bump_trivia(ps) - (op_dotted, op_t) = peek_dotted_op_token(ps) + (op_dotted, op_isassign, op_t) = peek_dotted_op_token(ps) op_k = kind(op_t) if ( !is_operator(op_k) || @@ -1212,12 +1209,12 @@ function parse_unary(ps::ParseState) end t2 = peek_token(ps, 2+op_dotted) k2 = kind(t2) - if op_k in KSet"- +" && !is_suffixed(op_t) && !op_dotted + if op_k in KSet"- +" && !op_dotted if !preceding_whitespace(t2) && (k2 in KSet"Integer Float Float32" || (op_k == K"+" && k2 in KSet"BinInt HexInt OctInt")) - k3 = peek(ps, 3) - if is_prec_power(k3) || k3 in KSet"[ {" + t3 = peek_token(ps, 3) + if is_prec_power(t3) || kind(t3) in KSet"[ {" # `[`, `{` (issue #18851) and `^` have higher precedence than # unary negation # -2^x ==> (call-pre - (call-i 2 ^ x)) @@ -1390,7 +1387,7 @@ end # flisp: parse-factor-with-initial-ex function parse_factor_with_initial_ex(ps::ParseState, mark) parse_decl_with_initial_ex(ps, mark) - if ((isdot, t) = peek_dotted_op_token(ps); is_prec_power(kind(t))) + if ((isdot, isassign, t) = peek_dotted_op_token(ps); is_prec_power(t) && !isassign) bump_dotted(ps, isdot, t, remap_kind=K"Identifier") parse_factor_after(ps) emit(ps, mark, isdot ? K"dotcall" : K"call", INFIX_FLAG) @@ -1462,7 +1459,7 @@ end # flisp: parse-unary-prefix function parse_unary_prefix(ps::ParseState, has_unary_prefix=false) mark = position(ps) - (isdot, t) = peek_dotted_op_token(ps) + (isdot, isassign, t) = peek_dotted_op_token(ps) k = kind(t) if is_syntactic_unary_op(k) && !isdot k2 = peek(ps, 2) @@ -1754,7 +1751,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false) maybe_strmac_1 = true emit(ps, mark, K".") end - elseif k == K"'" && !preceding_whitespace(t) + elseif is_prec_quote(t) && !preceding_whitespace(t) # f' ==> (call-post f ') # f'ᵀ ==> (call-post f 'ᵀ) bump(ps, remap_kind=K"Identifier") @@ -2149,15 +2146,28 @@ end function parse_global_local_const_vars(ps) mark = position(ps) n_commas = parse_comma(ps, false) - (isdot, t) = peek_dotted_op_token(ps) - if is_prec_assignment(t) + (isdot, isassign, t) = peek_dotted_op_token(ps) + + # Check if we have operator followed by = + is_compound_assignment = false + if is_operator(kind(t)) && !is_prec_assignment(t) + # Look ahead to see if next token is = + # For dotted operators like .+, we need to check token 3 + peek_pos = isdot ? 3 : 2 + t2 = peek_token(ps, peek_pos) + if kind(t2) == K"=" && !preceding_whitespace(t2) + is_compound_assignment = true + end + end + + if is_prec_assignment(t) || is_compound_assignment if n_commas >= 1 # const x,y = 1,2 ==> (const (= (tuple x y) (tuple 1 2))) emit(ps, mark, K"tuple") end # const x = 1 ==> (const (= x 1)) # global x ~ 1 ==> (global (call-i x ~ 1)) - # global x += 1 ==> (global (+= x 1)) + # global x += 1 ==> (global (op= x + 1)) parse_assignment_with_initial_ex(ps, mark, parse_comma) else # global x,y ==> (global x y) @@ -2235,7 +2245,7 @@ function parse_function_signature(ps::ParseState, is_function::Bool) # function (:)() end ==> (function (call (parens :)) (block)) # function (x::T)() end ==> (function (call (parens (::-i x T))) (block)) # function (::T)() end ==> (function (call (parens (::-pre T))) (block)) - # function (:*=(f))() end ==> (function (call (parens (call (quote-: *=) f))) (block)) + # function (:*=(f))() end ==> (function (call (parens (call (quote-: (op= *)) f))) (block)) emit(ps, mark, K"parens", PARENS_FLAG) end end @@ -3088,13 +3098,13 @@ function parse_paren(ps::ParseState, check_identifiers=true, has_unary_prefix=fa @check peek(ps) == K"(" bump(ps, TRIVIA_FLAG) # K"(" after_paren_mark = position(ps) - (isdot, tok) = peek_dotted_op_token(ps) + (isdot, isassign, tok) = peek_dotted_op_token(ps) k = kind(tok) if k == K")" # () ==> (tuple-p) bump(ps, TRIVIA_FLAG) emit(ps, mark, K"tuple", PARENS_FLAG) - elseif is_syntactic_operator(k) + elseif is_syntactic_operator(k) || isassign # allow :(=) etc in unchecked contexts, eg quotes # :(=) ==> (quote-: (parens =)) parse_atom(ps, check_identifiers) @@ -3492,7 +3502,7 @@ end function parse_atom(ps::ParseState, check_identifiers=true, has_unary_prefix=false) bump_trivia(ps) mark = position(ps) - (leading_dot, leading_tok) = peek_dotted_op_token(ps) + (leading_dot, leading_isassign, leading_tok) = peek_dotted_op_token(ps) leading_kind = kind(leading_tok) # todo: Reorder to put most likely tokens first? if leading_dot @@ -3599,16 +3609,27 @@ function parse_atom(ps::ParseState, check_identifiers=true, has_unary_prefix=fal @label is_operator # + ==> + # .+ ==> (. +) - bump_dotted(ps, leading_dot, leading_tok, emit_dot_node=true, remap_kind= + is_compound_assignment = !is_prec_assignment(leading_tok) && leading_isassign + bump_dotted(ps, leading_dot, leading_tok, emit_dot_node=!is_compound_assignment, remap_kind= is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier") + + # Check if this is a compound assignment operator pattern + if is_compound_assignment + bump(ps, TRIVIA_FLAG) # consume the = but mark as trivia + emit(ps, mark, leading_dot ? K".op=" : K"op=") + if check_identifiers + # += ==> (error (op= +)) + # .+= ==> (error (. (op= +))) + emit(ps, mark, K"error", error="invalid identifier") + end + # Quoted syntactic operators are allowed + # :+= ==> (quote-: (op= +)) + return + end + if check_identifiers && !(is_valid_identifier(leading_kind) || (leading_dot && leading_kind == K".")) - # += ==> (error (op= +)) # ? ==> (error ?) - # .+= ==> (error (. (op= +))) emit(ps, mark, K"error", error="invalid identifier") - else - # Quoted syntactic operators allowed - # :+= ==> (quote-: (op= +)) end elseif is_keyword(leading_kind) if leading_kind == K"var" && (t = peek_token(ps,2); diff --git a/src/julia/tokenize.jl b/src/julia/tokenize.jl index 37e40109..093c9713 100644 --- a/src/julia/tokenize.jl +++ b/src/julia/tokenize.jl @@ -2,10 +2,15 @@ module Tokenize export tokenize, untokenize -using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @callsite_inline +using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @callsite_inline, + generic_operators_by_level, PrecedenceLevel, PREC_NONE, PREC_ASSIGNMENT, + PREC_PAIRARROW, PREC_CONDITIONAL, PREC_ARROW, PREC_LAZYOR, PREC_LAZYAND, + PREC_COMPARISON, PREC_PIPE_LT, PREC_PIPE_GT, PREC_COLON, PREC_PLUS, + PREC_BITSHIFT, PREC_TIMES, PREC_RATIONAL, PREC_POWER, PREC_DECL, + PREC_WHERE, PREC_DOT, PREC_QUOTE, PREC_UNICODE_OPS, PREC_COMPOUND_ASSIGN import ..JuliaSyntax: kind, - is_literal, is_contextual_keyword, is_word_operator + is_literal, is_contextual_keyword, is_word_operator, is_operator #------------------------------------------------------------------------------- # Character-based predicates for tokenization @@ -72,32 +77,6 @@ end readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char) -# Some unicode operators are normalized by the tokenizer into their equivalent -# kinds. See also normalize_identifier() -const _ops_with_unicode_aliases = [ - # \minus '−' is normalized into K"-", - '−' => K"-" - # Lookalikes which are normalized into K"⋅", - # https://github.com/JuliaLang/julia/pull/25157, - '\u00b7' => K"⋅" # '·' Middle Dot,, - '\u0387' => K"⋅" # '·' Greek Ano Teleia,, -] - -function _nondot_symbolic_operator_kinds() - op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS") - setdiff(reinterpret.(Kind, op_range), [ - K"ErrorInvalidOperator" - K"Error**" - K"..." - K"." - K"where" - K"isa" - K"in" - K".'" - K"op=" - ]) -end - function _char_in_set_expr(varname, firstchars) codes = sort!(UInt32.(unique(firstchars))) terms = [] @@ -121,15 +100,14 @@ end if c == EOF_CHAR || !isvalid(c) return false end - u = UInt32(c) - return $(_char_in_set_expr(:u, - append!(first.(string.(_nondot_symbolic_operator_kinds())), - first.(_ops_with_unicode_aliases)))) + # Check if character is known operator char or in our unicode ops dictionary + return c in ('!', '#', '$', '%', '&', '*', '+', '-', '−', '/', ':', '<', '=', '>', '?', '@', '\\', '^', '|', '~', '÷', '⊻') || + haskey(_unicode_ops, c) end # Checks whether a Char is an operator which can be prefixed with a dot `.` function is_dottable_operator_start_char(c) - return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c) + return c != '?' && c != '$' && c != ':' && c != '\'' && c != '#' && c != '@' && is_operator_start_char(c) end @eval function isopsuffix(c::Char) @@ -151,7 +129,8 @@ end end function optakessuffix(k) - (K"BEGIN_OPS" <= k <= K"END_OPS") && + # Most operators can take suffix except for specific ones + is_operator(k) && !( K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" || k == K"?" || @@ -161,8 +140,6 @@ function optakessuffix(k) k == K"||" || k == K"in" || k == K"isa" || - k == K"≔" || - k == K"⩴" || k == K":" || k == K"$" || k == K"::" || @@ -176,14 +153,16 @@ function optakessuffix(k) end const _unicode_ops = let - ks = _nondot_symbolic_operator_kinds() - ss = string.(ks) + # Map single-character unicode operators to their precedence levels + ops = Dict{Char, PrecedenceLevel}() - ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss) - if length(s) == 1 && !isascii(s[1])]) - for ck in _ops_with_unicode_aliases - push!(ops, ck) + # Add operators from generic_operators_by_level + for (prec, chars) in generic_operators_by_level + for c in chars + ops[c] = prec + end end + ops end @@ -195,12 +174,12 @@ struct RawToken # Offsets into a string or buffer startbyte::Int # The byte where the token start in the buffer endbyte::Int # The byte where the token ended in the buffer - suffix::Bool + op_precedence::PrecedenceLevel # If K"Operator", the operator's precedence level end function RawToken(kind::Kind, startbyte::Int, endbyte::Int) - RawToken(kind, startbyte, endbyte, false) + RawToken(kind, startbyte, endbyte, PREC_NONE) end -RawToken() = RawToken(K"error", 0, 0, false) +RawToken() = RawToken(K"error", 0, 0, PREC_NONE) const EMPTY_TOKEN = RawToken() @@ -425,17 +404,33 @@ end Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`. """ -function emit(l::Lexer, kind::Kind, maybe_op=true) - suffix = false - if optakessuffix(kind) && maybe_op +function emit(l::Lexer, kind::Kind) + tok = RawToken(kind, startpos(l), position(l) - 1, PREC_NONE) + + l.last_token = kind + return tok +end + +function emit_operator(l::Lexer, kind::Kind, precedence::PrecedenceLevel, take_suffix=optakessuffix(kind)) + if take_suffix while isopsuffix(peekchar(l)) readchar(l) - suffix = true + kind = K"Operator" end end + tok = RawToken(kind, startpos(l), position(l) - 1, precedence) - tok = RawToken(kind, startpos(l), position(l) - 1, suffix) + l.last_token = kind + return tok +end +""" + emit(l::Lexer, kind::Kind) + +Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`. +""" +function emit_trivia(l::Lexer, kind::Kind) + tok = RawToken(kind, startpos(l), position(l) - 1, PREC_NONE) l.last_token = kind return tok end @@ -448,9 +443,9 @@ Returns the next `RawToken`. function next_token(l::Lexer, start = true) start && start_token!(l) if !isempty(l.string_states) - lex_string_chunk(l) + return lex_string_chunk(l) else - _next_token(l, readchar(l)) + return _next_token(l, readchar(l)) end end @@ -523,18 +518,44 @@ function _next_token(l::Lexer, c) return lex_plus(l); elseif c == '-' return lex_minus(l); - elseif c == '−' # \minus '−' treated as hyphen '-' - return emit(l, accept(l, '=') ? K"op=" : K"-") elseif c == '`' return lex_backtick(l); + elseif c == '−' # \minus '−' treated as hyphen '-' + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) + else + return emit_operator(l, K"-", PREC_PLUS) + end + elseif c == '∈' + return emit_operator(l, K"∈", PREC_COMPARISON) + elseif c == '⋆' + return emit_operator(l, K"⋆", PREC_TIMES) + elseif c == '±' + return emit_operator(l, K"±", PREC_PLUS) + elseif c == '∓' + return emit_operator(l, K"∓", PREC_PLUS) + elseif c == '¬' + return emit(l, K"¬") + elseif c == '√' + return emit(l, K"√") + elseif c == '∛' + return emit(l, K"∛") + elseif c == '∜' + return emit(l, K"∜") + elseif c == '≔' + return emit_operator(l, K"≔", PREC_ASSIGNMENT) + elseif c == '⩴' + return emit_operator(l, K"⩴", PREC_ASSIGNMENT) + elseif c == '≕' + return emit_operator(l, K"≕", PREC_ASSIGNMENT) + elseif haskey(_unicode_ops, c) + return emit_operator(l, K"Operator", _unicode_ops[c]) elseif is_identifier_start_char(c) return lex_identifier(l, c) elseif isdigit(c) return lex_digit(l, K"Integer") - elseif (k = get(_unicode_ops, c, K"None")) != K"None" - return emit(l, k) else - emit(l, + return emit(l, !isvalid(c) ? K"ErrorInvalidUTF8" : is_invisible_char(c) ? K"ErrorInvisibleChar" : is_identifier_char(c) ? K"ErrorIdentifierStart" : @@ -640,7 +661,7 @@ function lex_string_chunk(l) K"\"\"\"" : K"```") else return emit(l, state.delim == '"' ? K"\"" : - state.delim == '`' ? K"`" : K"'", false) + state.delim == '`' ? K"`" : K"'") end end # Read a chunk of string characters @@ -739,7 +760,7 @@ function lex_whitespace(l::Lexer, c) end c = readchar(l) end - return emit(l, k) + return emit_trivia(l, k) end function lex_comment(l::Lexer) @@ -748,7 +769,8 @@ function lex_comment(l::Lexer) while true pc, ppc = dpeekchar(l) if pc == '\n' || (pc == '\r' && ppc == '\n') || pc == EOF_CHAR - return emit(l, valid ? K"Comment" : K"ErrorInvalidUTF8") + return valid ? emit_trivia(l, K"Comment") : + emit(l, K"ErrorInvalidUTF8") end valid &= isvalid(pc) readchar(l) @@ -780,7 +802,7 @@ function lex_comment(l::Lexer) outk = !valid ? K"ErrorInvalidUTF8" : bidi_state != init_bidi_state ? K"ErrorBidiFormatting" : K"Comment" - return emit(l, outk) + return valid ? emit_trivia(l, outk) : emit(l, outk) end end end @@ -793,54 +815,57 @@ end function lex_greater(l::Lexer) if accept(l, '>') if accept(l, '>') - if accept(l, '=') - return emit(l, K"op=") - else # >>>?, ? not a = - return emit(l, K">>>") + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) # >>>= + else + return emit_operator(l, K"Operator", PREC_BITSHIFT) # >>> end - elseif accept(l, '=') - return emit(l, K"op=") else - return emit(l, K">>") + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) # >>= + else + return emit_operator(l, K"Operator", PREC_BITSHIFT) # >> + end end elseif accept(l, '=') - return emit(l, K">=") + return emit_operator(l, K"Operator", PREC_COMPARISON) # >= elseif accept(l, ':') return emit(l, K">:") else - return emit(l, K">") + return emit_operator(l, K">", PREC_COMPARISON) end end # Lex a less char, a '<' has been consumed function lex_less(l::Lexer) if accept(l, '<') - if accept(l, '=') - return emit(l, K"op=") - else # '<') - return emit(l, K"<-->") + return emit_operator(l, K"Operator", PREC_ARROW) # <--> elseif accept(l, '-') return emit(l, K"ErrorInvalidOperator") else - return emit(l, K"<--") + return emit_operator(l, K"Operator", PREC_ARROW) # <-- end end else - return emit(l, K"<") + return emit_operator(l, K"<", PREC_COMPARISON) end end @@ -848,15 +873,12 @@ end # An '=' char has been consumed function lex_equal(l::Lexer) if accept(l, '=') - if accept(l, '=') - emit(l, K"===") - else - emit(l, K"==") - end + accept(l, '=') + return emit_operator(l, K"Operator", PREC_COMPARISON) # ==, === elseif accept(l, '>') - emit(l, K"=>") + return emit_operator(l, K"Operator", PREC_PAIRARROW) else - emit(l, K"=") + return emit(l, K"=") end end @@ -867,16 +889,16 @@ function lex_colon(l::Lexer) elseif accept(l, '=') return emit(l, K":=") else - return emit(l, K":") + return emit_operator(l, K":", PREC_COLON) end end function lex_exclaim(l::Lexer) if accept(l, '=') if accept(l, '=') - return emit(l, K"!==") + return emit_operator(l, K"Operator", PREC_COMPARISON) # !== else - return emit(l, K"!=") + return emit_operator(l, K"Operator", PREC_COMPARISON) # != end else return emit(l, K"!") @@ -884,84 +906,82 @@ function lex_exclaim(l::Lexer) end function lex_percent(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") - else - return emit(l, K"%") - end + return emit_operator(l, K"Operator", peekchar(l) == '=' ? PREC_COMPOUND_ASSIGN : PREC_TIMES) end function lex_bar(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") - elseif accept(l, '>') - return emit(l, K"|>") + if accept(l, '>') + return emit_operator(l, K"Operator", PREC_PIPE_GT) # |> elseif accept(l, '|') return emit(l, K"||") else - emit(l, K"|") + return emit_operator(l, K"Operator", peekchar(l) == '=' ? PREC_COMPOUND_ASSIGN : PREC_PLUS) end end function lex_plus(l::Lexer) if accept(l, '+') - return emit(l, K"++") - elseif accept(l, '=') - return emit(l, K"op=") + return emit_operator(l, K"++", PREC_PLUS) end - return emit(l, K"+") + # Check if followed by = for compound assignment + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) + end + return emit_operator(l, K"+", PREC_PLUS) end function lex_minus(l::Lexer) if accept(l, '-') if accept(l, '>') - return emit(l, K"-->") + return emit_operator(l, K"-->", PREC_ARROW) else return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator end elseif l.last_token != K"." && accept(l, '>') - return emit(l, K"->") - elseif accept(l, '=') - return emit(l, K"op=") + return emit_operator(l, K"->", PREC_ARROW) end - return emit(l, K"-") + # Check if followed by = for compound assignment + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) + end + return emit_operator(l, K"-", PREC_PLUS) end function lex_star(l::Lexer) if accept(l, '*') return emit(l, K"Error**") # "**" is an invalid operator use ^ - elseif accept(l, '=') - return emit(l, K"op=") end - return emit(l, K"*") + # Check if followed by = for compound assignment + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) + end + return emit_operator(l, K"*", PREC_TIMES) end function lex_circumflex(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) end - return emit(l, K"^") + return emit_operator(l, K"Operator", PREC_POWER) # ^ end function lex_division(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) end - return emit(l, K"÷") + return emit_operator(l, K"Operator", PREC_TIMES) # / end function lex_dollar(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") + # Check if followed by = for compound assignment + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) end - return emit(l, K"$") + return emit_operator(l, K"$", PREC_PLUS) end function lex_xor(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") - end - return emit(l, K"⊻") + return emit_operator(l, K"Operator", peekchar(l) == '=' ? PREC_COMPOUND_ASSIGN : PREC_PLUS) end function accept_number(l::Lexer, f::F) where {F} @@ -1096,20 +1116,21 @@ function lex_prime(l) is_literal(l.last_token) # FIXME ^ This doesn't cover all cases - probably needs involvement # from the parser state. - return emit(l, K"'") + return emit_operator(l, K"'", PREC_QUOTE) else push!(l.string_states, StringState(false, true, '\'', 0)) - return emit(l, K"'", false) + return emit(l, K"'") end end function lex_amper(l::Lexer) if accept(l, '&') return emit(l, K"&&") - elseif accept(l, '=') - return emit(l, K"op=") else - return emit(l, K"&") + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) + end + return emit_operator(l, K"&", PREC_TIMES) end end @@ -1143,24 +1164,16 @@ end # Parse a token starting with a forward slash. # A '/' has been consumed function lex_forwardslash(l::Lexer) - if accept(l, '/') - if accept(l, '=') - return emit(l, K"op=") - else - return emit(l, K"//") - end - elseif accept(l, '=') - return emit(l, K"op=") - else - return emit(l, K"/") - end + prec = accept(l, '/') ? PREC_RATIONAL : PREC_TIMES + return emit_operator(l, K"Operator", peekchar(l) == '=' ? PREC_COMPOUND_ASSIGN : prec) # // or / end function lex_backslash(l::Lexer) - if accept(l, '=') - return emit(l, K"op=") + # Check if followed by = for compound assignment + if peekchar(l) == '=' + return emit_operator(l, K"Operator", PREC_COMPOUND_ASSIGN) # \ before = end - return emit(l, K"\\") + return emit_operator(l, K"Operator", PREC_TIMES) end function lex_dot(l::Lexer) @@ -1231,11 +1244,12 @@ function lex_identifier(l::Lexer, c) end if n > MAX_KW_LENGTH - emit(l, K"Identifier") + return emit(l, K"Identifier") elseif h == _true_hash || h == _false_hash - emit(l, K"Bool") + return emit(l, K"Bool") else - emit(l, get(_kw_hash, h, K"Identifier")) + k = get(_kw_hash, h, K"Identifier") + return emit(l, k) end end diff --git a/test/parser.jl b/test/parser.jl index 804d20a4..e54c8bc8 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -76,6 +76,8 @@ tests = [ "f(x) where S where U = 1" => "(function-= (where (where (call f x) S) U) 1)" "(f(x)::T) where S = 1" => "(function-= (where (parens (::-i (call f x) T)) S) 1)" "f(x) = 1 = 2" => "(function-= (call f x) (= 1 2))" # Should be a warning! + # Bad assignment with suffixed op + ((v = v"1.12",), "a +₁= b") => "(op= a (error +₁) b)" ], JuliaSyntax.parse_pair => [ "a => b" => "(call-i a => b)" @@ -617,7 +619,7 @@ tests = [ "function (::g(x))() end" => "(function (call (parens (::-pre (call g x)))) (block))" "function (f::T{g(i)})() end" => "(function (call (parens (::-i f (curly T (call g i))))) (block))" "function (::T)() end" => "(function (call (parens (::-pre T))) (block))" - "function (:*=(f))() end" => "(function (call (parens (call (quote-: *=) f))) (block))" + "function (:*=(f))() end" => "(function (call (parens (call (quote-: (op= *)) f))) (block))" "function begin() end" => "(function (call (error begin)) (block))" "function f() end" => "(function (call f) (block))" "function type() end" => "(function (call type) (block))" @@ -817,24 +819,24 @@ tests = [ "||" => "(error ||)" "." => "(error .)" "..." => "(error (dots-3))" - "+=" => "(error +=)" - "-=" => "(error -=)" - "*=" => "(error *=)" - "/=" => "(error /=)" - "//=" => "(error //=)" - "|=" => "(error |=)" - "^=" => "(error ^=)" - "÷=" => "(error ÷=)" - "%=" => "(error %=)" - "<<=" => "(error <<=)" - ">>=" => "(error >>=)" - ">>>="=> "(error >>>=)" - "\\=" => "(error \\=)" - "&=" => "(error &=)" - ":=" => "(error :=)" - "\$=" => "(error \$=)" - "⊻=" => "(error ⊻=)" - ".+=" => "(error (. +=))" + "+=" => "(error (op= +))" + "-=" => "(error (op= -))" + "*=" => "(error (op= *))" + "/=" => "(error (op= /))" + "//=" => "(error (op= //))" + "|=" => "(error (op= |))" + "^=" => "(error (op= ^))" + "÷=" => "(error (op= ÷))" + "%=" => "(error (op= %))" + "<<=" => "(error (op= <<))" + ">>=" => "(error (op= >>))" + ">>>="=> "(error (op= >>>))" + "\\=" => "(error (op= \\))" + "&=" => "(error (op= &))" + ":=" => "(error :=)" # Assignment operator, not `:`-update + "\$=" => "(error (op= \$))" + "⊻=" => "(error (op= ⊻))" + ".+=" => "(error (.op= +))" # Normal operators "+" => "+" # Assignment-precedence operators which can be used as identifiers @@ -843,8 +845,8 @@ tests = [ "⩴" => "⩴" "≕" => "≕" # Quoted syntactic operators allowed - ":+=" => "(quote-: +=)" - ":.+=" => "(quote-: (. +=))" + ":+=" => "(quote-: (op= +))" + ":.+=" => "(quote-: (.op= +))" ":.=" => "(quote-: (. =))" ":.&&" => "(quote-: (. &&))" # Special symbols quoted @@ -1116,7 +1118,7 @@ parsestmt_test_specs = [ # detecting raw vs non-raw strings. The old parser was tightly coupled to # the lexer and the parser state was used to disambiguate these cases. "x in' '" => "(call-i x in (char (error)))" - "x in'``\$" => "(call-i x in (call-i (juxtapose (char '`' (error-t)) (cmdstring-r (error-t))) \$ (error)))" + "x in'``\$" => "(wrapper (call-i x in (juxtapose (char '`' (error-t)) (cmdstring-r (error-t)))) (error-t \$))" "var\"#\"`str`" => "(juxtapose (var # (error-t)) (cmdstring-r \"str\"))" "var\"#\"\"str\"" => "(juxtapose (var # (error-t)) (error-t) (string \"str\"))" @@ -1165,8 +1167,8 @@ parsestmt_with_kind_tests = [ "a += b" => "(op= a::Identifier +::Identifier b::Identifier)" "a .+= b" => "(.op= a::Identifier +::Identifier b::Identifier)" "a >>= b" => "(op= a::Identifier >>::Identifier b::Identifier)" - ":+=" => "(quote-: +=::op=)" - ":.+=" => "(quote-: (. +=::op=))" + ":+=" => "(quote-: (op= +::Identifier))" + ":.+=" => "(quote-: (.op= +::Identifier))" ] @testset "parser `Kind` remapping" begin diff --git a/test/parser_api.jl b/test/parser_api.jl index 10a09d3a..2496ed82 100644 --- a/test/parser_api.jl +++ b/test/parser_api.jl @@ -214,8 +214,10 @@ tokensplit(str; kws...) = [kind(tok) => untokenize(tok, str) for tok in tokenize K"Integer" => "1", ] - # A predicate based on flags() - @test JuliaSyntax.is_suffixed(tokenize("+₁")[1]) + # +₁ is tokenized as a single identifier token (subscripts are valid in identifiers) + tokens = tokenize("+₁") + @test length(tokens) == 1 # Just the identifier, endmarker is not included in tokenize() + @test kind(tokens[1]) == K"Identifier" # Buffer interface @test tokenize(Vector{UInt8}("a + b")) == tokenize("a + b") diff --git a/test/tokenize.jl b/test/tokenize.jl index ab3800c9..cea7927d 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -129,11 +129,11 @@ end # testset K"NewlineWs",K"Comment", - K"NewlineWs",K"Integer",K"%",K"Integer", + K"NewlineWs",K"Integer",K"Operator",K"Integer", - K"NewlineWs",K"Identifier",K"'",K"/",K"Identifier",K"'", + K"NewlineWs",K"Identifier",K"'",K"Operator",K"Identifier",K"'", - K"NewlineWs",K"Identifier",K".",K"'",K"\\",K"Identifier",K".",K"'", + K"NewlineWs",K"Identifier",K".",K"'",K"Operator",K"Identifier",K".",K"'", K"NewlineWs",K"`",K"CmdString",K"`", @@ -175,18 +175,28 @@ end end @testset "test added operators" begin - @test tok("1+=2", 2).kind == K"op=" - @test tok("1-=2", 2).kind == K"op=" - @test tok("1*=2", 2).kind == K"op=" - @test tok("1^=2", 2).kind == K"op=" - @test tok("1÷=2", 2).kind == K"op=" - @test tok("1\\=2", 2).kind == K"op=" - @test tok("1\$=2", 2).kind == K"op=" - @test tok("1⊻=2", 2).kind == K"op=" + # Compound assignments now emit separate operator and = tokens + # Operators emit as K"Operator" when followed by = + @test tok("1+=2", 2).kind == K"Operator" # + before = + @test tok("1+=2", 3).kind == K"=" + @test tok("1-=2", 2).kind == K"Operator" # - before = + @test tok("1-=2", 3).kind == K"=" + @test tok("1*=2", 2).kind == K"Operator" # * before = + @test tok("1*=2", 3).kind == K"=" + @test tok("1^=2", 2).kind == K"Operator" # ^ before = + @test tok("1^=2", 3).kind == K"=" + @test tok("1÷=2", 2).kind == K"Operator" # ÷ before = + @test tok("1÷=2", 3).kind == K"=" + @test tok("1\\=2", 2).kind == K"Operator" # \ before = + @test tok("1\\=2", 3).kind == K"=" + @test tok("1\$=2", 2).kind == K"Operator" # $ before = + @test tok("1\$=2", 3).kind == K"=" + @test tok("1⊻=2", 2).kind == K"Operator" # ⊻ before = + @test tok("1⊻=2", 3).kind == K"=" @test tok("1:=2", 2).kind == K":=" @test tok("1-->2", 2).kind == K"-->" - @test tok("1<--2", 2).kind == K"<--" - @test tok("1<-->2", 2).kind == K"<-->" + @test tok("1<--2", 2).kind == K"Operator" + @test tok("1<-->2", 2).kind == K"Operator" @test tok("1>:2", 2).kind == K">:" end @@ -584,9 +594,9 @@ end end @testset "modifying function names (!) followed by operator" begin - @test toks("a!=b") == ["a"=>K"Identifier", "!="=>K"!=", "b"=>K"Identifier"] - @test toks("a!!=b") == ["a!"=>K"Identifier", "!="=>K"!=", "b"=>K"Identifier"] - @test toks("!=b") == ["!="=>K"!=", "b"=>K"Identifier"] + @test toks("a!=b") == ["a"=>K"Identifier", "!="=>K"Operator", "b"=>K"Identifier"] + @test toks("a!!=b") == ["a!"=>K"Identifier", "!="=>K"Operator", "b"=>K"Identifier"] + @test toks("!=b") == ["!="=>K"Operator", "b"=>K"Identifier"] end @testset "integer literals" begin @@ -725,11 +735,11 @@ end "f"=>K"Identifier", "("=>K"(", "a"=>K"Identifier", ")"=>K")"] @test toks("1234.0 .f(a)") == ["1234.0"=>K"Float", " "=>K"Whitespace", "."=>K".", "f"=>K"Identifier", "("=>K"(", "a"=>K"Identifier", ")"=>K")"] - @test toks("1f0./1") == ["1f0"=>K"Float32", "."=>K".", "/"=>K"/", "1"=>K"Integer"] + @test toks("1f0./1") == ["1f0"=>K"Float32", "."=>K".", "/"=>K"Operator", "1"=>K"Integer"] # Dotted operators after numeric constants are ok - @test toks("1e1.⫪") == ["1e1"=>K"Float", "."=>K".", "⫪"=>K"⫪"] - @test toks("1.1.⫪") == ["1.1"=>K"Float", "."=>K".", "⫪"=>K"⫪"] + @test toks("1e1.⫪") == ["1e1"=>K"Float", "."=>K".", "⫪"=>K"Operator"] + @test toks("1.1.⫪") == ["1.1"=>K"Float", "."=>K".", "⫪"=>K"Operator"] @test toks("1e1.−") == ["1e1"=>K"Float", "."=>K".", "−"=>K"-"] @test toks("1.1.−") == ["1.1"=>K"Float", "."=>K".", "−"=>K"-"] # Non-dottable operators are not ok @@ -739,8 +749,8 @@ end # Ambiguous dotted operators @test toks("1.+") == ["1."=>K"ErrorAmbiguousNumericConstant", "+"=>K"+"] @test toks("1.+ ") == ["1."=>K"ErrorAmbiguousNumericConstant", "+"=>K"+", " "=>K"Whitespace"] - @test toks("1.⤋") == ["1."=>K"ErrorAmbiguousNumericConstant", "⤋"=>K"⤋"] - @test toks("1.⫪") == ["1."=>K"ErrorAmbiguousNumericConstant", "⫪"=>K"⫪"] + @test toks("1.⤋") == ["1."=>K"ErrorAmbiguousNumericConstant", "⤋"=>K"Operator"] + @test toks("1.⫪") == ["1."=>K"ErrorAmbiguousNumericConstant", "⫪"=>K"Operator"] # non-dottable ops are the exception @test toks("1.:") == ["1."=>K"Float", ":"=>K":"] @test toks("1.\$") == ["1."=>K"Float", "\$"=>K"$"] @@ -793,9 +803,24 @@ end @test length(collect(tokenize(io))) == 4 end +function _nondot_symbolic_operator_kinds() + op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS") + setdiff(reinterpret.(Kind, op_range), [ + K"ErrorInvalidOperator" + K"Error**" + K"." + K".." + K"where" + K"isa" + K"in" + K".'" + K"op=" + ]) +end + @testset "dotted and suffixed operators" begin -for opkind in Tokenize._nondot_symbolic_operator_kinds() +for opkind in _nondot_symbolic_operator_kinds() op = string(opkind) strs = [ 1 => [ # unary @@ -853,19 +878,21 @@ end @testset "Normalization of Unicode symbols" begin # https://github.com/JuliaLang/julia/pull/25157 - @test tok("\u00b7").kind == K"⋅" - @test tok("\u0387").kind == K"⋅" - @test toks(".\u00b7") == ["."=>K".", "\u00b7"=>K"⋅"] - @test toks(".\u0387") == ["."=>K".", "\u0387"=>K"⋅"] + @test tok("\u00b7").kind == K"Operator" + @test tok("\u0387").kind == K"Operator" + @test toks(".\u00b7") == ["."=>K".", "\u00b7"=>K"Operator"] + @test toks(".\u0387") == ["."=>K".", "\u0387"=>K"Operator"] # https://github.com/JuliaLang/julia/pull/40948 @test tok("−").kind == K"-" - @test tok("−=").kind == K"op=" + # −= now emits separate tokens + @test tok("−=").kind == K"Operator" # − before = + @test tok("−=", 2).kind == K"=" @test toks(".−") == ["."=>K".", "−"=>K"-"] end @testset "perp" begin - @test tok("1 ⟂ 2", 3).kind==K"⟂" + @test tok("1 ⟂ 2", 3).kind==K"Operator" end @testset "outer" begin @@ -894,7 +921,7 @@ end @testset "circ arrow right op" begin s = "↻" - @test collect(tokenize(s))[1].kind == K"↻" + @test collect(tokenize(s))[1].kind == K"Operator" end @testset "invalid float" begin @@ -918,8 +945,8 @@ end raw"<|" raw"|>" raw": .. … ⁝ ⋮ ⋱ ⋰ ⋯" - raw"$ + - ¦ | ⊕ ⊖ ⊞ ⊟ ++ ∪ ∨ ⊔ ± ∓ ∔ ∸ ≏ ⊎ ⊻ ⊽ ⋎ ⋓ ⧺ ⧻ ⨈ ⨢ ⨣ ⨤ ⨥ ⨦ ⨧ ⨨ ⨩ ⨪ ⨫ ⨬ ⨭ ⨮ ⨹ ⨺ ⩁ ⩂ ⩅ ⩊ ⩌ ⩏ ⩐ ⩒ ⩔ ⩖ ⩗ ⩛ ⩝ ⩡ ⩢ ⩣" - raw"* / ⌿ ÷ % & ⋅ ∘ × \ ∩ ∧ ⊗ ⊘ ⊙ ⊚ ⊛ ⊠ ⊡ ⊓ ∗ ∙ ∤ ⅋ ≀ ⊼ ⋄ ⋆ ⋇ ⋉ ⋊ ⋋ ⋌ ⋏ ⋒ ⟑ ⦸ ⦼ ⦾ ⦿ ⧶ ⧷ ⨇ ⨰ ⨱ ⨲ ⨳ ⨴ ⨵ ⨶ ⨷ ⨸ ⨻ ⨼ ⨽ ⩀ ⩃ ⩄ ⩋ ⩍ ⩎ ⩑ ⩓ ⩕ ⩘ ⩚ ⩜ ⩞ ⩟ ⩠ ⫛ ⊍ ▷ ⨝ ⟕ ⟖ ⟗" + raw"$ + - | ⊕ ⊖ ⊞ ⊟ ++ ∪ ∨ ⊔ ± ∓ ∔ ∸ ≏ ⊎ ⊻ ⊽ ⋎ ⋓ ⧺ ⧻ ⨈ ⨢ ⨣ ⨤ ⨥ ⨦ ⨧ ⨨ ⨩ ⨪ ⨫ ⨬ ⨭ ⨮ ⨹ ⨺ ⩁ ⩂ ⩅ ⩊ ⩌ ⩏ ⩐ ⩒ ⩔ ⩖ ⩗ ⩛ ⩝ ⩡ ⩢ ⩣" + raw"* / ÷ % & ⋅ ∘ × \ ∩ ∧ ⊗ ⊘ ⊙ ⊚ ⊛ ⊠ ⊡ ⊓ ∗ ∙ ∤ ⅋ ≀ ⊼ ⋄ ⋆ ⋇ ⋉ ⋊ ⋋ ⋌ ⋏ ⋒ ⟑ ⦸ ⦼ ⦾ ⦿ ⧶ ⧷ ⨇ ⨰ ⨱ ⨲ ⨳ ⨴ ⨵ ⨶ ⨷ ⨸ ⨻ ⨼ ⨽ ⩀ ⩃ ⩄ ⩋ ⩍ ⩎ ⩑ ⩓ ⩕ ⩘ ⩚ ⩜ ⩞ ⩟ ⩠ ⫛ ⊍ ▷ ⨝ ⟕ ⟖ ⟗" raw"//" raw"<< >> >>>" raw"^ ↑ ↓ ⇵ ⟰ ⟱ ⤈ ⤉ ⤊ ⤋ ⤒ ⤓ ⥉ ⥌ ⥍ ⥏ ⥑ ⥔ ⥕ ⥘ ⥙ ⥜ ⥝ ⥠ ⥡ ⥣ ⥥ ⥮ ⥯ ↑ ↓" @@ -927,7 +954,7 @@ end raw"." ] if VERSION >= v"1.6.0" - push!(ops, raw"<-- <-->") + push!(ops, raw"<-- <--> ¦ ⌿") end if VERSION >= v"1.7.0" append!(ops, [