From 45f9ce87edd65ac1c2bd13c29c249e2b27b4baba Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 9 Jan 2026 17:44:26 +0100 Subject: [PATCH 1/3] gh-143054: Disallow non-top-level Cut for now The behaviour of Cut in nested parentheses, Repeat, Opt, and similar is somewhat chaotic. Apparently even the academic papers on PEG aren't as clear as they could be. And it doesn't really matter. Python only uses top-level cuts. When that changes, we can clarify as much as necessary (and even change the implementation to make sense for what we'll need). Document that this is deliberately unspecified, and add a test to make sure any decision is deliberate, tested and documented. --- Doc/reference/grammar.rst | 13 ++++++-- .../test_grammar_validator.py | 18 +++++++++- Lib/test/test_peg_generator/test_pegen.py | 24 ++++++++++++++ Tools/peg_generator/pegen/validator.py | 33 ++++++++++++++++++- 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/Doc/reference/grammar.rst b/Doc/reference/grammar.rst index 1037feb691f6bc..0ce8e42ddf3b0c 100644 --- a/Doc/reference/grammar.rst +++ b/Doc/reference/grammar.rst @@ -12,8 +12,17 @@ The notation used here is the same as in the preceding docs, and is described in the :ref:`notation ` section, except for an extra complication: -* ``~`` ("cut"): commit to the current alternative and fail the rule - even if this fails to parse +* ``~`` ("cut"): commit to the current alternative; fail the rule + if the alternative fails to parse + + Python mainly uses cuts for optimizations or improved error + messages. They often appear to be useless in the listing below. + + .. see gh-143054, and CutValidator in the source, if you want to change this: + + Cuts currently don't appear inside parentheses, brackets, lookaheads + and similar. + Their behavior in these contexts is deliberately left unspecified. .. literalinclude:: ../../Grammar/python.gram :language: peg diff --git a/Lib/test/test_peg_generator/test_grammar_validator.py b/Lib/test/test_peg_generator/test_grammar_validator.py index c7f20e1de802ce..857aced8ae5dcf 100644 --- a/Lib/test/test_peg_generator/test_grammar_validator.py +++ b/Lib/test/test_peg_generator/test_grammar_validator.py @@ -4,7 +4,8 @@ test_tools.skip_if_missing("peg_generator") with test_tools.imports_under_tool("peg_generator"): from pegen.grammar_parser import GeneratedParser as GrammarParser - from pegen.validator import SubRuleValidator, ValidationError, RaiseRuleValidator + from pegen.validator import SubRuleValidator, ValidationError + from pegen.validator import RaiseRuleValidator, CutValidator from pegen.testutil import parse_string from pegen.grammar import Grammar @@ -59,3 +60,18 @@ def test_raising_valid_rule(self) -> None: with self.assertRaises(ValidationError): for rule_name, rule in grammar.rules.items(): validator.validate_rule(rule_name, rule) + + def test_cut_validator(self) -> None: + grammar_source = """ + star: (OP ~ OP)* + plus: (OP ~ OP)+ + bracket: [OP ~ OP] + gather: OP.(OP ~ OP)+ + nested: [OP | NAME ~ OP] + """ + grammar: Grammar = parse_string(grammar_source, GrammarParser) + validator = CutValidator(grammar) + for rule_name, rule in grammar.rules.items(): + with self.subTest(rule_name): + with self.assertRaises(ValidationError): + validator.validate_rule(rule_name, rule) diff --git a/Lib/test/test_peg_generator/test_pegen.py b/Lib/test/test_peg_generator/test_pegen.py index d03ba07975a616..f39fcc2e0d8daf 100644 --- a/Lib/test/test_peg_generator/test_pegen.py +++ b/Lib/test/test_peg_generator/test_pegen.py @@ -755,6 +755,30 @@ def test_cut(self) -> None: ], ) + def test_cut_is_local_in_rule(self) -> None: + grammar = """ + start: + | inner + | 'x' { "ok" } + inner: + | 'x' ~ 'y' + | 'x' + """ + parser_class = make_parser(grammar) + node = parse_string("x", parser_class) + self.assertEqual(node, 'ok') + + def test_cut_is_local_in_parens(self) -> None: + # we currently don't guarantee this behavior, see gh-143054 + grammar = """ + start: + | ('x' ~ 'y' | 'x') + | 'x' { "ok" } + """ + parser_class = make_parser(grammar) + node = parse_string("x", parser_class) + self.assertEqual(node, 'ok') + def test_dangling_reference(self) -> None: grammar = """ start: foo ENDMARKER diff --git a/Tools/peg_generator/pegen/validator.py b/Tools/peg_generator/pegen/validator.py index 635eb398b41808..db62b91fcda100 100644 --- a/Tools/peg_generator/pegen/validator.py +++ b/Tools/peg_generator/pegen/validator.py @@ -1,5 +1,5 @@ from pegen import grammar -from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule +from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule, Cut, Repeat, Opt, NamedItem class ValidationError(Exception): @@ -44,6 +44,37 @@ def visit_Alt(self, node: Alt) -> None: ) +class CutValidator(GrammarValidator): + """Fail if Cut is not directly in a rule. + + For simplicity, we currently document that a Cut affects alternatives + of the *rule* it is in. + However, the implementation makes cuts local to enclosing Rhs + (e.g. parenthesized list of choices). + Additionally, in academic papers about PEG, repeats and optional items + are "desugared" to choices with an empty alternative, and thus contain + a Cut's effect. + + Please update documentation and tests when adding this cut, + then get rid of this validator. + + See gh-143054. + """ + + def visit(self, node: Any, parents: tuple[Any] = ()) -> None: + super().visit(node, parents=(*parents, node)) + + def visit_Cut(self, node: Alt, parents: tuple[Any] = ()) -> None: + parent_types = [type(p).__name__ for p in parents] + if parent_types != ['Rule', 'Rhs', 'Alt', 'NamedItem', 'Cut']: + raise ValidationError( + f"Rule {self.rulename!r} contains cut that's not on the " + "top level. " + "The intended semantics of such cases need " + "to be clarified; see the CutValidator docstring." + f"\nThe cut is inside: {parent_types}" + ) + def validate_grammar(the_grammar: grammar.Grammar) -> None: for validator_cls in GrammarValidator.__subclasses__(): validator = validator_cls(the_grammar) From 638e724b5aeaa329c269126f6534ab0a832de523 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 9 Jan 2026 17:59:09 +0100 Subject: [PATCH 2/3] Remove unused imports --- Tools/peg_generator/pegen/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/peg_generator/pegen/validator.py b/Tools/peg_generator/pegen/validator.py index db62b91fcda100..95839a4200b8d2 100644 --- a/Tools/peg_generator/pegen/validator.py +++ b/Tools/peg_generator/pegen/validator.py @@ -1,5 +1,5 @@ from pegen import grammar -from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule, Cut, Repeat, Opt, NamedItem +from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule class ValidationError(Exception): From 0214bb3a2199a3882fc0f9a106a38516bb1166bb Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 9 Jan 2026 18:00:05 +0100 Subject: [PATCH 3/3] Update typing --- Tools/peg_generator/pegen/validator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Tools/peg_generator/pegen/validator.py b/Tools/peg_generator/pegen/validator.py index 95839a4200b8d2..5e2bc238a1e966 100644 --- a/Tools/peg_generator/pegen/validator.py +++ b/Tools/peg_generator/pegen/validator.py @@ -1,3 +1,5 @@ +from typing import Any + from pegen import grammar from pegen.grammar import Alt, GrammarVisitor, Rhs, Rule @@ -61,10 +63,10 @@ class CutValidator(GrammarValidator): See gh-143054. """ - def visit(self, node: Any, parents: tuple[Any] = ()) -> None: + def visit(self, node: Any, parents: tuple[Any, ...] = ()) -> None: super().visit(node, parents=(*parents, node)) - def visit_Cut(self, node: Alt, parents: tuple[Any] = ()) -> None: + def visit_Cut(self, node: Alt, parents: tuple[Any, ...] = ()) -> None: parent_types = [type(p).__name__ for p in parents] if parent_types != ['Rule', 'Rhs', 'Alt', 'NamedItem', 'Cut']: raise ValidationError(