From cc3ef8322fc3f877b9e620f62ab2e22f9d3e75c2 Mon Sep 17 00:00:00 2001 From: Ivan Levkivskyi Date: Sat, 18 Apr 2026 13:48:34 +0100 Subject: [PATCH 1/3] Use parallel parsing at all stages --- mypy/build.py | 101 ++++++++++++++++++---------------- mypy/build_worker/worker.py | 11 +++- mypy/checkstrformat.py | 3 +- mypy/nativeparse.py | 29 +++------- mypy/parse.py | 59 +++++++++++--------- mypy/stubgen.py | 5 +- mypy/test/test_nativeparse.py | 26 ++++++++- mypy/test/testparse.py | 4 +- 8 files changed, 135 insertions(+), 103 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index ef481ed8f444..121b261e28fd 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -121,7 +121,6 @@ ImportFrom, MypyFile, OverloadedFuncDef, - ParseError, SymbolTable, ) from mypy.options import OPTIONS_AFFECTING_CACHE_NO_PLATFORM @@ -168,7 +167,7 @@ from mypy.modules_state import modules_state from mypy.nodes import Expression from mypy.options import Options -from mypy.parse import load_from_raw, parse, report_parse_error +from mypy.parse import load_from_raw, parse from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext from mypy.plugins.default import DefaultPlugin from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor @@ -999,13 +998,18 @@ def dump_stats(self) -> None: # Call print once so that we don't get a mess in parallel mode. print("\n".join(lines) + "\n\n", end="") - def parse_all(self, states: list[State]) -> None: - """Parse multiple files in parallel (if possible) and compute dependencies.""" + def parse_all(self, states: list[State], post_parse: bool = True) -> None: + """Parse multiple files in parallel (if possible) and compute dependencies. + + If post_parse is False, skip the last step (used when parsing unchanged files + that need to be re-checked due to stale dependencies). + """ if not self.options.native_parser: # Old parser cannot be parallelized. for state in states: state.parse_file() - self.post_parse_all(states) + if post_parse: + self.post_parse_all(states) return sequential_states = [] @@ -1020,7 +1024,8 @@ def parse_all(self, states: list[State]) -> None: continue parallel_states.append(state) self.parse_parallel(sequential_states, parallel_states) - self.post_parse_all(states) + if post_parse: + self.post_parse_all(states) def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None: """Perform parallel parsing of states. @@ -1030,7 +1035,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S parallelized efficiently. """ futures = [] - parallel_parsed_states = {} + parallel_parsed_states = [] # Use at least --num-workers if specified by user. available_threads = max(get_available_threads(), self.options.num_workers) # Overhead from trying to parallelize (small) blocking portion of @@ -1048,7 +1053,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S if ignore_errors: self.errors.ignored_files.add(state.xpath) futures.append(executor.submit(state.parse_file_inner, state.source or "")) - parallel_parsed_states[state.id] = state + parallel_parsed_states.append(state) else: self.log(f"Using cached AST for {state.xpath} ({state.id})") state.tree, state.early_errors = self.ast_cache[state.id] @@ -1058,21 +1063,27 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S state.parse_file() for fut in wait(futures).done: - state_id, parse_errors = fut.result() - # New parser reports errors lazily, add them if any. - if parse_errors: - state = parallel_parsed_states[state_id] - with state.wrap_context(): - self.errors.set_file(state.xpath, state.id, options=state.options) - for error in parse_errors: - report_parse_error(error, self.errors) - if self.errors.is_blockers(): - self.log("Bailing due to parse errors") - self.errors.raise_error() + fut.result() + for state in parallel_parsed_states: + # New parser returns serialized trees that need to be de-serialized. + with state.wrap_context(): + assert state.tree is not None + if state.tree.raw_data: + state.tree = load_from_raw( + state.xpath, + state.id, + state.tree.raw_data, + self.errors, + state.options, + imports_only=bool(self.workers), + ) + if self.errors.is_blockers(): + self.log("Bailing due to parse errors") + self.errors.raise_error() for state in parallel_states: assert state.tree is not None - if state.id in parallel_parsed_states: + if state in parallel_parsed_states: state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) state.semantic_analysis_pass1() self.ast_cache[state.id] = (state.tree, state.early_errors) @@ -1208,31 +1219,18 @@ def parse_file( source: str, options: Options, raw_data: FileRawData | None = None, - ) -> tuple[MypyFile, list[ParseError]]: + ) -> MypyFile: """Parse the source of a file with the given name. Raise CompileError if there is a parse error. """ - imports_only = False file_exists = self.fscache.exists(path) - if self.workers and file_exists: - # Currently, we can use the native parser only for actual files. - imports_only = True t0 = time.time() - parse_errors: list[ParseError] = [] if raw_data: # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - tree, parse_errors = parse( - source, - path, - id, - self.errors, - options=options, - file_exists=file_exists, - imports_only=imports_only, - ) + tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists) tree._fullname = id if self.stats_enabled: with self.stats_lock: @@ -1242,7 +1240,7 @@ def parse_file( stubs_parsed=int(tree.is_stub), parse_time=time.time() - t0, ) - return tree, parse_errors + return tree def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]: t0 = time.time() @@ -3089,15 +3087,12 @@ def get_source(self) -> str: self.time_spent_us += time_spent_us(t0) return source - def parse_file_inner( - self, source: str, raw_data: FileRawData | None = None - ) -> tuple[str, list[ParseError]]: + def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None: t0 = time_ref() - self.tree, parse_errors = self.manager.parse_file( + self.tree = self.manager.parse_file( self.id, self.xpath, source, options=self.options, raw_data=raw_data ) self.time_spent_us += time_spent_us(t0) - return self.id, parse_errors def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = None) -> None: """Parse file and run first pass of semantic analysis. @@ -3120,10 +3115,20 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = self.manager.errors.ignored_files.add(self.xpath) with self.wrap_context(): manager.errors.set_file(self.xpath, self.id, options=self.options) - _, parse_errors = self.parse_file_inner(source, raw_data) - for error in parse_errors: - # New parser reports errors lazily. - report_parse_error(error, manager.errors) + self.parse_file_inner(source, raw_data) + tree: MypyFile | None = self.tree + assert tree is not None + # New parser returns serialized trees that need to be de-serialized. + if tree.raw_data is not None: + assert raw_data is None + self.tree = load_from_raw( + self.xpath, + self.id, + tree.raw_data, + manager.errors, + self.options, + imports_only=bool(self.manager.workers), + ) if manager.errors.is_blockers(): manager.log("Bailing due to parse errors") manager.errors.raise_error() @@ -4631,9 +4636,9 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: # Re-generate import errors in case this module was loaded from the cache. if graph[id].meta: graph[id].verify_dependencies(suppressed_only=True) - # We may already have parsed the module, or not. - # If the former, parse_file() is a no-op. - graph[id].parse_file() + # We may already have parsed the modules, or not. + # If the former, parse_file() is a no-op. + manager.parse_all([graph[id] for id in stale], post_parse=False) if "typing" in scc: # For historical reasons we need to manually add typing aliases # for built-in generic collections, see docstring of diff --git a/mypy/build_worker/worker.py b/mypy/build_worker/worker.py index 6742bd6fde6f..2139ad130637 100644 --- a/mypy/build_worker/worker.py +++ b/mypy/build_worker/worker.py @@ -239,14 +239,23 @@ def load_states( mod_data: dict[str, tuple[bytes, FileRawData | None]], ) -> None: """Re-create full state of an SCC as it would have been in coordinator.""" + needs_parse = [] for id in scc.mod_ids: state = graph[id] # Re-clone options since we don't send them, it is usually faster than deserializing. state.options = state.options.clone_for_module(state.id) suppressed_deps_opts, raw_data = mod_data[id] - state.parse_file(raw_data=raw_data) + if raw_data is not None: + state.parse_file(raw_data=raw_data) + else: + needs_parse.append(state) # Set data that is needed to be written to cache meta. state.known_suppressed_deps_opts = suppressed_deps_opts + # Performa actual parsing in parallel (but we don't need to compute dependencies). + if needs_parse: + manager.parse_all(needs_parse, post_parse=False) + for id in scc.mod_ids: + state = graph[id] assert state.tree is not None import_lines = {imp.line for imp in state.tree.imports} state.imports_ignored = { diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py index e985aa352abd..e96af007e29c 100644 --- a/mypy/checkstrformat.py +++ b/mypy/checkstrformat.py @@ -581,13 +581,14 @@ def apply_field_accessors( temp_errors = Errors(self.chk.options) dummy = DUMMY_FIELD_NAME + spec.field[len(spec.key) :] - temp_ast, _ = parse( + temp_ast = parse( dummy, fnam="", module=None, options=self.chk.options, errors=temp_errors, file_exists=False, + eager=True, ) if temp_errors.is_errors(): self.msg.fail( diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py index f08268cfe1ca..fd90d85fa355 100644 --- a/mypy/nativeparse.py +++ b/mypy/nativeparse.py @@ -190,12 +190,15 @@ def add_error( def native_parse( - filename: str, options: Options, skip_function_bodies: bool = False, imports_only: bool = False + filename: str, options: Options, skip_function_bodies: bool = False ) -> tuple[MypyFile, list[ParseError], TypeIgnores]: """Parse a Python file using the native Rust-based parser. Return (MypyFile, errors, type_ignores). + The returned tree is empty with actual serialized data stored in `raw_data` + attribute. Use read_statements() and/or deserialize_imports() to de-serialize. + The caller should set these additional attributes on the returned MypyFile: - ignored_lines: dict of type ignore comments (from the TypeIgnores return value) - is_stub: whether the file is a .pyi stub @@ -210,26 +213,12 @@ def native_parse( b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = ( parse_to_binary_ast(filename, options, skip_function_bodies) ) - data = ReadBuffer(b) - n = read_int(data) - state = State(options) - if imports_only: - defs = [] - else: - defs = read_statements(state, data, n) - - imports = deserialize_imports(import_bytes) - - node = MypyFile(defs, imports) + node = MypyFile([], []) node.path = filename - node.is_partial_stub_package = is_partial_package - if imports_only: - node.raw_data = FileRawData( - b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings - ) - node.uses_template_strings = uses_template_strings - all_errors = errors + state.errors - return node, all_errors, ignores + node.raw_data = FileRawData( + b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings + ) + return node, errors, ignores def expect_end_tag(data: ReadBuffer) -> None: diff --git a/mypy/parse.py b/mypy/parse.py index bd8e4ad5dcd3..d2626737b8c4 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -18,14 +18,15 @@ def parse( errors: Errors, options: Options, file_exists: bool, - imports_only: bool = False, -) -> tuple[MypyFile, list[ParseError]]: + eager: bool = False, +) -> MypyFile: """Parse a source file, without doing any semantic analysis. - Return the parse tree. If errors is not provided, raise ParseError - on failure. Otherwise, use the errors object to report parse errors. - + Return the parse tree, use the errors object to report parse errors. The python_version (major, minor) option determines the Python syntax variant. + + New parser returns empty tree with serialized data. To get the full tree and + the parse errors, use eager=True. """ if options.native_parser: # Native parser only works with actual files on disk @@ -36,45 +37,43 @@ def parse( ignore_errors = options.ignore_errors or fnam in errors.ignored_files # If errors are ignored, we can drop many function bodies to speed up type checking. strip_function_bodies = ignore_errors and not options.preserve_asts - tree, parse_errors, type_ignores = mypy.nativeparse.native_parse( - fnam, - options, - skip_function_bodies=strip_function_bodies, - imports_only=imports_only, + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, skip_function_bodies=strip_function_bodies ) - # Convert type ignores list to dict - tree.ignored_lines = dict(type_ignores) # Set is_stub based on file extension tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by native_parse with deserialized + # Note: tree.imports is populated directly by load_from_raw() with deserialized # import metadata, so we don't need to collect imports via AST traversal - return tree, parse_errors + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree # Fall through to fastparse for non-existent files - assert not imports_only if options.transform_source is not None: source = options.transform_source(source) import mypy.fastparse - tree = mypy.fastparse.parse(source, fnam=fnam, module=module, errors=errors, options=options) - return tree, [] + return mypy.fastparse.parse(source, fnam=fnam, module=module, errors=errors, options=options) def load_from_raw( - fnam: str, module: str | None, raw_data: FileRawData, errors: Errors, options: Options + fnam: str, + module: str | None, + raw_data: FileRawData, + errors: Errors, + options: Options, + imports_only: bool = False, ) -> MypyFile: - """Load AST from parsed binary data. - - This essentially replicates parse() above but expects FileRawData instead of actually - parsing the source code in the file. - """ + """Load AST from parsed binary data and report stored errors.""" from mypy.nativeparse import State, deserialize_imports, read_statements - # This part mimics the logic in native_parse(). - data = ReadBuffer(raw_data.defs) - n = read_int(data) state = State(options) - defs = read_statements(state, data, n) + if imports_only: + defs = [] + else: + data = ReadBuffer(raw_data.defs) + n = read_int(data) + defs = read_statements(state, data, n) imports = deserialize_imports(raw_data.imports) tree = MypyFile(defs, imports) @@ -83,6 +82,8 @@ def load_from_raw( tree.is_partial_stub_package = raw_data.is_partial_stub_package tree.uses_template_strings = raw_data.uses_template_strings tree.is_stub = fnam.endswith(".pyi") + if module is not None: + tree._fullname = module # Report parse errors, this replicates the logic in parse(). all_errors = raw_data.raw_errors + state.errors @@ -90,6 +91,10 @@ def load_from_raw( for error in all_errors: # Note we never raise in this function, so it should not be called in coordinator. report_parse_error(error, errors) + if imports_only: + # Preserve raw data when only de-serializing imports, it will be sent to + # the parallel workers. + tree.raw_data = raw_data return tree diff --git a/mypy/stubgen.py b/mypy/stubgen.py index 38bd1f228e6e..9c682ba4b820 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -1744,17 +1744,16 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None: data = f.read() source = mypy.util.decode_python_encoding(data) errors = Errors(mypy_options) - mod.ast, errs = mypy.parse.parse( + mod.ast = mypy.parse.parse( source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, file_exists=True, + eager=True, ) mod.ast._fullname = mod.module - for err in errs: - mypy.parse.report_parse_error(err, errors) if errors.is_blockers(): # Syntax error! for m in errors.new_messages(): diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index 94be60e328b7..f9a18ea992c2 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -12,6 +12,8 @@ import unittest from collections.abc import Iterator +from librt.internal import ReadBuffer + from mypy import defaults, nodes from mypy.cache import ( END_TAG, @@ -21,6 +23,7 @@ LITERAL_NONE, LITERAL_STR, LOCATION, + read_int, ) from mypy.config_parser import parse_mypy_comments from mypy.errors import CompileError @@ -33,7 +36,13 @@ # If the experimental ast_serialize module isn't installed, the following import will fail # and we won't run any native parser tests. try: - from mypy.nativeparse import native_parse, parse_to_binary_ast + from mypy.nativeparse import ( + State, + deserialize_imports, + native_parse, + parse_to_binary_ast, + read_statements, + ) has_nativeparse = True except ImportError: @@ -90,6 +99,7 @@ def test_parser(testcase: DataDrivenTestCase) -> None: try: with temp_source(source) as fnam: node, errors, type_ignores = native_parse(fnam, options, skip_function_bodies) + errors += load_tree(node, options) node.path = "main" a = node.str_with_options(options).split("\n") a = [format_error(err) for err in errors] + a @@ -113,6 +123,18 @@ def format_ignore(ignore: tuple[int, list[str]]) -> str: return f"ignore: {line} [{', '.join(codes)}]" +def load_tree(node: MypyFile, options: Options) -> list[ParseError]: + """Deserialize full AST from serialized raw data.""" + assert node.raw_data is not None + state = State(options) + data = ReadBuffer(node.raw_data.defs) + n = read_int(data) + node.defs = read_statements(state, data, n) + node.imports = deserialize_imports(node.raw_data.imports) + node.raw_data = None + return state.errors + + def test_parser_imports(testcase: DataDrivenTestCase) -> None: """Perform a single native parser imports test case. @@ -128,7 +150,7 @@ def test_parser_imports(testcase: DataDrivenTestCase) -> None: try: with temp_source(source) as fnam: node, errors, type_ignores = native_parse(fnam, options) - + errors += load_tree(node, options) # Extract and format reachable imports a = format_reachable_imports(node) a = [format_error(err) for err in errors] + a diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py index 09177126426d..6d00f5b5710f 100644 --- a/mypy/test/testparse.py +++ b/mypy/test/testparse.py @@ -60,13 +60,14 @@ def test_parser(testcase: DataDrivenTestCase) -> None: try: errors = Errors(options) - n, _ = parse( + n = parse( bytes(source, "ascii"), fnam="main", module="__main__", errors=errors, options=options, file_exists=False, + eager=True, ) if errors.is_errors(): errors.raise_error() @@ -108,6 +109,7 @@ def test_parse_error(testcase: DataDrivenTestCase) -> None: errors=errors, options=options, file_exists=False, + eager=True, ) if errors.is_errors(): errors.raise_error() From 283d0e494d25fb5aaaf72115a34d176ac8466763 Mon Sep 17 00:00:00 2001 From: Ivan Levkivskyi Date: Sat, 18 Apr 2026 14:36:44 +0100 Subject: [PATCH 2/3] Fast path for single-file SCCs --- mypy/build.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mypy/build.py b/mypy/build.py index 121b261e28fd..abc60d24d2ae 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1023,7 +1023,12 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: sequential_states.append(state) continue parallel_states.append(state) - self.parse_parallel(sequential_states, parallel_states) + if len(parallel_states) > 1: + self.parse_parallel(sequential_states, parallel_states) + else: + # Avoid using executor when there is no parallelism. + for state in states: + state.parse_file() if post_parse: self.post_parse_all(states) From 002cbd9b314b7ea7fc923a042a1ef2694a48289a Mon Sep 17 00:00:00 2001 From: Ivan Levkivskyi Date: Sat, 18 Apr 2026 14:42:38 +0100 Subject: [PATCH 3/3] Work around mypyc --- mypy/build.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index abc60d24d2ae..9a478c549d4b 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -3106,7 +3106,8 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = modules in any way. Logic here should be kept in sync with BuildManager.parse_all(). """ self.needs_parse = False - if self.tree is not None: + tree = self.tree + if tree is not None: # The file was already parsed. return @@ -3121,15 +3122,14 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = with self.wrap_context(): manager.errors.set_file(self.xpath, self.id, options=self.options) self.parse_file_inner(source, raw_data) - tree: MypyFile | None = self.tree - assert tree is not None + assert self.tree is not None # New parser returns serialized trees that need to be de-serialized. - if tree.raw_data is not None: + if self.tree.raw_data is not None: assert raw_data is None self.tree = load_from_raw( self.xpath, self.id, - tree.raw_data, + self.tree.raw_data, manager.errors, self.options, imports_only=bool(self.manager.workers),