pytorch · MartinPavella · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026 · JakeStevens
diff --git a/__init__.py b/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 NXP
+# Copyright 2024-2026 NXP
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -10,7 +10,7 @@
 import itertools
 from collections import OrderedDict
 from collections.abc import Iterable
-from typing import Any, Dict, List, Tuple, Type
+from typing import Any, Callable, Dict, List, Tuple, Type
 
 import torch
 from executorch.backends.nxp.aten_passes.fuse_batch_norm_with_linear_pass import (
@@ -30,8 +30,10 @@
     check_subgraphs_connected,
     SourcePartition,
 )
+
 from torchao.quantization.pt2e import (
     move_exported_model_to_eval,
+    move_exported_model_to_train,
     ObserverOrFakeQuantize,
 )
 from torchao.quantization.pt2e.quantize_pt2e import (
@@ -176,16 +178,17 @@ def calibrate_and_quantize(
     calibration_inputs: Iterable[tuple[torch.Tensor, ...]],
     quantizer: Quantizer,
     is_qat: bool = False,
+    train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
 ) -> fx.GraphModule:
     """Quantize the provided model.
 
     :param model: Aten model (or it's GraphModule representation) to quantize.
-    :param calibration_inputs: Either a tuple of calibration input tensors where each element corresponds to a model
-                                input. Or an iterator over such tuples.
+    :param calibration_inputs: An iterator over tuples of calibration input tensors where each tensor corresponds to a
+                                model input.
     :param quantizer: Quantizer to use.
     :param is_qat: Whether quantization is done using Quantization Aware Training (QAT) or not.
                     Note: In QAT mode, training is not performed. Only calibration (in eval mode) is done.
-
+    :param train_fn: Optional training function to be called during QAT.
     :return: Quantized GraphModule.
     """
 
@@ -195,12 +198,20 @@ def calibrate_and_quantize(
     if is_qat:
         m = prepare_qat_pt2e(model, quantizer)
         m = AddSimulatedLinearBatchNormFusionQATPass()(m).graph_module
+
+        if train_fn:
+            m = move_exported_model_to_train(m)
+            train_fn(m)
+
         m = move_exported_model_to_eval(m)
+        m = RemoveSimulatedLinearBatchNormFusionQATPass()(m).graph_module
+        m = FuseBatchNormWithLinearPass()(m).graph_module
     else:
         m = prepare_pt2e(model, quantizer)
 
-    for data in calibration_inputs:
-        m(*data)
+    if not is_qat or (is_qat and not train_fn):
+        for data in calibration_inputs:
+            m(*data)
 
     if is_qat:
         m = RemoveSimulatedLinearBatchNormFusionQATPass()(m).graph_module

@@ -21,7 +21,7 @@
 
 NSYS_PATH = pathlib.Path(shutil.which("nsys"))
 NSYS_CONFIG_PATH = os.path.join(
-    PROJECT_DIR, "backends", "nxp", "tests_models", "neutron-imxrt700.ini"
+    PROJECT_DIR, "backends", "nxp", "tests", "neutron-imxrt700.ini"
 )
 NSYS_FIRMWARE_PATH = os.path.join(
     os.path.dirname(eiq_neutron_sdk.__file__),

@@ -12,6 +12,6 @@
 
     logger.debug("Importing from executorch-integration")
 except ImportError:
-    import executorch.backends.nxp.tests_models.config as test_config  # noqa F401
+    import executorch.backends.nxp.tests.config as test_config  # noqa F401
 
     logger.debug("Importing from executorch")
@@ -8,7 +8,7 @@
 import pathlib
 import shutil
 
-from executorch.backends.nxp.tests_models.outputs_dir_importer import outputs_dir
+from executorch.backends.nxp.tests.outputs_dir_importer import outputs_dir
 
 
 def pytest_addoption(parser):

@@ -14,8 +14,8 @@
 import numpy as np
 import torch
 from executorch.backends.nxp.backend.ir.converter.conversion import translator
-from executorch.backends.nxp.tests_models.calibration_dataset import CalibrationDataset
-from executorch.backends.nxp.tests_models.model_input_spec import ModelInputSpec
+from executorch.backends.nxp.tests.calibration_dataset import CalibrationDataset
+from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec
 from torch import Tensor
 
 

@@ -8,9 +8,10 @@
 import re
 from dataclasses import dataclass
 from functools import partial
-from typing import Callable
+from typing import Callable, Iterable
 
 import eiq_neutron_sdk
+import numpy as np
 import torch
 
 from executorch import exir
@@ -28,7 +29,6 @@
     RemoveIOQuantOpsPass,
 )
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
-
 from executorch.backends.nxp.nxp_backend import (
     core_aten_ops_exception_list,
     generate_neutron_compile_spec,
@@ -42,7 +42,7 @@
     ExecutorchProgramManager,
     to_edge_transform_and_lower,
 )
-from torch import nn
+from torch import memory_format, nn
 from torch.export import export
 from torchao.quantization.pt2e.quantizer import Quantizer
 
@@ -52,7 +52,9 @@
 @dataclass
 class ModelInputSpec:
     shape: tuple[int, ...]
+    type: np.dtype = np.float32
     dtype: torch.dtype = torch.float32
+    dim_order: memory_format = torch.contiguous_format
 
 
 def handle_kernel_selection(model_name: str = ""):
@@ -81,11 +83,11 @@ def handle_kernel_selection(model_name: str = ""):
 
 
 def get_random_calibration_inputs(
-    input_spec: tuple[ModelInputSpec, ...]
+    input_spec: Iterable[ModelInputSpec], num_samples: int = 4
 ) -> list[tuple[torch.Tensor, ...]]:
     return [
         tuple([torch.randn(spec.shape, dtype=spec.dtype) for spec in input_spec])
-        for _ in range(4)
+        for _ in range(num_samples)
     ]
 
 
@@ -94,35 +96,91 @@ def _get_default_quantizer(target_spec: NeutronTargetSpec, use_qat: bool) -> Qua
 
 
 def to_model_input_spec(
-    input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]]
+    input_spec: Iterable[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]]
 ) -> tuple[ModelInputSpec, ...]:
-    if isinstance(input_spec, tuple) and all(
-        isinstance(spec, ModelInputSpec) for spec in input_spec
-    ):
-        return input_spec
-
-    elif isinstance(input_spec, tuple) and all(
-        isinstance(spec, int) for spec in input_spec
-    ):
-        return (ModelInputSpec(input_spec),)
-
-    elif isinstance(input_spec, list) and all(
-        isinstance(input_shape, tuple) for input_shape in input_spec
-    ):
-        return tuple([ModelInputSpec(spec) for spec in input_spec])
-    else:
-        raise TypeError(f"Unsupported type {type(input_spec)}")
+    match input_spec:
+        case tuple() | list() if all(
+            isinstance(spec, ModelInputSpec) for spec in input_spec
+        ):
+            return tuple(input_spec)
+        case tuple() if all(isinstance(spec, int) for spec in input_spec):
+            return (ModelInputSpec(input_spec),)
+        case list() if all(
+            isinstance(input_shape, tuple) for input_shape in input_spec
+        ):
+            return tuple(ModelInputSpec(spec) for spec in input_spec)
+        case _:
+            raise TypeError(f"Unsupported type {type(input_spec)}")
+
+
+GetCalibrationInputsFn = Callable[
+    [tuple[ModelInputSpec, ...]], Iterable[tuple[torch.Tensor, ...]]
+]
+
+
+def get_calibration_inputs_fn_from_dataset_dir(dataset_dir) -> GetCalibrationInputsFn:
+    def _nested(
+        input_spec: tuple[ModelInputSpec, ...]
+    ) -> Iterable[tuple[torch.Tensor, ...]]:
+        data = sorted(os.listdir(dataset_dir))
+        inputs_needed = len(input_spec)
+
+        for path in data:
+            path = os.path.join(dataset_dir, path)
+            files = []
+
+            if os.path.isdir(path):
+                files = [os.path.join(path, x) for x in sorted(os.listdir(path))]
+            else:
+                files.append(path)
+
+            input_data = []
+            for idx, file in enumerate(files):
+                if len(input_data) == inputs_needed:
+                    break
+
+                tensor = np.fromfile(file, dtype=input_spec[idx].type).reshape(
+                    input_spec[idx].shape
+                )
+                input_data += (torch.from_numpy(tensor),)
+                continue
+
+            if len(input_data) < inputs_needed:
+                continue
+
+            yield tuple(input_data)
+
+    return _nested
+
+
+def _get_example_input(
+    input_spec: tuple[ModelInputSpec, ...]
+) -> tuple[torch.Tensor, ...]:
+    example_input = []
+    for spec in input_spec:
+        match spec.dim_order:
+            case torch.contiguous_format:
+                sample = torch.ones(spec.shape, dtype=spec.dtype)
+            case torch.channels_last:
+                sample = torch.ones(spec.shape, dtype=spec.dtype).to(
+                    memory_format=torch.channels_last
+                )
+            case _:
+                raise ValueError(f"Unsupported dim_order: {spec.dim_order}")
+        # noinspection PyUnboundLocalVariable
+        example_input.append(sample)
+
+    return tuple(example_input)
 
 
 def to_quantized_edge_program(
     model: torch.nn.Module,
-    input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]],
+    input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]],
     operators_not_to_delegate: list[str] = None,
-    get_calibration_inputs_fn: Callable[
-        [tuple[ModelInputSpec, ...]], list[tuple[torch.Tensor, ...]]
-    ] = get_random_calibration_inputs,
+    get_calibration_inputs_fn: GetCalibrationInputsFn = get_random_calibration_inputs,
     target: str = "imxrt700",
     use_qat: bool = False,
+    train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     remove_quant_io_ops: bool = False,
     custom_delegation_options: CustomDelegationOptions = CustomDelegationOptions(),  # noqa B008
     get_quantizer_fn: Callable[[], Quantizer] | None = None,
@@ -131,15 +189,16 @@ def to_quantized_edge_program(
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
     use_new_flow_neutron_c: bool = False,
+    delegate_to_npu=True,
 ) -> EdgeProgramManager:
     _neutron_target_spec = NeutronTargetSpec(target)
     if get_quantizer_fn is None:
         get_quantizer_fn = partial(
             _get_default_quantizer, _neutron_target_spec, use_qat
         )
-
-    calibration_inputs = get_calibration_inputs_fn(to_model_input_spec(input_spec))
-    example_input = calibration_inputs[0]
+    input_spec = to_model_input_spec(input_spec)
+    calibration_inputs = get_calibration_inputs_fn(input_spec)
+    example_input = _get_example_input(input_spec)
 
     # Make sure the model is in the evaluation mode.
     model.eval()
@@ -151,6 +210,7 @@ def to_quantized_edge_program(
         calibration_inputs=calibration_inputs,
         quantizer=get_quantizer_fn(),
         is_qat=use_qat,
+        train_fn=train_fn,
     )
 
     # List of operators to not decompose during the lowering.
@@ -166,15 +226,18 @@ def to_quantized_edge_program(
     post_quant_state_dict = (
         exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None
     )
-    partitioners = [
-        NeutronPartitioner(
-            compile_spec,
-            _neutron_target_spec,
-            custom_delegation_options,
-            post_quant_state_dict,
-            preserve_ops=preserve_ops,
-        )
-    ]
+    if delegate_to_npu:
+        partitioners = [
+            NeutronPartitioner(
+                compile_spec,
+                _neutron_target_spec,
+                custom_delegation_options,
+                post_quant_state_dict,
+                preserve_ops=preserve_ops,
+            )
+        ]
+    else:
+        partitioners = []
 
     edge_program_manager = to_edge_transform_and_lower(
         export(exir_program_aten__module_quant, example_input, strict=True),
@@ -203,15 +266,33 @@ def to_quantized_edge_program(
 
 def to_quantized_executorch_program(
     model: torch.nn.Module,
-    input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]],
+    input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]],
     use_qat: bool = False,
+    train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
     use_neutron_for_format_conversion: bool = True,
+    dataset_dir: str | None = None,
+    delegate_to_npu=True,
+    use_new_flow_neutron_c: bool = False,
 ) -> ExecutorchProgramManager:
+    if dataset_dir:
+        # Extract calibration data from a directory.
+        get_calibration_inputs_fn = {
+            "get_calibration_inputs_fn": get_calibration_inputs_fn_from_dataset_dir(
+                dataset_dir
+            )
+        }
+    else:
+        get_calibration_inputs_fn = {}  # Use default parameter value.
+
     edge_program_manager = to_quantized_edge_program(
         model,
         input_spec,
         use_qat=use_qat,
+        train_fn=train_fn,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
+        delegate_to_npu=delegate_to_npu,
+        use_new_flow_neutron_c=use_new_flow_neutron_c,
+        **get_calibration_inputs_fn,
     )
 
     return edge_program_manager.to_executorch(
@@ -221,7 +302,7 @@ def to_quantized_executorch_program(
 
 def to_edge_program(
     model: nn.Module,
-    input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]],
+    input_spec: Iterable[ModelInputSpec] | tuple[int, ...] | list[tuple[int, ...]],
 ) -> EdgeProgramManager:
     calibration_inputs = get_random_calibration_inputs(to_model_input_spec(input_spec))
 

@@ -16,7 +16,7 @@ def test_aot_example__mobilenet_v2():
     """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated."""
 
     # Find the executorch root directory (4 levels up from this test file)
-    executorch_root = Path(__file__).parent.parent.parent.parent
+    executorch_root = Path(__file__).parent.parent.parent.parent.parent
     assert executorch_root.exists(), f"Executorch root not found at {executorch_root}"
 
     # Run the compilation script as a module (like run_aot_example.sh does)