From 17d38f285f4df4c3a52228f224aaf21286f91a3f Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Thu, 25 Jun 2026 12:48:33 +0800
Subject: [PATCH 1/2] test(quantize): use fake model for image-segmentation e2e
 calibration

The image-segmentation quantize e2e test exported and calibrated the real
nvidia/segformer-b0-finetuned-ade-512-512 model, whose heavy backbone caused
random hangs on QNN hosts. Replace it with a tiny FP32 conv model that has
identical segmentation I/O (pixel_values [batch, 3, H, W] -> logits
[batch, num_labels, H/4, W/4]) and random weights, built via the new
_build_fake_segmentation_onnx() helper.

The ImageSegmentationDataset path is still fully exercised: the test keeps
--model-name so the real image processor and dataset samples feed real
pixel_values into calibration; only the heavy model is swapped out.
---
 tests/e2e/test_quantize_e2e.py | 86 +++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_quantize_e2e.py b/tests/e2e/test_quantize_e2e.py
index d95288714..3452f08ad 100644
--- a/tests/e2e/test_quantize_e2e.py
+++ b/tests/e2e/test_quantize_e2e.py
@@ -62,6 +62,72 @@ def _build_tiny_onnx(path: Path, *, with_metadata: bool = True) -> None:
     onnx.save(model, str(path))
 
 
+# Image-segmentation I/O contract shared by HF semantic-segmentation exports
+# (e.g. nvidia/segformer-*-ade-*): pixel_values [batch, 3, height, width] ->
+# logits [batch, num_labels, height/4, width/4]. 150 = ADE20K class count.
+_SEG_NUM_CHANNELS = 3
+_SEG_NUM_LABELS = 150
+
+
+def _build_fake_segmentation_onnx(path: Path) -> None:
+    """Build a tiny FP32 ONNX with segmentation-identical I/O and random weights.
+
+    Replaces a real HF segmentation export (heavy backbone that can randomly
+    hang on QNN hosts during calibration) with a small conv stack that keeps the
+    same input/output contract: ``pixel_values`` [batch, 3, height, width] ->
+    ``logits`` [batch, num_labels, height/4, width/4]. Spatial dims stay dynamic
+    so the model accepts both calibration inputs (e.g. 512x512) and the test's
+    degenerate 1x1 inference probe. Two stride-2 convs reproduce the /4 logits
+    resolution; a 1x1 conv acts as the classifier head.
+    """
+    rng = np.random.default_rng(1234)
+    pixel_values = onnx.helper.make_tensor_value_info(
+        "pixel_values",
+        onnx.TensorProto.FLOAT,
+        ["batch_size", _SEG_NUM_CHANNELS, "height", "width"],
+    )
+    logits = onnx.helper.make_tensor_value_info(
+        "logits",
+        onnx.TensorProto.FLOAT,
+        ["batch_size", _SEG_NUM_LABELS, "height_out", "width_out"],
+    )
+
+    def _w(shape: tuple[int, ...], name: str) -> onnx.TensorProto:
+        return onnx.numpy_helper.from_array(
+            (rng.standard_normal(shape) * 0.1).astype(np.float32), name
+        )
+
+    w1 = _w((8, _SEG_NUM_CHANNELS, 3, 3), "seg_W1")
+    b1 = _w((8,), "seg_B1")
+    w2 = _w((16, 8, 3, 3), "seg_W2")
+    b2 = _w((16,), "seg_B2")
+    w3 = _w((_SEG_NUM_LABELS, 16, 1, 1), "seg_W3")
+    b3 = _w((_SEG_NUM_LABELS,), "seg_B3")
+    nodes = [
+        onnx.helper.make_node(
+            "Conv", ["pixel_values", "seg_W1", "seg_B1"], ["c1"],
+            name="Conv_1", kernel_shape=[3, 3], strides=[2, 2], pads=[1, 1, 1, 1],
+        ),
+        onnx.helper.make_node("Relu", ["c1"], ["r1"], name="Relu_1"),
+        onnx.helper.make_node(
+            "Conv", ["r1", "seg_W2", "seg_B2"], ["c2"],
+            name="Conv_2", kernel_shape=[3, 3], strides=[2, 2], pads=[1, 1, 1, 1],
+        ),
+        onnx.helper.make_node("Relu", ["c2"], ["r2"], name="Relu_2"),
+        onnx.helper.make_node(
+            "Conv", ["r2", "seg_W3", "seg_B3"], ["logits"],
+            name="Classifier", kernel_shape=[1, 1], strides=[1, 1], pads=[0, 0, 0, 0],
+        ),
+    ]
+    graph = onnx.helper.make_graph(
+        nodes, "fake_segmentation", [pixel_values], [logits], [w1, b1, w2, b2, w3, b3]
+    )
+    model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)])
+    model.ir_version = 8
+    onnx.checker.check_model(model)
+    onnx.save(model, str(path))
+
+
 @pytest.fixture(scope="session")
 def tiny_onnx(tmp_path_factory: pytest.TempPathFactory) -> Path:
     d = tmp_path_factory.mktemp("tiny_quant")
@@ -157,12 +223,20 @@ def onnx_objdet() -> Path:
 
 
 @pytest.fixture(scope="session")
-def onnx_imgseg() -> Path:
-    return _export_hf_to_onnx(
-        "nvidia/segformer-b0-finetuned-ade-512-512",
-        "image-segmentation",
-        "segformer_b0",
-    )
+def onnx_imgseg(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """Fake segmentation ONNX standing in for a real HF export.
+
+    The real ``nvidia/segformer-b0-finetuned-ade-512-512`` export ran as the
+    calibration model here, but its heavy backbone caused random hangs on QNN
+    hosts. This builds a tiny model with identical segmentation I/O instead, so
+    calibration still exercises the ImageSegmentationDataset path without running
+    a large model. The dataset itself (image processor + samples) is still loaded
+    from the real ``--model-name`` in the test.
+    """
+    d = tmp_path_factory.mktemp("fake_imgseg")
+    p = d / "model.onnx"
+    _build_fake_segmentation_onnx(p)
+    return p
 
 
 @pytest.fixture(scope="session")

From e856655b4b068bc01ae0cc6513cd4abdce84039b Mon Sep 17 00:00:00 2001
From: Yue Sun <yuesu@microsoft.com>
Date: Thu, 25 Jun 2026 14:45:13 +0800
Subject: [PATCH 2/2] test(quantize): move fake segmentation model into
 create_test_models

Address review feedback: relocate the fake image-segmentation model builder
out of the e2e test and into the shared tests/fixtures/create_test_models.py
as create_fake_segmentation_model(), matching that module's existing
create_*_model() -> ModelProto convention and wiring it into main(). Add
tests/fixtures/__init__.py so the helper is importable (consistent with
tests/assets). The quantize e2e fixture now imports and saves the shared model.

No behavior change: I/O contract (pixel_values [batch, 3, H, W] -> logits
[batch, num_labels, H/4, W/4]) and seeded-random weights are unchanged.
---
 tests/e2e/test_quantize_e2e.py       |  78 ++-----------------
 tests/fixtures/__init__.py           |   4 +
 tests/fixtures/create_test_models.py | 110 +++++++++++++++++++++++++++
 3 files changed, 121 insertions(+), 71 deletions(-)
 create mode 100644 tests/fixtures/__init__.py

diff --git a/tests/e2e/test_quantize_e2e.py b/tests/e2e/test_quantize_e2e.py
index 3452f08ad..faa08ef98 100644
--- a/tests/e2e/test_quantize_e2e.py
+++ b/tests/e2e/test_quantize_e2e.py
@@ -21,6 +21,7 @@
 import onnxruntime as ort
 import pytest
 
+from tests.fixtures.create_test_models import create_fake_segmentation_model
 from winml.modelkit.commands.quantize import quantize as quantize_cmd
 
 
@@ -62,72 +63,6 @@ def _build_tiny_onnx(path: Path, *, with_metadata: bool = True) -> None:
     onnx.save(model, str(path))
 
 
-# Image-segmentation I/O contract shared by HF semantic-segmentation exports
-# (e.g. nvidia/segformer-*-ade-*): pixel_values [batch, 3, height, width] ->
-# logits [batch, num_labels, height/4, width/4]. 150 = ADE20K class count.
-_SEG_NUM_CHANNELS = 3
-_SEG_NUM_LABELS = 150
-
-
-def _build_fake_segmentation_onnx(path: Path) -> None:
-    """Build a tiny FP32 ONNX with segmentation-identical I/O and random weights.
-
-    Replaces a real HF segmentation export (heavy backbone that can randomly
-    hang on QNN hosts during calibration) with a small conv stack that keeps the
-    same input/output contract: ``pixel_values`` [batch, 3, height, width] ->
-    ``logits`` [batch, num_labels, height/4, width/4]. Spatial dims stay dynamic
-    so the model accepts both calibration inputs (e.g. 512x512) and the test's
-    degenerate 1x1 inference probe. Two stride-2 convs reproduce the /4 logits
-    resolution; a 1x1 conv acts as the classifier head.
-    """
-    rng = np.random.default_rng(1234)
-    pixel_values = onnx.helper.make_tensor_value_info(
-        "pixel_values",
-        onnx.TensorProto.FLOAT,
-        ["batch_size", _SEG_NUM_CHANNELS, "height", "width"],
-    )
-    logits = onnx.helper.make_tensor_value_info(
-        "logits",
-        onnx.TensorProto.FLOAT,
-        ["batch_size", _SEG_NUM_LABELS, "height_out", "width_out"],
-    )
-
-    def _w(shape: tuple[int, ...], name: str) -> onnx.TensorProto:
-        return onnx.numpy_helper.from_array(
-            (rng.standard_normal(shape) * 0.1).astype(np.float32), name
-        )
-
-    w1 = _w((8, _SEG_NUM_CHANNELS, 3, 3), "seg_W1")
-    b1 = _w((8,), "seg_B1")
-    w2 = _w((16, 8, 3, 3), "seg_W2")
-    b2 = _w((16,), "seg_B2")
-    w3 = _w((_SEG_NUM_LABELS, 16, 1, 1), "seg_W3")
-    b3 = _w((_SEG_NUM_LABELS,), "seg_B3")
-    nodes = [
-        onnx.helper.make_node(
-            "Conv", ["pixel_values", "seg_W1", "seg_B1"], ["c1"],
-            name="Conv_1", kernel_shape=[3, 3], strides=[2, 2], pads=[1, 1, 1, 1],
-        ),
-        onnx.helper.make_node("Relu", ["c1"], ["r1"], name="Relu_1"),
-        onnx.helper.make_node(
-            "Conv", ["r1", "seg_W2", "seg_B2"], ["c2"],
-            name="Conv_2", kernel_shape=[3, 3], strides=[2, 2], pads=[1, 1, 1, 1],
-        ),
-        onnx.helper.make_node("Relu", ["c2"], ["r2"], name="Relu_2"),
-        onnx.helper.make_node(
-            "Conv", ["r2", "seg_W3", "seg_B3"], ["logits"],
-            name="Classifier", kernel_shape=[1, 1], strides=[1, 1], pads=[0, 0, 0, 0],
-        ),
-    ]
-    graph = onnx.helper.make_graph(
-        nodes, "fake_segmentation", [pixel_values], [logits], [w1, b1, w2, b2, w3, b3]
-    )
-    model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)])
-    model.ir_version = 8
-    onnx.checker.check_model(model)
-    onnx.save(model, str(path))
-
-
 @pytest.fixture(scope="session")
 def tiny_onnx(tmp_path_factory: pytest.TempPathFactory) -> Path:
     d = tmp_path_factory.mktemp("tiny_quant")
@@ -228,14 +163,15 @@ def onnx_imgseg(tmp_path_factory: pytest.TempPathFactory) -> Path:
 
     The real ``nvidia/segformer-b0-finetuned-ade-512-512`` export ran as the
     calibration model here, but its heavy backbone caused random hangs on QNN
-    hosts. This builds a tiny model with identical segmentation I/O instead, so
-    calibration still exercises the ImageSegmentationDataset path without running
-    a large model. The dataset itself (image processor + samples) is still loaded
-    from the real ``--model-name`` in the test.
+    hosts. ``create_fake_segmentation_model`` builds a tiny model with identical
+    segmentation I/O instead, so calibration still exercises the
+    ImageSegmentationDataset path without running a large model. The dataset
+    itself (image processor + samples) is still loaded from the real
+    ``--model-name`` in the test.
     """
     d = tmp_path_factory.mktemp("fake_imgseg")
     p = d / "model.onnx"
-    _build_fake_segmentation_onnx(p)
+    onnx.save(create_fake_segmentation_model(), str(p))
     return p
 
 
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 000000000..862c45ce3
--- /dev/null
+++ b/tests/fixtures/__init__.py
@@ -0,0 +1,4 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
diff --git a/tests/fixtures/create_test_models.py b/tests/fixtures/create_test_models.py
index 343836dd8..087eebb66 100644
--- a/tests/fixtures/create_test_models.py
+++ b/tests/fixtures/create_test_models.py
@@ -15,6 +15,7 @@
 
 from pathlib import Path
 
+import numpy as np
 import onnx
 from onnx import TensorProto, helper
 
@@ -201,6 +202,109 @@ def create_multi_op_model() -> onnx.ModelProto:
     return model
 
 
+# Image-segmentation I/O contract shared by HF semantic-segmentation exports
+# (e.g. nvidia/segformer-*-ade-*): pixel_values [batch, 3, height, width] ->
+# logits [batch, num_labels, height/4, width/4]. 150 = ADE20K class count.
+SEG_NUM_CHANNELS = 3
+SEG_NUM_LABELS = 150
+
+
+def create_fake_segmentation_model() -> onnx.ModelProto:
+    """Create a tiny FP32 semantic-segmentation model with random weights.
+
+    Stands in for a real HuggingFace semantic-segmentation export (e.g.
+    ``nvidia/segformer-b0-finetuned-ade-512-512``) whose heavy backbone can
+    randomly hang on QNN hosts during quantization calibration. It keeps the
+    same I/O contract so calibration datasets and the quantizer treat it
+    identically to the real model:
+
+    - Input:  ``pixel_values`` [batch, 3, height, width] (FLOAT)
+    - Output: ``logits`` [batch, num_labels, height/4, width/4] (FLOAT)
+
+    Two stride-2 convs reproduce the ``/4`` logits resolution; a 1x1 conv acts
+    as the classifier head. Spatial dims stay dynamic so the model accepts both
+    calibration inputs (e.g. 512x512) and a degenerate 1x1 inference probe.
+    Weights are seeded-random so regeneration stays deterministic.
+    """
+    rng = np.random.default_rng(1234)
+
+    pixel_values = helper.make_tensor_value_info(
+        "pixel_values",
+        TensorProto.FLOAT,
+        ["batch_size", SEG_NUM_CHANNELS, "height", "width"],
+    )
+    logits = helper.make_tensor_value_info(
+        "logits",
+        TensorProto.FLOAT,
+        ["batch_size", SEG_NUM_LABELS, "height_out", "width_out"],
+    )
+
+    def _weight(shape: tuple[int, ...], name: str) -> onnx.TensorProto:
+        return onnx.numpy_helper.from_array(
+            (rng.standard_normal(shape) * 0.1).astype(np.float32), name
+        )
+
+    w1 = _weight((8, SEG_NUM_CHANNELS, 3, 3), "seg_W1")
+    b1 = _weight((8,), "seg_B1")
+    w2 = _weight((16, 8, 3, 3), "seg_W2")
+    b2 = _weight((16,), "seg_B2")
+    w3 = _weight((SEG_NUM_LABELS, 16, 1, 1), "seg_W3")
+    b3 = _weight((SEG_NUM_LABELS,), "seg_B3")
+
+    nodes = [
+        helper.make_node(
+            "Conv",
+            ["pixel_values", "seg_W1", "seg_B1"],
+            ["c1"],
+            name="Conv_1",
+            kernel_shape=[3, 3],
+            strides=[2, 2],
+            pads=[1, 1, 1, 1],
+        ),
+        helper.make_node("Relu", ["c1"], ["r1"], name="Relu_1"),
+        helper.make_node(
+            "Conv",
+            ["r1", "seg_W2", "seg_B2"],
+            ["c2"],
+            name="Conv_2",
+            kernel_shape=[3, 3],
+            strides=[2, 2],
+            pads=[1, 1, 1, 1],
+        ),
+        helper.make_node("Relu", ["c2"], ["r2"], name="Relu_2"),
+        helper.make_node(
+            "Conv",
+            ["r2", "seg_W3", "seg_B3"],
+            ["logits"],
+            name="Classifier",
+            kernel_shape=[1, 1],
+            strides=[1, 1],
+            pads=[0, 0, 0, 0],
+        ),
+    ]
+
+    graph = helper.make_graph(
+        nodes=nodes,
+        name="FakeSegmentation",
+        inputs=[pixel_values],
+        outputs=[logits],
+        initializer=[w1, b1, w2, b2, w3, b3],
+    )
+
+    model = helper.make_model(
+        graph,
+        opset_imports=[helper.make_opsetid("", 17)],
+        producer_name="WinML CLI Test Fixture Generator",
+    )
+    # Match the quantize e2e fixtures (ir_version 8) so onnxruntime's quantizer
+    # loads it identically to the other tiny models in that suite.
+    model.ir_version = 8
+
+    onnx.checker.check_model(model)
+
+    return model
+
+
 def main() -> None:
     """Generate all test fixture models."""
     fixtures_dir = Path(__file__).parent
@@ -220,6 +324,12 @@ def main() -> None:
     onnx.save(multi_op, str(multi_op_path))
     print(f"✓ Created {multi_op_path}")
 
+    # Generate fake_segmentation.onnx
+    fake_segmentation = create_fake_segmentation_model()
+    fake_segmentation_path = fixtures_dir / "fake_segmentation.onnx"
+    onnx.save(fake_segmentation, str(fake_segmentation_path))
+    print(f"✓ Created {fake_segmentation_path}")
+
     print("\nAll test fixtures generated successfully!")