Migrate to strict mode export (dynamo) to support AC tags and HOPs (#93)

xmfan · web-flow · commit bf3951566313 · 2025-08-28T16:41:26.000-07:00
* Directly use _export_to_torch_ir strict_mode to support AC tags

* clean

* rebase

* lint

* works with pytorch wip prs

* monkey patch strict mode verifier to accept dtype cast ops

* mp fp32

* comments on verifier patch
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -5,7 +5,8 @@
 
 import copy
 import itertools
-from contextlib import ExitStack
+import warnings
+from contextlib import ExitStack, contextmanager
 from types import MethodType
 from typing import Optional, Union
 
@@ -106,6 +107,42 @@ def _move_to_fake(module, k, device, parameter=True):
     return model
 
 
+# Export runs some asserts on the exported program to ensure that it is serializable,
+# and some safety checks e.g. whether the graph metadata is consistent with what's been traced.
+#
+# In autoparallel, we don't care about the serializability of this initial
+# trace, but we do want those same safety checks. In the short term, we
+# can patch the verification logic.
+@contextmanager
+def monkey_patch_export_verifier():
+    from torch._export.verifier import SpecViolationError, Verifier, final
+
+    prior = Verifier._check_graph_module
+
+    def expected_error(e: Exception):
+        okay = ["Operator 'autoparallel.dtype_cast' is not an allowed operator type"]
+        e_str = str(e)
+        for msg in okay:
+            if msg in e_str:
+                return True
+        return False
+
+    @final
+    def _try_check_graph_module(self: Verifier, gm: torch.fx.GraphModule) -> None:
+        try:
+            return prior(self, gm)
+        except SpecViolationError as e:
+            if not expected_error(e):
+                raise
+            warnings.warn(f"Ignoring strict-mode export verifier error: {e}")
+
+    try:
+        Verifier._check_graph_module = _try_check_graph_module
+        yield
+    finally:
+        Verifier._check_graph_module = prior
+
+
 class AutoParallel:
     """
     Args:
@@ -220,7 +257,10 @@ def build_model_graph(self):
                 inputs = (inputs,)
 
         with set_dtype_cast(True):
-            ep = torch.export.export(self.model, inputs)
+            with torch._dynamo.config.patch(
+                install_free_tensors=True
+            ), monkey_patch_export_verifier():
+                ep = torch.export.export(self.model, inputs, strict=True)
             self.joint_with_descriptors = aot_export_joint_with_descriptors(
                 self.stack,
                 ep.module(),
diff --git a/examples/example_autoparallel.py b/examples/example_autoparallel.py
@@ -4,15 +4,31 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import functools
+
 import torch
 from torch import nn
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from torch.distributed.tensor.placement_types import Replicate, Shard
 from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils.checkpoint import create_selective_checkpoint_contexts
 
 from autoparallel.api import AutoParallel
 
 
+def policy_fn(ctx, op, *args, **kwargs):
+    if (
+        op == torch.ops.aten._scaled_dot_product_flash_attention.default
+        or op == torch.ops.aten._scaled_dot_product_efficient_attention.default
+    ):
+        # NOTE: we can't save nondeterministic_seeded ops, the run with rng wrapper is not traceable yet
+        return torch.utils.checkpoint.CheckpointPolicy.PREFER_SAVE
+    return torch.utils.checkpoint.CheckpointPolicy.PREFER_RECOMPUTE
+
+
+context_fn = functools.partial(create_selective_checkpoint_contexts, policy_fn)
+
+
 class Block(nn.Module):
     def __init__(self, nheads, dim1, dim2):
         super().__init__()
@@ -48,7 +64,7 @@ def _compute_attention(self, x):
 
     def forward(self, x):
         o = torch.utils.checkpoint.checkpoint(
-            self._compute_attention, x, use_reentrant=False
+            self._compute_attention, x, use_reentrant=False, context_fn=context_fn
         )
 
         o0 = o + x
@@ -103,7 +119,6 @@ def input_fn():
 
 mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
 # mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
-# mp_policy = None
 
 with AutoParallel(model, input_fn, mesh, mp_policy, compile=True) as autop:
     assert any(n.meta.get("nn_module_stack") for n in autop.gm.graph.nodes)
@@ -128,16 +143,34 @@ def input_fn():
 out = parallel_mod(*x)
 out.backward(torch.randn_like(out))
 
-print("All good!")
+# Validate
+seqs = set()
+for n in autop.gm.graph.nodes:
+    if "checkpoint" in n.meta.get(
+        "stack_trace", ""
+    ):  # placeholders don't have stack trace
+        is_bwd = n.meta.get("partitioner_tag", "") == "is_backward"
+        if not is_bwd:
+            if "getitem" in str(n.target):
+                # getitem nodes are tagged same as their parent
+                expected = policy_fn(None, n.args[0].target, (), ())
+            else:
+                expected = policy_fn(None, n.target, (), ())
+            actual = n.meta.get("recompute")
+            # NOTE: this assert only supports policy_fns on op alone
+            assert actual == expected
+            seqs.add(n.meta["seq_nr"])
+        else:
+            # fwd counterpart should have already populated seqs
+            assert n.meta["seq_nr"] in seqs
 
 mm_nodes = autop.gm.graph.find_nodes(
     op="call_function", target=torch.ops.aten.mm.default
 )
 
-# assert (
-#     mm_nodes[0].meta.get("recompute")
-#     == torch.utils.checkpoint.CheckpointPolicy.PREFER_RECOMPUTE
-# )
+assert (
+    mm_nodes[0].meta.get("recompute")
+    == torch.utils.checkpoint.CheckpointPolicy.PREFER_RECOMPUTE
+)
 
-# TODO: change this assert once we fix AC
-assert mm_nodes[0].meta.get("recompute") is None
+print("All good!")