get DS3 running forward, OOM at backward

wconstab · wconstab · commit b5eb863c4bbb · 2025-07-18T07:16:22.000-07:00
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -281,10 +281,8 @@ def build_model_graph(self):
         # we basically want to remove noops in here
         prev = torch._inductor.config.pattern_matcher
         torch._inductor.config.pattern_matcher = False
-        try:
-            gm = joint_graph_passes(gm)
-        finally:
-            torch._inductor.config.pattern_matcher = prev
+        gm = joint_graph_passes(gm)
+        torch._inductor.config.pattern_matcher = prev
         remove_assert_ops(gm.graph)
         gm.graph.eliminate_dead_code()
         gm.recompile()
diff --git a/examples/example_ds3.py b/examples/example_ds3.py
@@ -851,3 +851,22 @@ def input_fn():
 autop.add_output_constraints([x_sharding])
 
 sharding_placement = autop.optimize_placement()
+parallel_mod = autop.apply_placement(sharding_placement)
+
+# run weight init on our sharded DTensor params
+parallel_mod.to_empty(device="cuda")
+parallel_mod.init_weights(init_std=0.02, buffer_device="cuda")  # maybe not correct value
+
+# # now let's run it
+x = (
+    torch.randn(
+        # 0,
+        # args.vocab_size,
+        (bs // mesh.shape[0], seqlen, dim),
+        device=torch.device("cuda"),
+        dtype=torch.bfloat16
+    ),
+)
+out = parallel_mod(*x)
+out.backward(torch.randn_like(out))
+print("All good!")