Update (base update)

ruisizhang123 · ruisizhang123 · commit ea19dff754ad · 2025-09-05T17:23:54.000-07:00
[ghstack-poisoned]
diff --git a/autoparallel/apply_sharding.py b/autoparallel/apply_sharding.py
@@ -34,12 +34,9 @@
 
 
 class ApplyShardingInterpreter(torch.fx.Interpreter):
-    def __init__(self, module, sharding_placement, decomp_table=None):
+    def __init__(self, module, sharding_placement):
         super().__init__(module, garbage_collect_values=True, graph=None)
         self.sharding_placement = sharding_placement
-        if decomp_table is None:
-            decomp_table = {}
-        self.decomp_table = decomp_table
         param_placement_order = {}
         if _ENABLE_ORDERED_SHARDING_OPTIMIZATION:
             param_placement_order = compute_optimal_placement_order_for_parameters(
@@ -170,36 +167,11 @@ def call_function(self, target, args, kwargs):
             # TODO: see if we can remove this contiguous properly
             new_args[0] = new_args[0].contiguous()
 
-        if target in self.decomp_table:
-            new_target = self.decomp_table[target]
-            out = super().call_function(new_target, tuple(new_args), kwargs)
-            # NOTE: is there a canonical way of handling this?
-            if out is not NotImplemented:
-                out = tree_map_only(DTensor, lambda x: x.to_local(), out)
-                return out
         out = super().call_function(target, tuple(new_args), kwargs)
         out = tree_map_only(DTensor, lambda x: x.to_local(), out)
         return out
 
 
-class ApplyDecompInterpreter(torch.fx.Interpreter):
-    def __init__(self, module, decomp_table=None):
-        super().__init__(module, garbage_collect_values=True, graph=None)
-        if decomp_table is None:
-            decomp_table = {}
-        self.decomp_table = decomp_table
-
-    def call_function(self, target, args, kwargs):
-        if target in self.decomp_table:
-            new_target = self.decomp_table[target]
-            out = super().call_function(new_target, args, kwargs)
-            # NOTE: is there a canonical way of handling this?
-            if out is not NotImplemented:
-                return out
-        out = super().call_function(target, args, kwargs)
-        return out
-
-
 def shard_node_given_placements(node, sharding_placement, *, meta: bool):
     # TODO: not sure if we actually guarantee sharding_placement has ever
     # input node lol
@@ -264,16 +236,18 @@ def apply_sharding_to_model(gm, sharding_placement, params_spec, buffers_spec):
 
     decomp_table = _get_inductor_decomp_table()
     # run with DTensor to apply the collectives given the graph
-    interp = ApplyShardingInterpreter(gm, sharding_placement, decomp_table)
+    interp = ApplyShardingInterpreter(gm, sharding_placement)
 
     # TODO: make_fx here is suspicious in case of dynamic shapes
     with fx_traceback.preserve_node_meta():
         parallel_gm0 = make_fx(interp.run)(*local_args)
 
     cleanup_graph(parallel_gm0)
-    interp2 = ApplyDecompInterpreter(parallel_gm0, decomp_table)
+    interp2 = torch.fx.Interpreter(parallel_gm0)
     with fx_traceback.preserve_node_meta():
-        parallel_gm = make_fx(interp2.run)(*local_args)
+        parallel_gm = make_fx(interp2.run, decomposition_table=decomp_table)(
+            *local_args
+        )
     cleanup_graph(parallel_gm)
 
     # Copy descriptors over to new graph
diff --git a/autoparallel/autobucketing_util/estimation.py b/autoparallel/autobucketing_util/estimation.py
@@ -75,6 +75,10 @@ def benchmark_and_sync_runtime(
     _, memories_at_nodes = memory.estimate_peak_memory(
         snodes, name_to_freeable_input_buf, graph_outputs
     )
+    # ensure memory offset is always positive
+    if min(memories_at_nodes) < 0:
+        shift_value = abs(min(memories_at_nodes))
+        memories_at_nodes = [x + shift_value for x in memories_at_nodes]
 
     for idx, snode in enumerate(snodes):
         if is_collective(