Hack around missing dtypes in compute estimation and handle grouped_mm cases with invalid strides

fmassa · fmassa · commit 59c4d1799e60 · 2025-07-05T08:23:42.000Z
The grouped_mm should be handled in the sharding propagation and those cases should just be removed I think
diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -147,12 +147,13 @@ def _get_device_tflops(dtype):
             f"Unsupported device: {device_name}. Supported devices: {[limit.name for limit in DEVICE_LIMITS]}"
         )
 
-    if dtype not in device_limit.gemm_tflops:
-        raise ValueError(
-            f"Dtype {dtype} not supported on {device_limit.name}. Supported dtypes: {list(device_limit.gemm_tflops.keys())}"
-        )
+    # TODO: add proper support for int64 etc
+    # if dtype not in device_limit.gemm_tflops:
+    #     raise ValueError(
+    #         f"Dtype {dtype} not supported on {device_limit.name}. Supported dtypes: {list(device_limit.gemm_tflops.keys())}"
+    #     )
 
-    return device_limit.gemm_tflops[dtype]
+    return device_limit.gemm_tflops.get(dtype, 1)
 
 
 def _get_sharded_shape(spec):
@@ -205,10 +206,16 @@ def estimate_strategy_runtime_cost(node, strategy):
 
     # TODO: maybe cache the flop_counter to avoid recreating it
     # all the time
-    with FlopCounterMode(display=False) as flop_counter:
-        node.target(*args, **kwargs)
-
-    flops = flop_counter.get_total_flops()
+    try:
+        with FlopCounterMode(display=False) as flop_counter:
+            node.target(*args, **kwargs)
+
+        flops = flop_counter.get_total_flops()
+    except RuntimeError as exc:
+        if node.target == torch.ops.aten._grouped_mm.default:
+            flops = float("inf")
+        else:
+            raise exc
 
     # TODO: fix this
     dtype = strategy.input_specs[0].tensor_meta.dtype