update block_size args to granularity

namgyu-youn · namgyu-youn · commit 0c2bb76566f4 · 2025-11-16T22:26:55.000+09:00
diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -17,6 +17,7 @@
     Int8WeightOnlyConfig,
     quantize_,
 )
+from torchao.quantization.granularity import PerRow, PerTensor
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import TorchAOIntegrationTestCase
 
@@ -160,24 +161,35 @@ def test_slice(self, config, device, dtype):
     @common_utils.parametrize(
         "config",
         [
-            Int8DynamicActivationInt8WeightConfig(version=2),
-            Int8WeightOnlyConfig(version=2),
+            Int8DynamicActivationInt8WeightConfig,
+            Int8WeightOnlyConfig,
         ],
     )
-    def test_index_select(self, config):
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    def test_index_select(self, config, granularity):
         """test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
         N, K = 256, 512
         x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
         linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device="cuda")
         linear.weight.data = x
+
+        config = config(version=2, granularity=granularity)
         quantize_(linear, config)
 
         x_int8 = linear.weight
         x_int8_0 = x_int8[0]
+
+        # Test dequantization consistency
         torch.testing.assert_close(
             x_int8.dequantize()[0], x_int8_0.dequantize(), atol=0, rtol=0
         )
 
+        # Test block_size granularity
+        if isinstance(granularity, PerRow):
+            self.assertEqual(x_int8.block_size, [1, K])
+        elif isinstance(granularity, PerTensor):
+            self.assertEqual(x_int8.block_size, [N, K])
+
     @common_utils.parametrize(
         "config",
         [
@@ -187,16 +199,17 @@ def test_index_select(self, config):
     )
     def test_dequantization_accuracy(self, config):
         """Test dequantization accuracy separately"""
-        test_data = torch.tensor([[1.0, -1.0]], dtype=torch.bfloat16, device="cuda")
-        linear = torch.nn.Linear(2, 1, bias=False, dtype=torch.bfloat16, device="cuda")
-        linear.weight.data = test_data
+        linear = torch.nn.Linear(
+            256, 512, bias=False, dtype=torch.bfloat16, device="cuda"
+        )
+        weight_fp = copy.deepcopy(linear.weight)
         quantize_(linear, config)
 
         tensor = linear.weight
         dequantized = tensor.dequantize()
-        self.assertEqual(dequantized.shape, test_data.shape)
-        assert compute_error(dequantized, test_data) > 20, (
-            f"Dequantization error is too high to get a SQNR of {compute_error(dequantized, test_data)}"
+        self.assertEqual(dequantized.shape, weight_fp.shape)
+        assert compute_error(dequantized, weight_fp) > 20, (
+            f"Dequantization error is too high to get a SQNR of {compute_error(dequantized, weight_fp)}"
         )
 
     @common_utils.parametrize(
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -140,18 +140,7 @@ def _slice_scale_for_dimension(
     """
     aten = torch.ops.aten
 
-    # Per-tensor quantization (scalar scale)
-    if scale.numel() == 1:
-        return scale
-
-    # Per-row quantization (1D scale)
-    if scale.ndim == 1:
-        if dim == 0:
-            return aten.slice.Tensor(scale, 0, start, end, step)
-        else:
-            return scale
-
-    # Block-wise quantization (2D scale)
+    # Unsupported case for now, this would be 1 scale per data element
     if scale.shape == data_shape:
         return aten.slice.Tensor(scale, dim, start, end, step)
 
@@ -169,12 +158,6 @@ def _slice_scale_for_dimension(
         # Slice away as normal
         return aten.slice.Tensor(scale, dim, start, end, step)
     else:
-        # Error on Step > 1
-        if step > 1:
-            raise NotImplementedError(
-                "Slicing with step > 1 is not implemented for scale tensors."
-            )
-
         # There is blocking in this dimension
         # Calculate which scale elements correspond to the sliced data
         scale_start = start // block_size_for_dim if start is not None else None
@@ -184,6 +167,12 @@ def _slice_scale_for_dimension(
             else None
         )
 
+        # Error on Step > 1
+        if step > 1:
+            raise NotImplementedError(
+                "Slicing with step > 1 is not implemented for scale tensors."
+            )
+
         return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
 
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1346,13 +1346,16 @@ class Int8WeightOnlyConfig(AOBaseConfig):
     Configuration for applying int8 weight-only symmetric per-channel quantization to linear layers.
 
     Args:
-        group_size: Optional[int] = None - Controls the granularity of quantization. If None, applies per-channel quantization.
-            Otherwise, applies per-group quantization with the specified group size.
+        group_size (version 1) - Controls the granularity of quantization.
+        If None, applies per-channel quantization. Otherwise, applies per-group quantization with the specified group size.
+        granularity (version 2) - Quantization granularity.
+            PerRow() for per-channel quantization, PerTensor() for per-tensor quantization.
         set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
             for better performance with this quantization scheme.
     """
 
     group_size: Optional[int] = None
+    granularity: Optional[Union[PerRow, PerTensor]] = PerRow()
     set_inductor_config: bool = True
     version: int = 1
 
@@ -1387,11 +1390,7 @@ def _int8_weight_only_quantize_tensor(weight, config):
         )
     else:
         assert config.version == 2, f"Unexpected version: {config.version}"
-        group_size = config.group_size
-        if group_size is None:
-            group_size = weight.shape[-1]
-        block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
-        new_weight = Int8Tensor.from_hp(weight, block_size=block_size)
+        new_weight = Int8Tensor.from_hp(weight, granularity=config.granularity)
     return new_weight
 
 
@@ -1572,17 +1571,17 @@ def _int8_dynamic_activation_int8_weight_quantize_tensor(weight, config):
         else:
             input_quant_func = _int8_asymm_per_token_quant
 
-    if isinstance(config.granularity, PerTensor):
-        # Tensor granularity
-        block_size = weight.shape
-    else:
-        # Per row granularity
-        block_size = tuple([1 for _ in range(weight.dim() - 1)] + [weight.shape[-1]])
-
     if config.version == 1:
         warnings.warn(
             "Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details"
         )
+        if isinstance(config.granularity, PerTensor):
+            block_size = weight.shape
+        else:
+            block_size = tuple(
+                [1 for _ in range(weight.dim() - 1)] + [weight.shape[-1]]
+            )
+
         quantized_weight = to_affine_quantized_intx(
             weight,
             mapping_type,
@@ -1602,10 +1601,13 @@ def _int8_dynamic_activation_int8_weight_quantize_tensor(weight, config):
         )
 
         assert config.version == 2, f"Unexpected version: {config.version}"
+        # Compute block_size from granularity for activation quantization kwargs
+        block_size = get_block_size(weight.shape, config.granularity)
+
         quantized_weight = Int8Tensor.from_hp(
             weight,
-            block_size,
-            act_quant_kwargs=QuantizeTensorToInt8Kwargs(block_size=block_size),
+            granularity=config.granularity,
+            act_quant_kwargs=QuantizeTensorToInt8Kwargs(block_size=list(block_size)),
         )
 
     return quantized_weight
diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -57,7 +57,7 @@ def _choose_quant_func_and_quantize_tensor(
     elif isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):
         return Int8Tensor.from_hp(
             tensor,
-            quant_kwargs.block_size,
+            granularity=quant_kwargs.granularity,
             act_quant_kwargs=quant_kwargs,
         )
 
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -14,6 +14,7 @@
     _slice_scale_for_dimension,
 )
 from torchao.kernel import int_scaled_matmul
+from torchao.quantization.granularity import PerRow
 from torchao.quantization.quant_primitives import (
     MappingType,
     _maybe_expand_scale_to_tensor_shape,
@@ -24,6 +25,7 @@
     QuantizeTensorKwargs,
     _choose_quant_func_and_quantize_tensor,
 )
+from torchao.quantization.utils import get_block_size
 from torchao.utils import TorchAOBaseTensor, fill_defaults
 
 __all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
@@ -37,10 +39,12 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
 
     Args:
         block_size (list[int]): block size for quantization granularity
+        granularity: the granularity for the Tensor, currently either PerRow() or PerTensor()
         # TODO: Static quantization support using `static_scale`, `static_zero_point`
     """
 
     block_size: list[int]
+    granularity = PerRow()
 
 
 class Int8Tensor(TorchAOBaseTensor):
@@ -101,26 +105,28 @@ def __repr__(self):
     @classmethod
     def from_hp(
         cls,
-        w: torch.Tensor,
-        block_size: list[int],
+        w_hp: torch.Tensor,
+        granularity=PerRow(),
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
     ):
-        if w.dim() not in [2, 3] or len(block_size) != w.dim():
+        block_size = list(get_block_size(w_hp.shape, granularity))
+
+        if w_hp.dim() not in [2, 3] or len(block_size) != w_hp.dim():
             raise ValueError("Expected 2D or 3D tensor with same block_size length")
 
         scale, zero_point = choose_qparams_affine(
-            input=w,
+            input=w_hp,
             mapping_type=MappingType.SYMMETRIC,
             block_size=block_size,
             target_dtype=torch.int8,
             quant_min=-128,
             quant_max=127,
-            scale_dtype=w.dtype,
+            scale_dtype=w_hp.dtype,
             zero_point_dtype=torch.int8,
         )
 
         int_data = quantize_affine(
-            w,
+            w_hp,
             block_size=block_size,
             scale=scale,
             zero_point=zero_point,
@@ -132,7 +138,7 @@ def from_hp(
             scale,
             block_size,
             act_quant_kwargs=act_quant_kwargs,
-            dtype=w.dtype,
+            dtype=w_hp.dtype,
         )
 
     def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
@@ -290,7 +296,7 @@ def _(func, types, args, kwargs):
         Int8Tensor(
             selected_qdata,
             selected_scale,
-            [selected_qdata.shape[-1]],
+            self.block_size[1:],
             self.act_quant_kwargs,
             self.dtype,
         ),

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def _choose_quant_func_and_quantize_tensor(`
`57`	`57`	`elif isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):`
`58`	`58`	`return Int8Tensor.from_hp(`
`59`	`59`	`tensor,`
`60`		`- quant_kwargs.block_size,`
	`60`	`+ granularity=quant_kwargs.granularity,`
`61`	`61`	`act_quant_kwargs=quant_kwargs,`
`62`	`62`	`)`
`63`	`63`