Introduce SINQ calibration-free quantization algorithm (#3156)

namgyu-youn · web-flow · commit 1cfd3b5c195f · 2025-11-25T10:17:07.000-08:00
* feat: SINQ quantization algorithm

* update sinq algorithm

* update sinq ops and add unit test

* update device to cpu in SINQ test

* fix scale dtype, device

* update device to direct override

* add qmin, qmax args similar to HQQ
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -15,6 +15,7 @@
     MappingType,
     ZeroPointDomain,
     _choose_qparams_affine_tinygemm,
+    _choose_qparams_and_quantize_scale_only_sinq,
     _choose_scale_float8,
     _fake_quantize_affine,
     _fake_quantize_affine_cachemask,
@@ -823,6 +824,45 @@ def test_maybe_expand_scale_to_tensor_shape(self):
         self.assertEqual(new_scale5.shape, torch.Size([3, 2, 8]))
         self.assertEqual(new_scale5.unique(dim=-1).shape, torch.Size([3, 2, 2]))
 
+    def test_choose_qparams_and_quantize_scale_only_sinq(self):
+        """Test SINQ quantization produces valid outputs and accuracy."""
+        torch.manual_seed(self.SEED)
+        input = torch.randn(128, 256, dtype=torch.float32)
+        group_size = 64
+        qmin = -(2 ** (4 - 1))
+        qmax = 2 ** (4 - 1) - 1
+
+        # Run SINQ
+        qdata, scale_row, scale_col = _choose_qparams_and_quantize_scale_only_sinq(
+            input,
+            group_size=group_size,
+            qmin=qmin,
+            qmax=qmax,
+            niter=20,
+        )
+
+        # Check quantized weight is producible
+        self.assertEqual(qdata.dtype, torch.int8)
+        self.assertEqual(qdata.shape, input.shape)
+        self.assertTrue((qdata >= qmin).all() and (qdata <= qmax).all())
+
+        # Check scale factors are producible
+        num_groups = input.shape[1] // group_size
+        self.assertEqual(scale_row.shape, (input.shape[0], num_groups))
+        self.assertEqual(scale_col.shape, (input.shape[1],))
+        self.assertTrue((scale_row > 0).all() and (scale_col > 0).all())
+
+        # Check weight transform with 2-scale factor is applicable
+        qdata_fp32 = qdata.to(torch.float32)
+        qdata_reshaped = qdata_fp32.reshape(-1, group_size)
+        scale_row_expanded = scale_row.reshape(-1, 1)
+        scale_col_reshaped = scale_col.reshape(num_groups, group_size)
+        scale_col_expanded = scale_col_reshaped.repeat(input.shape[0], 1)
+        reconstructed = (
+            qdata_reshaped * scale_row_expanded * scale_col_expanded
+        ).reshape(input.shape)
+        self.assertFalse(torch.isnan(reconstructed).any())
+
     def test_float8_blockwise_scaling(self):
         M, K = 512, 1024
         hp_tensor = torch.randn(M, K, dtype=torch.float)
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -33,6 +33,7 @@
     "_choose_qparams_affine_floatx",
     "_choose_qparams_and_quantize_affine_hqq",
     "_choose_qparams_and_quantize_scale_only_hqq",
+    "_choose_qparams_and_quantize_scale_only_sinq",
     "_choose_qparams_and_quantize_affine_qqq",
     "_choose_scale_float8",
     "_choose_qparams_gguf",
@@ -2219,6 +2220,81 @@ def round_stoch(x: torch.Tensor) -> torch.Tensor:
     return qdata, scale
 
 
+def _choose_qparams_and_quantize_scale_only_sinq(
+    tensor: torch.Tensor,
+    qmin: int = -(2 ** (4 - 1)),
+    qmax: int = 2 ** (4 - 1) - 1,
+    group_size: int = 64,
+    niter: int = 20,
+    compute_dtype: torch.dtype = torch.float16,
+) -> tuple:
+    """
+    SINQ: Sinkhorn-Normalized Quantization (https://www.arxiv.org/abs/2509.22944)
+
+    Iteratively normalizes row and column standard deviations to minimize
+    matrix imbalance before quantization with dual scales.
+
+    Args:
+        tensor: Input weight tensor
+        group_size: Quantization group size (default: 64)
+        niter: Number of Sinkhorn iterations (default: 20)
+        compute_dtype: Target compute dtype (default: torch.float16)
+
+    Returns:
+        Tuple of (qdata, scale_row, scale_col)
+    """
+    if group_size is not None:
+        assert _is_divisible(tensor.numel(), group_size), (
+            f"group_size must divide tensor elements. shape: {tensor.shape}, group_size: {group_size}"
+        )
+
+    W = tensor.to(dtype=compute_dtype)
+    shape = W.shape
+
+    # Reshape for 1D tiling
+    W = W.reshape(-1, group_size)  # [N*num_groups, group_size]
+
+    # Algorithm 1: Sinkhorn Normalization
+    q_min = min(W.std(dim=0).min().item(), W.std(dim=1).min().item())
+    q_min = max(q_min, 1e-8)
+
+    W_hat = W.clone()
+    scale_col_sinkhorn = torch.ones(W.shape[1], device=W.device, dtype=compute_dtype)
+    scale_row_sinkhorn = torch.ones(W.shape[0], device=W.device, dtype=compute_dtype)
+
+    for _ in range(niter):
+        # Normalize columns (dim=0)
+        q_col = W_hat.std(dim=0) / q_min
+        q_col = torch.clamp(q_col, min=1e-8)
+        W_hat = W_hat / q_col.unsqueeze(0)
+        scale_col_sinkhorn = scale_col_sinkhorn * q_col
+
+        # Normalize rows (dim=1)
+        q_row = W_hat.std(dim=1) / q_min
+        q_row = torch.clamp(q_row, min=1e-8)
+        W_hat = W_hat / q_row.unsqueeze(1)
+        scale_row_sinkhorn = scale_row_sinkhorn * q_row
+
+    # INT8 symmetric quantization
+    # TODO: Consider custom bitwidth for SIMD acceleration like vadd4
+    scale_s = (W_hat.abs().amax(dim=1, keepdim=True) / float(qmax)).clamp_min(1e-8)
+    # TODO: Find better rounding strategy like stochastic rounding
+    Q = _Round.apply(W_hat / scale_s).clamp(qmin, qmax)
+    # TODO: PERF test for scale factor dtype (FP16 vs. INT8)
+    # Although FP16 has high accuracy, FP16×INT8 can't be computed
+    # in Tensor Core directly, requiring INT8 to FP16 ops.
+    qdata = Q.view(shape).contiguous().to(torch.int8)
+
+    # Combine RTN scale with row Sinkhorn factor
+    scale_row = (
+        (scale_s.view(-1) * scale_row_sinkhorn).view(shape[0], -1).to(compute_dtype)
+    )
+    num_groups = shape[1] // group_size
+    scale_col = scale_col_sinkhorn.repeat(num_groups)[: shape[1]].to(compute_dtype)
+
+    return qdata, scale_row, scale_col
+
+
 def _choose_qparams_affine_floatx(
     tensor: torch.Tensor, ebits: int, mbits: int
 ) -> torch.Tensor: