Weighted quantile; nan-policy; everything mostly works

cakedev0 · cakedev0 · commit fa789fc2d199 · 2025-10-23T07:37:19.000+02:00
diff --git a/src/array_api_extra/_delegation.py b/src/array_api_extra/_delegation.py
@@ -904,6 +904,7 @@ def quantile(
     axis: int | None = None,
     method: str = "linear",
     keepdims: bool = False,
+    nan_policy: str = "propagate",
     *,
     weights: Array | None = None,
     xp: ModuleType | None = None,
@@ -1051,16 +1052,22 @@ def quantile(
        "Sample quantiles in statistical packages,"
        The American Statistician, 50(4), pp. 361-365, 1996
     """
-    methods = {"linear", "inverted_cdf", "averaged_inverted_cdf"}
+    if xp is None:
+        xp = array_namespace(a)
+    if is_pydata_sparse_namespace(xp):
+        raise ValueError('no supported')
 
+    methods = {"linear", "inverted_cdf", "averaged_inverted_cdf"}
     if method not in methods:
         msg = f"`method` must be one of {methods}"
         raise ValueError(msg)
+    nan_policies = {"propagate", "omit"}
+    if nan_policy not in nan_policies:
+        msg = f"`nan_policy` must be one of {nan_policies}"
+        raise ValueError(msg)
     if keepdims not in {True, False}:
         msg = "If specified, `keepdims` must be True or False."
         raise ValueError(msg)
-    if xp is None:
-        xp = array_namespace(a)
 
     a = xp.asarray(a)
     if not xp.isdtype(a.dtype, ("integral", "real floating")):
@@ -1071,15 +1078,31 @@ def quantile(
         raise ValueError(msg)
     ndim = a.ndim
     if ndim < 1:
-        msg = "`a` must be at least 1-dimensional"
+        msg = "`a` must be at least 1-dimensional."
         raise TypeError(msg)
     if axis is not None and ((axis >= ndim) or (axis < -ndim)):
         msg = "`axis` is not compatible with the dimension of `a`."
         raise ValueError(msg)
-
-    # Array API states: Mixed integer and floating-point type promotion rules
-    # are not specified because behavior varies between implementations.
-    # We chose to align with numpy (see docstring):
+    if weights is None:
+        if nan_policy != "propagate":
+            msg = ""
+            raise ValueError(msg)
+    else:
+        if ndim > 2:
+            msg = "When weights are provided, dimension of `a` must be 1 or 2."
+            raise ValueError(msg)
+        if a.shape != weights.shape:
+            if axis is None:
+                msg = "Axis must be specified when shapes of `a` and ̀ weights` differ."
+                raise TypeError(msg)
+            if weights.shape != eager_shape(a, axis):
+                msg = "Shape of weights must be consistent with shape of a along specified axis."
+                raise ValueError(msg)
+        if axis is None and ndim == 2:
+            msg = "When weights are provided, axis must be specified when `a` is 2d"
+            raise ValueError(msg)
+
+    # Align result dtype with what numpy does:
     dtype = xp.result_type(
         xp.float64 if xp.isdtype(a.dtype, "integral") else a,
         xp.asarray(q),
@@ -1088,20 +1111,25 @@ def quantile(
     device = get_device(a)
     a = xp.asarray(a, dtype=dtype, device=device)
     q = xp.asarray(q, dtype=dtype, device=device)
+    # TODO: cast weights here? Assert weights are on the same device as `a`?
 
     if xp.any((q > 1) | (q < 0) | xp.isnan(q)):
         msg = "`q` values must be in the range [0, 1]"
         raise ValueError(msg)
 
     # Delegate where possible.
-    if is_numpy_namespace(xp):
+    if is_numpy_namespace(xp) and nan_policy == "propagate":
         return xp.quantile(a, q, axis=axis, method=method, keepdims=keepdims, weights=weights)
     # No delegation for dask: I couldn't make it work
-    basic_case = method == "linear" and weights is None
+    basic_case = method == "linear" and weights is None and nan_policy == "propagate"
     if (basic_case and is_jax_namespace(xp)) or is_cupy_namespace(xp):
         return xp.quantile(a, q, axis=axis, method=method, keepdims=keepdims)
     if basic_case and is_torch_namespace(xp):
         return xp.quantile(a, q, dim=axis, interpolation=method, keepdim=keepdims)
 
+    # XXX: I'm not sure we want to support dask, it seems uterly slow...
     # Otherwise call our implementation (will sort data)
-    return _quantile.quantile(a, q, axis=axis, method=method, keepdims=keepdims, xp=xp)
+    return _quantile.quantile(
+        a, q, axis=axis, method=method, keepdims=keepdims,
+        nan_policy=nan_policy, weights=weights, xp=xp
+    )
diff --git a/src/array_api_extra/_lib/_quantile.py b/src/array_api_extra/_lib/_quantile.py
@@ -4,7 +4,7 @@
 
 from ._utils._compat import device as get_device
 from ._utils._helpers import eager_shape
-from ._utils._typing import Array
+from ._utils._typing import Array, Device
 
 
 def quantile(  # numpydoc ignore=PR01,RT01
@@ -14,6 +14,7 @@ def quantile(  # numpydoc ignore=PR01,RT01
     method: str = "linear",
     axis: int | None = None,
     keepdims: bool = False,
+    nan_policy: str = "propagate",
     *,
     weights: Array | None = None,
     xp: ModuleType,
@@ -43,43 +44,49 @@ def quantile(  # numpydoc ignore=PR01,RT01
         a = xp.full(tuple(a_shape), xp.nan, dtype=a.dtype, device=device)
 
     if weights is None:
-        res = _quantile(a, q, float(n), axis, method, xp)
+        res = _quantile(a, q, n, axis, method, xp)
+        if not axis_none:
+            res = xp.moveaxis(res, axis, 0)
     else:
+        weights = xp.asarray(weights, dtype=xp.float64, device=device)
         average = method == 'averaged_inverted_cdf'
-        res = _weighted_quantile(a, q, weights, n, axis, average, xp)
-    # to support weights, the main thing would be to
-    # argsort a, and then use it to sort a and w.
-    # The hard part will be dealing with 0-weights and NaNs
-    # But maybe a proper use of searchsorted + left/right side will work?
+        res = _weighted_quantile(
+            a, q, weights, n, axis, average,
+            nan_policy=nan_policy, xp=xp, device=device
+        )
 
     # reshaping to conform to doc/other libs' behavior
     if axis_none:
         if keepdims:
             res = xp.reshape(res, q.shape + (1,) * a_ndim)
-    else:
-        res = xp.moveaxis(res, axis, 0)
-        if keepdims:
-            a_shape[axis] = 1
-            res = xp.reshape(res, q.shape + tuple(a_shape))
+    elif keepdims:
+        a_shape[axis] = 1
+        res = xp.reshape(res, q.shape + tuple(a_shape))
 
     return res[0, ...] if q_scalar else res
 
 
 def _quantile(  # numpydoc ignore=GL08
-    a: Array, q: Array, n: float, axis: int, method: str, xp: ModuleType
+    a: Array, q: Array, n: int, axis: int, method: str, xp: ModuleType
 ) -> Array:
     a = xp.sort(a, axis=axis, stable=False)
+    mask_nan = xp.any(xp.isnan(a), axis=axis, keepdims=True)
+    if xp.any(mask_nan):
+        # propogate NaNs:
+        mask = xp.repeat(mask_nan, n, axis=axis)
+        a = xp.where(mask, xp.nan, a)
+        del mask
 
     if method == "linear":
-        m = 1 - q       
+        m = 1 - q
     else: # method is "inverted_cdf" or "averaged_inverted_cdf"
         m = 0
 
-    jg = q * n + m - 1
+    jg = q * float(n) + m - 1
 
     j = jg // 1
-    j = xp.clip(j, 0.0, n - 1)
-    jp1 = xp.clip(j + 1, 0.0, n - 1)
+    j = xp.clip(j, 0.0, float(n - 1))
+    jp1 = xp.clip(j + 1, 0.0, float(n - 1))
     # `̀j` and `jp1` are 1d arrays
 
     g = jg % 1
@@ -88,7 +95,7 @@ def _quantile(  # numpydoc ignore=GL08
     elif method == 'averaged_inverted_cdf':
         g = (1 + xp.astype((g > 0), jg.dtype)) / 2
 
-    g = xp.where(j < 0, 0, g)  # equivalent to g[j < 0] = 0, but works with strictest
+    g = xp.where(j < 0, 0, g)  # equivalent to g[j < 0] = 0, but works with readonly
     new_g_shape = [1] * a.ndim
     new_g_shape[axis] = g.shape[0]
     g = xp.reshape(g, tuple(new_g_shape))
@@ -98,37 +105,55 @@ def _quantile(  # numpydoc ignore=GL08
     )
 
 
-def _weighted_quantile(a: Array, q: Array, weights: Array, n: int, axis, average: bool, xp: ModuleType):
+def _weighted_quantile(
+    a: Array, q: Array, weights: Array, n: int, axis: int, average: bool, nan_policy: str,
+    xp: ModuleType, device: Device
+) -> Array:
+    """
+    a is expected to be 1d or 2d.
+    """
+    kwargs = dict(n=n, average=average, nan_policy=nan_policy, xp=xp, device=device)
     a = xp.moveaxis(a, axis, -1)
+    if weights.ndim > 1:
+        weights = xp.moveaxis(weights, axis, -1)
     sorter = xp.argsort(a, axis=-1, stable=False)
-    a = xp.take_along_axis(a, sorter, axis=-1)
 
     if a.ndim == 1:
-        return _weighted_quantile_sorted_1d(a, q, weights, n, )
+        x = xp.take(a, sorter)
+        w = xp.take(weights, sorter)
+        return _weighted_quantile_sorted_1d(x, q, w, **kwargs)
 
     d, = eager_shape(a, axis=0)
-    res = xp.empty((q.shape[0], d))
+    res = []
     for idx in range(d):
         w = weights if weights.ndim == 1 else weights[idx, ...]
         w = xp.take(w, sorter[idx, ...])
-        res[..., idx] = _weighted_quantile_sorted_1d(a[idx, ...], q, w, n, average)
+        x = xp.take(a[idx, ...], sorter[idx, ...])
+        res.append(_weighted_quantile_sorted_1d(x, q, w, **kwargs))
+    res = xp.stack(res, axis=1)
     return res
 
 
-def _weighted_quantile_sorted_1d(a, q, w, n, average: bool, xp: ModuleType):
-    cw = xp.cumsum(w)
+def _weighted_quantile_sorted_1d(
+    x: Array, q: Array, w: Array, n: int, average: bool, nan_policy: str,
+    xp: ModuleType, device: Device
+) -> Array:
+    if nan_policy == "omit":
+        w = xp.where(xp.isnan(x), 0., w)
+    elif xp.any(xp.isnan(x)):
+        return xp.full(q.shape, xp.nan, dtype=x.dtype, device=device)
+    cw = xp.cumulative_sum(w)
     t = cw[-1] * q
-    i = xp.searchsorted(cw, t)
+    i = xp.searchsorted(cw, t, side='left')
     j = xp.searchsorted(cw, t, side='right')
-    i = xp.minimum(i, float(n - 1))
-    j = xp.minimum(j, float(n - 1))
+    i = xp.clip(i, 0, n - 1)
+    j = xp.clip(j, 0, n - 1)
 
     # Ignore leading `weights=0` observations when `q=0`
     # see https://github.com/scikit-learn/scikit-learn/pull/20528
-    i = xp.where(q == 0., j, i)   
+    i = xp.where(q == 0., j, i)
     if average:
         # Ignore trailing `weights=0` observations when `q=1`
         j = xp.where(q == 1., i, j)
-        return (xp.take(a, i) + xp.take(a, j)) / 2
-    else:
-        return xp.take(a, i)
+        return (xp.take(x, i) + xp.take(x, j)) / 2
+    return xp.take(x, i)
diff --git a/tests/test_funcs.py b/tests/test_funcs.py
@@ -1558,18 +1558,70 @@ def test_shape(self, xp: ModuleType):
         assert quantile(a, q, axis=1, keepdims=True).shape == (2, 3, 1, 5)
         assert quantile(a, q, axis=2, keepdims=True).shape == (2, 3, 4, 1)
 
-    def test_against_numpy(self, xp: ModuleType):
+    @pytest.mark.parametrize("keepdims", [True, False])
+    def test_against_numpy(self, xp: ModuleType, keepdims: bool):
         rng = np.random.default_rng()
         a_np = rng.random((3, 4, 5))
         q_np = rng.random(2)
         a = xp.asarray(a_np)
         q = xp.asarray(q_np)
-        for keepdims in [False, True]:
-            for axis in [None, *range(a.ndim)]:
-                actual = quantile(a, q, axis=axis, keepdims=keepdims)
-                expected = np.quantile(a_np, q_np, axis=axis, keepdims=keepdims)
-                expected = xp.asarray(expected, dtype=xp.float64)
-                xp_assert_close(actual, expected, atol=1e-12)
+        for axis in [None, *range(a.ndim)]:
+            actual = quantile(a, q, axis=axis, keepdims=keepdims)
+            expected = np.quantile(a_np, q_np, axis=axis, keepdims=keepdims)
+            expected = xp.asarray(expected)
+            xp_assert_close(actual, expected, atol=1e-12)
+
+    @pytest.mark.parametrize("keepdims", [True, False])
+    @pytest.mark.parametrize("nan_policy", ["omit", "no_nans", "propagate"])#, #["omit"])#["no_nans", "propagate"])
+    @pytest.mark.parametrize("q_np", [0.5, 0., 1., np.linspace(0, 1, num=11)])
+    def test_weighted_against_numpy(self, xp: ModuleType, keepdims: bool, q_np: Array | float, nan_policy: str):
+        rng = np.random.default_rng()
+        n, d = 10, 20
+        a_np = rng.random((n, d))
+        kwargs = dict(keepdims=keepdims)
+        mask_nan = np.zeros((n, d), dtype=bool)
+        if nan_policy != "no_nans":
+            # from 0% to 100% of NaNs:
+            mask_nan = rng.random((n, d)) < rng.random((n, 1))
+            # don't put nans in the first row:
+            mask_nan[:] = False
+            a_np[mask_nan] = np.nan
+            kwargs['nan_policy'] = nan_policy
+
+        a = xp.asarray(a_np)
+        q = xp.asarray(np.copy(q_np))
+        m = 'inverted_cdf'
+
+        np_quantile = np.quantile
+        if nan_policy == "omit":
+            np_quantile = np.nanquantile
+
+        for w_np, axis in [
+            (rng.random(n), 0),
+            (rng.random(d), 1),
+            (rng.integers(0, 2, n), 0),
+            (rng.integers(0, 2, d), 1),
+            (rng.integers(0, 2, (n, d)), 0),
+            (rng.integers(0, 2, (n, d)), 1),
+        ]:
+            print(w_np)
+            with warnings.catch_warnings(record=True) as warning:
+                warnings.filterwarnings("always", "invalid value encountered in divide", RuntimeWarning)
+                warnings.filterwarnings("ignore", "All-NaN slice encountered", RuntimeWarning)
+                try:
+                    expected = np_quantile(a_np, q_np, axis=axis, method=m, weights=w_np, keepdims=keepdims)
+                except IndexError:
+                    print('index error')
+                    continue
+                if warning:  # this means some weights sum was 0, in this case we skip calling xpx.quantile
+                    print('warning')
+                    continue
+            expected = xp.asarray(expected)
+            print("not skiped")
+
+            w = xp.asarray(w_np)
+            actual = quantile(a, q, axis=axis, method=m, weights=w, **kwargs)
+            xp_assert_close(actual, expected, atol=1e-12)
 
     def test_2d_axis(self, xp: ModuleType):
         x = xp.asarray([[1, 2, 3], [4, 5, 6]])
@@ -1605,8 +1657,6 @@ def test_edge_cases(self, xp: ModuleType):
 
     def test_invalid_q(self, xp: ModuleType):
         x = xp.asarray([1, 2, 3, 4, 5])
-        _ = quantile(x, 1.0)
-        # ^ FIXME: here just to make this test fail for sparse backend
         # q > 1 should raise
         with pytest.raises(
             ValueError, match=r"`q` values must be in the range \[0, 1\]"