Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1f8c628
init commit kendall spearman ordinal cats
pandeconscious Oct 23, 2025
906f1e4
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Oct 27, 2025
497dc7e
series test update and fixes
pandeconscious Oct 27, 2025
583aca6
cat desc longer in tests
pandeconscious Oct 27, 2025
e069810
testing frame corr
pandeconscious Oct 27, 2025
b90726f
pre commit fixes v2
pandeconscious Oct 27, 2025
65a506c
cleanup
pandeconscious Oct 27, 2025
ab3b8b9
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
e93ed83
test import scipy fix
pandeconscious Nov 4, 2025
ec4d97e
rst sorting autofix
pandeconscious Nov 4, 2025
ebfc3b0
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
8cfacef
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 5, 2025
7ef7fb2
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 12, 2025
588808a
refactor
pandeconscious Nov 12, 2025
c484552
fix dtype for duplicates
pandeconscious Nov 12, 2025
216475c
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
e997747
clean up
pandeconscious Nov 16, 2025
4184167
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
8bcd3dc
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 18, 2025
2673281
clean up
pandeconscious Nov 18, 2025
ff48847
import fix
pandeconscious Nov 18, 2025
1c69e29
test tranform ordered cat func
pandeconscious Nov 18, 2025
8b26a7d
tests and mypy fixes
pandeconscious Nov 18, 2025
a625520
type check fix
pandeconscious Nov 18, 2025
259424e
addressing review comments
pandeconscious Nov 18, 2025
f141e6a
Merge branch 'main' into ordered_cat_corr
pandeconscious Nov 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@
treat_as_nested,
)
from pandas.core.methods import selectn
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -11679,6 +11680,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = transform_ord_cat_cols_to_coded_cols(data)

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11968,6 +11973,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = transform_ord_cat_cols_to_coded_cols(left)
right = transform_ord_cat_cols_to_coded_cols(right)

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down
32 changes: 32 additions & 0 deletions pandas/core/methods/corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Module for correlation related implementation
"""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from pandas.core.dtypes.dtypes import CategoricalDtype

if TYPE_CHECKING:
from pandas import DataFrame


def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
"""
Replace ordered categoricals with their codes, making a shallow copy if necessary.
"""

result = df
made_copy = False
for idx, dtype in enumerate(df.dtypes):
if not isinstance(dtype, CategoricalDtype) or not dtype.ordered:
continue
col = result._ixs(idx, axis=1)
if not made_copy:
made_copy = True
result = result.copy(deep=False)
result._iset_item(idx, col.cat.codes.replace(-1, np.nan))
return result
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2684,6 +2684,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
104 changes: 104 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import combinations

import numpy as np
import pytest

Expand Down Expand Up @@ -252,6 +254,61 @@ def test_corr_numeric_only(self, meth, numeric_only):
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@td.skip_if_no("scipy")
def test_corr_rank_ordered_categorical(
self,
method,
):
df = DataFrame(
{
"ord_cat": pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"ord_cat_none": pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
),
"ord_cat_shuff": pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
}
)
corr_calc = df.corr(method=method)
for col1, col2 in combinations(df.columns, r=2):
corr_expected = df[col1].corr(df[col2], method=method)
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@td.skip_if_no("scipy")
def test_corr_rank_ordered_categorical_duplicate_columns(
self,
method,
):
cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
df = DataFrame(
{
"a": pd.array([1, 2, 3, 4], dtype=cat),
"b": pd.array([4, 3, 2, 1], dtype=cat),
"c": [4, 3, 2, 1],
"d": [10, 20, 30, 40],
"e": [100, 200, 300, 400],
}
)
df.columns = ["a", "a", "c", "c", "e"]

corr_calc = df.corr(method=method)
for col1_idx, col2_idx in combinations(range(len(df.columns)), r=2):
corr_expected = df.iloc[:, col1_idx].corr(
df.iloc[:, col2_idx], method=method
)
tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected)


class TestDataFrameCorrWith:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -493,3 +550,50 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
df1 = DataFrame(
{
"a": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"b": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"c": Series([0, 1, 2, 3]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

df2 = DataFrame(
{
"a": Series([2.0, 3.0, 4.5, np.nan]),
"b": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"c": Series([2, 3, 0, 1]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

corr_calc = df1.corrwith(df2, method=method)
for col in df1.columns:
corr_expected = df1[col].corr(df2[col], method=method)
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
147 changes: 147 additions & 0 deletions pandas/tests/methods/corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
Tests for core/methods/corr.py
"""

import numpy as np
import pytest

from pandas import (
Categorical,
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols


@pytest.mark.parametrize(
("input_df", "expected_df"),
[
pytest.param(
# 1) Simple: two ordered categorical columns (with and without None)
DataFrame(
{
"ord_cat": Series(
Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_cat_none": Series(
Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
}
),
DataFrame(
{
# codes: low=0, m=1, h=2, vh=3
"ord_cat": Series([0, 1, 2, 3], dtype="int8"),
# codes: low=0, m=1, h=2, None -> NaN
"ord_cat_none": Series([0, 1.0, 2.0, np.nan]),
}
),
id="ordered-categoricals-basic",
),
pytest.param(
# 2) Mixed dtypes: only the ordered categorical should change
DataFrame(
{
"ordered": Series(
Categorical(
["a", "c", "b"],
categories=["a", "b", "c"],
ordered=True,
)
),
"unordered": Series(Categorical(["x", "y", "x"], ordered=False)),
"num": Series([10, 20, 30]),
"text": Series(["u", "v", "w"]),
}
),
DataFrame(
{
# codes: a=0, c=2, b=1
"ordered": Series([0, 2, 1], dtype="int8"),
# unordered categorical should be untouched (still categorical)
"unordered": Series(Categorical(["x", "y", "x"], ordered=False)),
"num": Series([10, 20, 30]),
"text": Series(["u", "v", "w"]),
}
),
id="mixed-types-only-ordered-changes",
),
pytest.param(
# 3 Duplicate column names: first 'dup' is ordered categorical,
# second 'dup' is non-categorical
DataFrame(
{
"dup_1": Series(
Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
ordered=True,
)
),
"dup_2": Series([5, 6, 7]), # duplicate name, later column
}
),
DataFrame(
{
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
# position 1 remains untouched numbers [5,6,7].
"dup_1": Series([0, 1, 2], dtype="int8"),
"dup_2": Series([5, 6, 7]),
}
),
id="duplicate-names-ordered-first",
),
pytest.param(
# 4 Duplicate column names: first 'dup' is non-categorical,
# second 'dup' is ordered categorical, third 'dup' is ordered categorical
DataFrame(
{
"dup_1": Series(["a", "b", "c"]), # non-categorical (object)
"dup_2": Series(
Categorical(
["p", "q", None],
categories=["p", "q"],
ordered=True,
)
),
"dup_3": Series(
Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
ordered=True,
)
),
}
),
DataFrame(
{
# First stays object; second turns into codes [0, 1, NaN]
# and third changes into codes [0, 1, 2]
"dup_1": Series(["a", "b", "c"]),
"dup_2": Series([0.0, 1.0, np.nan]),
"dup_3": Series([0, 1, 2], dtype="int8"),
}
),
id="duplicate-names-ordered-and-non-categorical-and-none",
),
],
)
def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this test is necessary; your other tests are sufficient.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function in itself can also be potentially used for things other than correlation as it is a specific type of transformation. Correlation is one use case of transforming to these codes, so to me it seems like this function should be anyway tested for what it is supposed to do irrespective of its use in correlation. Please lmk what do you think.

# duplicate columns creation for dup columns
if "dup_1" in input_df.columns:
input_df.columns = ["dup" for _ in range(len(input_df.columns))]
expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]

out_df = transform_ord_cat_cols_to_coded_cols(input_df)
assert list(out_df.columns) == list(expected_df.columns)
for i, col in enumerate(out_df.columns):
tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i])
44 changes: 44 additions & 0 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,47 @@ def test_corr_callable_method(self, datetime_series):
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize(
"cat_series",
[
Series(
pd.Categorical( # ordered cat series
["low", "medium", "high"],
categories=["low", "medium", "high"],
ordered=True,
)
),
Series(
pd.Categorical( # ordered cat series with NA
["low", "medium", "high", None],
categories=["low", "medium", "high"],
ordered=True,
)
),
],
)
@pytest.mark.parametrize(
"other_series",
[
Series( # other cat ordered series
pd.Categorical(
["m", "l", "h"],
categories=["l", "m", "h"],
ordered=True,
)
),
# other non cat series
Series([2, 1, 3]),
],
)
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

self,
method,
cat_series,
other_series,
):
expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
corr_calc = cat_series.corr(other_series, method=method)
tm.assert_almost_equal(corr_calc, expected_corr[method])
Loading