From 1f8c6280cb9203eb28b0378b76569ddf591afe44 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Thu, 23 Oct 2025 10:46:48 +0000 Subject: [PATCH 01/21] init commit kendall spearman ordinal cats --- pandas/core/frame.py | 27 +++++++++++++++++- pandas/core/series.py | 6 ++++ pandas/tests/series/methods/test_cov_corr.py | 29 +++++++++++++++++++- test_corr.py | 23 ++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 test_corr.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d4ff1bc4f35ac..44d13c6e81641 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11633,6 +11633,11 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() + + if method in ("spearman", "kendall"): + data = data._transform_ord_cat_cols_to_coded_cols() + + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": @@ -11926,7 +11931,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): - + left = left._convert_ordered_cat_to_code() + right = right._convert_ordered_cat_to_code() def c(x): return nanops.nancorr(x[0], x[1], method=method) @@ -11957,6 +11963,25 @@ def c(x): return correl + def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: + """ + any ordered categorical columns are transformed to the respectice caregorical codes + other columns remain untouched + """ + categ = self.select_dtypes("category") + if len(categ.columns) == 0: + return self + + cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns + + if len(cols_convert) > 0: + data = self.copy(deep=False) + data[cols_convert] = data[cols_convert].transform( + lambda x: x.cat.codes.replace(-1, np.nan) + ) + return data + return self + # ---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/core/series.py b/pandas/core/series.py index f3aaee26fe470..dd7d6a513ecf2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2687,6 +2687,12 @@ def corr( this, other = self.align(other, join="inner") if len(this) == 0: return np.nan + + if method in ("spearman", "kendall"): + if this.dtype == "category" and this.cat.ordered: + this = this.cat.codes.replace(-1, np.nan) + if other.dtype == "category" and other.cat.ordered: + other = other.cat.codes.replace(-1, np.nan) this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 7a4d48fb76940..322bbfdd884c7 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -11,7 +11,6 @@ ) import pandas._testing as tm - class TestSeriesCov: def test_cov(self, datetime_series): # full overlap @@ -184,3 +183,31 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_rank_ordered_categorical(self, method,): + stats = pytest.importorskip("scipy.stats") + method_scipy_func = { + "kendall": stats.kendalltau, + "spearman": stats.spearmanr + } + ser_ord_cat = pd.Series( pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], ordered=True + )) + ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) + ser_ord_int = pd.Series([0, 1, 2, 3]) + ser_ord_float = pd.Series([2.0, 3.0, 4.5, 6.5]) + + corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_int)[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_float)[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_cat_codes)[0] + tm.assert_almost_equal(corr_calc, corr_expected) + diff --git a/test_corr.py b/test_corr.py new file mode 100644 index 0000000000000..e52674647ef1b --- /dev/null +++ b/test_corr.py @@ -0,0 +1,23 @@ +import pandas as pd +df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : [4, 3, 2, 1]}) +df['b'] = df['b'].astype('category').cat.set_categories([4, 3, 2, 1], ordered=True) +#import pdb; pdb.set_trace() +crr = df.corr(method='spearman') +print(crr) + + +df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : ["vh", "h", "m", "l"]}) +df['b'] = df['b'].astype('category').cat.set_categories(["vh", "h", "m", "l"], ordered=True) +#import pdb; pdb.set_trace() +print(df) +print(df.dtypes) +crr = df.corr(method='spearman') +print(crr) + +ser_ord_cat = pd.Series( pd.Categorical( + ["vh", "h", "m", "low"], + categories=["vh", "h", "m", "low"], ordered=True + )) +print(ser_ord_cat) +crr = ser_ord_cat.corr(ser_ord_cat, method='spearman') +print(crr) \ No newline at end of file From 497dc7e88a418b99a07a02b841a63941d986cc1a Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 27 Oct 2025 11:31:35 +0000 Subject: [PATCH 02/21] series test update and fixes --- pandas/core/frame.py | 4 +-- pandas/tests/series/methods/test_cov_corr.py | 29 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a9ae652bca71..d202d4db0e9c5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11964,8 +11964,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): - left = left._convert_ordered_cat_to_code() - right = right._convert_ordered_cat_to_code() + left = left._transform_ord_cat_cols_to_coded_cols() + right = right._transform_ord_cat_cols_to_coded_cols() def c(x): return nanops.nancorr(x[0], x[1], method=method) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 322bbfdd884c7..c4b16224cc19c 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -200,14 +200,37 @@ def test_corr_rank_ordered_categorical(self, method,): ser_ord_float = pd.Series([2.0, 3.0, 4.5, 6.5]) corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_int)[0] + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_int, nan_policy="omit")[0] tm.assert_almost_equal(corr_calc, corr_expected) corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_float)[0] + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_float, nan_policy="omit")[0] tm.assert_almost_equal(corr_calc, corr_expected) corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_cat_codes)[0] + corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit")[0] tm.assert_almost_equal(corr_calc, corr_expected) + + ser_ord_cat_shuff = pd.Series( pd.Categorical( + ["h", "low", "vh", "m"], + categories=["low", "m", "h", "vh"], ordered=True + )) + ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) + corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit")[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit")[0] + tm.assert_almost_equal(corr_calc, corr_expected) + + ser_ord_cat_with_nan = pd.Series( pd.Categorical( + ["h", "low", "vh", None, "m"], + categories=["low", "m", "h", "vh"], ordered=True + )) + ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(-1, np.nan) + ser_ord_int = pd.Series([2, 0, 1, 3, None]) + corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method) + corr_expected = method_scipy_func[method](ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit")[0] + tm.assert_almost_equal(corr_calc, corr_expected) \ No newline at end of file From 583aca6d3a3decbb6054b6343f2ce83d83f879c1 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 27 Oct 2025 11:46:42 +0000 Subject: [PATCH 03/21] cat desc longer in tests --- pandas/tests/series/methods/test_cov_corr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index c4b16224cc19c..aa706367d38f5 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -192,8 +192,8 @@ def test_corr_rank_ordered_categorical(self, method,): "spearman": stats.spearmanr } ser_ord_cat = pd.Series( pd.Categorical( - ["low", "m", "h", "vh"], - categories=["low", "m", "h", "vh"], ordered=True + ["low", "med", "high", "very_high"], + categories=["low", "med", "high", "very_high"], ordered=True )) ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) ser_ord_int = pd.Series([0, 1, 2, 3]) @@ -212,8 +212,8 @@ def test_corr_rank_ordered_categorical(self, method,): tm.assert_almost_equal(corr_calc, corr_expected) ser_ord_cat_shuff = pd.Series( pd.Categorical( - ["h", "low", "vh", "m"], - categories=["low", "m", "h", "vh"], ordered=True + ["high", "low", "very_high", "med"], + categories=["low", "med", "high", "very_high"], ordered=True )) ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) From e06981087966df29ec0ace23aac68c5b08ed35a5 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 27 Oct 2025 12:28:27 +0000 Subject: [PATCH 04/21] testing frame corr --- pandas/tests/frame/methods/test_cov_corr.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5ed2e86283e9..23e8c50489480 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,3 +1,4 @@ +from itertools import combinations import numpy as np import pytest @@ -251,6 +252,24 @@ def test_corr_numeric_only(self, meth, numeric_only): else: with pytest.raises(ValueError, match="could not convert string to float"): df.corr(meth, numeric_only=numeric_only) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_rank_ordered_categorical(self, method,): + df = DataFrame( + { + "ord_cat": pd.Series(pd.Categorical(["low", "m", "h", "vh"], categories=["low", "m", "h", "vh"], ordered=True)), + "ord_cat_none": pd.Series(pd.Categorical(["low", "m", "h", None], categories=["low", "m", "h"], ordered=True)), + "ord_int": pd.Series([0, 1, 2, 3]), + "ord_float": pd.Series([2.0, 3.0, 4.5, 6.5]), + "ord_float_nan": pd.Series([2.0, 3.0, 4.5, np.nan]), + "ord_cat_shuff": pd.Series(pd.Categorical(["m", "h", "vh", "low"], categories=["low", "m", "h", "vh"], ordered=True)), + } + ) + corr_calc = df.corr(method=method) + for col1, col2 in combinations(["ord_cat", "ord_int", "ord_float"], r=2): + expected = df[col1].corr(df[col2], method=method) + tm.assert_almost_equal(corr_calc[col1][col2], expected) + class TestDataFrameCorrWith: From b90726fd73f5ce19b6f4d0ce9be5f0ebc5fe1cfc Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 27 Oct 2025 13:40:09 +0000 Subject: [PATCH 05/21] pre commit fixes v2 --- pandas/core/frame.py | 6 +- pandas/core/series.py | 2 +- pandas/tests/frame/methods/test_cov_corr.py | 92 +++++++++++++++++--- pandas/tests/series/methods/test_cov_corr.py | 88 ++++++++++++------- test_corr.py | 32 ++++--- 5 files changed, 158 insertions(+), 62 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d202d4db0e9c5..9b4e15ac7aac5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11670,7 +11670,6 @@ def corr( if method in ("spearman", "kendall"): data = data._transform_ord_cat_cols_to_coded_cols() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": @@ -11966,6 +11965,7 @@ def corrwith( elif method in ["kendall", "spearman"] or callable(method): left = left._transform_ord_cat_cols_to_coded_cols() right = right._transform_ord_cat_cols_to_coded_cols() + def c(x): return nanops.nancorr(x[0], x[1], method=method) @@ -11998,8 +11998,8 @@ def c(x): def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: """ - any ordered categorical columns are transformed to the respectice caregorical codes - other columns remain untouched + any ordered categorical columns are transformed to the respective + categorical codes while other columns remain untouched """ categ = self.select_dtypes("category") if len(categ.columns) == 0: diff --git a/pandas/core/series.py b/pandas/core/series.py index 65c40244e3954..5a9c262fe699e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2686,7 +2686,7 @@ def corr( this, other = self.align(other, join="inner") if len(this) == 0: return np.nan - + if method in ("spearman", "kendall"): if this.dtype == "category" and this.cat.ordered: this = this.cat.codes.replace(-1, np.nan) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 23e8c50489480..e56308f48e3c2 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,4 +1,5 @@ from itertools import combinations + import numpy as np import pytest @@ -252,24 +253,45 @@ def test_corr_numeric_only(self, meth, numeric_only): else: with pytest.raises(ValueError, match="could not convert string to float"): df.corr(meth, numeric_only=numeric_only) - + @pytest.mark.parametrize("method", ["kendall", "spearman"]) - def test_corr_rank_ordered_categorical(self, method,): + def test_corr_rank_ordered_categorical( + self, + method, + ): df = DataFrame( { - "ord_cat": pd.Series(pd.Categorical(["low", "m", "h", "vh"], categories=["low", "m", "h", "vh"], ordered=True)), - "ord_cat_none": pd.Series(pd.Categorical(["low", "m", "h", None], categories=["low", "m", "h"], ordered=True)), - "ord_int": pd.Series([0, 1, 2, 3]), - "ord_float": pd.Series([2.0, 3.0, 4.5, 6.5]), - "ord_float_nan": pd.Series([2.0, 3.0, 4.5, np.nan]), - "ord_cat_shuff": pd.Series(pd.Categorical(["m", "h", "vh", "low"], categories=["low", "m", "h", "vh"], ordered=True)), + "ord_cat": Series( + pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "ord_cat_none": Series( + pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ) + ), + "ord_int": Series([0, 1, 2, 3]), + "ord_float": Series([2.0, 3.0, 4.5, 6.5]), + "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]), + "ord_cat_shuff": Series( + pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "ord_int_shuff": Series([2, 3, 0, 1]), } ) corr_calc = df.corr(method=method) - for col1, col2 in combinations(["ord_cat", "ord_int", "ord_float"], r=2): - expected = df[col1].corr(df[col2], method=method) - tm.assert_almost_equal(corr_calc[col1][col2], expected) - + for col1, col2 in combinations(df.columns, r=2): + corr_expected = df[col1].corr(df[col2], method=method) + tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) class TestDataFrameCorrWith: @@ -512,3 +534,49 @@ def test_cov_with_missing_values(self): result2 = df.dropna().cov() tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_rank_ordered_categorical( + self, + method, + ): + df1 = DataFrame( + { + "a": Series( + pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "b": Series( + pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ) + ), + "c": Series([0, 1, 2, 3]), + "d": Series([2.0, 3.0, 4.5, 6.5]), + } + ) + + df2 = DataFrame( + { + "a": Series([2.0, 3.0, 4.5, np.nan]), + "b": Series( + pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "c": Series([2, 3, 0, 1]), + "d": Series([2.0, 3.0, 4.5, 6.5]), + } + ) + + corr_calc = df1.corrwith(df2, method=method) + for col in df1.columns: + corr_expected = df1[col].corr(df2[col], method=method) + tm.assert_almost_equal(corr_calc.get(col), corr_expected) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index aa706367d38f5..6d1f439f6ccd0 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -11,6 +11,7 @@ ) import pandas._testing as tm + class TestSeriesCov: def test_cov(self, datetime_series): # full overlap @@ -183,54 +184,77 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) - + @pytest.mark.parametrize("method", ["kendall", "spearman"]) - def test_corr_rank_ordered_categorical(self, method,): + def test_corr_rank_ordered_categorical( + self, + method, + ): stats = pytest.importorskip("scipy.stats") - method_scipy_func = { - "kendall": stats.kendalltau, - "spearman": stats.spearmanr - } - ser_ord_cat = pd.Series( pd.Categorical( - ["low", "med", "high", "very_high"], - categories=["low", "med", "high", "very_high"], ordered=True - )) + method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr} + ser_ord_cat = Series( + pd.Categorical( + ["low", "med", "high", "very_high"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ) ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) - ser_ord_int = pd.Series([0, 1, 2, 3]) - ser_ord_float = pd.Series([2.0, 3.0, 4.5, 6.5]) - + ser_ord_int = Series([0, 1, 2, 3]) + ser_ord_float = Series([2.0, 3.0, 4.5, 6.5]) + corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_int, nan_policy="omit")[0] + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_int, nan_policy="omit" + )[0] tm.assert_almost_equal(corr_calc, corr_expected) corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_float, nan_policy="omit")[0] + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_float, nan_policy="omit" + )[0] tm.assert_almost_equal(corr_calc, corr_expected) corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit")[0] + corr_expected = method_scipy_func[method]( + ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit" + )[0] tm.assert_almost_equal(corr_calc, corr_expected) - ser_ord_cat_shuff = pd.Series( pd.Categorical( - ["high", "low", "very_high", "med"], - categories=["low", "med", "high", "very_high"], ordered=True - )) + ser_ord_cat_shuff = Series( + pd.Categorical( + ["high", "low", "very_high", "med"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ) ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) - + corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit")[0] + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit" + )[0] tm.assert_almost_equal(corr_calc, corr_expected) corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit")[0] + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit" + )[0] tm.assert_almost_equal(corr_calc, corr_expected) - - ser_ord_cat_with_nan = pd.Series( pd.Categorical( - ["h", "low", "vh", None, "m"], - categories=["low", "m", "h", "vh"], ordered=True - )) - ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(-1, np.nan) - ser_ord_int = pd.Series([2, 0, 1, 3, None]) + + ser_ord_cat_with_nan = Series( + pd.Categorical( + ["h", "low", "vh", None, "m"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ) + ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace( + -1, np.nan + ) + ser_ord_int = Series([2, 0, 1, 3, None]) corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method) - corr_expected = method_scipy_func[method](ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit")[0] - tm.assert_almost_equal(corr_calc, corr_expected) \ No newline at end of file + corr_expected = method_scipy_func[method]( + ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit" + )[0] + tm.assert_almost_equal(corr_calc, corr_expected) diff --git a/test_corr.py b/test_corr.py index e52674647ef1b..6a06c8821aa05 100644 --- a/test_corr.py +++ b/test_corr.py @@ -1,23 +1,27 @@ import pandas as pd -df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : [4, 3, 2, 1]}) -df['b'] = df['b'].astype('category').cat.set_categories([4, 3, 2, 1], ordered=True) -#import pdb; pdb.set_trace() -crr = df.corr(method='spearman') + +df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 3, 2, 1]}) +df["b"] = df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) +# import pdb; pdb.set_trace() +crr = df.corr(method="spearman") print(crr) -df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : ["vh", "h", "m", "l"]}) -df['b'] = df['b'].astype('category').cat.set_categories(["vh", "h", "m", "l"], ordered=True) -#import pdb; pdb.set_trace() +df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["vh", "h", "m", "l"]}) +df["b"] = ( + df["b"].astype("category").cat.set_categories(["vh", "h", "m", "l"], ordered=True) +) +# import pdb; pdb.set_trace() print(df) print(df.dtypes) -crr = df.corr(method='spearman') +crr = df.corr(method="spearman") print(crr) -ser_ord_cat = pd.Series( pd.Categorical( - ["vh", "h", "m", "low"], - categories=["vh", "h", "m", "low"], ordered=True - )) +ser_ord_cat = pd.Series( + pd.Categorical( + ["vh", "h", "m", "low"], categories=["vh", "h", "m", "low"], ordered=True + ) +) print(ser_ord_cat) -crr = ser_ord_cat.corr(ser_ord_cat, method='spearman') -print(crr) \ No newline at end of file +crr = ser_ord_cat.corr(ser_ord_cat, method="spearman") +print(crr) From 65a506cc76c9c5f520356718c2915a76bc0e1bc9 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 27 Oct 2025 13:42:24 +0000 Subject: [PATCH 06/21] cleanup --- test_corr.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 test_corr.py diff --git a/test_corr.py b/test_corr.py deleted file mode 100644 index 6a06c8821aa05..0000000000000 --- a/test_corr.py +++ /dev/null @@ -1,27 +0,0 @@ -import pandas as pd - -df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 3, 2, 1]}) -df["b"] = df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) -# import pdb; pdb.set_trace() -crr = df.corr(method="spearman") -print(crr) - - -df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["vh", "h", "m", "l"]}) -df["b"] = ( - df["b"].astype("category").cat.set_categories(["vh", "h", "m", "l"], ordered=True) -) -# import pdb; pdb.set_trace() -print(df) -print(df.dtypes) -crr = df.corr(method="spearman") -print(crr) - -ser_ord_cat = pd.Series( - pd.Categorical( - ["vh", "h", "m", "low"], categories=["vh", "h", "m", "low"], ordered=True - ) -) -print(ser_ord_cat) -crr = ser_ord_cat.corr(ser_ord_cat, method="spearman") -print(crr) From e93ed835606804990a6041f7c27f862d28de8e4c Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 4 Nov 2025 20:20:22 +0000 Subject: [PATCH 07/21] test import scipy fix --- pandas/tests/frame/methods/test_cov_corr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index e56308f48e3c2..41368f8977d4b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -259,6 +259,7 @@ def test_corr_rank_ordered_categorical( self, method, ): + pytest.importorskip("scipy") df = DataFrame( { "ord_cat": Series( @@ -540,6 +541,7 @@ def test_corr_rank_ordered_categorical( self, method, ): + pytest.importorskip("scipy") df1 = DataFrame( { "a": Series( From ec4d97e299f3678ad7dacc516424625fdbf38cf3 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 4 Nov 2025 21:43:38 +0000 Subject: [PATCH 08/21] rst sorting autofix --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4722048c8c93e..6f25bcb67df0f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -210,6 +210,7 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) +- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`) - :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`) From 588808a6960a5cc2620cffc8902bcd47262713f4 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 12 Nov 2025 17:07:19 +0000 Subject: [PATCH 09/21] refactor --- pandas/core/frame.py | 27 ++++++++++++++++----- pandas/tests/frame/methods/test_cov_corr.py | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e155c99b072a1..edf07d841f095 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -115,6 +115,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, BaseMaskedDtype, + CategoricalDtype, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -12015,15 +12016,29 @@ def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: if len(categ.columns) == 0: return self - cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns + cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique() + single_cols = [col for col in cols_convert if isinstance(categ[col], Series)] + duplicated_cols = [ + col for col in cols_convert if isinstance(categ[col], DataFrame) + ] - if len(cols_convert) > 0: - data = self.copy(deep=False) - data[cols_convert] = data[cols_convert].transform( + if not single_cols and not duplicated_cols: + return self + + data = self.copy(deep=False) + if single_cols: + data[single_cols] = data[single_cols].transform( lambda x: x.cat.codes.replace(-1, np.nan) ) - return data - return self + + if duplicated_cols: + data[duplicated_cols] = data[duplicated_cols].apply( + lambda x: x.cat.codes.replace(-1, np.nan) + if isinstance(x, CategoricalDtype) and bool(x.ordered) + else x + ) + + return data # ---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 41368f8977d4b..2554e8c8220d3 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -255,11 +255,11 @@ def test_corr_numeric_only(self, meth, numeric_only): df.corr(meth, numeric_only=numeric_only) @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @td.skip_if_no("scipy") def test_corr_rank_ordered_categorical( self, method, ): - pytest.importorskip("scipy") df = DataFrame( { "ord_cat": Series( From c4845523d3a29969e085210a444b07e4996f294e Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 12 Nov 2025 17:30:40 +0000 Subject: [PATCH 10/21] fix dtype for duplicates --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index edf07d841f095..4acf15aa92744 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12016,25 +12016,25 @@ def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: if len(categ.columns) == 0: return self + data = self.copy(deep=False) cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique() - single_cols = [col for col in cols_convert if isinstance(categ[col], Series)] + single_cols = [col for col in cols_convert if isinstance(data[col], Series)] duplicated_cols = [ - col for col in cols_convert if isinstance(categ[col], DataFrame) + col for col in cols_convert if isinstance(data[col], DataFrame) ] if not single_cols and not duplicated_cols: return self - data = self.copy(deep=False) if single_cols: - data[single_cols] = data[single_cols].transform( + data[single_cols] = data[single_cols].apply( lambda x: x.cat.codes.replace(-1, np.nan) ) if duplicated_cols: data[duplicated_cols] = data[duplicated_cols].apply( lambda x: x.cat.codes.replace(-1, np.nan) - if isinstance(x, CategoricalDtype) and bool(x.ordered) + if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered) else x ) From e997747a689268c04176412bde644511e4222061 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Sun, 16 Nov 2025 18:49:22 +0000 Subject: [PATCH 11/21] clean up --- pandas/core/frame.py | 16 +-- pandas/tests/frame/methods/test_cov_corr.py | 33 ++++++ pandas/tests/series/methods/test_cov_corr.py | 105 ++++++++----------- 3 files changed, 86 insertions(+), 68 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f42900a33924a..6bc696b6ba8d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12017,21 +12017,23 @@ def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: data = self.copy(deep=False) cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique() - single_cols = [col for col in cols_convert if isinstance(data[col], Series)] - duplicated_cols = [ + ser_generating_cols = [ + col for col in cols_convert if isinstance(data[col], Series) + ] + df_generating_cols = [ col for col in cols_convert if isinstance(data[col], DataFrame) ] - if not single_cols and not duplicated_cols: + if not ser_generating_cols and not df_generating_cols: return self - if single_cols: - data[single_cols] = data[single_cols].apply( + if ser_generating_cols: + data[ser_generating_cols] = data[ser_generating_cols].apply( lambda x: x.cat.codes.replace(-1, np.nan) ) - if duplicated_cols: - data[duplicated_cols] = data[duplicated_cols].apply( + for df_col in df_generating_cols: + data[df_col] = data[df_col].apply( lambda x: x.cat.codes.replace(-1, np.nan) if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered) else x diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 2554e8c8220d3..aa9ba93e72680 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -294,6 +294,39 @@ def test_corr_rank_ordered_categorical( corr_expected = df[col1].corr(df[col2], method=method) tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @td.skip_if_no("scipy") + def test_corr_rank_ordered_categorical_duplicate_columns( + self, + method, + ): + df = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [4, 3, 2, 1], + "c": [4, 3, 2, 1], + "d": [10, 20, 30, 40], + "e": [100, 200, 300, 400], + } + ) + df["a"] = ( + df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) + ) + df["b"] = ( + df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) + ) + df["c"] = ( + df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) + ) + df.columns = ["a", "a", "c", "c", "e"] + + corr_calc = df.corr(method=method) + for col1_idx, col2_idx in combinations(range(len(df.columns)), r=2): + corr_expected = df.iloc[:, col1_idx].corr( + df.iloc[:, col2_idx], method=method + ) + tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected) + class TestDataFrameCorrWith: @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 6d1f439f6ccd0..e423a82710ac4 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -186,75 +186,58 @@ def test_corr_callable_method(self, datetime_series): tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @pytest.mark.parametrize( + "ord_cat_series", + [ + Series( # ordered categorical series + pd.Categorical( + ["low", "med", "high", "very_high"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ), + Series( # ordered categorical series with nan and a different ranking + pd.Categorical( + ["h", "low", "vh", None], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + ], + ) + @pytest.mark.parametrize( + "other_series", + [ + Series( # int series against which tord cat series is correlated + [0, 1, 2, 3] + ), + Series( # float series against which ord cat series is correlated + [2.0, 3.0, 4.5, 6.5] + ), + Series( # other ord cat series against which ord cat series is correlated + pd.Categorical( + ["high", "low", "very_high", "med"], + categories=["low", "med", "high", "very_high"], + ordered=True, + ) + ), + ], + ) def test_corr_rank_ordered_categorical( self, method, + ord_cat_series, + other_series, ): stats = pytest.importorskip("scipy.stats") method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr} - ser_ord_cat = Series( - pd.Categorical( - ["low", "med", "high", "very_high"], - categories=["low", "med", "high", "very_high"], - ordered=True, - ) - ) - ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) - ser_ord_int = Series([0, 1, 2, 3]) - ser_ord_float = Series([2.0, 3.0, 4.5, 6.5]) - - corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) - corr_expected = method_scipy_func[method]( - ser_ord_cat_codes, ser_ord_int, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) - - corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) - corr_expected = method_scipy_func[method]( - ser_ord_cat_codes, ser_ord_float, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) - - corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) - corr_expected = method_scipy_func[method]( - ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) - - ser_ord_cat_shuff = Series( - pd.Categorical( - ["high", "low", "very_high", "med"], - categories=["low", "med", "high", "very_high"], - ordered=True, - ) - ) - ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) - - corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method) - corr_expected = method_scipy_func[method]( - ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) + ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan) - corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method) - corr_expected = method_scipy_func[method]( - ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) + if other_series.dtype == "category" and other_series.cat.ordered: + other_series = other_series.cat.codes.replace(-1, np.nan) - ser_ord_cat_with_nan = Series( - pd.Categorical( - ["h", "low", "vh", None, "m"], - categories=["low", "m", "h", "vh"], - ordered=True, - ) - ) - ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace( - -1, np.nan - ) - ser_ord_int = Series([2, 0, 1, 3, None]) - corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method) + corr_calc = ord_cat_series.corr(other_series, method=method) corr_expected = method_scipy_func[method]( - ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit" + ord_ser_cat_codes, other_series, nan_policy="omit" )[0] tm.assert_almost_equal(corr_calc, corr_expected) From 2673281042ed2eb7410bcdb77598df7297b4d115 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 15:10:17 +0000 Subject: [PATCH 12/21] clean up --- pandas/core/frame.py | 43 ++++--------------------------------- pandas/core/methods/corr.py | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 39 deletions(-) create mode 100644 pandas/core/methods/corr.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6bc696b6ba8d7..518ed01b720a1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -114,7 +114,6 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, BaseMaskedDtype, - CategoricalDtype, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -185,6 +184,7 @@ treat_as_nested, ) from pandas.core.methods import selectn +from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -11682,7 +11682,7 @@ def corr( idx = cols.copy() if method in ("spearman", "kendall"): - data = data._transform_ord_cat_cols_to_coded_cols() + data = transform_ord_cat_cols_to_coded_cols(data) mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -11973,8 +11973,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): - left = left._transform_ord_cat_cols_to_coded_cols() - right = right._transform_ord_cat_cols_to_coded_cols() + left = transform_ord_cat_cols_to_coded_cols(left) + right = transform_ord_cat_cols_to_coded_cols(right) def c(x): return nanops.nancorr(x[0], x[1], method=method) @@ -12006,41 +12006,6 @@ def c(x): return correl - def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame: - """ - any ordered categorical columns are transformed to the respective - categorical codes while other columns remain untouched - """ - categ = self.select_dtypes("category") - if len(categ.columns) == 0: - return self - - data = self.copy(deep=False) - cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique() - ser_generating_cols = [ - col for col in cols_convert if isinstance(data[col], Series) - ] - df_generating_cols = [ - col for col in cols_convert if isinstance(data[col], DataFrame) - ] - - if not ser_generating_cols and not df_generating_cols: - return self - - if ser_generating_cols: - data[ser_generating_cols] = data[ser_generating_cols].apply( - lambda x: x.cat.codes.replace(-1, np.nan) - ) - - for df_col in df_generating_cols: - data[df_col] = data[df_col].apply( - lambda x: x.cat.codes.replace(-1, np.nan) - if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered) - else x - ) - - return data - # ---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py new file mode 100644 index 0000000000000..5d6b767b55e0c --- /dev/null +++ b/pandas/core/methods/corr.py @@ -0,0 +1,22 @@ +import numpy as np + +from pandas import DataFrame + + +def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: + """ + any ordered categorical columns are transformed to the respective + categorical codes while other columns remain untouched + """ + + result = df + made_copy = False + for idx, dtype in enumerate(df.dtypes): + if not dtype == "category" or not dtype.ordered: + continue + col = result._ixs(idx, axis=1) + if not made_copy: + made_copy = True + result = result.copy(deep=False) + result._iset_item(idx, col.cat.codes.replace(-1, np.nan)) + return result From ff48847dfe420294d5afc3dc07e13d6ca9b83b76 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 19:14:22 +0000 Subject: [PATCH 13/21] import fix --- pandas/core/methods/corr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py index 5d6b767b55e0c..03b1ecd6c5a2b 100644 --- a/pandas/core/methods/corr.py +++ b/pandas/core/methods/corr.py @@ -1,6 +1,15 @@ +""" +Module for correlation related implementation +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np -from pandas import DataFrame +if TYPE_CHECKING: + from pandas import DataFrame def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: From 1c69e29f5b4147fb80a73fbd9adea9fed330115e Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 20:29:48 +0000 Subject: [PATCH 14/21] test tranform ordered cat func --- pandas/tests/methods/corr.py | 138 +++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 pandas/tests/methods/corr.py diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py new file mode 100644 index 0000000000000..db4d8e9b8f5ed --- /dev/null +++ b/pandas/tests/methods/corr.py @@ -0,0 +1,138 @@ +""" +Tests for core/methods/corr.py +""" + +import pytest +import numpy as np +from pandas import DataFrame, Series, Categorical +import pandas._testing as tm +from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols + + +@pytest.mark.parametrize( + ("input_df", "expected_df"), + [ + pytest.param( + # 1) Simple: two ordered categorical columns (with and without None) + DataFrame( + { + "ord_cat": Series( + Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ) + ), + "ord_cat_none": Series( + Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ) + ), + } + ), + DataFrame( + { + # codes: low=0, m=1, h=2, vh=3 + "ord_cat": Series([0, 1, 2, 3], dtype="int8"), + # codes: low=0, m=1, h=2, None -> NaN + "ord_cat_none": Series([0, 1.0, 2.0, np.nan]), + } + ), + id="ordered-categoricals-basic", + ), + pytest.param( + # 2) Mixed dtypes: only the ordered categorical should change + DataFrame( + { + "ordered": Series( + Categorical( + ["a", "c", "b"], + categories=["a", "b", "c"], + ordered=True, + ) + ), + "unordered": Series( + Categorical(["x", "y", "x"], ordered=False) + ), + "num": Series([10, 20, 30]), + "text": Series(["u", "v", "w"]), + } + ), + DataFrame( + { + # codes: a=0, c=2, b=1 + "ordered": Series([0, 2, 1], dtype="int8"), + # unordered categorical should be untouched (still categorical) + "unordered": Series( + Categorical(["x", "y", "x"], ordered=False) + ), + "num": Series([10, 20, 30]), + "text": Series(["u", "v", "w"]), + } + ), + id="mixed-types-only-ordered-changes", + ), + pytest.param( + # 3 Duplicate column names: first 'dup' is ordered categorical, second 'dup' is non-categorical + DataFrame( + { + "dup": Series( + Categorical( + ["low", "m", "h"], + categories=["low", "m", "h"], + ordered=True, + ) + ), + "dup": Series([5, 6, 7]), # duplicate name, later column + } + ), + DataFrame( + { + # After transform: position 0 (ordered cat) becomes codes [0,1,2], + # position 1 remains untouched numbers [5,6,7]. + "dup": Series([0, 1, 2], dtype="int8"), + "dup": Series([5, 6, 7]), + } + ), + id="duplicate-names-ordered-first", + ), + pytest.param( + # 4 Duplicate column names: first 'dup' is non-categorical, second 'dup' is ordered categorical, third 'dup' is ordered categorical + DataFrame( + { + "dup": Series(["a", "b", "c"]), # non-categorical (object) + "dup": Series( + Categorical( + ["p", "q", None], + categories=["p", "q"], + ordered=True, + ) + ), + "dup": Series( + Categorical( + ["low", "m", "h"], + categories=["low", "m", "h"], + ordered=True, + ) + ), + } + ), + DataFrame( + { + # First stays object; second turns into codes [0,1,NaN] and third changes into codes [0, 1, 2] as well + "dup": Series(["a", "b", "c"]), + "dup": Series([0.0, 1.0, np.nan]), + "dup": Series([0, 1, 2], dtype="int8"), + } + ), + id="duplicate-names-ordered-and-non-categorical-and-none", + ), + ], +) +def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df): + out_df = transform_ord_cat_cols_to_coded_cols(input_df) + assert list(out_df.columns) == list(expected_df.columns) + for i, col in enumerate(out_df.columns): + tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i]) \ No newline at end of file From 8b26a7d554890fc7db2e9ac350e47440766e2567 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 21:05:21 +0000 Subject: [PATCH 15/21] tests and mypy fixes --- pandas/core/methods/corr.py | 2 +- pandas/tests/methods/corr.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py index 03b1ecd6c5a2b..bd220ecf65863 100644 --- a/pandas/core/methods/corr.py +++ b/pandas/core/methods/corr.py @@ -21,7 +21,7 @@ def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: result = df made_copy = False for idx, dtype in enumerate(df.dtypes): - if not dtype == "category" or not dtype.ordered: + if not dtype == "category" or not dtype.ordered: # type: ignore[attr-defined] continue col = result._ixs(idx, axis=1) if not made_copy: diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py index db4d8e9b8f5ed..8e167c44a0064 100644 --- a/pandas/tests/methods/corr.py +++ b/pandas/tests/methods/corr.py @@ -53,9 +53,7 @@ ordered=True, ) ), - "unordered": Series( - Categorical(["x", "y", "x"], ordered=False) - ), + "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), "num": Series([10, 20, 30]), "text": Series(["u", "v", "w"]), } @@ -65,9 +63,7 @@ # codes: a=0, c=2, b=1 "ordered": Series([0, 2, 1], dtype="int8"), # unordered categorical should be untouched (still categorical) - "unordered": Series( - Categorical(["x", "y", "x"], ordered=False) - ), + "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), "num": Series([10, 20, 30]), "text": Series(["u", "v", "w"]), } @@ -75,7 +71,8 @@ id="mixed-types-only-ordered-changes", ), pytest.param( - # 3 Duplicate column names: first 'dup' is ordered categorical, second 'dup' is non-categorical + # 3 Duplicate column names: first 'dup' is ordered categorical, + # second 'dup' is non-categorical DataFrame( { "dup": Series( @@ -99,7 +96,8 @@ id="duplicate-names-ordered-first", ), pytest.param( - # 4 Duplicate column names: first 'dup' is non-categorical, second 'dup' is ordered categorical, third 'dup' is ordered categorical + # 4 Duplicate column names: first 'dup' is non-categorical, + # second 'dup' is ordered categorical, third 'dup' is ordered categorical DataFrame( { "dup": Series(["a", "b", "c"]), # non-categorical (object) @@ -121,7 +119,8 @@ ), DataFrame( { - # First stays object; second turns into codes [0,1,NaN] and third changes into codes [0, 1, 2] as well + # First stays object; second turns into codes [0, 1, NaN] + # and third changes into codes [0, 1, 2] "dup": Series(["a", "b", "c"]), "dup": Series([0.0, 1.0, np.nan]), "dup": Series([0, 1, 2], dtype="int8"), @@ -135,4 +134,4 @@ def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df): out_df = transform_ord_cat_cols_to_coded_cols(input_df) assert list(out_df.columns) == list(expected_df.columns) for i, col in enumerate(out_df.columns): - tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i]) \ No newline at end of file + tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i]) From a6255200cc505be73c7cd3765c8d5725d449f60d Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 21:40:11 +0000 Subject: [PATCH 16/21] type check fix --- pandas/core/methods/corr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py index bd220ecf65863..afadbf226221f 100644 --- a/pandas/core/methods/corr.py +++ b/pandas/core/methods/corr.py @@ -8,6 +8,8 @@ import numpy as np +from pandas.core.dtypes.dtypes import CategoricalDtype + if TYPE_CHECKING: from pandas import DataFrame @@ -21,7 +23,7 @@ def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: result = df made_copy = False for idx, dtype in enumerate(df.dtypes): - if not dtype == "category" or not dtype.ordered: # type: ignore[attr-defined] + if not isinstance(dtype, CategoricalDtype) or not dtype.ordered: continue col = result._ixs(idx, axis=1) if not made_copy: From 259424e6ba56c60e3912c11ca16763b476f352db Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Tue, 18 Nov 2025 23:29:04 +0000 Subject: [PATCH 17/21] addressing review comments --- pandas/core/methods/corr.py | 3 +- pandas/tests/frame/methods/test_cov_corr.py | 48 ++++++------------- pandas/tests/methods/corr.py | 34 +++++++++----- pandas/tests/series/methods/test_cov_corr.py | 49 +++++++------------- 4 files changed, 56 insertions(+), 78 deletions(-) diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py index afadbf226221f..9d070b6dae652 100644 --- a/pandas/core/methods/corr.py +++ b/pandas/core/methods/corr.py @@ -16,8 +16,7 @@ def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: """ - any ordered categorical columns are transformed to the respective - categorical codes while other columns remain untouched + Replace ordered categoricals with their codes, making a shallow copy if necessary. """ result = df diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index aa9ba93e72680..72240bf57929e 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -262,31 +262,21 @@ def test_corr_rank_ordered_categorical( ): df = DataFrame( { - "ord_cat": Series( - pd.Categorical( - ["low", "m", "h", "vh"], - categories=["low", "m", "h", "vh"], - ordered=True, - ) + "ord_cat": pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, ), - "ord_cat_none": Series( - pd.Categorical( - ["low", "m", "h", None], - categories=["low", "m", "h"], - ordered=True, - ) + "ord_cat_none": pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, ), - "ord_int": Series([0, 1, 2, 3]), - "ord_float": Series([2.0, 3.0, 4.5, 6.5]), - "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]), - "ord_cat_shuff": Series( - pd.Categorical( - ["m", "h", "vh", "low"], - categories=["low", "m", "h", "vh"], - ordered=True, - ) + "ord_cat_shuff": pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, ), - "ord_int_shuff": Series([2, 3, 0, 1]), } ) corr_calc = df.corr(method=method) @@ -300,24 +290,16 @@ def test_corr_rank_ordered_categorical_duplicate_columns( self, method, ): + cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True) df = DataFrame( { - "a": [1, 2, 3, 4], - "b": [4, 3, 2, 1], + "a": pd.array([1, 2, 3, 4], dtype=cat), + "b": pd.array([4, 3, 2, 1], dtype=cat), "c": [4, 3, 2, 1], "d": [10, 20, 30, 40], "e": [100, 200, 300, 400], } ) - df["a"] = ( - df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) - ) - df["b"] = ( - df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) - ) - df["c"] = ( - df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True) - ) df.columns = ["a", "a", "c", "c", "e"] corr_calc = df.corr(method=method) diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py index 8e167c44a0064..d781ea9b0c93a 100644 --- a/pandas/tests/methods/corr.py +++ b/pandas/tests/methods/corr.py @@ -2,9 +2,14 @@ Tests for core/methods/corr.py """ -import pytest import numpy as np -from pandas import DataFrame, Series, Categorical +import pytest + +from pandas import ( + Categorical, + DataFrame, + Series, +) import pandas._testing as tm from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols @@ -75,22 +80,22 @@ # second 'dup' is non-categorical DataFrame( { - "dup": Series( + "dup_1": Series( Categorical( ["low", "m", "h"], categories=["low", "m", "h"], ordered=True, ) ), - "dup": Series([5, 6, 7]), # duplicate name, later column + "dup_2": Series([5, 6, 7]), # duplicate name, later column } ), DataFrame( { # After transform: position 0 (ordered cat) becomes codes [0,1,2], # position 1 remains untouched numbers [5,6,7]. - "dup": Series([0, 1, 2], dtype="int8"), - "dup": Series([5, 6, 7]), + "dup_1": Series([0, 1, 2], dtype="int8"), + "dup_2": Series([5, 6, 7]), } ), id="duplicate-names-ordered-first", @@ -100,15 +105,15 @@ # second 'dup' is ordered categorical, third 'dup' is ordered categorical DataFrame( { - "dup": Series(["a", "b", "c"]), # non-categorical (object) - "dup": Series( + "dup_1": Series(["a", "b", "c"]), # non-categorical (object) + "dup_2": Series( Categorical( ["p", "q", None], categories=["p", "q"], ordered=True, ) ), - "dup": Series( + "dup_3": Series( Categorical( ["low", "m", "h"], categories=["low", "m", "h"], @@ -121,9 +126,9 @@ { # First stays object; second turns into codes [0, 1, NaN] # and third changes into codes [0, 1, 2] - "dup": Series(["a", "b", "c"]), - "dup": Series([0.0, 1.0, np.nan]), - "dup": Series([0, 1, 2], dtype="int8"), + "dup_1": Series(["a", "b", "c"]), + "dup_2": Series([0.0, 1.0, np.nan]), + "dup_3": Series([0, 1, 2], dtype="int8"), } ), id="duplicate-names-ordered-and-non-categorical-and-none", @@ -131,6 +136,11 @@ ], ) def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df): + # duplicate columns creation for dup columns + if "dup_1" in input_df.columns: + input_df.columns = ["dup" for _ in range(len(input_df.columns))] + expected_df.columns = ["dup" for _ in range(len(expected_df.columns))] + out_df = transform_ord_cat_cols_to_coded_cols(input_df) assert list(out_df.columns) == list(expected_df.columns) for i, col in enumerate(out_df.columns): diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index e423a82710ac4..99112362d21bd 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -187,19 +187,19 @@ def test_corr_callable_method(self, datetime_series): @pytest.mark.parametrize("method", ["kendall", "spearman"]) @pytest.mark.parametrize( - "ord_cat_series", + "cat_series", [ - Series( # ordered categorical series - pd.Categorical( - ["low", "med", "high", "very_high"], - categories=["low", "med", "high", "very_high"], + Series( + pd.Categorical( # ordered cat series + ["low", "medium", "high"], + categories=["low", "medium", "high"], ordered=True, ) ), - Series( # ordered categorical series with nan and a different ranking - pd.Categorical( - ["h", "low", "vh", None], - categories=["low", "m", "h", "vh"], + Series( + pd.Categorical( # ordered cat series with NA + ["low", "medium", "high", None], + categories=["low", "medium", "high"], ordered=True, ) ), @@ -208,36 +208,23 @@ def test_corr_callable_method(self, datetime_series): @pytest.mark.parametrize( "other_series", [ - Series( # int series against which tord cat series is correlated - [0, 1, 2, 3] - ), - Series( # float series against which ord cat series is correlated - [2.0, 3.0, 4.5, 6.5] - ), - Series( # other ord cat series against which ord cat series is correlated + Series( # other cat ordered series pd.Categorical( - ["high", "low", "very_high", "med"], - categories=["low", "med", "high", "very_high"], + ["m", "l", "h"], + categories=["l", "m", "h"], ordered=True, ) ), + # other non cat series + Series([2, 1, 3]), ], ) def test_corr_rank_ordered_categorical( self, method, - ord_cat_series, + cat_series, other_series, ): - stats = pytest.importorskip("scipy.stats") - method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr} - ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan) - - if other_series.dtype == "category" and other_series.cat.ordered: - other_series = other_series.cat.codes.replace(-1, np.nan) - - corr_calc = ord_cat_series.corr(other_series, method=method) - corr_expected = method_scipy_func[method]( - ord_ser_cat_codes, other_series, nan_policy="omit" - )[0] - tm.assert_almost_equal(corr_calc, corr_expected) + expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5} + corr_calc = cat_series.corr(other_series, method=method) + tm.assert_almost_equal(corr_calc, expected_corr[method]) From d2d0f719625b222126d1b2b267d3f820bfcb8195 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 19 Nov 2025 08:59:24 -0500 Subject: [PATCH 18/21] type fix corr.py --- pandas/tests/methods/corr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py index d781ea9b0c93a..021e40024a4d1 100644 --- a/pandas/tests/methods/corr.py +++ b/pandas/tests/methods/corr.py @@ -135,7 +135,7 @@ ), ], ) -def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df): +def test_transform_ord_cat_cols_to_coded_cols(input_df: DataFrame, expected_df: DataFrame): # duplicate columns creation for dup columns if "dup_1" in input_df.columns: input_df.columns = ["dup" for _ in range(len(input_df.columns))] From 858d0c2360bfa46ec6b9495c514dbc5751d7d6a0 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 19 Nov 2025 14:23:18 +0000 Subject: [PATCH 19/21] ruff format --- pandas/tests/methods/corr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py index 021e40024a4d1..86d94a69825be 100644 --- a/pandas/tests/methods/corr.py +++ b/pandas/tests/methods/corr.py @@ -135,7 +135,9 @@ ), ], ) -def test_transform_ord_cat_cols_to_coded_cols(input_df: DataFrame, expected_df: DataFrame): +def test_transform_ord_cat_cols_to_coded_cols( + input_df: DataFrame, expected_df: DataFrame +): # duplicate columns creation for dup columns if "dup_1" in input_df.columns: input_df.columns = ["dup" for _ in range(len(input_df.columns))] From a8c88c778966ada8ed2e4e5d40ddc096d8ce0a2c Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 19 Nov 2025 14:50:08 +0000 Subject: [PATCH 20/21] mypy fix --- pandas/tests/methods/corr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py index 86d94a69825be..4b14159801c39 100644 --- a/pandas/tests/methods/corr.py +++ b/pandas/tests/methods/corr.py @@ -137,7 +137,7 @@ ) def test_transform_ord_cat_cols_to_coded_cols( input_df: DataFrame, expected_df: DataFrame -): +) -> None: # duplicate columns creation for dup columns if "dup_1" in input_df.columns: input_df.columns = ["dup" for _ in range(len(input_df.columns))] From 71305aa99ac53901ee8df348aecf8b481c7717c0 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 19 Nov 2025 15:16:58 +0000 Subject: [PATCH 21/21] scipy unavailable fix in test --- pandas/tests/series/methods/test_cov_corr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 99112362d21bd..2d23165c4733e 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, @@ -185,6 +187,7 @@ def test_corr_callable_method(self, datetime_series): expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) + @td.skip_if_no("scipy") @pytest.mark.parametrize("method", ["kendall", "spearman"]) @pytest.mark.parametrize( "cat_series",