Merge branch 'main' into api-date_range

jbrockmendel · jbrockmendel · commit 2575866d95a8 · 2025-11-25T12:17:52.000-08:00
diff --git a/doc/source/whatsnew/v2.3.4.rst b/doc/source/whatsnew/v2.3.4.rst
@@ -13,6 +13,7 @@ including other versions of pandas.
 Bug fixes
 ^^^^^^^^^
 - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
+- Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_234.contributors:
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1124,7 +1124,6 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :meth:`Series.str.match` failing to raise when given a compiled ``re.Pattern`` object and conflicting ``case`` or ``flags`` arguments (:issue:`62240`)
-- Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`)
 - Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`)
 - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
 - Bug in multiplication with a :class:`StringDtype` incorrectly allowing multiplying by bools; explicitly cast to integers instead (:issue:`62595`)
@@ -1268,6 +1267,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
 - Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`)
 - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`)
+- Bug in :meth:`DataFrame.groupby` methods when operating on NumPy-nullable data failing when the NA mask was not C-contiguous (:issue:`61031`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -819,7 +819,7 @@ def group_prod(
     int64_t[::1] counts,
     ndarray[int64float_t, ndim=2] values,
     const intp_t[::1] labels,
-    const uint8_t[:, ::1] mask,
+    const uint8_t[:, :] mask,
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint skipna=True,
@@ -893,7 +893,7 @@ def group_var(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     int64_t ddof=1,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint is_datetimelike=False,
     str name="var",
@@ -998,7 +998,7 @@ def group_skew(
     int64_t[::1] counts,
     ndarray[float64_t, ndim=2] values,
     const intp_t[::1] labels,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -1086,7 +1086,7 @@ def group_kurt(
     int64_t[::1] counts,
     ndarray[float64_t, ndim=2] values,
     const intp_t[::1] labels,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -1180,7 +1180,7 @@ def group_mean(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -1324,7 +1324,7 @@ def group_ohlc(
     ndarray[int64float_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
 ) -> None:
     """
@@ -1870,7 +1870,7 @@ cdef group_min_max(
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
     bint compute_max=True,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ):
@@ -1983,7 +1983,7 @@ def group_idxmin_idxmax(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     str name="idxmin",
     bint skipna=True,
     uint8_t[:, ::1] result_mask=None,
@@ -2096,7 +2096,7 @@ def group_max(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -2124,7 +2124,7 @@ def group_min(
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -2148,7 +2148,7 @@ def group_min(
 cdef group_cummin_max(
     numeric_t[:, ::1] out,
     ndarray[numeric_t, ndim=2] values,
-    const uint8_t[:, ::1] mask,
+    const uint8_t[:, :] mask,
     uint8_t[:, ::1] result_mask,
     const intp_t[::1] labels,
     int ngroups,
@@ -2264,7 +2264,7 @@ def group_cummin(
     const intp_t[::1] labels,
     int ngroups,
     bint is_datetimelike,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
@@ -2290,7 +2290,7 @@ def group_cummax(
     const intp_t[::1] labels,
     int ngroups,
     bint is_datetimelike,
-    const uint8_t[:, ::1] mask=None,
+    const uint8_t[:, :] mask=None,
     uint8_t[:, ::1] result_mask=None,
     bint skipna=True,
 ) -> None:
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -1006,8 +1006,12 @@ cdef class BlockValuesRefs:
 
 cdef extern from "Python.h":
     """
+    // python version < 3.14
     #if PY_VERSION_HEX < 0x030E0000
-    int __Pyx_PyUnstable_Object_IsUniqueReferencedTemporary(PyObject *ref);
+    // This function is unused and is declared to avoid a build warning
+    int __Pyx_PyUnstable_Object_IsUniqueReferencedTemporary(PyObject *ref) {
+        return Py_REFCNT(ref) == 1;
+    }
     #else
     #define __Pyx_PyUnstable_Object_IsUniqueReferencedTemporary \
         PyUnstable_Object_IsUniqueReferencedTemporary
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -2342,6 +2342,9 @@ def _with_freq(self, freq) -> Self:
 
     def _values_for_json(self) -> np.ndarray:
         # Small performance bump vs the base class which calls np.asarray(self)
+        if self.unit != "ns":
+            # GH#55827
+            return self.as_unit("ns")._values_for_json()
         if isinstance(self.dtype, np.dtype):
             return self._ndarray
         return super()._values_for_json()
diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py
@@ -197,6 +197,7 @@ def test_update_dt_column_with_NaT_create_column(self):
                 np.datetime64("2000-01-02T00:00:00"),
                 np.dtype("datetime64[ns]"),
             ),
+            (1, 2, pd.Int64Dtype()),
         ],
     )
     def test_update_preserve_dtype(self, value_df, value_other, dtype):
@@ -228,3 +229,19 @@ def test_update_on_duplicate_frame_unique_argument_index(self):
         expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2], dtype=np.dtype("intc"))
         df.update(other)
         tm.assert_frame_equal(df, expected)
+
+    def test_update_preserve_mixed_dtypes(self):
+        # GH#44104
+        dtype1 = pd.Int64Dtype()
+        dtype2 = pd.StringDtype()
+        df = DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+        df = df.astype({"a": dtype1, "b": dtype2})
+
+        other = DataFrame({"a": [4, 5], "b": ["a", "b"]})
+        other = other.astype({"a": dtype1, "b": dtype2})
+
+        expected = DataFrame({"a": [4, 5, 3], "b": ["a", "b", "z"]})
+        expected = expected.astype({"a": dtype1, "b": dtype2})
+
+        df.update(other)
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py
@@ -84,3 +84,22 @@ def test_dup_labels_output_shape(groupby_func, idx):
 
     assert result.shape == (1, 2)
     tm.assert_index_equal(result.columns, idx)
+
+
+def test_not_c_contiguous_mask(groupby_func):
+    # https://github.com/pandas-dev/pandas/issues/61031
+    if groupby_func == "corrwith":
+        # corrwith is deprecated
+        return
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64")
+    reversed = DataFrame(
+        {"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0]
+    )[::-1]
+    assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"]
+    args = get_groupby_method_args(groupby_func, df)
+
+    gb_reversed = reversed.groupby("a")
+    result = getattr(gb_reversed, groupby_func)(*args)
+    gb = df.groupby("a")
+    expected = getattr(gb, groupby_func)(*args)
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -135,12 +135,6 @@ def test_frame_non_unique_index_raises(self, orient):
         ],
     )
     def test_frame_non_unique_columns(self, orient, data, request):
-        if isinstance(data[0][0], Timestamp) and orient == "split":
-            mark = pytest.mark.xfail(
-                reason="GH#55827 non-nanosecond dt64 fails to round-trip"
-            )
-            request.applymarker(mark)
-
         df = DataFrame(data, index=[1, 2], columns=["x", "x"])
 
         expected_warning = None
@@ -162,10 +156,14 @@ def test_frame_non_unique_columns(self, orient, data, request):
                 # in milliseconds; these are internally stored in nanosecond,
                 # so divide to get where we need
                 # TODO: a to_epoch method would also solve; see GH 14772
-                expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000)
+                dta = expected.iloc[:, 0]._values
+                dta = dta.as_unit("ns")  # GH#55827
+                expected.isetitem(0, dta.astype(np.int64) // 1_000_000)
         elif orient == "split":
             expected = df
             expected.columns = ["x", "x.1"]
+            if expected["x"].dtype.kind == "M":
+                expected["x"] = expected["x"].astype("M8[ns]")  # GH#55827
 
         tm.assert_frame_equal(result, expected)