diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 10b56011c9640..fbe8164863237 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1117,6 +1117,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.match` failing to raise when given a compiled ``re.Pattern`` object and conflicting ``case`` or ``flags`` arguments (:issue:`62240`) - Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`) - Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 110473be5d27c..210a177d073f7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1351,7 +1351,13 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): + def match( + self, + pat: str | re.Pattern, + case: bool | lib.NoDefault = lib.no_default, + flags: int | lib.NoDefault = lib.no_default, + na=lib.no_default, + ): """ Determine if each string starts with a match of a regular expression. @@ -1397,6 +1403,39 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): 2 False dtype: bool """ + if flags is not lib.no_default: + # pat.flags will have re.U regardless, so we need to add it here + # before checking for a match + flags = flags | re.U + if is_re(pat): + if pat.flags != flags: + raise ValueError( + "Cannot both specify 'flags' and pass a compiled regexp " + "object with conflicting flags" + ) + else: + pat = re.compile(pat, flags=flags) + # set flags=0 to ensure that when we call + # re.compile(pat, flags=flags) the constructor does not raise. + flags = 0 + else: + flags = 0 + + if case is lib.no_default: + if is_re(pat): + case = not bool(pat.flags & re.IGNORECASE) + else: + # Case-sensitive default + case = True + elif is_re(pat): + implicit_case = not bool(pat.flags & re.IGNORECASE) + if implicit_case != case: + # GH#62240 + raise ValueError( + "Cannot both specify 'case' and pass a compiled regexp " + "object with conflicting case-sensitivity" + ) + result = self._data.array._str_match(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c948fec8e7aa2..bd8fb2e2326f6 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -238,7 +238,17 @@ def _str_match( if not case: flags |= re.IGNORECASE - regex = re.compile(pat, flags=flags) + if isinstance(pat, re.Pattern): + # We need to check that flags matches pat.flags. + # pat.flags will have re.U regardless, so we need to add it here + # before checking for a match + flags = flags | re.U + + if flags != pat.flags: + raise ValueError("Cannot pass flags that do not match pat.flags") + regex = pat + else: + regex = re.compile(pat, flags=flags) f = lambda x: regex.match(x) is not None return self._str_map(f, na_value=na, dtype=np.dtype(bool)) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 12a3bd63cea90..2a244f8329345 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1004,26 +1004,27 @@ def test_match_compiled_regex(any_string_dtype): expected = Series([True, False, True, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - # TODO this currently works for pyarrow-backed dtypes but raises for python - if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow": - result = values.str.match(re.compile("ab"), case=False) - expected = Series([True, True, True, True], dtype=expected_dtype) - tm.assert_series_equal(result, expected) - else: - with pytest.raises( - ValueError, match="cannot process flags argument with a compiled pattern" - ): - values.str.match(re.compile("ab"), case=False) + msg = ( + "Cannot both specify 'case' and pass a compiled " + "regexp object with conflicting case-sensitivity" + ) + with pytest.raises(ValueError, match=msg): + values.str.match(re.compile("ab"), case=False) result = values.str.match(re.compile("ab", flags=re.IGNORECASE)) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with pytest.raises( - ValueError, match="cannot process flags argument with a compiled pattern" - ): + msg = ( + "Cannot both specify 'flags' and pass a compiled " + "regexp object with conflicting flags" + ) + with pytest.raises(ValueError, match=msg): values.str.match(re.compile("ab"), flags=re.IGNORECASE) + # But if the flags match you're OK + values.str.match(re.compile("ab", flags=re.IGNORECASE), flags=re.IGNORECASE) + @pytest.mark.parametrize( "pat, case, exp",