Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ In cases with mixed-resolution inputs, the highest resolution is used:

.. warning:: Many users will now get "M8[us]" dtype data in cases when they used to get "M8[ns]". For most use cases they should not notice a difference. One big exception is converting to integers, which will give integers 1000x smaller.

Similarly, the :class:`Timedelta` constructor and :func:`to_timedelta` with a string input now defaults to a microsecond unit, using nanosecond unit only in cases that actually have nanosecond precision.

.. _whatsnew_300.api_breaking.concat_datetime_sorting:

:func:`concat` no longer ignores ``sort`` when all objects have a :class:`DatetimeIndex`
Expand Down
25 changes: 22 additions & 3 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
import re
import warnings

from pandas.util._decorators import set_module
Expand Down Expand Up @@ -679,6 +680,17 @@ cdef timedelta_from_spec(object number, object frac, object unit):
return cast_from_unit(float(n), unit)


cdef bint needs_nano_unit(int64_t ival, str item):
"""
Check if a passed string `item` needs to be stored with nano unit or can
use microsecond instead.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
use microsecond instead.
use microsecond instead. Needs nanoseconds if:
- if the parsed value in nanoseconds has sub-microseconds content -> certainly
needs nano
- if the seconds part in the string contains more than 6 decimals, i.e. has
trailing zeros beyond the microsecond part (e.g. "0.123456000 s") -> treat
as nano for consistency
- if the string explicitly contains an entry for nanoseconds (e.g. "1000 ns")

I had a hard time understanding what the function was doing on first read, so I think a bit more explanation like above would help for future readers

"""
# TODO: more performant way of doing this check?
if ival % 1000 != 0:
return True
return re.search(r"\.\d{7}", item) or "ns" in item or "nano" in item


cpdef inline str parse_timedelta_unit(str unit):
"""
Parameters
Expand Down Expand Up @@ -2078,10 +2090,17 @@ class Timedelta(_Timedelta):
if (len(value) > 0 and value[0] == "P") or (
len(value) > 1 and value[:2] == "-P"
):
value = parse_iso_format_string(value)
ival = parse_iso_format_string(value)
else:
ival = parse_timedelta_string(value)

if not needs_nano_unit(ival, value):
# If we don't specifically need nanosecond resolution, default
# to microsecond like we do for datetimes
value = np.timedelta64(ival // 1000, "us")
return cls(value)
else:
value = parse_timedelta_string(value)
value = np.timedelta64(value)
value = np.timedelta64(ival, "ns")
elif PyDelta_Check(value):
# pytimedelta object -> microsecond resolution
new_value = delta_to_nanoseconds(
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arithmetic/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,9 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array
# i.e. resolution is lower -> use lowest supported resolution
dtype = np.dtype("m8[s]")
expected = expected.astype(dtype)
elif type(three_days) is timedelta:
elif type(three_days) is timedelta or (
isinstance(three_days, Timedelta) and three_days.unit == "us"
):
expected = expected.astype("m8[us]")
elif isinstance(
three_days,
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def test_len_nan_group():

def test_groupby_timedelta_median():
# issue 57926
expected = Series(data=Timedelta("1D"), index=["foo"])
expected = Series(data=Timedelta("1D"), index=["foo"], dtype="m8[ns]")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering why ns, but it is the other PR that will preserve the unit of the Timedelta object when converting to an array?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the dtype here preserves the current dtype for expected. The other PR will change the dtype for df["timedelta"] below, so an update will be needed after one of them gets merged

df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]})
gb = df.groupby("label")["timedelta"]
actual = gb.median()
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/scalar/timedelta/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def test_td_mul_td64_ndarray_invalid(self):

msg = (
"ufunc '?multiply'? cannot use operands with types "
rf"dtype\('{tm.ENDIAN}m8\[ns\]'\) and dtype\('{tm.ENDIAN}m8\[ns\]'\)"
rf"dtype\('{tm.ENDIAN}m8\[us\]'\) and dtype\('{tm.ENDIAN}m8\[us\]'\)"
)
with pytest.raises(TypeError, match=msg):
td * other
Expand Down Expand Up @@ -1219,6 +1219,7 @@ def test_ops_str_deprecated(box):
"ufunc 'divide' cannot use operands",
"Invalid dtype object for __floordiv__",
r"unsupported operand type\(s\) for /: 'int' and 'str'",
r"unsupported operand type\(s\) for /: 'datetime.timedelta' and 'str'",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious, how is this caused by the changes here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was only on the dev builds and when box=True so we are dividing by np.array(["1"], dtype=object), so i suspect that when td.to_timedelt64() has a "us" unit it tries casting to a pytimedelta to operate

]
)
with pytest.raises(TypeError, match=msg):
Expand Down
26 changes: 22 additions & 4 deletions pandas/tests/scalar/timedelta/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,12 @@ def test_construction():
expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8")
assert Timedelta(10, unit="D")._value == expected
assert Timedelta(10.0, unit="D")._value == expected
assert Timedelta("10 days")._value == expected
assert Timedelta("10 days")._value == expected // 1000
assert Timedelta(days=10)._value == expected
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to keep track, this is another code path that should also still be updated ideally? (for another PR)
Similarly as doing Timedelta(datetime.timedelta()), which already gives us

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you're suggesting. I'd be open to changing .value to no longer always cast to nanos, but _value is correct here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, this was about the Timedelta(days=10) still returning nanos (so about the line that did not change)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh i see. will be starting a branch for that shortly

assert Timedelta(days=10.0)._value == expected

expected += np.timedelta64(10, "s").astype("m8[ns]").view("i8")
assert Timedelta("10 days 00:00:10")._value == expected
assert Timedelta("10 days 00:00:10")._value == expected // 1000
assert Timedelta(days=10, seconds=10)._value == expected
assert Timedelta(days=10, milliseconds=10 * 1000)._value == expected
assert Timedelta(days=10, microseconds=10 * 1000 * 1000)._value == expected
Expand Down Expand Up @@ -434,7 +434,7 @@ def test_td_construction_with_np_dtypes(npdtype, item):
def test_td_from_repr_roundtrip(val):
# round-trip both for string and value
td = Timedelta(val)
assert Timedelta(td._value) == td
assert Timedelta(td.value) == td

assert Timedelta(str(td)) == td
assert Timedelta(td._repr_base(format="all")) == td
Expand All @@ -443,7 +443,7 @@ def test_td_from_repr_roundtrip(val):

def test_overflow_on_construction():
# GH#3374
value = Timedelta("1day")._value * 20169940
value = Timedelta("1day").as_unit("ns")._value * 20169940
msg = "Cannot cast 1742682816000000000000 from ns to 'ns' without overflow"
with pytest.raises(OutOfBoundsTimedelta, match=msg):
Timedelta(value)
Expand Down Expand Up @@ -705,3 +705,21 @@ def test_non_nano_value():
# check that the suggested workaround actually works
result = td.asm8.view("i8")
assert result == 86400000000


def test_parsed_unit():
td = Timedelta("1 Day")
assert td.unit == "us"

td = Timedelta("1 Day 2 hours 3 minutes 4 ns")
assert td.unit == "ns"

td = Timedelta("1 Day 2:03:04.012345")
assert td.unit == "us"

td = Timedelta("1 Day 2:03:04.012345000")
assert td.unit == "ns"

# 7 digits after the decimal
td = Timedelta("1 Day 2:03:04.0123450")
assert td.unit == "ns"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert td.unit == "ns"
assert td.unit == "ns"
td = Timedelta("1 Day 2:03:04.012345000")
assert td.unit == "ns"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will update

5 changes: 3 additions & 2 deletions pandas/tests/scalar/timedelta/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ def test_total_seconds_scalar(self):

def test_conversion(self):
for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]:
td = td.as_unit("ns")
pydt = td.to_pytimedelta()
assert td == Timedelta(pydt)
assert td == pydt
Expand Down Expand Up @@ -385,8 +386,8 @@ def check(value):
assert abs(td) == Timedelta("13:48:48")
assert str(td) == "-1 days +10:11:12"
assert -td == Timedelta("0 days 13:48:48")
assert -Timedelta("-1 days, 10:11:12")._value == 49728000000000
assert Timedelta("-1 days, 10:11:12")._value == -49728000000000
assert -Timedelta("-1 days, 10:11:12")._value == 49728000000
assert Timedelta("-1 days, 10:11:12")._value == -49728000000

rng = to_timedelta("-1 days, 10:11:12.100123456")
assert rng.days == -1
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/scalar/timestamp/test_timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,11 +367,11 @@ def test_roundtrip(self):
# further test accessors
base = Timestamp("20140101 00:00:00").as_unit("ns")

result = Timestamp(base._value + Timedelta("5ms")._value)
result = Timestamp(base._value + Timedelta("5ms").value)
assert result == Timestamp(f"{base}.005000")
assert result.microsecond == 5000

result = Timestamp(base._value + Timedelta("5us")._value)
result = Timestamp(base._value + Timedelta("5us").value)
assert result == Timestamp(f"{base}.000005")
assert result.microsecond == 5

Expand All @@ -380,11 +380,11 @@ def test_roundtrip(self):
assert result.nanosecond == 5
assert result.microsecond == 0

result = Timestamp(base._value + Timedelta("6ms 5us")._value)
result = Timestamp(base._value + Timedelta("6ms 5us").value)
assert result == Timestamp(f"{base}.006005")
assert result.microsecond == 5 + 6 * 1000

result = Timestamp(base._value + Timedelta("200ms 5us")._value)
result = Timestamp(base._value + Timedelta("200ms 5us").value)
assert result == Timestamp(f"{base}.200005")
assert result.microsecond == 5 + 200 * 1000

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ def test_fillna_pytimedelta(self):
ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"])

result = ser.fillna(timedelta(1))
expected = Series(Timedelta("1 days"), index=["A", "B"])
expected = Series(Timedelta("1 days"), index=["A", "B"], dtype="m8[ns]")
tm.assert_series_equal(result, expected)

def test_fillna_period(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def test_arithmetic_with_duplicate_index(self):
ser = Series(date_range("20130101 09:00:00", periods=5), index=index)
other = Series(date_range("20130101", periods=5), index=index)
result = ser - other
expected = Series(Timedelta("9 hours"), index=[2, 2, 3, 3, 4])
expected = Series(Timedelta("9 hours"), index=[2, 2, 3, 3, 4], dtype="m8[ns]")
tm.assert_series_equal(result, expected)

def test_masked_and_non_masked_propagate_na(self):
Expand Down
Loading