diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93b0ccad9f2aa..8529ff885690a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -355,6 +355,7 @@ Datetimelike - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Timedelta diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b5f448aa32fad..8736b9d5ec8d6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -530,7 +530,9 @@ cpdef array_to_datetime( state.update_creso(item_reso) if infer_reso: creso = state.creso - iresult[i] = cast_from_unit(val, "ns", out_reso=creso) + + # we now need to parse this as if unit=abbrev + iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) state.found_other = True elif isinstance(val, str): @@ -779,6 +781,13 @@ def array_to_datetime_with_tz( _TSObject tsobj bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC DatetimeParseState state = DatetimeParseState(creso) + str abbrev + + if infer_reso: + # We treat ints/floats as nanoseconds + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) for i in range(n): # Analogous to `item = values[i]` @@ -790,7 +799,12 @@ def array_to_datetime_with_tz( else: tsobj = convert_to_tsobject( - item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0 + item, + tz=tz, + unit=abbrev, + dayfirst=dayfirst, + yearfirst=yearfirst, + nanos=0, ) if tsobj.value != NPY_NAT: state.update_creso(tsobj.creso) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b100a6a80f8ed..9079e33e08dc7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2220,6 +2220,7 @@ def _sequence_to_dt64( data = cast(np.ndarray, data) copy = False if lib.infer_dtype(data, skipna=False) == "integer": + # Much more performant than going through array_to_datetime data = data.astype(np.int64) elif tz is not None and ambiguous == "raise": obj_data = np.asarray(data, dtype=object) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 7110a428544d5..35c3604de3d63 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1012,11 +1012,17 @@ def test_dti_constructor_with_non_nano_dtype(self, tz): dtype = "M8[us]" if tz is not None: dtype = f"M8[us, {tz}]" - # NB: the 2500 is interpreted as nanoseconds and rounded *down* - # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] result = DatetimeIndex(vals, dtype=dtype) - exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals] + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] exp_arr = np.array(exp_vals, dtype="M8[us]") expected = DatetimeIndex(exp_arr, dtype="M8[us]") if tz is not None: @@ -1054,6 +1060,36 @@ def test_dti_constructor_object_float_matches_float_dtype(self): dti2 = DatetimeIndex(arr2, tz="CET") tm.assert_index_equal(dti1, dti2) + @pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"]) + def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype): + # Going through the object path should match the non-object path + + vals1 = np.arange(5, dtype="i8") * 1000 + vals1[0] = pd.NaT.value + + vals2 = vals1.astype(np.float64) + vals2[0] = np.nan + + vals3 = vals1.astype(object) + # change lib.infer_dtype(vals3) from "integer" so we go through + # array_to_datetime in _sequence_to_dt64 + vals3[0] = pd.NaT + + vals4 = vals2.astype(object) + + res1 = DatetimeIndex(vals1, dtype=dtype) + res2 = DatetimeIndex(vals2, dtype=dtype) + res3 = DatetimeIndex(vals3, dtype=dtype) + res4 = DatetimeIndex(vals4, dtype=dtype) + + expected = DatetimeIndex(vals1.view("M8[us]")) + if res1.tz is not None: + expected = expected.tz_localize("UTC").tz_convert(res1.tz) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + class TestTimeSeries: def test_dti_constructor_preserve_dti_freq(self): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 7d4ad99deab38..aca06a2f91c32 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -25,6 +25,7 @@ Timestamp, cut, date_range, + to_datetime, ) import pandas._testing as tm @@ -114,13 +115,19 @@ def test_astype_object_to_dt64_non_nano(self, tz): dtype = "M8[us]" if tz is not None: dtype = f"M8[us, {tz}]" - # NB: the 2500 is interpreted as nanoseconds and rounded *down* - # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] ser = Series(vals, dtype=object) result = ser.astype(dtype) - exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals] + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] exp_arr = np.array(exp_vals, dtype="M8[us]") expected = Series(exp_arr, dtype="M8[us]") if tz is not None: