diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 15c10c913c4..ecb4e628493 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -17,9 +17,8 @@ // Functions for pandas conversion via NumPy -#include "arrow/python/numpy_interop.h" // IWYU pragma: expand - #include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/numpy_interop.h" // IWYU pragma: expand #include #include @@ -642,15 +641,14 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second + // In ARROW-7723, we found as a result of ARROW-3789 that second // through microsecond resolution tz-aware timestamps were being promoted to // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy // array in this function. PyArray_GETITEM returns datetime.datetime for // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). We inserted this hack to - // preserve the <= 0.15.1 behavior until a better solution can be devised + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; - modified_options.ignore_timezone = true; modified_options.coerce_temporal_nanoseconds = false; for (int c = 0; c < data.num_chunks(); c++) { @@ -658,8 +656,12 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; - RETURN_NOT_OK(ConvertArrayToPandas( - modified_options, arr->field(static_cast(i)), nullptr, &numpy_array)); + std::shared_ptr field = arr->field(static_cast(i)); + // Seen notes above about timstamp conversion. Don't blindly convert because + // timestamps in lists are handled differently. + modified_options.timestamp_as_object = + field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; + RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -951,8 +953,21 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); - auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + OwnedRef tzinfo; + if (!type.timezone().empty()) { + RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); + RETURN_IF_PYERROR(); + } + auto WrapValue = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + if (tzinfo.obj() != nullptr) { + PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out); + RETURN_IF_PYERROR(); + Py_DECREF(*out); + *out = with_tz; + } + RETURN_IF_PYERROR(); return Status::OK(); }; @@ -1721,8 +1736,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (ts_type.timezone() != "" && !options.ignore_timezone) { - // XXX: ignore_timezone: hack here for ARROW-7723 + } else if (!ts_type.timezone().empty()) { *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { *output_type = PandasWriter::DATETIME_NANO; diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 79a72bcb1ef..2c5db8bdaa9 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -56,10 +56,6 @@ struct PandasOptions { /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; - /// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion - /// path - bool ignore_timezone = false; - /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 8cec87bdd36..19c7a4ea27d 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -14,22 +14,65 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +#include "arrow/python/datetime.h" #include #include #include #include "arrow/python/common.h" -#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" #include "arrow/python/platform.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace py { namespace internal { +namespace { + +// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'. +// GCC 4.9 doesn't support regex, so handcode until support for it +// is dropped. +bool MatchFixedOffset(const std::string& tz, util::string_view* sign, + util::string_view* hour, util::string_view* minute) { + if (tz.size() < 5) { + return false; + } + const char* iter = tz.data(); + if (*iter == '+' || *iter == '-') { + *sign = util::string_view(iter, 1); + iter++; + if (tz.size() < 6) { + return false; + } + } + if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') || + (*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) { + *hour = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + if (*iter != ':') { + return false; + } + iter++; + + if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') { + *minute = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + return iter == (tz.data() + tz.size()); +} + +} // namespace + PyDateTime_CAPI* datetime_api = nullptr; void InitDatetime() { @@ -262,6 +305,44 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } +// GIL must be held when calling this function. +// Converted from python. See https://github.com/apache/arrow/pull/7604 +// for details. +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { + util::string_view sign_str, hour_str, minute_str; + OwnedRef pytz; + RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); + + if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { + int sign = -1; + if (sign_str == "+") { + sign = 1; + } + OwnedRef fixed_offset; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); + uint32_t minutes, hours; + if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || + !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), + &minutes)) { + return Status::Invalid("Invalid timezone: ", tz); + } + OwnedRef total_minutes(PyLong_FromLong( + sign * ((static_cast(hours) * 60) + static_cast(minutes)))); + RETURN_IF_PYERROR(); + *tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); + } + + OwnedRef timezone; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + *tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a8b22da4741..a394fa4b4bb 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -157,6 +157,16 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } +/// \brief Convert a time zone name into a time zone object. +/// +/// Supported input strings are: +/// * As used in the Olson time zone database (the "tz database" or +/// "tzdata"), such as "America/New_York" +/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 +/// GIL must be held when calling this method. +ARROW_PYTHON_EXPORT +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f5478f93944..d57c9261ded 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1174,7 +1174,9 @@ cdef _array_like_to_pandas(obj, options): result = pandas_api.series(arr, dtype=dtype, name=name) if (isinstance(original_type, TimestampType) and - original_type.tz is not None): + original_type.tz is not None and + # can be object dtype for non-ns and timestamp_as_object=True + result.dtype.kind == "M"): from pyarrow.pandas_compat import make_tz_aware result = make_tz_aware(result, original_type.tz) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 93c43ca99b4..19b7340f906 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1819,6 +1819,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) + CStatus StringToTzinfo(c_string, PyObject** tzinfo) cdef extern from 'arrow/python/init.h': diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 392a3221002..ac381751498 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -22,7 +22,7 @@ import sys from collections import OrderedDict -from datetime import date, datetime, time, timedelta +from datetime import date, datetime, time, timedelta, timezone from distutils.version import LooseVersion import hypothesis as h @@ -3321,13 +3321,20 @@ def test_cast_timestamp_unit(): assert result.equals(expected) -def test_struct_with_timestamp_tz(): +def test_nested_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() + ts_dt = ts.to_pydatetime() # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 for unit in ['s', 'ms', 'us']: + if unit in ['s', 'ms']: + # This is used for verifying timezone conversion to micros are not + # important + def truncate(x): return x.replace(microsecond=0) + else: + def truncate(x): return x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) @@ -3336,20 +3343,30 @@ def test_struct_with_timestamp_tz(): result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is None assert isinstance(result[0]['stop'], datetime) + assert result[0]['stop'].tzinfo is None result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is not None + utc_dt = result[0]['start'].astimezone(timezone.utc) + assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt) assert isinstance(result[0]['stop'], datetime) + assert result[0]['stop'].tzinfo is not None # same conversion for table result = pa.table({'a': arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['stop'].tzinfo is None result = pa.table({'a': arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['stop'].tzinfo is not None # ---------------------------------------------------------------------- @@ -4010,19 +4027,25 @@ def test_timestamp_as_object_out_of_range(): @pytest.mark.parametrize("resolution", ["s", "ms", "us"]) +@pytest.mark.parametrize("tz", [None, "America/New_York"]) # One datetime outside nanosecond range, one inside nanosecond range: @pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)]) -def test_timestamp_as_object_non_nanosecond(resolution, dt): +def test_timestamp_as_object_non_nanosecond(resolution, tz, dt): # Timestamps can be converted Arrow and reloaded into Pandas with no loss # of information if the timestamp_as_object option is True. - arr = pa.array([dt], type=pa.timestamp(resolution)) - result = arr.to_pandas(timestamp_as_object=True) - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt - + arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz)) table = pa.table({'a': arr}) - result = table.to_pandas(timestamp_as_object=True)['a'] - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt + + for result in [ + arr.to_pandas(timestamp_as_object=True), + table.to_pandas(timestamp_as_object=True)['a'] + ]: + assert result.dtype == object + assert isinstance(result[0], datetime) + if tz: + assert result[0].tzinfo is not None + expected = result[0].tzinfo.fromutc(dt) + else: + assert result[0].tzinfo is None + expected = dt + assert result[0] == expected diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e541510df0b..007f8e88198 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1816,9 +1816,6 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' -_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$') - - def tzinfo_to_string(tz): """ Converts a time zone object into a string indicating the name of a time @@ -1884,14 +1881,9 @@ def string_to_tzinfo(name): tz : datetime.tzinfo Time zone object """ - import pytz - m = _FIXED_OFFSET_RE.match(name) - if m: - sign = 1 if m.group(1) == '+' else -1 - hours, minutes = map(int, m.group(2, 3)) - return pytz.FixedOffset(sign * (hours * 60 + minutes)) - else: - return pytz.timezone(name) + cdef PyObject* tz + check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) + return PyObject_to_object(tz) def timestamp(unit, tz=None):