From b738ae7a636cca10cf64ca82d506cceaa2f9f559 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 07:55:48 +0000 Subject: [PATCH 01/14] ARROW-9223: [Python] Propagate Timzone information in pandas conversion - Ports string_to_timezone to C++ - Causes nested timestamp columns within structs to use conversion to object path. - Copy timezone on to_object path. Open to other suggestions on how to solve this. --- cpp/src/arrow/python/arrow_to_pandas.cc | 29 +++++++++++------ cpp/src/arrow/python/arrow_to_pandas.h | 4 --- cpp/src/arrow/python/datetime.cc | 41 ++++++++++++++++++++++++- cpp/src/arrow/python/datetime.h | 11 +++++++ python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_pandas.py | 8 +++++ python/pyarrow/types.pxi | 12 ++------ 7 files changed, 83 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 15c10c913c4..390402be64a 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -17,9 +17,8 @@ // Functions for pandas conversion via NumPy -#include "arrow/python/numpy_interop.h" // IWYU pragma: expand - #include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/numpy_interop.h" // IWYU pragma: expand #include #include @@ -642,15 +641,15 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second + // In ARROW-7723, we found as a result of ARROW-3789 that second // through microsecond resolution tz-aware timestamps were being promoted to // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy // array in this function. PyArray_GETITEM returns datetime.datetime for // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). We inserted this hack to - // preserve the <= 0.15.1 behavior until a better solution can be devised + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; - modified_options.ignore_timezone = true; + modified_options.timestamp_as_object = true; modified_options.coerce_temporal_nanoseconds = false; for (int c = 0; c < data.num_chunks(); c++) { @@ -951,8 +950,21 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); - auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + PyObject* tzinfo = nullptr; + if (!type.timezone().empty()) { + RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), &tzinfo)); + } + auto WrapValue = [unit, tzinfo](typename Type::c_type value, PyObject** out) { + OwnedRef tz(tzinfo); RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + if (tzinfo != nullptr) { + PyObject* with_tz = PyObject_CallMethod(*out, "astimezone", "O", tzinfo); + RETURN_IF_PYERROR(); + Py_DECREF(*out); + *out = with_tz; + } + RETURN_IF_PYERROR(); return Status::OK(); }; @@ -1721,8 +1733,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (ts_type.timezone() != "" && !options.ignore_timezone) { - // XXX: ignore_timezone: hack here for ARROW-7723 + } else if (!ts_type.timezone().empty()) { *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { *output_type = PandasWriter::DATETIME_NANO; diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 79a72bcb1ef..2c5db8bdaa9 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -56,10 +56,6 @@ struct PandasOptions { /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; - /// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion - /// path - bool ignore_timezone = false; - /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 8cec87bdd36..82665db965a 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -14,17 +14,20 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +#include "arrow/python/datetime.h" #include #include #include +#include #include "arrow/python/common.h" -#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" #include "arrow/python/platform.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace py { @@ -262,6 +265,42 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } +// GIL must be held when calling this function. +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { + static const std::regex kFixedOffset("([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$"); + OwnedRef pytz; + RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); + + std::cmatch match; + if (std::regex_match(tz.c_str(), match, kFixedOffset)) { + int sign = -1; + if (match.str(1) == "+") { + sign = 1; + } + OwnedRef fixed_offset; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); + uint32_t minutes, hours; + if (!::arrow::internal::ParseUnsigned(match[2].first, match[2].length(), &hours) || + !::arrow::internal::ParseUnsigned(match[3].first, match[3].length(), &minutes)) { + return Status::Invalid("Invalid timezone: ", timezone); + } + OwnedRef total_minutes(PyLong_FromLong( + sign * ((static_cast(hours) * 60) + static_cast(minutes)))); + RETURN_IF_PYERROR(); + *tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); + } + + OwnedRef timezone; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + *tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a8b22da4741..c21500048eb 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -157,6 +157,17 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } +/// \brief Convert a time zone name into a time zone object. +/// +/// Supported input strings are: +/// * As used in the Olson time zone database (the "tz database" or +/// "tzdata"), such as "America/New_York" +/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + +// GIL must be held when calling this method. +ARROW_PYTHON_EXPORT +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 93c43ca99b4..19b7340f906 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1819,6 +1819,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) + CStatus StringToTzinfo(c_string, PyObject** tzinfo) cdef extern from 'arrow/python/init.h': diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 392a3221002..6202d2c5b2d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3336,20 +3336,28 @@ def test_struct_with_timestamp_tz(): result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is None assert isinstance(result[0]['stop'], datetime) + assert result[0]['stop'].tzinfo is None result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is not None assert isinstance(result[0]['stop'], datetime) + assert result[0]['start'].tzinfo is not None # same conversion for table result = pa.table({'a': arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['start'].tzinfo is None result = pa.table({'a': arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['start'].tzinfo is not None # ---------------------------------------------------------------------- diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e541510df0b..3b104cf2443 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1884,15 +1884,9 @@ def string_to_tzinfo(name): tz : datetime.tzinfo Time zone object """ - import pytz - m = _FIXED_OFFSET_RE.match(name) - if m: - sign = 1 if m.group(1) == '+' else -1 - hours, minutes = map(int, m.group(2, 3)) - return pytz.FixedOffset(sign * (hours * 60 + minutes)) - else: - return pytz.timezone(name) - + cdef PyObject* tz + check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) + return PyObject_to_object(tz) def timestamp(unit, tz=None): """ From c14a516b0523dad5e274ef0d118a6fd806232a29 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 08:04:42 +0000 Subject: [PATCH 02/14] minor fixes --- cpp/src/arrow/python/datetime.h | 3 +-- python/pyarrow/types.pxi | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index c21500048eb..a394fa4b4bb 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -163,8 +163,7 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { /// * As used in the Olson time zone database (the "tz database" or /// "tzdata"), such as "America/New_York" /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - -// GIL must be held when calling this method. +/// GIL must be held when calling this method. ARROW_PYTHON_EXPORT Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 3b104cf2443..dcdfd510919 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1816,9 +1816,6 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' -_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$') - - def tzinfo_to_string(tz): """ Converts a time zone object into a string indicating the name of a time From aa91a366600ecd8a915ef23fca5dbeabc1521292 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 08:09:16 +0000 Subject: [PATCH 03/14] fix string --- cpp/src/arrow/python/datetime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 82665db965a..3d2b60ed1b6 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -282,7 +282,7 @@ Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { uint32_t minutes, hours; if (!::arrow::internal::ParseUnsigned(match[2].first, match[2].length(), &hours) || !::arrow::internal::ParseUnsigned(match[3].first, match[3].length(), &minutes)) { - return Status::Invalid("Invalid timezone: ", timezone); + return Status::Invalid("Invalid timezone: ", tz); } OwnedRef total_minutes(PyLong_FromLong( sign * ((static_cast(hours) * 60) + static_cast(minutes)))); From cbb443b0e007ba4679b459db89a0c96ca00f1ed7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 08:12:49 +0000 Subject: [PATCH 04/14] ask question inline about list handling --- cpp/src/arrow/python/arrow_to_pandas.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 390402be64a..f4f321efdfb 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -744,6 +744,7 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy // the existing unit tests PandasOptions modified_options = options; + // Force object converation here as well for timestamps? modified_options.coerce_temporal_nanoseconds = false; OwnedRefNoGIL owned_numpy_array; From a09602b4dc16819f783c804db5ba92dc8b73a7b8 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 20:38:44 -0700 Subject: [PATCH 05/14] Apply same convesion of timestamp to datetime for lists (previously datetime64 was used). Do correct timezone adjustment. --- cpp/src/arrow/python/arrow_to_pandas.cc | 30 ++++++++++++++++----- cpp/src/arrow/python/datetime.cc | 2 +- python/pyarrow/tests/test_pandas.py | 35 ++++++++++++++++++++----- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index f4f321efdfb..898ae407b95 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -744,8 +744,8 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy // the existing unit tests PandasOptions modified_options = options; - // Force object converation here as well for timestamps? modified_options.coerce_temporal_nanoseconds = false; + modified_options.timestamp_as_object = true; OwnedRefNoGIL owned_numpy_array; RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_column, nullptr, @@ -951,16 +951,32 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); - PyObject* tzinfo = nullptr; + OwnedRef tzinfo; + OwnedRef utc_kwargs; + OwnedRef empty_args; if (!type.timezone().empty()) { - RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), &tzinfo)); + RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); + utc_kwargs.reset(PyDict_New()); + RETURN_IF_PYERROR(); + + OwnedRef utc; + RETURN_NOT_OK(internal::StringToTzinfo("utc", utc.ref())); + PyDict_SetItemString(utc_kwargs.obj(), "tzinfo", utc.obj()); + RETURN_IF_PYERROR(); + + empty_args.reset(PyTuple_New(0)); + RETURN_IF_PYERROR(); } - auto WrapValue = [unit, tzinfo](typename Type::c_type value, PyObject** out) { - OwnedRef tz(tzinfo); + auto WrapValue = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); RETURN_IF_PYERROR(); - if (tzinfo != nullptr) { - PyObject* with_tz = PyObject_CallMethod(*out, "astimezone", "O", tzinfo); + if (tzinfo.obj() != nullptr) { + OwnedRef replace(PyObject_GetAttrString(*out, "replace")); + OwnedRef with_utc( + PyObject_Call(replace.obj(), empty_args.obj(), utc_kwargs.obj())); + RETURN_IF_PYERROR(); + PyObject* with_tz = + PyObject_CallMethod(with_utc.obj(), "astimezone", "O", tzinfo.obj()); RETURN_IF_PYERROR(); Py_DECREF(*out); *out = with_tz; diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 3d2b60ed1b6..0f5206f46ba 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -272,7 +272,7 @@ Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); std::cmatch match; - if (std::regex_match(tz.c_str(), match, kFixedOffset)) { + if (std::regex_match(tz.data(), tz.data() + tz.size(), match, kFixedOffset)) { int sign = -1; if (match.str(1) == "+") { sign = 1; diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 6202d2c5b2d..eee038a2af9 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -22,7 +22,7 @@ import sys from collections import OrderedDict -from datetime import date, datetime, time, timedelta +from datetime import date, datetime, time, timedelta, timezone from distutils.version import LooseVersion import hypothesis as h @@ -3321,9 +3321,12 @@ def test_cast_timestamp_unit(): assert result.equals(expected) -def test_struct_with_timestamp_tz(): +def test_nested_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() + # This is used for verifying timezone conversion to micros are not + # important + ts_dt = ts.to_pydatetime().replace(microsecond=0) # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 @@ -3332,7 +3335,9 @@ def test_struct_with_timestamp_tz(): arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) arr3 = pa.StructArray.from_arrays([arr, arr], ['start', 'stop']) + list_arr3 = pa.ListArray.from_arrays([0, 1], arr) arr4 = pa.StructArray.from_arrays([arr2, arr2], ['start', 'stop']) + list_arr4 = pa.ListArray.from_arrays([0, 1], arr2) result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) @@ -3340,24 +3345,40 @@ def test_struct_with_timestamp_tz(): assert isinstance(result[0]['stop'], datetime) assert result[0]['stop'].tzinfo is None + result = list_arr3.to_pandas() + assert isinstance(result[0][0], datetime) + assert result[0][0].tzinfo is None + result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) assert result[0]['start'].tzinfo is not None + utc_dt = result[0]['start'].astimezone(timezone.utc) + assert utc_dt.replace(tzinfo=None, microsecond=0) == ts_dt assert isinstance(result[0]['stop'], datetime) - assert result[0]['start'].tzinfo is not None + assert result[0]['stop'].tzinfo is not None + + result = list_arr4.to_pandas() + assert isinstance(result[0][0], datetime) + utc_dt = result[0][0].astimezone(timezone.utc) + assert utc_dt.replace(tzinfo=None, microsecond=0) == ts_dt + assert result[0][0].tzinfo is not None # same conversion for table - result = pa.table({'a': arr3}).to_pandas() + result = pa.table({'a': arr3, 'b': list_arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) - assert result['a'][0]['start'].tzinfo is None + assert result['a'][0]['stop'].tzinfo is None + assert isinstance(result['b'][0][0], datetime) + assert result['b'][0][0].tzinfo is None - result = pa.table({'a': arr4}).to_pandas() + result = pa.table({'a': arr4, 'b': list_arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) - assert result['a'][0]['start'].tzinfo is not None + assert result['a'][0]['stop'].tzinfo is not None + assert isinstance(result['b'][0][0], datetime) + assert result['b'][0][0].tzinfo is not None # ---------------------------------------------------------------------- From 4aafda52c01a607ca76df35fb5851dd856a390f0 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 1 Jul 2020 20:54:18 -0700 Subject: [PATCH 06/14] add whitespace for lint --- python/pyarrow/types.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index dcdfd510919..007f8e88198 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1885,6 +1885,7 @@ def string_to_tzinfo(name): check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) return PyObject_to_object(tz) + def timestamp(unit, tz=None): """ Create instance of timestamp type with resolution and optional time zone. From 2ed3d41f4983620a78fb735a6e596dace22865d3 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 2 Jul 2020 15:07:53 +1000 Subject: [PATCH 07/14] only modify with timezone path --- cpp/src/arrow/python/arrow_to_pandas.cc | 10 ++++++---- python/pyarrow/tests/test_pandas.py | 20 ++------------------ 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 898ae407b95..e599add7ee5 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -649,16 +649,19 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // datetime.datetime does not support nanoseconds). // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; - modified_options.timestamp_as_object = true; modified_options.coerce_temporal_nanoseconds = false; for (int c = 0; c < data.num_chunks(); c++) { auto arr = checked_cast(data.chunk(c).get()); // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { + // Se notes above about conversion. + std::shared_ptr field = arr->field(static_cast(i)); + modified_options.timestamp_as_object = + field->type()->id() == Type::TIMESTAMP && + !checked_cast(*field->type()).timezone().empty(); PyObject* numpy_array; - RETURN_NOT_OK(ConvertArrayToPandas( - modified_options, arr->field(static_cast(i)), nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -745,7 +748,6 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, // the existing unit tests PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; - modified_options.timestamp_as_object = true; OwnedRefNoGIL owned_numpy_array; RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_column, nullptr, diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index eee038a2af9..bf1ec92266d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3335,9 +3335,7 @@ def test_nested_with_timestamp_tz(): arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) arr3 = pa.StructArray.from_arrays([arr, arr], ['start', 'stop']) - list_arr3 = pa.ListArray.from_arrays([0, 1], arr) arr4 = pa.StructArray.from_arrays([arr2, arr2], ['start', 'stop']) - list_arr4 = pa.ListArray.from_arrays([0, 1], arr2) result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) @@ -3345,10 +3343,6 @@ def test_nested_with_timestamp_tz(): assert isinstance(result[0]['stop'], datetime) assert result[0]['stop'].tzinfo is None - result = list_arr3.to_pandas() - assert isinstance(result[0][0], datetime) - assert result[0][0].tzinfo is None - result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) assert result[0]['start'].tzinfo is not None @@ -3357,28 +3351,18 @@ def test_nested_with_timestamp_tz(): assert isinstance(result[0]['stop'], datetime) assert result[0]['stop'].tzinfo is not None - result = list_arr4.to_pandas() - assert isinstance(result[0][0], datetime) - utc_dt = result[0][0].astimezone(timezone.utc) - assert utc_dt.replace(tzinfo=None, microsecond=0) == ts_dt - assert result[0][0].tzinfo is not None - # same conversion for table - result = pa.table({'a': arr3, 'b': list_arr3}).to_pandas() + result = pa.table({'a': arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) assert result['a'][0]['stop'].tzinfo is None - assert isinstance(result['b'][0][0], datetime) - assert result['b'][0][0].tzinfo is None - result = pa.table({'a': arr4, 'b': list_arr4}).to_pandas() + result = pa.table({'a': arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) assert result['a'][0]['stop'].tzinfo is not None - assert isinstance(result['b'][0][0], datetime) - assert result['b'][0][0].tzinfo is not None # ---------------------------------------------------------------------- From d321a8d91da755f1e9b716fe1151f8ebbf01b4d4 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 2 Jul 2020 05:40:26 +0000 Subject: [PATCH 08/14] use fromutc, it is more succinct --- cpp/src/arrow/python/arrow_to_pandas.cc | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index e599add7ee5..e9465ea9609 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -954,31 +954,15 @@ struct ObjectWriterVisitor { enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); OwnedRef tzinfo; - OwnedRef utc_kwargs; - OwnedRef empty_args; if (!type.timezone().empty()) { RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); - utc_kwargs.reset(PyDict_New()); - RETURN_IF_PYERROR(); - - OwnedRef utc; - RETURN_NOT_OK(internal::StringToTzinfo("utc", utc.ref())); - PyDict_SetItemString(utc_kwargs.obj(), "tzinfo", utc.obj()); - RETURN_IF_PYERROR(); - - empty_args.reset(PyTuple_New(0)); RETURN_IF_PYERROR(); } auto WrapValue = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); RETURN_IF_PYERROR(); if (tzinfo.obj() != nullptr) { - OwnedRef replace(PyObject_GetAttrString(*out, "replace")); - OwnedRef with_utc( - PyObject_Call(replace.obj(), empty_args.obj(), utc_kwargs.obj())); - RETURN_IF_PYERROR(); - PyObject* with_tz = - PyObject_CallMethod(with_utc.obj(), "astimezone", "O", tzinfo.obj()); + PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out); RETURN_IF_PYERROR(); Py_DECREF(*out); *out = with_tz; From 783f37050fa90999b3dcc4dffb427326f5c6dc6d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 3 Jul 2020 04:35:15 +0000 Subject: [PATCH 09/14] remove regex. use to_object always for struct --- cpp/src/arrow/python/arrow_to_pandas.cc | 7 +--- cpp/src/arrow/python/datetime.cc | 51 +++++++++++++++++++++---- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index e9465ea9609..8b151622209 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -650,17 +650,14 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; + modified_options.timestamp_as_object = true; for (int c = 0; c < data.num_chunks(); c++) { auto arr = checked_cast(data.chunk(c).get()); // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { - // Se notes above about conversion. - std::shared_ptr field = arr->field(static_cast(i)); - modified_options.timestamp_as_object = - field->type()->id() == Type::TIMESTAMP && - !checked_cast(*field->type()).timezone().empty(); PyObject* numpy_array; + std::shared_ptr field = arr->field(static_cast(i)); RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 0f5206f46ba..4577b1e5750 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -19,7 +19,6 @@ #include #include #include -#include #include "arrow/python/common.h" #include "arrow/python/helpers.h" @@ -33,6 +32,44 @@ namespace arrow { namespace py { namespace internal { +namespace { + +bool MatchFixedOffset(const std::string& tz, util::string_view* sign, + util::string_view* hour, util::string_view* minute) { + if (tz.size() < 5) { + return false; + } + const char* iter = tz.data(); + if (*iter == '+' || *iter == '-') { + *sign = util::string_view(iter, 1); + iter++; + if (tz.size() < 6) { + return false; + } + } + if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') || + (*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) { + *hour = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + if (*iter != ':') { + return false; + } + iter++; + + if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') { + *minute = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + return iter == (tz.data() + tz.size()); +} + +} // namespace + PyDateTime_CAPI* datetime_api = nullptr; void InitDatetime() { @@ -267,21 +304,21 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { // GIL must be held when calling this function. Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { - static const std::regex kFixedOffset("([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$"); + util::string_view sign_str, hour_str, minute_str; OwnedRef pytz; RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); - std::cmatch match; - if (std::regex_match(tz.data(), tz.data() + tz.size(), match, kFixedOffset)) { + if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { int sign = -1; - if (match.str(1) == "+") { + if (sign_str == "+") { sign = 1; } OwnedRef fixed_offset; RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); uint32_t minutes, hours; - if (!::arrow::internal::ParseUnsigned(match[2].first, match[2].length(), &hours) || - !::arrow::internal::ParseUnsigned(match[3].first, match[3].length(), &minutes)) { + if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || + !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), + &minutes)) { return Status::Invalid("Invalid timezone: ", tz); } OwnedRef total_minutes(PyLong_FromLong( From 3d864778932ace7a24b7a47c50ef1c1807e7a30c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 3 Jul 2020 04:45:17 +0000 Subject: [PATCH 10/14] only truncate seconds and milliseconds --- python/pyarrow/tests/test_pandas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index bf1ec92266d..b274c17d5a0 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3326,11 +3326,15 @@ def test_nested_with_timestamp_tz(): ts = pd.Timestamp.now() # This is used for verifying timezone conversion to micros are not # important - ts_dt = ts.to_pydatetime().replace(microsecond=0) + ts_dt = ts.to_pydatetime() # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 for unit in ['s', 'ms', 'us']: + if unit in ['s', 'ms']: + truncate = lambda x: x.replace(microsecond=0) + else: + truncate = lambda x: x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) @@ -3347,7 +3351,7 @@ def test_nested_with_timestamp_tz(): assert isinstance(result[0]['start'], datetime) assert result[0]['start'].tzinfo is not None utc_dt = result[0]['start'].astimezone(timezone.utc) - assert utc_dt.replace(tzinfo=None, microsecond=0) == ts_dt + assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt) assert isinstance(result[0]['stop'], datetime) assert result[0]['stop'].tzinfo is not None From 1ed5ee3d1fdf1b6d222a4ce9b60dce061543ca5e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 3 Jul 2020 04:47:16 +0000 Subject: [PATCH 11/14] move comment --- python/pyarrow/tests/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index b274c17d5a0..1d46bd362f7 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3324,14 +3324,14 @@ def test_cast_timestamp_unit(): def test_nested_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() - # This is used for verifying timezone conversion to micros are not - # important ts_dt = ts.to_pydatetime() # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 for unit in ['s', 'ms', 'us']: if unit in ['s', 'ms']: + # This is used for verifying timezone conversion to micros are not + # important truncate = lambda x: x.replace(microsecond=0) else: truncate = lambda x: x From 4285420abb1d44fdcd24e4a365ef0df545b734b5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 4 Jul 2020 04:43:43 +0000 Subject: [PATCH 12/14] fix logic for propagating object_as_timestamp and other review comments --- cpp/src/arrow/python/arrow_to_pandas.cc | 5 ++++- cpp/src/arrow/python/datetime.cc | 5 +++++ python/pyarrow/tests/test_pandas.py | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 8b151622209..ecb4e628493 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -650,7 +650,6 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; - modified_options.timestamp_as_object = true; for (int c = 0; c < data.num_chunks(); c++) { auto arr = checked_cast(data.chunk(c).get()); @@ -658,6 +657,10 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; std::shared_ptr field = arr->field(static_cast(i)); + // Seen notes above about timstamp conversion. Don't blindly convert because + // timestamps in lists are handled differently. + modified_options.timestamp_as_object = + field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 4577b1e5750..19c7a4ea27d 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -34,6 +34,9 @@ namespace internal { namespace { +// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'. +// GCC 4.9 doesn't support regex, so handcode until support for it +// is dropped. bool MatchFixedOffset(const std::string& tz, util::string_view* sign, util::string_view* hour, util::string_view* minute) { if (tz.size() < 5) { @@ -303,6 +306,8 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { } // GIL must be held when calling this function. +// Converted from python. See https://github.com/apache/arrow/pull/7604 +// for details. Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { util::string_view sign_str, hour_str, minute_str; OwnedRef pytz; diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1d46bd362f7..890f303b1d7 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3332,9 +3332,9 @@ def test_nested_with_timestamp_tz(): if unit in ['s', 'ms']: # This is used for verifying timezone conversion to micros are not # important - truncate = lambda x: x.replace(microsecond=0) + def truncate(x): return x.replace(microsecond=0) else: - truncate = lambda x: x + def truncate(x): return x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) From 4137304108fccdcf1827b8bffe3f94afd44cd9fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Jul 2020 11:42:51 +0200 Subject: [PATCH 13/14] tz fix for to_pandas with timestamp array and timestamp_as_object=True --- python/pyarrow/array.pxi | 4 +++- python/pyarrow/tests/test_pandas.py | 26 +++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f5478f93944..d57c9261ded 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1174,7 +1174,9 @@ cdef _array_like_to_pandas(obj, options): result = pandas_api.series(arr, dtype=dtype, name=name) if (isinstance(original_type, TimestampType) and - original_type.tz is not None): + original_type.tz is not None and + # can be object dtype for non-ns and timestamp_as_object=True + result.dtype.kind == "M"): from pyarrow.pandas_compat import make_tz_aware result = make_tz_aware(result, original_type.tz) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 890f303b1d7..c864ceeed81 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4027,19 +4027,23 @@ def test_timestamp_as_object_out_of_range(): @pytest.mark.parametrize("resolution", ["s", "ms", "us"]) +@pytest.mark.parametrize("tz", [None, "America/New_York"]) # One datetime outside nanosecond range, one inside nanosecond range: @pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)]) -def test_timestamp_as_object_non_nanosecond(resolution, dt): +def test_timestamp_as_object_non_nanosecond(resolution, tz, dt): # Timestamps can be converted Arrow and reloaded into Pandas with no loss # of information if the timestamp_as_object option is True. - arr = pa.array([dt], type=pa.timestamp(resolution)) - result = arr.to_pandas(timestamp_as_object=True) - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt - + arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz)) table = pa.table({'a': arr}) - result = table.to_pandas(timestamp_as_object=True)['a'] - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt + + for result in [ + arr.to_pandas(timestamp_as_object=True), + table.to_pandas(timestamp_as_object=True)['a'] + ]: + assert result.dtype == object + assert isinstance(result[0], datetime) + if tz: + assert result[0].tzinfo is not None + else: + assert result[0].tzinfo is None + assert result[0] == dt From a550a761781e8a7e1ebe163642a1c6fae98dad52 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Jul 2020 15:50:07 +0200 Subject: [PATCH 14/14] fix test --- python/pyarrow/tests/test_pandas.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index c864ceeed81..ac381751498 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4044,6 +4044,8 @@ def test_timestamp_as_object_non_nanosecond(resolution, tz, dt): assert isinstance(result[0], datetime) if tz: assert result[0].tzinfo is not None + expected = result[0].tzinfo.fromutc(dt) else: assert result[0].tzinfo is None - assert result[0] == dt + expected = dt + assert result[0] == expected