From 8e614324fe50b04c2b7264d32d7456d2c6f87b88 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 11 Jul 2020 13:32:50 -0500 Subject: [PATCH 01/25] ARROW-9223: [Python] Propagate timezone information in pandas conversion - Ports string_to_timezone to C++ - Causes nested timestamp columns within structs that have timezones to go through object conversion - Copy timezone on to_object path. Open to other suggestions on how to solve this. It still seems like this is inconsistent with nested timestamps in lists which isn't great. Closes #7604 from emkornfield/datetime Lead-authored-by: Micah Kornfield Co-authored-by: Joris Van den Bossche Signed-off-by: Wes McKinney --- cpp/src/arrow/python/arrow_to_pandas.cc | 36 +++++++---- cpp/src/arrow/python/arrow_to_pandas.h | 4 -- cpp/src/arrow/python/datetime.cc | 83 ++++++++++++++++++++++++- cpp/src/arrow/python/datetime.h | 10 +++ python/pyarrow/array.pxi | 4 +- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_pandas.py | 49 +++++++++++---- python/pyarrow/types.pxi | 14 +---- 8 files changed, 160 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index bc4e25b08df..9775bf3a0cf 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -17,9 +17,8 @@ // Functions for pandas conversion via NumPy -#include "arrow/python/numpy_interop.h" // IWYU pragma: expand - #include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/numpy_interop.h" // IWYU pragma: expand #include #include @@ -642,15 +641,14 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second + // In ARROW-7723, we found as a result of ARROW-3789 that second // through microsecond resolution tz-aware timestamps were being promoted to // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy // array in this function. PyArray_GETITEM returns datetime.datetime for // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). We inserted this hack to - // preserve the <= 0.15.1 behavior until a better solution can be devised + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. PandasOptions modified_options = options; - modified_options.ignore_timezone = true; modified_options.coerce_temporal_nanoseconds = false; for (int c = 0; c < data.num_chunks(); c++) { @@ -658,8 +656,12 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; - RETURN_NOT_OK(ConvertArrayToPandas( - modified_options, arr->field(static_cast(i)), nullptr, &numpy_array)); + std::shared_ptr field = arr->field(static_cast(i)); + // Seen notes above about timstamp conversion. Don't blindly convert because + // timestamps in lists are handled differently. + modified_options.timestamp_as_object = + field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; + RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -951,8 +953,21 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); - auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + OwnedRef tzinfo; + if (!type.timezone().empty()) { + RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); + RETURN_IF_PYERROR(); + } + auto WrapValue = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + if (tzinfo.obj() != nullptr) { + PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out); + RETURN_IF_PYERROR(); + Py_DECREF(*out); + *out = with_tz; + } + RETURN_IF_PYERROR(); return Status::OK(); }; @@ -1727,8 +1742,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (ts_type.timezone() != "" && !options.ignore_timezone) { - // XXX: ignore_timezone: hack here for ARROW-7723 + } else if (!ts_type.timezone().empty()) { *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { *output_type = PandasWriter::DATETIME_NANO; diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 79a72bcb1ef..2c5db8bdaa9 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -56,10 +56,6 @@ struct PandasOptions { /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; - /// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion - /// path - bool ignore_timezone = false; - /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 8cec87bdd36..19c7a4ea27d 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -14,22 +14,65 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +#include "arrow/python/datetime.h" #include #include #include #include "arrow/python/common.h" -#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" #include "arrow/python/platform.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace py { namespace internal { +namespace { + +// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'. +// GCC 4.9 doesn't support regex, so handcode until support for it +// is dropped. +bool MatchFixedOffset(const std::string& tz, util::string_view* sign, + util::string_view* hour, util::string_view* minute) { + if (tz.size() < 5) { + return false; + } + const char* iter = tz.data(); + if (*iter == '+' || *iter == '-') { + *sign = util::string_view(iter, 1); + iter++; + if (tz.size() < 6) { + return false; + } + } + if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') || + (*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) { + *hour = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + if (*iter != ':') { + return false; + } + iter++; + + if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') { + *minute = util::string_view(iter, 2); + iter += 2; + } else { + return false; + } + return iter == (tz.data() + tz.size()); +} + +} // namespace + PyDateTime_CAPI* datetime_api = nullptr; void InitDatetime() { @@ -262,6 +305,44 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } +// GIL must be held when calling this function. +// Converted from python. See https://github.com/apache/arrow/pull/7604 +// for details. +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { + util::string_view sign_str, hour_str, minute_str; + OwnedRef pytz; + RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); + + if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { + int sign = -1; + if (sign_str == "+") { + sign = 1; + } + OwnedRef fixed_offset; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); + uint32_t minutes, hours; + if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || + !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), + &minutes)) { + return Status::Invalid("Invalid timezone: ", tz); + } + OwnedRef total_minutes(PyLong_FromLong( + sign * ((static_cast(hours) * 60) + static_cast(minutes)))); + RETURN_IF_PYERROR(); + *tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); + } + + OwnedRef timezone; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + *tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); + RETURN_IF_PYERROR(); + return Status::OK(); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a8b22da4741..a394fa4b4bb 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -157,6 +157,16 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } +/// \brief Convert a time zone name into a time zone object. +/// +/// Supported input strings are: +/// * As used in the Olson time zone database (the "tz database" or +/// "tzdata"), such as "America/New_York" +/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 +/// GIL must be held when calling this method. +ARROW_PYTHON_EXPORT +Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ac26ecca601..158ea953d33 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1287,7 +1287,9 @@ cdef _array_like_to_pandas(obj, options): result = pandas_api.series(arr, dtype=dtype, name=name) if (isinstance(original_type, TimestampType) and - original_type.tz is not None): + original_type.tz is not None and + # can be object dtype for non-ns and timestamp_as_object=True + result.dtype.kind == "M"): from pyarrow.pandas_compat import make_tz_aware result = make_tz_aware(result, original_type.tz) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 555688a8385..d704af40023 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1926,6 +1926,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) + CStatus StringToTzinfo(c_string, PyObject** tzinfo) cdef extern from 'arrow/python/init.h': diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index b023e394ef9..bc6167ffba8 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -22,7 +22,7 @@ import sys from collections import OrderedDict -from datetime import date, datetime, time, timedelta +from datetime import date, datetime, time, timedelta, timezone from distutils.version import LooseVersion import hypothesis as h @@ -3327,13 +3327,20 @@ def test_cast_timestamp_unit(): assert result.equals(expected) -def test_struct_with_timestamp_tz(): +def test_nested_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() + ts_dt = ts.to_pydatetime() # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 for unit in ['s', 'ms', 'us']: + if unit in ['s', 'ms']: + # This is used for verifying timezone conversion to micros are not + # important + def truncate(x): return x.replace(microsecond=0) + else: + def truncate(x): return x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) @@ -3342,20 +3349,30 @@ def test_struct_with_timestamp_tz(): result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is None assert isinstance(result[0]['stop'], datetime) + assert result[0]['stop'].tzinfo is None result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) + assert result[0]['start'].tzinfo is not None + utc_dt = result[0]['start'].astimezone(timezone.utc) + assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt) assert isinstance(result[0]['stop'], datetime) + assert result[0]['stop'].tzinfo is not None # same conversion for table result = pa.table({'a': arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['stop'].tzinfo is None result = pa.table({'a': arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) + assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) + assert result['a'][0]['stop'].tzinfo is not None # ---------------------------------------------------------------------- @@ -4032,19 +4049,25 @@ def test_timestamp_as_object_out_of_range(): @pytest.mark.parametrize("resolution", ["s", "ms", "us"]) +@pytest.mark.parametrize("tz", [None, "America/New_York"]) # One datetime outside nanosecond range, one inside nanosecond range: @pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)]) -def test_timestamp_as_object_non_nanosecond(resolution, dt): +def test_timestamp_as_object_non_nanosecond(resolution, tz, dt): # Timestamps can be converted Arrow and reloaded into Pandas with no loss # of information if the timestamp_as_object option is True. - arr = pa.array([dt], type=pa.timestamp(resolution)) - result = arr.to_pandas(timestamp_as_object=True) - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt - + arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz)) table = pa.table({'a': arr}) - result = table.to_pandas(timestamp_as_object=True)['a'] - assert result.dtype == object - assert isinstance(result[0], datetime) - assert result[0] == dt + + for result in [ + arr.to_pandas(timestamp_as_object=True), + table.to_pandas(timestamp_as_object=True)['a'] + ]: + assert result.dtype == object + assert isinstance(result[0], datetime) + if tz: + assert result[0].tzinfo is not None + expected = result[0].tzinfo.fromutc(dt) + else: + assert result[0].tzinfo is None + expected = dt + assert result[0] == expected diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index edd96227b6a..a7e02876dac 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1816,9 +1816,6 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' -_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$') - - def tzinfo_to_string(tz): """ Converts a time zone object into a string indicating the name of a time @@ -1884,14 +1881,9 @@ def string_to_tzinfo(name): tz : datetime.tzinfo Time zone object """ - import pytz - m = _FIXED_OFFSET_RE.match(name) - if m: - sign = 1 if m.group(1) == '+' else -1 - hours, minutes = map(int, m.group(2, 3)) - return pytz.FixedOffset(sign * (hours * 60 + minutes)) - else: - return pytz.timezone(name) + cdef PyObject* tz + check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) + return PyObject_to_object(tz) def timestamp(unit, tz=None): From 69511b5a5c8940c7f0ec20dc7eceec407cdc5c25 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Jul 2020 06:57:42 +0000 Subject: [PATCH 02/25] Honor tzinfo when converting from datetime --- cpp/src/arrow/python/inference.cc | 22 +++++++++------------- cpp/src/arrow/python/python_to_arrow.cc | 21 ++++++++++++++++++--- python/pyarrow/scalar.pxi | 3 ++- python/pyarrow/tests/test_array.py | 5 +++++ python/pyarrow/tests/test_pandas.py | 15 +++++++++++++++ 5 files changed, 49 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index c2fc06e554c..3df9a5dca56 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -295,10 +295,7 @@ class TypeInferrer { int_count_(0), date_count_(0), time_count_(0), - timestamp_second_count_(0), - timestamp_milli_count_(0), timestamp_micro_count_(0), - timestamp_nano_count_(0), duration_count_(0), float_count_(0), binary_count_(0), @@ -332,6 +329,13 @@ class TypeInferrer { ++int_count_; } else if (PyDateTime_Check(obj)) { ++timestamp_micro_count_; + OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); + if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None && timezone_.empty()) { + // From public docs on array construction + // "Localized timestamps will currently be returned as UTC " + // representation). " + timezone_ = "UTC"; + } *keep_going = make_unions_; } else if (PyDelta_Check(obj)) { ++duration_count_; @@ -458,14 +462,8 @@ class TypeInferrer { *out = date32(); } else if (time_count_) { *out = time64(TimeUnit::MICRO); - } else if (timestamp_nano_count_) { - *out = timestamp(TimeUnit::NANO); } else if (timestamp_micro_count_) { - *out = timestamp(TimeUnit::MICRO); - } else if (timestamp_milli_count_) { - *out = timestamp(TimeUnit::MILLI); - } else if (timestamp_second_count_) { - *out = timestamp(TimeUnit::SECOND); + *out = timestamp(TimeUnit::MICRO, timezone_); } else if (duration_count_) { *out = duration(TimeUnit::MICRO); } else if (bool_count_) { @@ -597,10 +595,8 @@ class TypeInferrer { int64_t int_count_; int64_t date_count_; int64_t time_count_; - int64_t timestamp_second_count_; - int64_t timestamp_milli_count_; + std::string timezone_; int64_t timestamp_micro_count_; - int64_t timestamp_nano_count_; int64_t duration_count_; int64_t float_count_; int64_t binary_count_; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 66a1e410265..d15cf2dfdcf 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -558,6 +558,8 @@ template class TemporalConverter : public TimeConverter { public: using TimeConverter::TimeConverter; + TemporalConverter(TimeUnit::type unit, PyObject* utc) + : TimeConverter(unit), utc_(utc) {} Status AppendValue(PyObject* obj) override { int64_t value; @@ -569,11 +571,22 @@ class TemporalConverter : public TimeConverter { return this->typed_builder_->AppendNull(); } } else { - // convert builtin python objects - ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(obj, this->unit_)); + PyObject* target = obj; + OwnedRef target_holder; + if (PyDateTime_Check(obj)) { + OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); + if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) { + target_holder.reset(PyObject_CallMethod(obj, "astimezone", "O", utc_.obj())); + target = target_holder.obj(); + } + } + ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(target, this->unit_)); } return this->typed_builder_->Append(value); } + + private: + OwnedRef utc_; }; // ---------------------------------------------------------------------- @@ -1169,9 +1182,11 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve break; } case Type::TIMESTAMP: { + PyObject* utc; + RETURN_NOT_OK(internal::StringToTzinfo("UTC", &utc)); *out = std::unique_ptr(new TemporalConverter( - checked_cast(*type).unit())); + checked_cast(*type).unit(), utc)); break; } case Type::DURATION: { diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 7f35419345c..d2f1308acf8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -334,7 +334,8 @@ cdef class Date64Scalar(Scalar): def as_py(self): """ - Return this value as a Python datetime.datetime instance. + Return this value as a Pandas Timestamp instance (if available), + otherwise as a Python datetime.datetime instance. """ cdef CDate64Scalar* sp = self.wrapped.get() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a1bfc961dc3..a06ed7a40bf 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -32,6 +32,7 @@ import pickle5 except ImportError: pickle5 = None +import pytz import pyarrow as pa import pyarrow.tests.strategies as past @@ -309,6 +310,8 @@ def test_nulls(ty): def test_array_from_scalar(): today = datetime.date.today() now = datetime.datetime.now() + now_utc = now.replace(tzinfo=pytz.utc) + now_with_tz = now_utc.astimezone(pytz.timezone('US/Eastern')) oneday = datetime.timedelta(days=1) cases = [ @@ -326,6 +329,7 @@ def test_array_from_scalar(): (pa.scalar(True), 11, pa.array([True] * 11)), (today, 2, pa.array([today] * 2)), (now, 10, pa.array([now] * 10)), + (now_with_tz, 10, pa.array([now_utc] * 10)), (now.time(), 9, pa.array([now.time()] * 9)), (oneday, 4, pa.array([oneday] * 4)), (False, 9, pa.array([False] * 9)), @@ -341,6 +345,7 @@ def test_array_from_scalar(): for value, size, expected in cases: arr = pa.repeat(value, size) assert len(arr) == size + assert arr.type.equals(expected.type) assert arr.equals(expected) if expected.type == pa.null(): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index bc6167ffba8..bbe38846d26 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3327,6 +3327,21 @@ def test_cast_timestamp_unit(): assert result.equals(expected) +def test_nested_with_timestamp_tz_round_trip(): + ts = pd.Timestamp.now() + ts_dt = ts.to_pydatetime() + arr = pa.array([ts_dt], type=pa.timestamp('us', tz='America/New_York')) + struct = pa.StructArray.from_arrays([arr, arr], ['start', 'stop']) + + result = struct.to_pandas() + # N.B. we test with Panaas because the Arrow types are not + # actually equal. All Timezone aware times are currently normalized + # to "UTC" as the timesetamp type.but since this conversion results + # in object dtypes, and tzinfo is preserrved the comparison should + # result in equality. + pd.testing.assert_series_equal(pa.array(result).to_pandas(), result) + + def test_nested_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() From 5700909aec9392adef831eb9bbc754eaaed185ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 21 Jul 2020 17:30:05 +0200 Subject: [PATCH 03/25] better timezone support --- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- cpp/src/arrow/python/common.h | 5 ++ cpp/src/arrow/python/datetime.cc | 11 +-- cpp/src/arrow/python/datetime.h | 2 +- cpp/src/arrow/python/inference.cc | 10 +-- cpp/src/arrow/python/python_to_arrow.cc | 61 ++++++++--------- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/tests/test_array.py | 15 +++- python/pyarrow/tests/test_convert_builtin.py | 72 ++++++++++++++++++++ python/pyarrow/tests/test_pandas.py | 8 +-- python/pyarrow/types.pxi | 3 +- 11 files changed, 135 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 9775bf3a0cf..bab18a7ceb0 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -955,7 +955,7 @@ struct ObjectWriterVisitor { const TimeUnit::type unit = type.unit(); OwnedRef tzinfo; if (!type.timezone().empty()) { - RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); + ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); RETURN_IF_PYERROR(); } auto WrapValue = [&](typename Type::c_type value, PyObject** out) { diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 52a3f334d4e..b32b5f91906 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -137,6 +137,11 @@ class ARROW_PYTHON_EXPORT OwnedRef { OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} explicit OwnedRef(PyObject* obj) : obj_(obj) {} + OwnedRef& operator=(PyObject* obj) { + obj_ = obj; + return *this; + } + OwnedRef& operator=(OwnedRef&& other) { obj_ = other.detach(); return *this; diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 19c7a4ea27d..7e763d6b8e9 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -308,7 +308,7 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { // GIL must be held when calling this function. // Converted from python. See https://github.com/apache/arrow/pull/7604 // for details. -Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { +Result StringToTzinfo(const std::string& tz) { util::string_view sign_str, hour_str, minute_str; OwnedRef pytz; RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); @@ -329,18 +329,19 @@ Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { OwnedRef total_minutes(PyLong_FromLong( sign * ((static_cast(hours) * 60) + static_cast(minutes)))); RETURN_IF_PYERROR(); - *tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); + auto tzinfo = + PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); RETURN_IF_PYERROR(); - return Status::OK(); + return tzinfo; } OwnedRef timezone; RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); OwnedRef py_tz_string( PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); - *tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); + auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); RETURN_IF_PYERROR(); - return Status::OK(); + return tzinfo; } } // namespace internal diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a394fa4b4bb..49785f3e12d 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -165,7 +165,7 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 /// GIL must be held when calling this method. ARROW_PYTHON_EXPORT -Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); +Result StringToTzinfo(const std::string& tz); } // namespace internal } // namespace py diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 3df9a5dca56..5b725ef1ec2 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -329,13 +329,6 @@ class TypeInferrer { ++int_count_; } else if (PyDateTime_Check(obj)) { ++timestamp_micro_count_; - OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); - if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None && timezone_.empty()) { - // From public docs on array construction - // "Localized timestamps will currently be returned as UTC " - // representation). " - timezone_ = "UTC"; - } *keep_going = make_unions_; } else if (PyDelta_Check(obj)) { ++duration_count_; @@ -463,7 +456,7 @@ class TypeInferrer { } else if (time_count_) { *out = time64(TimeUnit::MICRO); } else if (timestamp_micro_count_) { - *out = timestamp(TimeUnit::MICRO, timezone_); + *out = timestamp(TimeUnit::MICRO); } else if (duration_count_) { *out = duration(TimeUnit::MICRO); } else if (bool_count_) { @@ -595,7 +588,6 @@ class TypeInferrer { int64_t int_count_; int64_t date_count_; int64_t time_count_; - std::string timezone_; int64_t timestamp_micro_count_; int64_t duration_count_; int64_t float_count_; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d15cf2dfdcf..64a03f57354 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -193,6 +193,7 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int32_t value; + // TODO(kszucs): handle tzinfo if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution switch (unit) { @@ -216,6 +217,7 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int64_t value; + // TODO(kszucs): handle tzinfo if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution switch (unit) { @@ -239,20 +241,31 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int64_t value; + int64_t offset = 0; // UTC offset in seconds + if (PyDateTime_Check(obj)) { auto dt = reinterpret_cast(obj); + if (dt->hastzinfo) { + // calculate offset from UTC timezone in seconds + OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL)); + RETURN_IF_PYERROR(); + if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) { + auto delta = reinterpret_cast(pyoffset.obj()); + offset = internal::PyDelta_to_s(delta); + } + } switch (unit) { case TimeUnit::SECOND: - value = internal::PyDateTime_to_s(dt); + value = internal::PyDateTime_to_s(dt) - offset; break; case TimeUnit::MILLI: - value = internal::PyDateTime_to_ms(dt); + value = internal::PyDateTime_to_ms(dt) - offset * 1000; break; case TimeUnit::MICRO: - value = internal::PyDateTime_to_us(dt); + value = internal::PyDateTime_to_us(dt) - offset * 1000 * 1000; break; case TimeUnit::NANO: - value = internal::PyDateTime_to_ns(dt); + value = internal::PyDateTime_to_ns(dt) - offset * 1000 * 1000 * 1000; break; default: return Status::UnknownError("Invalid time unit"); @@ -558,8 +571,6 @@ template class TemporalConverter : public TimeConverter { public: using TimeConverter::TimeConverter; - TemporalConverter(TimeUnit::type unit, PyObject* utc) - : TimeConverter(unit), utc_(utc) {} Status AppendValue(PyObject* obj) override { int64_t value; @@ -571,22 +582,10 @@ class TemporalConverter : public TimeConverter { return this->typed_builder_->AppendNull(); } } else { - PyObject* target = obj; - OwnedRef target_holder; - if (PyDateTime_Check(obj)) { - OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); - if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) { - target_holder.reset(PyObject_CallMethod(obj, "astimezone", "O", utc_.obj())); - target = target_holder.obj(); - } - } - ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(target, this->unit_)); + ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(obj, this->unit_)); } return this->typed_builder_->Append(value); } - - private: - OwnedRef utc_; }; // ---------------------------------------------------------------------- @@ -1172,27 +1171,27 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve } break; case Type::TIME32: { - *out = std::unique_ptr(new TimeConverter( - checked_cast(*type).unit())); + auto unit = checked_cast(*type).unit(); + *out = + std::unique_ptr(new TimeConverter(unit)); break; } case Type::TIME64: { - *out = std::unique_ptr(new TimeConverter( - checked_cast(*type).unit())); + auto unit = checked_cast(*type).unit(); + *out = + std::unique_ptr(new TimeConverter(unit)); break; } case Type::TIMESTAMP: { - PyObject* utc; - RETURN_NOT_OK(internal::StringToTzinfo("UTC", &utc)); - *out = - std::unique_ptr(new TemporalConverter( - checked_cast(*type).unit(), utc)); + auto unit = checked_cast(*type).unit(); + *out = std::unique_ptr( + new TemporalConverter(unit)); break; } case Type::DURATION: { - *out = - std::unique_ptr(new TemporalConverter( - checked_cast(*type).unit())); + auto unit = checked_cast(*type).unit(); + *out = std::unique_ptr( + new TemporalConverter(unit)); break; } default: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d704af40023..c7c2a95965c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1926,7 +1926,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) - CStatus StringToTzinfo(c_string, PyObject** tzinfo) + CResult[PyObject*] StringToTzinfo(c_string) cdef extern from 'arrow/python/init.h': diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a06ed7a40bf..900fb861ead 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -329,7 +329,7 @@ def test_array_from_scalar(): (pa.scalar(True), 11, pa.array([True] * 11)), (today, 2, pa.array([today] * 2)), (now, 10, pa.array([now] * 10)), - (now_with_tz, 10, pa.array([now_utc] * 10)), + (now_with_tz, 2, pa.array([now_utc] * 2)), (now.time(), 9, pa.array([now.time()] * 9)), (oneday, 4, pa.array([oneday] * 4)), (False, 9, pa.array([False] * 9)), @@ -347,7 +347,6 @@ def test_array_from_scalar(): assert len(arr) == size assert arr.type.equals(expected.type) assert arr.equals(expected) - if expected.type == pa.null(): assert arr.null_count == size else: @@ -1809,6 +1808,18 @@ def test_array_from_numpy_datetimeD(): assert result.equals(expected) +# test_array_from_naive_datetimes +# test_array_from_datetimes_with_timezone + +def test_array_from_naive_datetimes(): + arr = pa.array([ + None, + datetime.datetime(2017, 4, 4, 12, 11, 10), + datetime.datetime(2018, 1, 1, 0, 2, 0) + ]) + assert arr.type == pa.timestamp('us', tz=None) + + @pytest.mark.parametrize(('dtype', 'type'), [ ('datetime64[s]', pa.timestamp('s')), ('datetime64[ms]', pa.timestamp('ms')), diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 4f709f62777..f565891fd2a 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -811,6 +811,78 @@ def test_sequence_timestamp(): 46, 57, 437699) +# parametrize over unit +@pytest.mark.parametrize('timezone', [ + None, + 'UTC', + 'Europe/Budapest', +]) +@pytest.mark.parametrize('unit', [ + 's', + 'ms', + 'us' +]) +def test_sequence_timestamp_with_timezone(timezone, unit): + def expected_integer_value(dt): + units = ['s', 'ms', 'us', 'ns'] + multiplier = 10**(units.index(unit) * 3) + return None if dt is None else int(dt.timestamp() * multiplier) + + def expected_datetime_value(dt): + # truncate microseconds + if unit == 's': + dt = dt.replace(microsecond=0) + elif unit == 'ms': + dt = dt.replace(microsecond=(dt.microsecond // 1000) * 1000) + + # adjust the timezone + if timezone is None: + # make datetime timezone unaware + return dt.replace(tzinfo=None) + else: + # convert to the expected timezone + return dt.astimezone(pytz.timezone(timezone)) + + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + None, + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + utcdata = [ + pytz.utc.localize(data[0]), + data[1], + None, + data[3].astimezone(pytz.utc), + data[4].astimezone(pytz.utc), + ] + + ty = pa.timestamp(unit, tz=timezone) + arr = pa.array(data, type=ty) + assert len(arr) == 5 + assert arr.type == ty + assert arr.null_count == 1 + + # test that the underlying integers are UTC values + values = arr.cast('int64') + expected = list(map(expected_integer_value, utcdata)) + assert values.to_pylist() == expected + + # test that the scalars are datetimes with the correct timezone + assert arr[0].as_py() == expected_datetime_value(utcdata[0]) + assert arr[1].as_py() == expected_datetime_value(utcdata[1]) + assert arr[2].as_py() is None + assert arr[3].as_py() == expected_datetime_value(utcdata[3]) + assert arr[4].as_py() == expected_datetime_value(utcdata[4]) + + def test_sequence_numpy_timestamp(): data = [ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index bbe38846d26..14feeda0fc9 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3334,10 +3334,10 @@ def test_nested_with_timestamp_tz_round_trip(): struct = pa.StructArray.from_arrays([arr, arr], ['start', 'stop']) result = struct.to_pandas() - # N.B. we test with Panaas because the Arrow types are not - # actually equal. All Timezone aware times are currently normalized - # to "UTC" as the timesetamp type.but since this conversion results - # in object dtypes, and tzinfo is preserrved the comparison should + # N.B. we test with Pandas because the Arrow types are not + # actually equal. All Timezone aware times are currently normalized + # to "UTC" as the timestamp type, but since this conversion results + # in object dtypes, and tzinfo is preserved the comparison should # result in equality. pd.testing.assert_series_equal(pa.array(result).to_pandas(), result) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a7e02876dac..e81a265a4e4 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1881,8 +1881,7 @@ def string_to_tzinfo(name): tz : datetime.tzinfo Time zone object """ - cdef PyObject* tz - check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) + cdef PyObject* tz = GetResultValue(StringToTzinfo(name.encode('utf-8'))) return PyObject_to_object(tz) From 3c4f071289f54748b593eea56db721e3054b4f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 21 Jul 2020 21:59:44 +0200 Subject: [PATCH 04/25] test time conversions --- cpp/src/arrow/python/datetime.cc | 13 +++ cpp/src/arrow/python/datetime.h | 3 + cpp/src/arrow/python/python_to_arrow.cc | 20 +--- python/pyarrow/scalar.pxi | 3 +- python/pyarrow/tests/test_convert_builtin.py | 98 ++++++++++++++------ 5 files changed, 90 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 7e763d6b8e9..577c513eb83 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -305,6 +305,19 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } +Result PyDateTime_utcoffset_s(PyObject* obj) { + // calculate offset from UTC timezone in seconds + // supports only PyDateTime_DateTime and PyDateTime_Time objects + OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL)); + RETURN_IF_PYERROR(); + if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) { + auto delta = reinterpret_cast(pyoffset.obj()); + return internal::PyDelta_to_s(delta); + } else { + return 0; + } +} + // GIL must be held when calling this function. // Converted from python. See https://github.com/apache/arrow/pull/7604 // for details. diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 49785f3e12d..9da4b467123 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -157,6 +157,9 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } +ARROW_PYTHON_EXPORT +Result PyDateTime_utcoffset_s(PyObject* pydatetime); + /// \brief Convert a time zone name into a time zone object. /// /// Supported input strings are: diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 64a03f57354..e90630fbc64 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -193,9 +193,8 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int32_t value; - // TODO(kszucs): handle tzinfo if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution + // TODO(kszucs): consider to raise if a timezone aware time object is encountered switch (unit) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); @@ -207,6 +206,7 @@ struct ValueConverter { return Status::UnknownError("Invalid time unit"); } } else { + // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); } return value; @@ -217,9 +217,8 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int64_t value; - // TODO(kszucs): handle tzinfo if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution + // TODO(kszucs): consider to raise if a timezone aware time object is encountered switch (unit) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); @@ -231,6 +230,7 @@ struct ValueConverter { return Status::UnknownError("Invalid time unit"); } } else { + // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); } return value; @@ -241,19 +241,9 @@ template <> struct ValueConverter { static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { int64_t value; - int64_t offset = 0; // UTC offset in seconds - if (PyDateTime_Check(obj)) { + ARROW_ASSIGN_OR_RAISE(auto offset, internal::PyDateTime_utcoffset_s(obj)); auto dt = reinterpret_cast(obj); - if (dt->hastzinfo) { - // calculate offset from UTC timezone in seconds - OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL)); - RETURN_IF_PYERROR(); - if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) { - auto delta = reinterpret_cast(pyoffset.obj()); - offset = internal::PyDelta_to_s(delta); - } - } switch (unit) { case TimeUnit::SECOND: value = internal::PyDateTime_to_s(dt) - offset; diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index d2f1308acf8..7f35419345c 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -334,8 +334,7 @@ cdef class Date64Scalar(Scalar): def as_py(self): """ - Return this value as a Pandas Timestamp instance (if available), - otherwise as a Python datetime.datetime instance. + Return this value as a Python datetime.datetime instance. """ cdef CDate64Scalar* sp = self.wrapped.get() diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f565891fd2a..acb5bf6023e 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -791,6 +791,72 @@ def test_date32_overflow(): pa.array(data3, type=pa.date32()) +@pytest.mark.parametrize(('time_type', 'unit', 'int_type'), [ + (pa.time32, 's', 'int32'), + (pa.time32, 'ms', 'int32'), + (pa.time64, 'us', 'int64'), + # (pa.time64, 'ns'), +]) +def test_sequence_time_with_timezone(time_type, unit, int_type): + def expected_integer_value(t): + # only use with utc time object because it doesn't adjust with the + # offset + units = ['s', 'ms', 'us', 'ns'] + multiplier = 10**(units.index(unit) * 3) + if t is None: + return None + seconds = ( + t.hour * 3600 + + t.minute * 60 + + t.second + + t.microsecond * 10**-6 + ) + return int(seconds * multiplier) + + def expected_time_value(t): + # only use with utc time object because it doesn't adjust with the + # time objects tzdata + if unit == 's': + return t.replace(microsecond=0) + elif unit == 'ms': + return t.replace(microsecond=(t.microsecond // 1000) * 1000) + elif unit == 'us': + return t + else: + raise ValueError("use pandas") + + # only timezone naive times are supported in arrow + data = [ + datetime.time(8, 23, 34, 123456), + datetime.time(5, 0, 0, 1000), + None, + datetime.time(1, 11, 56, 432539), + datetime.time(23, 10, 0, 437699) + ] + + ty = time_type(unit) + arr = pa.array(data, type=ty) + assert len(arr) == 5 + assert arr.type == ty + assert arr.null_count == 1 + + # test that the underlying integers are UTC values + values = arr.cast(int_type) + expected = list(map(expected_integer_value, data)) + assert values.to_pylist() == expected + + # test that the scalars are datetime.time objects with UTC timezone + assert arr[0].as_py() == expected_time_value(data[0]) + assert arr[1].as_py() == expected_time_value(data[1]) + assert arr[2].as_py() is None + assert arr[3].as_py() == expected_time_value(data[3]) + assert arr[4].as_py() == expected_time_value(data[4]) + + def tz(hours, minutes=0): + offset = datetime.timedelta(hours=hours, minutes=minutes) + return datetime.timezone(offset) + + def test_sequence_timestamp(): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), @@ -811,7 +877,6 @@ def test_sequence_timestamp(): 46, 57, 437699) -# parametrize over unit @pytest.mark.parametrize('timezone', [ None, 'UTC', @@ -820,7 +885,8 @@ def test_sequence_timestamp(): @pytest.mark.parametrize('unit', [ 's', 'ms', - 'us' + 'us', + # TODO(kszucs): 'ns' ]) def test_sequence_timestamp_with_timezone(timezone, unit): def expected_integer_value(dt): @@ -903,34 +969,6 @@ def test_sequence_numpy_timestamp(): 46, 57, 437699) -def test_sequence_timestamp_with_unit(): - data = [ - datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), - ] - - s = pa.timestamp('s') - ms = pa.timestamp('ms') - us = pa.timestamp('us') - - arr_s = pa.array(data, type=s) - assert len(arr_s) == 1 - assert arr_s.type == s - assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, - 23, 34, 0) - - arr_ms = pa.array(data, type=ms) - assert len(arr_ms) == 1 - assert arr_ms.type == ms - assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, - 23, 34, 123000) - - arr_us = pa.array(data, type=us) - assert len(arr_us) == 1 - assert arr_us.type == us - assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, - 23, 34, 123456) - - class MyDate(datetime.date): pass From 793e68e73fc7a3364b1cb33e3f44ca128faf3f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 14:16:56 +0200 Subject: [PATCH 05/25] Expost TzinfoToString on the C++ side --- cpp/src/arrow/python/datetime.cc | 70 ++++++++++++++++++++++++++++ cpp/src/arrow/python/datetime.h | 3 ++ python/pyarrow/includes/libarrow.pxd | 2 + python/pyarrow/tests/test_types.py | 29 ++++++++++++ python/pyarrow/types.pxi | 4 ++ 5 files changed, 108 insertions(+) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 577c513eb83..010b550af5a 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include "arrow/python/common.h" @@ -357,6 +358,75 @@ Result StringToTzinfo(const std::string& tz) { return tzinfo; } +Result TzinfoToString(PyObject* tzinfo) { + OwnedRef module_pytz; // import pytz + OwnedRef module_datetime; // import datetime + OwnedRef class_tzinfo; // from datetime import tzinfo + OwnedRef class_timezone; // from datetime import tzinfo + OwnedRef class_fixedoffset; // from pytz import _FixedOffset + + // import necessary modules + RETURN_NOT_OK(internal::ImportModule("pytz", &module_pytz)); + RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime)); + // import necessary classes + RETURN_NOT_OK( + internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", &class_fixedoffset)); + RETURN_NOT_OK( + internal::ImportFromModule(module_datetime.obj(), "tzinfo", &class_tzinfo)); + RETURN_NOT_OK( + internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone)); + + // check that it's a valid tzinfo object + if (!PyObject_IsInstance(tzinfo, class_tzinfo.obj())) { + return Status::Invalid("Not an instance of datetime.tzinfo"); + } + + // convert offset objects to "+/-{hh}:{mm}" format + if (PyObject_IsInstance(tzinfo, class_timezone.obj()) || + PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) { + OwnedRef timedelta_object(PyObject_CallMethod(tzinfo, "utcoffset", "O", Py_None)); + RETURN_IF_PYERROR(); + if (timedelta_object.obj() != nullptr && timedelta_object.obj() != Py_None) { + // retrieve the offset as seconds + auto timedelta = reinterpret_cast(timedelta_object.obj()); + auto total_seconds = internal::PyDelta_to_s(timedelta); + + // determine whether the offset is positive or negative + auto sign = (total_seconds < 0) ? "-" : "+"; + total_seconds = abs(total_seconds); + + // calculate offset components + int64_t hours, minutes, seconds; + seconds = split_time(total_seconds, 60, &minutes); + minutes = split_time(minutes, 60, &hours); + if (seconds > 0) { + // check there are no remaining seconds + return Status::Invalid("Offset must represent whole number of minutes"); + } + + // construct the timezone string + std::stringstream stream; + stream << sign << std::setfill('0') << std::setw(2) << hours << ":" + << std::setfill('0') << std::setw(2) << minutes; + return stream.str(); + } else { + return Status::Invalid("Unable to convert timezone offset object to string"); + } + } + + // pytz.tzinfo.BaseTzInfo objects store the string representation in zone attribute + if (PyObject_HasAttrString(tzinfo, "zone")) { + OwnedRef zone_object(PyObject_GetAttrString(tzinfo, "zone")); + if (zone_object) { + std::string zone_string; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone_object.obj(), &zone_string)); + return zone_string; + } + } + + return Status::Invalid("Unable to convert timezone object to string"); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 9da4b467123..082b8e2349f 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -170,6 +170,9 @@ Result PyDateTime_utcoffset_s(PyObject* pydatetime); ARROW_PYTHON_EXPORT Result StringToTzinfo(const std::string& tz); +ARROW_PYTHON_EXPORT +Result TzinfoToString(PyObject* pytzinfo); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c7c2a95965c..07cfccc50c8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1926,6 +1926,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) + + CResult[c_string] TzinfoToString(PyObject* pytzinfo) CResult[PyObject*] StringToTzinfo(c_string) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 1f905f3be43..abb6fa0792c 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -17,9 +17,11 @@ from collections import OrderedDict from collections.abc import Iterator +import datetime import pickle import pytest +import pytz import hypothesis as h import hypothesis.strategies as st import weakref @@ -252,6 +254,33 @@ def test_is_primitive(): assert not types.is_primitive(pa.list_(pa.int32())) +@pytest.mark.parametrize(('tz', 'expected'), [ + (pytz.utc, 'UTC'), + (pytz.timezone('Europe/Paris'), 'Europe/Paris'), + (pytz.FixedOffset(180), '+03:00'), + # (datetime.timezone.utc, 'UTC'), + (datetime.timezone.utc, '+00:00'), + (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') +]) +def test_tzinfo_to_string(tz, expected): + result = pa.lib.tzinfo_to_string(tz) + assert result == expected + + assert pa.lib.tzinfo_to_str(tz) == expected + + +@pytest.mark.parametrize(('string', 'expected'), [ + ('UTC', pytz.utc), + ('Europe/Paris', pytz.timezone('Europe/Paris')), + ('+03:00', pytz.FixedOffset(180)), + ('+01:30', pytz.FixedOffset(90)), + ('-02:00', pytz.FixedOffset(-120)) +]) +def test_string_to_tzinfo(string, expected): + result = pa.lib.string_to_tzinfo(string) + assert result == expected + + def test_timestamp(): for unit in ('s', 'ms', 'us', 'ns'): for tz in (None, 'UTC', 'Europe/Paris'): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e81a265a4e4..2d5020b16cf 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1862,6 +1862,10 @@ def tzinfo_to_string(tz): raise TypeError('Must be an instance of `datetime.tzinfo`') +def tzinfo_to_str(tz): + return frombytes(GetResultValue(TzinfoToString(tz))) + + def string_to_tzinfo(name): """ Convert a time zone name into a time zone object. From 2f131d397da72b9fa47054b6ccf1ff6cfb8ba0d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 16:01:29 +0200 Subject: [PATCH 06/25] Add more tests and reorganize implementation --- python/pyarrow/tests/test_convert_builtin.py | 11 --- python/pyarrow/tests/test_types.py | 91 +++++++++++++++++++- python/pyarrow/types.pxi | 29 ------- 3 files changed, 88 insertions(+), 43 deletions(-) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index acb5bf6023e..8734f51c9f1 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1525,17 +1525,6 @@ def test_decimal_array_with_none_and_nan(): assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None] -@pytest.mark.parametrize('tz,name', [ - (pytz.FixedOffset(90), '+01:30'), - (pytz.FixedOffset(-90), '-01:30'), - (pytz.utc, 'UTC'), - (pytz.timezone('America/New_York'), 'America/New_York') -]) -def test_timezone_string(tz, name): - assert pa.lib.tzinfo_to_string(tz) == name - assert pa.lib.string_to_tzinfo(name) == tz - - def test_map_from_dicts(): data = [[{'key': b'a', 'value': 1}, {'key': b'b', 'value': 2}], [{'key': b'c', 'value': 3}], diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index abb6fa0792c..6ebb937b8f4 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -263,10 +263,84 @@ def test_is_primitive(): (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') ]) def test_tzinfo_to_string(tz, expected): - result = pa.lib.tzinfo_to_string(tz) - assert result == expected + assert pa.lib.tzinfo_to_string(tz) == expected + + +def test_tzinfo_to_string_errors(): + msg = "Offset must represent whole number of minutes" + with pytest.raises(ValueError, match=msg): + tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) + pa.lib.tzinfo_to_string(tz) + + msg = "Not an instance of datetime.tzinfo" + with pytest.raises(ValueError): + pa.lib.tzinfo_to_string("Europe/Budapest") + + +def test_convert_custom_tzinfo_objects_to_string(): + class CorrectTimezone1(datetime.tzinfo): + """ + Conversion is using utcoffset() + """ + + def tzname(self, dt): + return None + + def utcoffset(self, dt): + return datetime.timedelta(hours=-3, minutes=30) + + class CorrectTimezone2(datetime.tzinfo): + """ + Conversion is using tzname() + """ + + def tzname(self, dt): + return "+03:00" + + def utcoffset(self, dt): + return datetime.timedelta(hours=3) + + class BuggyTimezone1(datetime.tzinfo): + """ + Unable to infer name or offset + """ - assert pa.lib.tzinfo_to_str(tz) == expected + def tzname(self, dt): + return None + + def utcoffset(self, dt): + return None + + class BuggyTimezone2(datetime.tzinfo): + """ + Wrong offset type + """ + + def tzname(self, dt): + return None + + def utcoffset(self, dt): + return "one hour" + + class BuggyTimezone3(datetime.tzinfo): + """ + Wrong timezone name type + """ + + def tzname(self, dt): + return 240 + + def utcoffset(self, dt): + return None + + assert pa.lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30" + assert pa.lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00" + + msg = (r"Object returned by tzinfo.utcoffset\(None\) is not an instance " + r"of datetime.timedelta") + for wrong in [BuggyTimezone1(), BuggyTimezone2(), BuggyTimezone3()]: + with pytest.raises(ValueError, match=msg): + pa.lib.tzinfo_to_string(wrong) @pytest.mark.parametrize(('string', 'expected'), [ @@ -281,6 +355,17 @@ def test_string_to_tzinfo(string, expected): assert result == expected +@pytest.mark.parametrize('tz,name', [ + (pytz.FixedOffset(90), '+01:30'), + (pytz.FixedOffset(-90), '-01:30'), + (pytz.utc, 'UTC'), + (pytz.timezone('America/New_York'), 'America/New_York') +]) +def test_timezone_string_roundtrip(tz, name): + assert pa.lib.tzinfo_to_string(tz) == name + assert pa.lib.string_to_tzinfo(name) == tz + + def test_timestamp(): for unit in ('s', 'ms', 'us', 'ns'): for tz in (None, 'UTC', 'Europe/Paris'): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 2d5020b16cf..15483d321b6 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1834,35 +1834,6 @@ def tzinfo_to_string(tz): name : str Time zone name """ - import pytz - import datetime - - def fixed_offset_to_string(offset): - seconds = int(offset.utcoffset(None).total_seconds()) - sign = '+' if seconds >= 0 else '-' - minutes, seconds = divmod(abs(seconds), 60) - hours, minutes = divmod(minutes, 60) - if seconds > 0: - raise ValueError('Offset must represent whole number of minutes') - return '{}{:02d}:{:02d}'.format(sign, hours, minutes) - - if tz is pytz.utc: - return tz.zone # ARROW-4055 - elif isinstance(tz, pytz.tzinfo.BaseTzInfo): - return tz.zone - elif isinstance(tz, pytz._FixedOffset): - return fixed_offset_to_string(tz) - elif isinstance(tz, datetime.tzinfo): - if isinstance(tz, datetime.timezone): - return fixed_offset_to_string(tz) - else: - raise ValueError('Unable to convert timezone `{}` to string' - .format(tz)) - else: - raise TypeError('Must be an instance of `datetime.tzinfo`') - - -def tzinfo_to_str(tz): return frombytes(GetResultValue(TzinfoToString(tz))) From e007cb18fadc147a5d596110a03e17d1d6dd4718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 16:16:30 +0200 Subject: [PATCH 07/25] Add timezone inference --- cpp/src/arrow/python/datetime.cc | 90 +++++++++++--------- cpp/src/arrow/python/inference.cc | 10 ++- python/pyarrow/tests/test_convert_builtin.py | 27 ++++++ 3 files changed, 87 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 010b550af5a..f6d53d0997d 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -319,6 +319,41 @@ Result PyDateTime_utcoffset_s(PyObject* obj) { } } +Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo, PyObject* pydelta_class) { + // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format + OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None)); + RETURN_IF_PYERROR(); + + if (!PyObject_IsInstance(pydelta_object.obj(), pydelta_class)) { + return Status::Invalid( + "Object returned by tzinfo.utcoffset(None) is not an instance of " + "datetime.timedelta"); + } + auto pydelta = reinterpret_cast(pydelta_object.obj()); + + // retrieve the offset as seconds + auto total_seconds = internal::PyDelta_to_s(pydelta); + + // determine whether the offset is positive or negative + auto sign = (total_seconds < 0) ? "-" : "+"; + total_seconds = abs(total_seconds); + + // calculate offset components + int64_t hours, minutes, seconds; + seconds = split_time(total_seconds, 60, &minutes); + minutes = split_time(minutes, 60, &hours); + if (seconds > 0) { + // check there are no remaining seconds + return Status::Invalid("Offset must represent whole number of minutes"); + } + + // construct the timezone string + std::stringstream stream; + stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0') + << std::setw(2) << minutes; + return stream.str(); +} + // GIL must be held when calling this function. // Converted from python. See https://github.com/apache/arrow/pull/7604 // for details. @@ -362,7 +397,8 @@ Result TzinfoToString(PyObject* tzinfo) { OwnedRef module_pytz; // import pytz OwnedRef module_datetime; // import datetime OwnedRef class_tzinfo; // from datetime import tzinfo - OwnedRef class_timezone; // from datetime import tzinfo + OwnedRef class_timezone; // from datetime import timezone + OwnedRef class_timedelta; // from datetime import timedelta OwnedRef class_fixedoffset; // from pytz import _FixedOffset // import necessary modules @@ -375,56 +411,32 @@ Result TzinfoToString(PyObject* tzinfo) { internal::ImportFromModule(module_datetime.obj(), "tzinfo", &class_tzinfo)); RETURN_NOT_OK( internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone)); + RETURN_NOT_OK( + internal::ImportFromModule(module_datetime.obj(), "timedelta", &class_timedelta)); // check that it's a valid tzinfo object if (!PyObject_IsInstance(tzinfo, class_tzinfo.obj())) { return Status::Invalid("Not an instance of datetime.tzinfo"); } - // convert offset objects to "+/-{hh}:{mm}" format + // if tzinfo is an instance of pytz._FixedOffset or datetime.timedelta return the + // HH:MM offset string representation if (PyObject_IsInstance(tzinfo, class_timezone.obj()) || PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) { - OwnedRef timedelta_object(PyObject_CallMethod(tzinfo, "utcoffset", "O", Py_None)); - RETURN_IF_PYERROR(); - if (timedelta_object.obj() != nullptr && timedelta_object.obj() != Py_None) { - // retrieve the offset as seconds - auto timedelta = reinterpret_cast(timedelta_object.obj()); - auto total_seconds = internal::PyDelta_to_s(timedelta); - - // determine whether the offset is positive or negative - auto sign = (total_seconds < 0) ? "-" : "+"; - total_seconds = abs(total_seconds); - - // calculate offset components - int64_t hours, minutes, seconds; - seconds = split_time(total_seconds, 60, &minutes); - minutes = split_time(minutes, 60, &hours); - if (seconds > 0) { - // check there are no remaining seconds - return Status::Invalid("Offset must represent whole number of minutes"); - } - - // construct the timezone string - std::stringstream stream; - stream << sign << std::setfill('0') << std::setw(2) << hours << ":" - << std::setfill('0') << std::setw(2) << minutes; - return stream.str(); - } else { - return Status::Invalid("Unable to convert timezone offset object to string"); - } + return PyTZInfo_utcoffset_hhmm(tzinfo, class_timedelta.obj()); } - // pytz.tzinfo.BaseTzInfo objects store the string representation in zone attribute - if (PyObject_HasAttrString(tzinfo, "zone")) { - OwnedRef zone_object(PyObject_GetAttrString(tzinfo, "zone")); - if (zone_object) { - std::string zone_string; - RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone_object.obj(), &zone_string)); - return zone_string; - } + // attempt to call tzinfo.tzname(None) + OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); + RETURN_IF_PYERROR(); + if (PyUnicode_Check(tzname_object.obj())) { + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result)); + return result; } - return Status::Invalid("Unable to convert timezone object to string"); + // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None) + return PyTZInfo_utcoffset_hhmm(tzinfo, class_timedelta.obj()); } } // namespace internal diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 5b725ef1ec2..d1ce2c2f797 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -328,6 +328,13 @@ class TypeInferrer { } else if (internal::IsPyInteger(obj)) { ++int_count_; } else if (PyDateTime_Check(obj)) { + // infer timezone from the first encountered datetime object + if (!timestamp_micro_count_) { + OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); + if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) { + ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj())); + } + } ++timestamp_micro_count_; *keep_going = make_unions_; } else if (PyDelta_Check(obj)) { @@ -456,7 +463,7 @@ class TypeInferrer { } else if (time_count_) { *out = time64(TimeUnit::MICRO); } else if (timestamp_micro_count_) { - *out = timestamp(TimeUnit::MICRO); + *out = timestamp(TimeUnit::MICRO, timezone_); } else if (duration_count_) { *out = duration(TimeUnit::MICRO); } else if (bool_count_) { @@ -589,6 +596,7 @@ class TypeInferrer { int64_t date_count_; int64_t time_count_; int64_t timestamp_micro_count_; + std::string timezone_; int64_t duration_count_; int64_t float_count_; int64_t binary_count_; diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 8734f51c9f1..1f27e0fa444 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -949,6 +949,33 @@ def expected_datetime_value(dt): assert arr[4].as_py() == expected_datetime_value(utcdata[4]) +def test_sequence_timestamp_with_timezone_inference(): + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + None, + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + expected = [ + pa.timestamp('us', tz=None), + pa.timestamp('us', tz='UTC'), + pa.timestamp('us', tz=None), + pa.timestamp('us', tz='US/Eastern'), + pa.timestamp('us', tz='Europe/Moscow') + ] + for dt, expected_type in zip(data, expected): + prepended = [dt] + data + arr = pa.array(prepended) + assert arr.type == expected_type + + def test_sequence_numpy_timestamp(): data = [ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), From 25f9b25974c7a7482f476b183fd90218041d5e0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 16:58:53 +0200 Subject: [PATCH 08/25] Enable tests for nanosecond resolution --- python/pyarrow/tests/test_array.py | 3 -- python/pyarrow/tests/test_convert_builtin.py | 42 ++++++++++++++------ python/pyarrow/tests/test_types.py | 1 - 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 900fb861ead..c5a7a0716e5 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1808,9 +1808,6 @@ def test_array_from_numpy_datetimeD(): assert result.equals(expected) -# test_array_from_naive_datetimes -# test_array_from_datetimes_with_timezone - def test_array_from_naive_datetimes(): arr = pa.array([ None, diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 1f27e0fa444..444238d8f62 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -795,7 +795,7 @@ def test_date32_overflow(): (pa.time32, 's', 'int32'), (pa.time32, 'ms', 'int32'), (pa.time64, 'us', 'int64'), - # (pa.time64, 'ns'), + (pa.time64, 'ns', 'int64'), ]) def test_sequence_time_with_timezone(time_type, unit, int_type): def expected_integer_value(t): @@ -820,10 +820,8 @@ def expected_time_value(t): return t.replace(microsecond=0) elif unit == 'ms': return t.replace(microsecond=(t.microsecond // 1000) * 1000) - elif unit == 'us': - return t else: - raise ValueError("use pandas") + return t # only timezone naive times are supported in arrow data = [ @@ -886,16 +884,23 @@ def test_sequence_timestamp(): 's', 'ms', 'us', - # TODO(kszucs): 'ns' + 'ns' ]) def test_sequence_timestamp_with_timezone(timezone, unit): def expected_integer_value(dt): units = ['s', 'ms', 'us', 'ns'] multiplier = 10**(units.index(unit) * 3) - return None if dt is None else int(dt.timestamp() * multiplier) + if dt is None: + return None + else: + # avoid float precision issues + ts = decimal.Decimal(str(dt.timestamp())) + return int(ts * multiplier) def expected_datetime_value(dt): - # truncate microseconds + if dt is None: + return None + if unit == 's': dt = dt.replace(microsecond=0) elif unit == 'ms': @@ -942,11 +947,8 @@ def expected_datetime_value(dt): assert values.to_pylist() == expected # test that the scalars are datetimes with the correct timezone - assert arr[0].as_py() == expected_datetime_value(utcdata[0]) - assert arr[1].as_py() == expected_datetime_value(utcdata[1]) - assert arr[2].as_py() is None - assert arr[3].as_py() == expected_datetime_value(utcdata[3]) - assert arr[4].as_py() == expected_datetime_value(utcdata[4]) + for i in range(len(arr)): + assert arr[i].as_py() == expected_datetime_value(utcdata[i]) def test_sequence_timestamp_with_timezone_inference(): @@ -976,6 +978,22 @@ def test_sequence_timestamp_with_timezone_inference(): assert arr.type == expected_type +# @pytest.mark.pandas +# def test_nanosecond_resolution_timestamp(): +# import pandas as pd + +# data = [ +# pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'), +# unit='ns'), +# datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive +# pytz.utc.localize( +# datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) +# ), +# None, +# ] +# assert pa.array(data).type == pa.timestamp('ns', tz='US/Eastern') + + def test_sequence_numpy_timestamp(): data = [ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 6ebb937b8f4..5980de927d0 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -258,7 +258,6 @@ def test_is_primitive(): (pytz.utc, 'UTC'), (pytz.timezone('Europe/Paris'), 'Europe/Paris'), (pytz.FixedOffset(180), '+03:00'), - # (datetime.timezone.utc, 'UTC'), (datetime.timezone.utc, '+00:00'), (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') ]) From 33b978f8e6bdb479885ebeb30a9b553423db3d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 17:20:33 +0200 Subject: [PATCH 09/25] Fix array from scalar test case --- python/pyarrow/tests/test_array.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c5a7a0716e5..7c1f0248d26 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -329,7 +329,14 @@ def test_array_from_scalar(): (pa.scalar(True), 11, pa.array([True] * 11)), (today, 2, pa.array([today] * 2)), (now, 10, pa.array([now] * 10)), - (now_with_tz, 2, pa.array([now_utc] * 2)), + ( + now_with_tz, + 2, + pa.array( + [now_utc] * 2, + type=pa.timestamp('us', tz=pytz.timezone('US/Eastern')) + ) + ), (now.time(), 9, pa.array([now.time()] * 9)), (oneday, 4, pa.array([oneday] * 4)), (False, 9, pa.array([False] * 9)), From 973259d652f0a4fb66b56f77c0f3ce0d6ff5ebf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 17:25:51 +0200 Subject: [PATCH 10/25] Convert from mixed builtin and pandas datetimes --- python/pyarrow/tests/test_convert_builtin.py | 40 +++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 444238d8f62..a1012602891 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -978,20 +978,32 @@ def test_sequence_timestamp_with_timezone_inference(): assert arr.type == expected_type -# @pytest.mark.pandas -# def test_nanosecond_resolution_timestamp(): -# import pandas as pd - -# data = [ -# pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'), -# unit='ns'), -# datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive -# pytz.utc.localize( -# datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) -# ), -# None, -# ] -# assert pa.array(data).type == pa.timestamp('ns', tz='US/Eastern') +@pytest.mark.pandas +def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes(): + import pandas as pd + + data = [ + pd.Timestamp(1184307814123456123, tz=pytz.timezone('US/Eastern'), + unit='ns'), + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + None, + ] + utcdata = [ + data[0].astimezone(pytz.utc), + pytz.utc.localize(data[1]), + data[2].astimezone(pytz.utc), + None, + ] + + arr = pa.array(data) + assert arr.type == pa.timestamp('us', tz='US/Eastern') + + values = arr.cast('int64') + expected = [int(dt.timestamp() * 10**6) if dt else None for dt in utcdata] + assert values.to_pylist() == expected def test_sequence_numpy_timestamp(): From 29527d0f607f62fe1fe8e4b08372fe6e5266af10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 17:30:34 +0200 Subject: [PATCH 11/25] Skip test_tzinfo_to_string_errors for python versions <=3.6 --- python/pyarrow/tests/test_types.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 5980de927d0..89301266938 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -18,6 +18,7 @@ from collections import OrderedDict from collections.abc import Iterator import datetime +import sys import pickle import pytest @@ -265,6 +266,10 @@ def test_tzinfo_to_string(tz, expected): assert pa.lib.tzinfo_to_string(tz) == expected +@pytest.mark.skipif(sys.version_info <= (3, 6), reason=( + "Since python 3.7 the UTC offset for datetime.timezone is not restricted " + "to a whole number of minutes" +)) def test_tzinfo_to_string_errors(): msg = "Offset must represent whole number of minutes" with pytest.raises(ValueError, match=msg): From 2be7a9e617a5c961f8b9c44f74cf69928329517f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 22 Jul 2020 17:42:29 +0200 Subject: [PATCH 12/25] Fix skip condition --- python/pyarrow/tests/test_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 89301266938..f2764ba109a 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -266,7 +266,7 @@ def test_tzinfo_to_string(tz, expected): assert pa.lib.tzinfo_to_string(tz) == expected -@pytest.mark.skipif(sys.version_info <= (3, 6), reason=( +@pytest.mark.skipif(sys.version_info <= (3, 7), reason=( "Since python 3.7 the UTC offset for datetime.timezone is not restricted " "to a whole number of minutes" )) From d5c890f606bc7f6d37f89fde962a9814e503123f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 1 Aug 2020 03:49:53 +0000 Subject: [PATCH 13/25] fix typos --- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index bab18a7ceb0..f785c205b30 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -657,7 +657,7 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; std::shared_ptr field = arr->field(static_cast(i)); - // Seen notes above about timstamp conversion. Don't blindly convert because + // See notes above about timestamp conversion. Don't blindly convert because // timestamps in lists are handled differently. modified_options.timestamp_as_object = field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; From 6a5d9f819d1d9aca851eab8b4beddf76cf3c8515 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 1 Aug 2020 05:13:07 +0000 Subject: [PATCH 14/25] Add fallback parameter to ignore timezone in C++ python code --- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- cpp/src/arrow/python/arrow_to_pandas.h | 5 + cpp/src/arrow/python/python_to_arrow.cc | 131 +++++++++++++++--------- cpp/src/arrow/python/python_to_arrow.h | 8 +- 4 files changed, 92 insertions(+), 54 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index f785c205b30..345232e4230 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -954,7 +954,7 @@ struct ObjectWriterVisitor { enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); OwnedRef tzinfo; - if (!type.timezone().empty()) { + if (!type.timezone().empty() && !options.ignore_timezone) { ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); RETURN_IF_PYERROR(); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 2c5db8bdaa9..abf4bbdef0d 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -56,6 +56,11 @@ struct PandasOptions { /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; + /// Used to maintain backwards compatibility for + /// timezone bugs (see ARROW-9528). Should be removed + /// after Arrow 2.0 release. + bool ignore_timezone = false; + /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index e90630fbc64..5340f8e7b51 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -191,7 +191,8 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool /*ignore_timezone*/) { int32_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered @@ -215,7 +216,8 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool /*ignore_timezone=*/) { int64_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered @@ -239,10 +241,14 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool ignore_timezone) { int64_t value; if (PyDateTime_Check(obj)) { ARROW_ASSIGN_OR_RAISE(auto offset, internal::PyDateTime_utcoffset_s(obj)); + if (ignore_timezone) { + offset = 0; + } auto dt = reinterpret_cast(obj); switch (unit) { case TimeUnit::SECOND: @@ -286,7 +292,8 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool /*ignore_timezone*/) { int64_t value; if (PyDelta_Check(obj)) { auto dt = reinterpret_cast(obj); @@ -392,7 +399,8 @@ class SeqConverter; // Forward-declare converter factory Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, std::unique_ptr* out); + bool strict_conversions, bool ignore_timezone, + std::unique_ptr* out); // Marshal Python sequence (list, tuple, etc.) to Arrow array class SeqConverter { @@ -527,16 +535,19 @@ class PrimitiveConverter : public TypedConverter { template class TimeConverter : public TypedConverter { public: - explicit TimeConverter(TimeUnit::type unit) : unit_(unit) {} + explicit TimeConverter(TimeUnit::type unit, bool ignore_timezone) + : unit_(unit), ignore_timezone_(ignore_timezone) {} // TODO(kszucs): support numpy values for date and time converters Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); + ARROW_ASSIGN_OR_RAISE(auto value, + ValueConverter::FromPython(obj, unit_, ignore_timezone_)); return this->typed_builder_->Append(value); } protected: TimeUnit::type unit_; + bool ignore_timezone_; }; // TODO(kszucs): move it to the type_traits @@ -572,7 +583,10 @@ class TemporalConverter : public TimeConverter { return this->typed_builder_->AppendNull(); } } else { - ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(obj, this->unit_)); + ARROW_ASSIGN_OR_RAISE( + value, + ValueConverter::FromPython( + obj, this->unit_, TimeConverter::ignore_timezone_)); } return this->typed_builder_->Append(value); } @@ -713,16 +727,19 @@ class BaseListConverter : public TypedConverter { public: using BuilderType = typename TypeTraits::BuilderType; - explicit BaseListConverter(bool from_pandas, bool strict_conversions) - : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} + explicit BaseListConverter(bool from_pandas, bool strict_conversions, + bool ignore_timezone) + : from_pandas_(from_pandas), + strict_conversions_(strict_conversions), + ignore_timezone_(ignore_timezone) {} Status Init(ArrayBuilder* builder) override { this->builder_ = builder; this->typed_builder_ = checked_cast(builder); this->value_type_ = checked_cast(*builder->type()).value_type(); - RETURN_NOT_OK( - GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_)); + RETURN_NOT_OK(GetConverter(value_type_, from_pandas_, strict_conversions_, + ignore_timezone_, &value_converter_)); return this->value_converter_->Init(this->typed_builder_->value_builder()); } @@ -832,8 +849,9 @@ class BaseListConverter : public TypedConverter { protected: std::shared_ptr value_type_; std::unique_ptr value_converter_; - bool from_pandas_; - bool strict_conversions_; + const bool from_pandas_; + const bool strict_conversions_; + const bool ignore_timezone_; }; template @@ -893,8 +911,8 @@ class MapConverter : public BaseListConverter { public: using BASE = BaseListConverter; - explicit MapConverter(bool from_pandas, bool strict_conversions) - : BASE(from_pandas, strict_conversions), key_builder_(nullptr) {} + explicit MapConverter(bool from_pandas, bool strict_conversions, bool ignore_timezone) + : BASE(from_pandas, strict_conversions, ignore_timezone), key_builder_(nullptr) {} Status Append(PyObject* obj) override { RETURN_NOT_OK(BASE::Append(obj)); @@ -936,8 +954,11 @@ class MapConverter : public BaseListConverter { template class StructConverter : public TypedConverter { public: - explicit StructConverter(bool from_pandas, bool strict_conversions) - : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} + explicit StructConverter(bool from_pandas, bool strict_conversions, + bool ignore_timezone) + : from_pandas_(from_pandas), + strict_conversions_(strict_conversions), + ignore_timezone_(ignore_timezone) {} Status Init(ArrayBuilder* builder) override { this->builder_ = builder; @@ -957,8 +978,8 @@ class StructConverter : public TypedConverter { std::shared_ptr field_type(struct_type->field(i)->type()); std::unique_ptr value_converter; - RETURN_NOT_OK( - GetConverter(field_type, from_pandas_, strict_conversions_, &value_converter)); + RETURN_NOT_OK(GetConverter(field_type, from_pandas_, strict_conversions_, + ignore_timezone_, &value_converter)); RETURN_NOT_OK(value_converter->Init(this->typed_builder_->field_builder(i))); value_converters_.push_back(std::move(value_converter)); @@ -1076,6 +1097,7 @@ class StructConverter : public TypedConverter { } dict_key_kind_ = DictKeyKind::UNKNOWN; bool from_pandas_; bool strict_conversions_; + bool ignore_timezone_; }; template @@ -1112,7 +1134,7 @@ class DecimalConverter : public TypedConverter Status GetConverterFlat(const std::shared_ptr& type, bool strict_conversions, - std::unique_ptr* out) { + bool ignore_timezone, std::unique_ptr* out) { switch (type->id()) { SIMPLE_CONVERTER_CASE(NA, NullConverter); PRIMITIVE(BOOL, BooleanType); @@ -1162,26 +1184,27 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve break; case Type::TIME32: { auto unit = checked_cast(*type).unit(); - *out = - std::unique_ptr(new TimeConverter(unit)); + *out = std::unique_ptr( + new TimeConverter(unit, ignore_timezone)); break; } case Type::TIME64: { auto unit = checked_cast(*type).unit(); - *out = - std::unique_ptr(new TimeConverter(unit)); + *out = std::unique_ptr( + new TimeConverter(unit, ignore_timezone)); break; } case Type::TIMESTAMP: { auto unit = checked_cast(*type).unit(); *out = std::unique_ptr( - new TemporalConverter(unit)); + new TemporalConverter(unit, ignore_timezone)); break; } case Type::DURATION: { auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TemporalConverter(unit)); + *out = + std::unique_ptr(new TemporalConverter( + unit, /*ignore_timezone=*/false)); break; } default: @@ -1192,7 +1215,8 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve } Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, std::unique_ptr* out) { + bool strict_conversions, bool ignore_timezone, + std::unique_ptr* out) { if (from_pandas) { // ARROW-842: If pandas is not installed then null checks will be less // comprehensive, but that is okay. @@ -1204,53 +1228,53 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, if (from_pandas) { *out = std::unique_ptr( new ListConverter( - from_pandas, strict_conversions)); + from_pandas, strict_conversions, ignore_timezone)); } else { *out = std::unique_ptr( - new ListConverter(from_pandas, - strict_conversions)); + new ListConverter( + from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); case Type::LARGE_LIST: if (from_pandas) { *out = std::unique_ptr( new ListConverter( - from_pandas, strict_conversions)); + from_pandas, strict_conversions, ignore_timezone)); } else { *out = std::unique_ptr( - new ListConverter(from_pandas, - strict_conversions)); + new ListConverter( + from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); case Type::MAP: if (from_pandas) { *out = std::unique_ptr(new MapConverter( - from_pandas, strict_conversions)); + from_pandas, strict_conversions, ignore_timezone)); } else { - *out = std::unique_ptr( - new MapConverter(from_pandas, strict_conversions)); + *out = std::unique_ptr(new MapConverter( + from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); case Type::FIXED_SIZE_LIST: if (from_pandas) { *out = std::unique_ptr( - new FixedSizeListConverter(from_pandas, - strict_conversions)); + new FixedSizeListConverter( + from_pandas, strict_conversions, ignore_timezone)); } else { *out = std::unique_ptr( - new FixedSizeListConverter(from_pandas, - strict_conversions)); + new FixedSizeListConverter( + from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); case Type::STRUCT: if (from_pandas) { *out = std::unique_ptr( - new StructConverter(from_pandas, - strict_conversions)); + new StructConverter( + from_pandas, strict_conversions, ignore_timezone)); } else { - *out = std::unique_ptr( - new StructConverter(from_pandas, strict_conversions)); + *out = std::unique_ptr(new StructConverter( + from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); default: @@ -1258,10 +1282,11 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, } if (from_pandas) { - RETURN_NOT_OK( - GetConverterFlat(type, strict_conversions, out)); + RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, + ignore_timezone, out)); } else { - RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, out)); + RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, + ignore_timezone, out)); } return Status::OK(); } @@ -1332,6 +1357,10 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); + if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { + const auto& ts_type = checked_cast(*real_type); + real_type = timestamp(ts_type.unit()); + } } else { real_type = options.type; strict_conversions = true; @@ -1340,8 +1369,8 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, // Create the sequence converter, initialize with the builder std::unique_ptr converter; - RETURN_NOT_OK( - GetConverter(real_type, options.from_pandas, strict_conversions, &converter)); + RETURN_NOT_OK(GetConverter(real_type, options.from_pandas, strict_conversions, + options.ignore_timezone, &converter)); // Create ArrayBuilder for type, then pass into the SeqConverter // instance. The reason this is created here rather than in GetConverter is diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 5c8052ac6ce..5108e752e8f 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -54,8 +54,12 @@ struct PyConversionOptions { // Memory pool to use for allocations MemoryPool* pool; - // Default false - bool from_pandas; + bool from_pandas = false; + + /// Used to maintain backwards compatibility for + /// timezone bugs (see ARROW-9528). Should be removed + /// after Arrow 2.0 release. + bool ignore_timezone = false; }; /// \brief Convert sequence (list, generator, NumPy array with dtype object) of From 263517d779b7a432b126213d5b4f3e43db3e324d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 2 Aug 2020 04:13:33 +0000 Subject: [PATCH 15/25] add ARROW_NO_TZ env variable to revert back to old behavior --- python/pyarrow/array.pxi | 3 +++ python/pyarrow/includes/libarrow.pxd | 2 ++ 2 files changed, 5 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 158ea953d33..8089a05da66 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os import warnings @@ -31,6 +32,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, options.pool = pool options.from_pandas = from_pandas + options.ignore_timezone = os.environ.get('ARROW_NO_TZ', False) cdef shared_ptr[CChunkedArray] out @@ -730,6 +732,7 @@ cdef PandasOptions _convert_pandas_options(dict options): result.safe_cast = options['safe'] result.split_blocks = options['split_blocks'] result.self_destruct = options['self_destruct'] + result.ignore_timezone = os.environ.get('ARROW_NO_TZ', False) return result diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 07cfccc50c8..f25e376946e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1753,6 +1753,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: int64_t size CMemoryPool* pool c_bool from_pandas + c_bool ignore_timezone # TODO Some functions below are not actually "nogil" @@ -1875,6 +1876,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool timestamp_as_object c_bool use_threads c_bool coerce_temporal_nanoseconds + c_bool ignore_timezone c_bool deduplicate_objects c_bool safe_cast c_bool split_blocks From 9680e1c8e8d6b29d6322724c081ba62b96984215 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 3 Aug 2020 05:43:41 +0000 Subject: [PATCH 16/25] use env variable in psark script --- ci/scripts/integration_spark.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index 9828a28a1ec..06166a7d8b9 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -22,6 +22,9 @@ source_dir=${1} spark_dir=${2} spark_version=${SPARK_VERSION:-master} +# Use old behavior that always dropped tiemzones. +export ARROW_NO_TZ=1 + if [ "${SPARK_VERSION:0:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x export ARROW_PRE_0_15_IPC_FORMAT=1 From c8c22f9d3869885d3c2613bb4587243665bc828e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 9 Aug 2020 03:50:42 +0000 Subject: [PATCH 17/25] ARROW_NO_TZ -> PYARROW_IGNORE_TZ --- ci/scripts/integration_spark.sh | 2 +- python/pyarrow/array.pxi | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index 06166a7d8b9..2d02eb81d47 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -23,7 +23,7 @@ spark_dir=${2} spark_version=${SPARK_VERSION:-master} # Use old behavior that always dropped tiemzones. -export ARROW_NO_TZ=1 +export PYARROW_IGNORE_TZ=1 if [ "${SPARK_VERSION:0:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8089a05da66..287f72185e4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -32,7 +32,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, options.pool = pool options.from_pandas = from_pandas - options.ignore_timezone = os.environ.get('ARROW_NO_TZ', False) + options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TZ', False) cdef shared_ptr[CChunkedArray] out @@ -732,7 +732,7 @@ cdef PandasOptions _convert_pandas_options(dict options): result.safe_cast = options['safe'] result.split_blocks = options['split_blocks'] result.self_destruct = options['self_destruct'] - result.ignore_timezone = os.environ.get('ARROW_NO_TZ', False) + result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TZ', False) return result From 463a0c487bf2e299e19830a641ab5d37bebd769c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 16:24:08 +0200 Subject: [PATCH 18/25] address review comments --- .../arrow/compute/kernels/scalar_string.cc | 4 +- cpp/src/arrow/python/arrow_to_pandas.cc | 40 ++++++++++++------- cpp/src/arrow/python/common.h | 5 --- cpp/src/arrow/python/datetime.h | 12 +++++- cpp/src/arrow/python/python_to_arrow.cc | 4 +- 5 files changed, 41 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 7e616176444..0332be9dc5c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -861,10 +861,10 @@ void AddBinaryLength(FunctionRegistry* registry) { applicator::ScalarUnaryNotNull::Exec; ArrayKernelExec exec_offset_64 = applicator::ScalarUnaryNotNull::Exec; - for (const auto& input_type : {binary(), utf8()}) { + for (const auto input_type : {binary(), utf8()}) { DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32)); } - for (const auto& input_type : {large_binary(), large_utf8()}) { + for (const auto input_type : {large_binary(), large_utf8()}) { DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64)); } DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 345232e4230..7c317340470 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -952,26 +952,38 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { + PyObject* tzinfo = nullptr; const TimeUnit::type unit = type.unit(); - OwnedRef tzinfo; - if (!type.timezone().empty() && !options.ignore_timezone) { - ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); - RETURN_IF_PYERROR(); - } - auto WrapValue = [&](typename Type::c_type value, PyObject** out) { + + auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); RETURN_IF_PYERROR(); - if (tzinfo.obj() != nullptr) { - PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out); - RETURN_IF_PYERROR(); - Py_DECREF(*out); - *out = with_tz; - } - + return Status::OK(); + }; + auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) { + PyObject* tz_naive; + RETURN_NOT_OK(ConvertTimezoneNaive(value, &tz_naive)); + // convert the timezone naive datetime object to timezone aware + *out = PyObject_CallMethod(tzinfo, "fromutc", "O", tz_naive); RETURN_IF_PYERROR(); + // the timezone naive object is no longer required + Py_DECREF(tz_naive); return Status::OK(); }; - return ConvertAsPyObjects(options, data, WrapValue, out_values); + + if (!type.timezone().empty() && !options.ignore_timezone) { + // convert timezone aware + ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); + RETURN_IF_PYERROR(); + RETURN_NOT_OK( + ConvertAsPyObjects(options, data, ConvertTimezoneAware, out_values)); + } else { + // convert timezone naive + RETURN_NOT_OK( + ConvertAsPyObjects(options, data, ConvertTimezoneNaive, out_values)); + } + + return Status::OK(); } Status Visit(const Decimal128Type& type) { diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index b32b5f91906..52a3f334d4e 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -137,11 +137,6 @@ class ARROW_PYTHON_EXPORT OwnedRef { OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} explicit OwnedRef(PyObject* obj) : obj_(obj) {} - OwnedRef& operator=(PyObject* obj) { - obj_ = obj; - return *this; - } - OwnedRef& operator=(OwnedRef&& other) { obj_ = other.detach(); return *this; diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 082b8e2349f..4f3adb4cd53 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -163,13 +163,23 @@ Result PyDateTime_utcoffset_s(PyObject* pydatetime); /// \brief Convert a time zone name into a time zone object. /// /// Supported input strings are: -/// * As used in the Olson time zone database (the "tz database" or +/// * As used in the Olson time zone database (the "tz database" or /// "tzdata"), such as "America/New_York" /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 /// GIL must be held when calling this method. ARROW_PYTHON_EXPORT Result StringToTzinfo(const std::string& tz); +/// \brief Convert a time zone object to a string representation. +/// +/// The output strings are: +/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 +/// if the input object is either an instance of pytz._FixedOffset or +/// datetime.timedelta +/// * The timezone's name if the input object's tzname() method returns with a +/// non-empty timezone name such as "UTC" or "America/New_York" +/// +/// GIL must be held when calling this method. ARROW_PYTHON_EXPORT Result TzinfoToString(PyObject* pytzinfo); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 5340f8e7b51..0ab3415671f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -245,7 +245,7 @@ struct ValueConverter { bool ignore_timezone) { int64_t value; if (PyDateTime_Check(obj)) { - ARROW_ASSIGN_OR_RAISE(auto offset, internal::PyDateTime_utcoffset_s(obj)); + ARROW_ASSIGN_OR_RAISE(int64_t offset, internal::PyDateTime_utcoffset_s(obj)); if (ignore_timezone) { offset = 0; } @@ -1358,7 +1358,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { - const auto& ts_type = checked_cast(*real_type); + const auto& ts_type = checked_cast(*real_type); real_type = timestamp(ts_type.unit()); } } else { From d4cc5194c6336fec78b0b8df040ad8e4b812d09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 17:33:07 +0200 Subject: [PATCH 19/25] properly test nested struct roundtrip --- python/pyarrow/tests/test_pandas.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 14feeda0fc9..2d66a320481 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3334,12 +3334,8 @@ def test_nested_with_timestamp_tz_round_trip(): struct = pa.StructArray.from_arrays([arr, arr], ['start', 'stop']) result = struct.to_pandas() - # N.B. we test with Pandas because the Arrow types are not - # actually equal. All Timezone aware times are currently normalized - # to "UTC" as the timestamp type, but since this conversion results - # in object dtypes, and tzinfo is preserved the comparison should - # result in equality. - pd.testing.assert_series_equal(pa.array(result).to_pandas(), result) + restored = pa.array(result) + assert restored.equals(struct) def test_nested_with_timestamp_tz(): From 97e42376ebc5ccf835e213a483a64cb06c05b6ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 18:14:28 +0200 Subject: [PATCH 20/25] address review comments --- cpp/src/arrow/python/arrow_to_pandas.cc | 11 ++++++----- cpp/src/arrow/python/datetime.cc | 20 +++++++------------- python/pyarrow/tests/test_types.py | 15 +++++---------- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 7c317340470..d65ff8682da 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -961,13 +961,13 @@ struct ObjectWriterVisitor { return Status::OK(); }; auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) { - PyObject* tz_naive; - RETURN_NOT_OK(ConvertTimezoneNaive(value, &tz_naive)); + PyObject* naive_datetime; + RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime)); // convert the timezone naive datetime object to timezone aware - *out = PyObject_CallMethod(tzinfo, "fromutc", "O", tz_naive); - RETURN_IF_PYERROR(); + *out = PyObject_CallMethod(tzinfo, "fromutc", "O", naive_datetime); // the timezone naive object is no longer required - Py_DECREF(tz_naive); + Py_DECREF(naive_datetime); + RETURN_IF_PYERROR(); return Status::OK(); }; @@ -977,6 +977,7 @@ struct ObjectWriterVisitor { RETURN_IF_PYERROR(); RETURN_NOT_OK( ConvertAsPyObjects(options, data, ConvertTimezoneAware, out_values)); + Py_DECREF(tzinfo); } else { // convert timezone naive RETURN_NOT_OK( diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index f6d53d0997d..72724210f8e 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -319,12 +319,12 @@ Result PyDateTime_utcoffset_s(PyObject* obj) { } } -Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo, PyObject* pydelta_class) { +Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) { // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None)); RETURN_IF_PYERROR(); - if (!PyObject_IsInstance(pydelta_object.obj(), pydelta_class)) { + if (!PyDelta_Check(pydelta_object.obj())) { return Status::Invalid( "Object returned by tzinfo.utcoffset(None) is not an instance of " "datetime.timedelta"); @@ -396,9 +396,7 @@ Result StringToTzinfo(const std::string& tz) { Result TzinfoToString(PyObject* tzinfo) { OwnedRef module_pytz; // import pytz OwnedRef module_datetime; // import datetime - OwnedRef class_tzinfo; // from datetime import tzinfo OwnedRef class_timezone; // from datetime import timezone - OwnedRef class_timedelta; // from datetime import timedelta OwnedRef class_fixedoffset; // from pytz import _FixedOffset // import necessary modules @@ -407,23 +405,19 @@ Result TzinfoToString(PyObject* tzinfo) { // import necessary classes RETURN_NOT_OK( internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", &class_fixedoffset)); - RETURN_NOT_OK( - internal::ImportFromModule(module_datetime.obj(), "tzinfo", &class_tzinfo)); RETURN_NOT_OK( internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone)); - RETURN_NOT_OK( - internal::ImportFromModule(module_datetime.obj(), "timedelta", &class_timedelta)); // check that it's a valid tzinfo object - if (!PyObject_IsInstance(tzinfo, class_tzinfo.obj())) { - return Status::Invalid("Not an instance of datetime.tzinfo"); + if (!PyTZInfo_Check(tzinfo)) { + return Status::TypeError("Not an instance of datetime.tzinfo"); } - // if tzinfo is an instance of pytz._FixedOffset or datetime.timedelta return the + // if tzinfo is an instance of pytz._FixedOffset or datetime.timezone return the // HH:MM offset string representation if (PyObject_IsInstance(tzinfo, class_timezone.obj()) || PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) { - return PyTZInfo_utcoffset_hhmm(tzinfo, class_timedelta.obj()); + return PyTZInfo_utcoffset_hhmm(tzinfo); } // attempt to call tzinfo.tzname(None) @@ -436,7 +430,7 @@ Result TzinfoToString(PyObject* tzinfo) { } // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None) - return PyTZInfo_utcoffset_hhmm(tzinfo, class_timedelta.obj()); + return PyTZInfo_utcoffset_hhmm(tzinfo); } } // namespace internal diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index f2764ba109a..4aa013a6f34 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -18,7 +18,6 @@ from collections import OrderedDict from collections.abc import Iterator import datetime -import sys import pickle import pytest @@ -266,20 +265,16 @@ def test_tzinfo_to_string(tz, expected): assert pa.lib.tzinfo_to_string(tz) == expected -@pytest.mark.skipif(sys.version_info <= (3, 7), reason=( - "Since python 3.7 the UTC offset for datetime.timezone is not restricted " - "to a whole number of minutes" -)) def test_tzinfo_to_string_errors(): + msg = "Not an instance of datetime.tzinfo" + with pytest.raises(TypeError): + pa.lib.tzinfo_to_string("Europe/Budapest") + + tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) msg = "Offset must represent whole number of minutes" with pytest.raises(ValueError, match=msg): - tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) pa.lib.tzinfo_to_string(tz) - msg = "Not an instance of datetime.tzinfo" - with pytest.raises(ValueError): - pa.lib.tzinfo_to_string("Europe/Budapest") - def test_convert_custom_tzinfo_objects_to_string(): class CorrectTimezone1(datetime.tzinfo): From 498d743f2ebbed055ffa369a2b1473ee957f649c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 18:30:18 +0200 Subject: [PATCH 21/25] use OwnedRef to cleanup tzobject --- cpp/src/arrow/python/arrow_to_pandas.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index d65ff8682da..a5f1aa4179f 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -952,8 +952,8 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { - PyObject* tzinfo = nullptr; const TimeUnit::type unit = type.unit(); + OwnedRef tzinfo; auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); @@ -964,7 +964,7 @@ struct ObjectWriterVisitor { PyObject* naive_datetime; RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime)); // convert the timezone naive datetime object to timezone aware - *out = PyObject_CallMethod(tzinfo, "fromutc", "O", naive_datetime); + *out = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", naive_datetime); // the timezone naive object is no longer required Py_DECREF(naive_datetime); RETURN_IF_PYERROR(); @@ -973,11 +973,12 @@ struct ObjectWriterVisitor { if (!type.timezone().empty() && !options.ignore_timezone) { // convert timezone aware - ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); + PyObject *tzobj; + ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone())); + tzinfo.reset(tzobj); RETURN_IF_PYERROR(); RETURN_NOT_OK( ConvertAsPyObjects(options, data, ConvertTimezoneAware, out_values)); - Py_DECREF(tzinfo); } else { // convert timezone naive RETURN_NOT_OK( From 2f5dbf63953435aedc1a2e175670c44376c94042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 22:13:08 +0200 Subject: [PATCH 22/25] fix test_tzinfo_to_string_errors test case --- python/pyarrow/tests/test_types.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 4aa013a6f34..c52751e91ac 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -18,6 +18,7 @@ from collections import OrderedDict from collections.abc import Iterator import datetime +import sys import pickle import pytest @@ -270,10 +271,13 @@ def test_tzinfo_to_string_errors(): with pytest.raises(TypeError): pa.lib.tzinfo_to_string("Europe/Budapest") - tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) - msg = "Offset must represent whole number of minutes" - with pytest.raises(ValueError, match=msg): - pa.lib.tzinfo_to_string(tz) + if sys.version_info >= (3, 8): + # before 3.8 it was only possible to create timezone objects with whole + # number of minutes + tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) + msg = "Offset must represent whole number of minutes" + with pytest.raises(ValueError, match=msg): + pa.lib.tzinfo_to_string(tz) def test_convert_custom_tzinfo_objects_to_string(): From f5d44e73ae14c13bb5d027e3bb32427d4e6aa32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 22:13:53 +0200 Subject: [PATCH 23/25] clang format --- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index a5f1aa4179f..47d86f55157 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -973,7 +973,7 @@ struct ObjectWriterVisitor { if (!type.timezone().empty() && !options.ignore_timezone) { // convert timezone aware - PyObject *tzobj; + PyObject* tzobj; ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone())); tzinfo.reset(tzobj); RETURN_IF_PYERROR(); From 880f344471d6a37894556b5f48c77d674951cb79 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 15 Aug 2020 05:23:20 +0000 Subject: [PATCH 24/25] TZ->TIMEZONE --- ci/scripts/integration_spark.sh | 2 +- python/pyarrow/array.pxi | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index 2d02eb81d47..a45ed7a7125 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -23,7 +23,7 @@ spark_dir=${2} spark_version=${SPARK_VERSION:-master} # Use old behavior that always dropped tiemzones. -export PYARROW_IGNORE_TZ=1 +export PYARROW_IGNORE_TIMEZONE=1 if [ "${SPARK_VERSION:0:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 287f72185e4..34417da63ff 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -32,7 +32,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, options.pool = pool options.from_pandas = from_pandas - options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TZ', False) + options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) cdef shared_ptr[CChunkedArray] out @@ -732,7 +732,7 @@ cdef PandasOptions _convert_pandas_options(dict options): result.safe_cast = options['safe'] result.split_blocks = options['split_blocks'] result.self_destruct = options['self_destruct'] - result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TZ', False) + result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) return result From 7a658cd99d833cef20323bb7cc03d804da0dc830 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 15 Aug 2020 05:39:44 +0000 Subject: [PATCH 25/25] fix doc comments --- cpp/src/arrow/python/arrow_to_pandas.cc | 1 + cpp/src/arrow/python/datetime.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 47d86f55157..47b62a35521 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -648,6 +648,7 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // units second through microsecond but PyLong for nanosecond (because // datetime.datetime does not support nanoseconds). // We force the object conversion to preserve the value of the timezone. + // Nanoseconds are returned integers inside of structs. PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 72724210f8e..4eeab7f5a69 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -354,7 +354,6 @@ Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) { return stream.str(); } -// GIL must be held when calling this function. // Converted from python. See https://github.com/apache/arrow/pull/7604 // for details. Result StringToTzinfo(const std::string& tz) {