From 12840af97c9903a36877f7e494ed54fccbde1217 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 19 Jul 2020 18:57:57 -0500 Subject: [PATCH] Revert "ARROW-9223: [Python] Propagate timezone information in pandas conversion" This reverts commit f86c4dbbc18d9b314338a868212ca80b05d2ed55. --- cpp/src/arrow/python/arrow_to_pandas.cc | 36 ++++------- cpp/src/arrow/python/arrow_to_pandas.h | 4 ++ cpp/src/arrow/python/datetime.cc | 83 +------------------------ cpp/src/arrow/python/datetime.h | 10 --- python/pyarrow/array.pxi | 4 +- python/pyarrow/includes/libarrow.pxd | 1 - python/pyarrow/tests/test_pandas.py | 49 ++++----------- python/pyarrow/types.pxi | 14 ++++- 8 files changed, 41 insertions(+), 160 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 9775bf3a0cf..bc4e25b08df 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -17,9 +17,10 @@ // Functions for pandas conversion via NumPy -#include "arrow/python/arrow_to_pandas.h" #include "arrow/python/numpy_interop.h" // IWYU pragma: expand +#include "arrow/python/arrow_to_pandas.h" + #include #include #include @@ -641,14 +642,15 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // In ARROW-7723, we found as a result of ARROW-3789 that second + // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second // through microsecond resolution tz-aware timestamps were being promoted to // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy // array in this function. PyArray_GETITEM returns datetime.datetime for // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). - // We force the object conversion to preserve the value of the timezone. + // datetime.datetime does not support nanoseconds). We inserted this hack to + // preserve the <= 0.15.1 behavior until a better solution can be devised PandasOptions modified_options = options; + modified_options.ignore_timezone = true; modified_options.coerce_temporal_nanoseconds = false; for (int c = 0; c < data.num_chunks(); c++) { @@ -656,12 +658,8 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { PyObject* numpy_array; - std::shared_ptr field = arr->field(static_cast(i)); - // Seen notes above about timstamp conversion. Don't blindly convert because - // timestamps in lists are handled differently. - modified_options.timestamp_as_object = - field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; - RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); + RETURN_NOT_OK(ConvertArrayToPandas( + modified_options, arr->field(static_cast(i)), nullptr, &numpy_array)); fields_data[i].reset(numpy_array); } @@ -953,21 +951,8 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); - OwnedRef tzinfo; - if (!type.timezone().empty()) { - RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref())); - RETURN_IF_PYERROR(); - } - auto WrapValue = [&](typename Type::c_type value, PyObject** out) { + auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); - RETURN_IF_PYERROR(); - if (tzinfo.obj() != nullptr) { - PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out); - RETURN_IF_PYERROR(); - Py_DECREF(*out); - *out = with_tz; - } - RETURN_IF_PYERROR(); return Status::OK(); }; @@ -1742,7 +1727,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (!ts_type.timezone().empty()) { + } else if (ts_type.timezone() != "" && !options.ignore_timezone) { + // XXX: ignore_timezone: hack here for ARROW-7723 *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { *output_type = PandasWriter::DATETIME_NANO; diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 2c5db8bdaa9..79a72bcb1ef 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -56,6 +56,10 @@ struct PandasOptions { /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; + /// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion + /// path + bool ignore_timezone = false; + /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 19c7a4ea27d..8cec87bdd36 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -14,65 +14,22 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -#include "arrow/python/datetime.h" #include #include #include #include "arrow/python/common.h" -#include "arrow/python/helpers.h" +#include "arrow/python/datetime.h" #include "arrow/python/platform.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" -#include "arrow/util/value_parsing.h" namespace arrow { namespace py { namespace internal { -namespace { - -// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'. -// GCC 4.9 doesn't support regex, so handcode until support for it -// is dropped. -bool MatchFixedOffset(const std::string& tz, util::string_view* sign, - util::string_view* hour, util::string_view* minute) { - if (tz.size() < 5) { - return false; - } - const char* iter = tz.data(); - if (*iter == '+' || *iter == '-') { - *sign = util::string_view(iter, 1); - iter++; - if (tz.size() < 6) { - return false; - } - } - if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') || - (*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) { - *hour = util::string_view(iter, 2); - iter += 2; - } else { - return false; - } - if (*iter != ':') { - return false; - } - iter++; - - if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') { - *minute = util::string_view(iter, 2); - iter += 2; - } else { - return false; - } - return iter == (tz.data() + tz.size()); -} - -} // namespace - PyDateTime_CAPI* datetime_api = nullptr; void InitDatetime() { @@ -305,44 +262,6 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } -// GIL must be held when calling this function. -// Converted from python. See https://github.com/apache/arrow/pull/7604 -// for details. -Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) { - util::string_view sign_str, hour_str, minute_str; - OwnedRef pytz; - RETURN_NOT_OK(internal::ImportModule("pytz", &pytz)); - - if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { - int sign = -1; - if (sign_str == "+") { - sign = 1; - } - OwnedRef fixed_offset; - RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); - uint32_t minutes, hours; - if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || - !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), - &minutes)) { - return Status::Invalid("Invalid timezone: ", tz); - } - OwnedRef total_minutes(PyLong_FromLong( - sign * ((static_cast(hours) * 60) + static_cast(minutes)))); - RETURN_IF_PYERROR(); - *tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); - RETURN_IF_PYERROR(); - return Status::OK(); - } - - OwnedRef timezone; - RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); - OwnedRef py_tz_string( - PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); - *tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); - RETURN_IF_PYERROR(); - return Status::OK(); -} - } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a394fa4b4bb..a8b22da4741 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -157,16 +157,6 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } -/// \brief Convert a time zone name into a time zone object. -/// -/// Supported input strings are: -/// * As used in the Olson time zone database (the "tz database" or -/// "tzdata"), such as "America/New_York" -/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 -/// GIL must be held when calling this method. -ARROW_PYTHON_EXPORT -Status StringToTzinfo(const std::string& tz, PyObject** tzinfo); - } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 3d5dd7d482c..4e4339e4d14 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1287,9 +1287,7 @@ cdef _array_like_to_pandas(obj, options): result = pandas_api.series(arr, dtype=dtype, name=name) if (isinstance(original_type, TimestampType) and - original_type.tz is not None and - # can be object dtype for non-ns and timestamp_as_object=True - result.dtype.kind == "M"): + original_type.tz is not None): from pyarrow.pandas_compat import make_tz_aware result = make_tz_aware(result, original_type.tz) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d704af40023..555688a8385 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1926,7 +1926,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: int64_t TimePoint_to_ns(CTimePoint val) CTimePoint TimePoint_from_s(double val) CTimePoint TimePoint_from_ns(int64_t val) - CStatus StringToTzinfo(c_string, PyObject** tzinfo) cdef extern from 'arrow/python/init.h': diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 979b2a6f72c..02ffbf73726 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -22,7 +22,7 @@ import sys from collections import OrderedDict -from datetime import date, datetime, time, timedelta, timezone +from datetime import date, datetime, time, timedelta from distutils.version import LooseVersion import hypothesis as h @@ -3325,20 +3325,13 @@ def test_cast_timestamp_unit(): assert result.equals(expected) -def test_nested_with_timestamp_tz(): +def test_struct_with_timestamp_tz(): # ARROW-7723 ts = pd.Timestamp.now() - ts_dt = ts.to_pydatetime() # XXX: Ensure that this data does not get promoted to nanoseconds (and thus # integers) to preserve behavior in 0.15.1 for unit in ['s', 'ms', 'us']: - if unit in ['s', 'ms']: - # This is used for verifying timezone conversion to micros are not - # important - def truncate(x): return x.replace(microsecond=0) - else: - def truncate(x): return x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) @@ -3347,30 +3340,20 @@ def truncate(x): return x result = arr3.to_pandas() assert isinstance(result[0]['start'], datetime) - assert result[0]['start'].tzinfo is None assert isinstance(result[0]['stop'], datetime) - assert result[0]['stop'].tzinfo is None result = arr4.to_pandas() assert isinstance(result[0]['start'], datetime) - assert result[0]['start'].tzinfo is not None - utc_dt = result[0]['start'].astimezone(timezone.utc) - assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt) assert isinstance(result[0]['stop'], datetime) - assert result[0]['stop'].tzinfo is not None # same conversion for table result = pa.table({'a': arr3}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) - assert result['a'][0]['start'].tzinfo is None assert isinstance(result['a'][0]['stop'], datetime) - assert result['a'][0]['stop'].tzinfo is None result = pa.table({'a': arr4}).to_pandas() assert isinstance(result['a'][0]['start'], datetime) - assert result['a'][0]['start'].tzinfo is not None assert isinstance(result['a'][0]['stop'], datetime) - assert result['a'][0]['stop'].tzinfo is not None # ---------------------------------------------------------------------- @@ -4047,25 +4030,19 @@ def test_timestamp_as_object_out_of_range(): @pytest.mark.parametrize("resolution", ["s", "ms", "us"]) -@pytest.mark.parametrize("tz", [None, "America/New_York"]) # One datetime outside nanosecond range, one inside nanosecond range: @pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)]) -def test_timestamp_as_object_non_nanosecond(resolution, tz, dt): +def test_timestamp_as_object_non_nanosecond(resolution, dt): # Timestamps can be converted Arrow and reloaded into Pandas with no loss # of information if the timestamp_as_object option is True. - arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz)) - table = pa.table({'a': arr}) + arr = pa.array([dt], type=pa.timestamp(resolution)) + result = arr.to_pandas(timestamp_as_object=True) + assert result.dtype == object + assert isinstance(result[0], datetime) + assert result[0] == dt - for result in [ - arr.to_pandas(timestamp_as_object=True), - table.to_pandas(timestamp_as_object=True)['a'] - ]: - assert result.dtype == object - assert isinstance(result[0], datetime) - if tz: - assert result[0].tzinfo is not None - expected = result[0].tzinfo.fromutc(dt) - else: - assert result[0].tzinfo is None - expected = dt - assert result[0] == expected + table = pa.table({'a': arr}) + result = table.to_pandas(timestamp_as_object=True)['a'] + assert result.dtype == object + assert isinstance(result[0], datetime) + assert result[0] == dt diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 007f8e88198..e541510df0b 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1816,6 +1816,9 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$') + + def tzinfo_to_string(tz): """ Converts a time zone object into a string indicating the name of a time @@ -1881,9 +1884,14 @@ def string_to_tzinfo(name): tz : datetime.tzinfo Time zone object """ - cdef PyObject* tz - check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz)) - return PyObject_to_object(tz) + import pytz + m = _FIXED_OFFSET_RE.match(name) + if m: + sign = 1 if m.group(1) == '+' else -1 + hours, minutes = map(int, m.group(2, 3)) + return pytz.FixedOffset(sign * (hours * 60 + minutes)) + else: + return pytz.timezone(name) def timestamp(unit, tz=None):