Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@

// Functions for pandas conversion via NumPy

#include "arrow/python/numpy_interop.h" // IWYU pragma: expand

#include "arrow/python/arrow_to_pandas.h"
#include "arrow/python/numpy_interop.h" // IWYU pragma: expand

#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -642,24 +641,27 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
std::vector<OwnedRef> fields_data(num_fields);
OwnedRef dict_item;

// XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second
// In ARROW-7723, we found as a result of ARROW-3789 that second
// through microsecond resolution tz-aware timestamps were being promoted to
// use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
// array in this function. PyArray_GETITEM returns datetime.datetime for
// units second through microsecond but PyLong for nanosecond (because
// datetime.datetime does not support nanoseconds). We inserted this hack to
// preserve the <= 0.15.1 behavior until a better solution can be devised
// datetime.datetime does not support nanoseconds).
// We force the object conversion to preserve the value of the timezone.
PandasOptions modified_options = options;
modified_options.ignore_timezone = true;
modified_options.coerce_temporal_nanoseconds = false;

for (int c = 0; c < data.num_chunks(); c++) {
auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
// Convert the struct arrays first
for (int32_t i = 0; i < num_fields; i++) {
PyObject* numpy_array;
RETURN_NOT_OK(ConvertArrayToPandas(
modified_options, arr->field(static_cast<int>(i)), nullptr, &numpy_array));
std::shared_ptr<Array> field = arr->field(static_cast<int>(i));
// Seen notes above about timstamp conversion. Don't blindly convert because
// timestamps in lists are handled differently.
modified_options.timestamp_as_object =
field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object;
RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array));
fields_data[i].reset(numpy_array);
}

Expand Down Expand Up @@ -951,8 +953,21 @@ struct ObjectWriterVisitor {
template <typename Type>
enable_if_timestamp<Type, Status> Visit(const Type& type) {
const TimeUnit::type unit = type.unit();
auto WrapValue = [unit](typename Type::c_type value, PyObject** out) {
OwnedRef tzinfo;
if (!type.timezone().empty()) {
RETURN_NOT_OK(internal::StringToTzinfo(type.timezone(), tzinfo.ref()));
RETURN_IF_PYERROR();
}
auto WrapValue = [&](typename Type::c_type value, PyObject** out) {
RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out));
RETURN_IF_PYERROR();
if (tzinfo.obj() != nullptr) {
PyObject* with_tz = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", *out);
RETURN_IF_PYERROR();
Py_DECREF(*out);
*out = with_tz;
}

RETURN_IF_PYERROR();
return Status::OK();
};
Expand Down Expand Up @@ -1721,8 +1736,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
// Nanoseconds are never out of bounds for pandas, so in that case
// we don't convert to object
*output_type = PandasWriter::OBJECT;
} else if (ts_type.timezone() != "" && !options.ignore_timezone) {
// XXX: ignore_timezone: hack here for ARROW-7723
} else if (!ts_type.timezone().empty()) {
*output_type = PandasWriter::DATETIME_NANO_TZ;
} else if (options.coerce_temporal_nanoseconds) {
*output_type = PandasWriter::DATETIME_NANO;
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/arrow/python/arrow_to_pandas.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,6 @@ struct PandasOptions {
/// Coerce all date and timestamp to datetime64[ns]
bool coerce_temporal_nanoseconds = false;

/// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion
/// path
bool ignore_timezone = false;

/// \brief If true, do not create duplicate PyObject versions of equal
/// objects. This only applies to immutable objects like strings or datetime
/// objects
Expand Down
83 changes: 82 additions & 1 deletion cpp/src/arrow/python/datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,65 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/python/datetime.h"

#include <algorithm>
#include <chrono>
#include <iostream>

#include "arrow/python/common.h"
#include "arrow/python/datetime.h"
#include "arrow/python/helpers.h"
#include "arrow/python/platform.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
namespace py {
namespace internal {

namespace {

// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'.
// GCC 4.9 doesn't support regex, so handcode until support for it
// is dropped.
bool MatchFixedOffset(const std::string& tz, util::string_view* sign,
util::string_view* hour, util::string_view* minute) {
if (tz.size() < 5) {
return false;
}
const char* iter = tz.data();
if (*iter == '+' || *iter == '-') {
*sign = util::string_view(iter, 1);
iter++;
if (tz.size() < 6) {
return false;
}
}
if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') ||
(*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) {
*hour = util::string_view(iter, 2);
iter += 2;
} else {
return false;
}
if (*iter != ':') {
return false;
}
iter++;

if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') {
*minute = util::string_view(iter, 2);
iter += 2;
} else {
return false;
}
return iter == (tz.data() + tz.size());
}

} // namespace

PyDateTime_CAPI* datetime_api = nullptr;

void InitDatetime() {
Expand Down Expand Up @@ -262,6 +305,44 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) {
PyDateTime_GET_DAY(pydate));
}

// GIL must be held when calling this function.
// Converted from python. See https://github.com/apache/arrow/pull/7604
// for details.
Status StringToTzinfo(const std::string& tz, PyObject** tzinfo) {
util::string_view sign_str, hour_str, minute_str;
OwnedRef pytz;
RETURN_NOT_OK(internal::ImportModule("pytz", &pytz));

if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
int sign = -1;
if (sign_str == "+") {
sign = 1;
}
OwnedRef fixed_offset;
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset));
uint32_t minutes, hours;
if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
!::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
&minutes)) {
return Status::Invalid("Invalid timezone: ", tz);
}
OwnedRef total_minutes(PyLong_FromLong(
sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
RETURN_IF_PYERROR();
*tzinfo = PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL);
RETURN_IF_PYERROR();
return Status::OK();
}

OwnedRef timezone;
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
OwnedRef py_tz_string(
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
*tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
RETURN_IF_PYERROR();
return Status::OK();
}

} // namespace internal
} // namespace py
} // namespace arrow
10 changes: 10 additions & 0 deletions cpp/src/arrow/python/datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,16 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
return PyDelta_to_us(pytimedelta) * 1000;
}

/// \brief Convert a time zone name into a time zone object.
///
/// Supported input strings are:
/// * As used in the Olson time zone database (the "tz database" or
/// "tzdata"), such as "America/New_York"
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
/// GIL must be held when calling this method.
ARROW_PYTHON_EXPORT
Status StringToTzinfo(const std::string& tz, PyObject** tzinfo);

} // namespace internal
} // namespace py
} // namespace arrow
4 changes: 3 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,9 @@ cdef _array_like_to_pandas(obj, options):
result = pandas_api.series(arr, dtype=dtype, name=name)

if (isinstance(original_type, TimestampType) and
original_type.tz is not None):
original_type.tz is not None and
# can be object dtype for non-ns and timestamp_as_object=True
result.dtype.kind == "M"):
from pyarrow.pandas_compat import make_tz_aware
result = make_tz_aware(result, original_type.tz)

Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1819,6 +1819,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
int64_t TimePoint_to_ns(CTimePoint val)
CTimePoint TimePoint_from_s(double val)
CTimePoint TimePoint_from_ns(int64_t val)
CStatus StringToTzinfo(c_string, PyObject** tzinfo)


cdef extern from 'arrow/python/init.h':
Expand Down
49 changes: 36 additions & 13 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import sys

from collections import OrderedDict
from datetime import date, datetime, time, timedelta
from datetime import date, datetime, time, timedelta, timezone
from distutils.version import LooseVersion

import hypothesis as h
Expand Down Expand Up @@ -3321,13 +3321,20 @@ def test_cast_timestamp_unit():
assert result.equals(expected)


def test_struct_with_timestamp_tz():
def test_nested_with_timestamp_tz():
# ARROW-7723
ts = pd.Timestamp.now()
ts_dt = ts.to_pydatetime()

# XXX: Ensure that this data does not get promoted to nanoseconds (and thus
# integers) to preserve behavior in 0.15.1
for unit in ['s', 'ms', 'us']:
if unit in ['s', 'ms']:
# This is used for verifying timezone conversion to micros are not
# important
def truncate(x): return x.replace(microsecond=0)
else:
def truncate(x): return x
arr = pa.array([ts], type=pa.timestamp(unit))
arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York'))

Expand All @@ -3336,20 +3343,30 @@ def test_struct_with_timestamp_tz():

result = arr3.to_pandas()
assert isinstance(result[0]['start'], datetime)
assert result[0]['start'].tzinfo is None
assert isinstance(result[0]['stop'], datetime)
assert result[0]['stop'].tzinfo is None

result = arr4.to_pandas()
assert isinstance(result[0]['start'], datetime)
assert result[0]['start'].tzinfo is not None
utc_dt = result[0]['start'].astimezone(timezone.utc)
assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt)
assert isinstance(result[0]['stop'], datetime)
assert result[0]['stop'].tzinfo is not None

# same conversion for table
result = pa.table({'a': arr3}).to_pandas()
assert isinstance(result['a'][0]['start'], datetime)
assert result['a'][0]['start'].tzinfo is None
assert isinstance(result['a'][0]['stop'], datetime)
assert result['a'][0]['stop'].tzinfo is None

result = pa.table({'a': arr4}).to_pandas()
assert isinstance(result['a'][0]['start'], datetime)
assert result['a'][0]['start'].tzinfo is not None
assert isinstance(result['a'][0]['stop'], datetime)
assert result['a'][0]['stop'].tzinfo is not None


# ----------------------------------------------------------------------
Expand Down Expand Up @@ -4010,19 +4027,25 @@ def test_timestamp_as_object_out_of_range():


@pytest.mark.parametrize("resolution", ["s", "ms", "us"])
@pytest.mark.parametrize("tz", [None, "America/New_York"])
# One datetime outside nanosecond range, one inside nanosecond range:
@pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)])
def test_timestamp_as_object_non_nanosecond(resolution, dt):
def test_timestamp_as_object_non_nanosecond(resolution, tz, dt):
# Timestamps can be converted Arrow and reloaded into Pandas with no loss
# of information if the timestamp_as_object option is True.
arr = pa.array([dt], type=pa.timestamp(resolution))
result = arr.to_pandas(timestamp_as_object=True)
assert result.dtype == object
assert isinstance(result[0], datetime)
assert result[0] == dt

arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz))
table = pa.table({'a': arr})
result = table.to_pandas(timestamp_as_object=True)['a']
assert result.dtype == object
assert isinstance(result[0], datetime)
assert result[0] == dt

for result in [
arr.to_pandas(timestamp_as_object=True),
table.to_pandas(timestamp_as_object=True)['a']
]:
assert result.dtype == object
assert isinstance(result[0], datetime)
if tz:
assert result[0].tzinfo is not None
expected = result[0].tzinfo.fromutc(dt)
else:
assert result[0].tzinfo is None
expected = dt
assert result[0] == expected
14 changes: 3 additions & 11 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1816,9 +1816,6 @@ cdef timeunit_to_string(TimeUnit unit):
return 'ns'


_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$')


def tzinfo_to_string(tz):
"""
Converts a time zone object into a string indicating the name of a time
Expand Down Expand Up @@ -1884,14 +1881,9 @@ def string_to_tzinfo(name):
tz : datetime.tzinfo
Time zone object
"""
import pytz
m = _FIXED_OFFSET_RE.match(name)
if m:
sign = 1 if m.group(1) == '+' else -1
hours, minutes = map(int, m.group(2, 3))
return pytz.FixedOffset(sign * (hours * 60 + minutes))
else:
return pytz.timezone(name)
cdef PyObject* tz
check_status(libarrow.StringToTzinfo(name.encode('utf-8'), &tz))
return PyObject_to_object(tz)


def timestamp(unit, tz=None):
Expand Down