diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4679a2f5b76..0cafdce89e5 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -108,6 +108,31 @@ std::string Date32Type::ToString() const { return std::string("date32"); } +static inline void print_time_unit(TimeUnit unit, std::ostream* stream) { + switch (unit) { + case TimeUnit::SECOND: + (*stream) << "s"; + break; + case TimeUnit::MILLI: + (*stream) << "ms"; + break; + case TimeUnit::MICRO: + (*stream) << "us"; + break; + case TimeUnit::NANO: + (*stream) << "ns"; + break; + } +} + +std::string TimestampType::ToString() const { + std::stringstream ss; + ss << "timestamp["; + print_time_unit(this->unit, &ss); + ss << "]"; + return ss.str(); +} + // ---------------------------------------------------------------------- // Union type diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a838082d7e7..15b99c5ce4f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -495,7 +495,7 @@ struct ARROW_EXPORT TimestampType : public FixedWidthType { TimestampType(const TimestampType& other) : TimestampType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return name(); } + std::string ToString() const override; static std::string name() { return "timestamp"; } TimeUnit unit; diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 6724b52e600..a4aac443fae 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -56,6 +56,8 @@ FloatValue, DoubleValue, ListValue, BinaryValue, StringValue) +import pyarrow.schema as _schema + from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, @@ -64,6 +66,7 @@ list_, struct, dictionary, field, DataType, Field, Schema, schema) + from pyarrow.table import Column, RecordBatch, Table, concat_tables diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 6a6b4ba9ad0..11244e78360 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -34,7 +34,8 @@ from pyarrow.memory cimport MemoryPool, maybe_unbox_memory_pool cimport pyarrow.scalar as scalar from pyarrow.scalar import NA -from pyarrow.schema cimport Field, Schema, DictionaryType +from pyarrow.schema cimport (DataType, Field, Schema, DictionaryType, + box_data_type) import pyarrow.schema as schema cimport cpython @@ -45,16 +46,40 @@ cdef _pandas(): return pd +cdef maybe_coerce_datetime64(values, dtype, DataType type, + timestamps_to_ms=False): + + from pyarrow.compat import DatetimeTZDtype + + if values.dtype.type != np.datetime64: + return values, type + + coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' + + if coerce_ms: + values = values.astype('datetime64[ms]') + + if isinstance(dtype, DatetimeTZDtype): + tz = dtype.tz + unit = 'ms' if coerce_ms else dtype.unit + type = schema.timestamp(unit, tz) + else: + # Trust the NumPy dtype + type = schema.type_from_numpy_dtype(values.dtype) + + return values, type + + cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.type = DataType() - self.type.init(self.sp_array.get().type()) + self.type = box_data_type(self.sp_array.get().type()) @staticmethod - def from_pandas(obj, mask=None, timestamps_to_ms=False, Field field=None, + def from_pandas(obj, mask=None, DataType type=None, + timestamps_to_ms=False, MemoryPool memory_pool=None): """ Convert pandas.Series to an Arrow Array. @@ -66,6 +91,9 @@ cdef class Array: mask : pandas.Series or numpy.ndarray, optional boolean mask if the object is valid or null + type : pyarrow.DataType + Explicit type to attempt to coerce to + timestamps_to_ms : bool, optional Convert datetime columns to ms resolution. This is needed for compatibility with other functionality like Parquet I/O which @@ -107,33 +135,43 @@ cdef class Array: """ cdef: shared_ptr[CArray] out - shared_ptr[CField] c_field + shared_ptr[CDataType] c_type CMemoryPool* pool pd = _pandas() - if field is not None: - c_field = field.sp_field - if mask is not None: mask = get_series_values(mask) - series_values = get_series_values(obj) + values = get_series_values(obj) + pool = maybe_unbox_memory_pool(memory_pool) - if isinstance(series_values, pd.Categorical): + if isinstance(values, pd.Categorical): return DictionaryArray.from_arrays( - series_values.codes, series_values.categories.values, + values.codes, values.categories.values, mask=mask, memory_pool=memory_pool) + elif values.dtype == object: + # Object dtype undergoes a different conversion path as more type + # inference may be needed + if type is not None: + c_type = type.sp_type + with nogil: + check_status(pyarrow.PandasObjectsToArrow( + pool, values, mask, c_type, &out)) else: - if series_values.dtype.type == np.datetime64 and timestamps_to_ms: - series_values = series_values.astype('datetime64[ms]') + values, type = maybe_coerce_datetime64( + values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) + + if type is None: + check_status(pyarrow.PandasDtypeToArrow(values.dtype, &c_type)) + else: + c_type = type.sp_type - pool = maybe_unbox_memory_pool(memory_pool) with nogil: check_status(pyarrow.PandasToArrow( - pool, series_values, mask, c_field, &out)) + pool, values, mask, c_type, &out)) - return box_array(out) + return box_array(out) @staticmethod def from_list(object list_obj, DataType type=None, @@ -338,6 +376,10 @@ cdef class DateArray(NumericArray): pass +cdef class TimestampArray(NumericArray): + pass + + cdef class FloatArray(FloatingPointArray): pass @@ -423,7 +465,7 @@ cdef dict _array_classes = { Type_LIST: ListArray, Type_BINARY: BinaryArray, Type_STRING: StringArray, - Type_TIMESTAMP: Int64Array, + Type_TIMESTAMP: TimestampArray, Type_DICTIONARY: DictionaryArray } diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 9148be7d9f8..74d7ca2827b 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -17,9 +17,11 @@ # flake8: noqa +from distutils.version import LooseVersion import itertools import numpy as np +import pandas as pd import sys import six @@ -115,6 +117,13 @@ def encode_file_path(path): return encoded_path +if LooseVersion(pd.__version__) < '0.19.0': + pdapi = pd.core.common + from pandas.core.dtypes import DatetimeTZDtype +else: + from pandas.types.dtypes import DatetimeTZDtype + pdapi = pd.api.types + integer_types = six.integer_types + (np.integer,) __all__ = [] diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx index aa30f097248..5ad7cf53261 100644 --- a/python/pyarrow/config.pyx +++ b/python/pyarrow/config.pyx @@ -17,10 +17,10 @@ cdef extern from 'pyarrow/do_import_numpy.h': pass -cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow': +cdef extern from 'pyarrow/numpy_interop.h' namespace 'arrow::py': int import_numpy() -cdef extern from 'pyarrow/config.h' namespace 'pyarrow': +cdef extern from 'pyarrow/config.h' namespace 'arrow::py': void pyarrow_init() void pyarrow_set_numpy_nan(object o) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index b7dbf96563a..28424afb093 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -19,6 +19,7 @@ from distutils.version import LooseVersion import pandas as pd +from pyarrow.compat import pdapi from pyarrow._feather import FeatherError # noqa from pyarrow.table import Table import pyarrow._feather as ext @@ -27,11 +28,6 @@ if LooseVersion(pd.__version__) < '0.17.0': raise ImportError("feather requires pandas >= 0.17.0") -if LooseVersion(pd.__version__) < '0.19.0': - pdapi = pd.core.common -else: - pdapi = pd.api.types - class FeatherReader(ext.FeatherReader): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 253cabbe0a5..dee7fd4f8e4 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -84,6 +84,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] indices() shared_ptr[CArray] dictionary() + cdef cppclass CTimestampType" arrow::TimestampType"(CFixedWidthType): + TimeUnit unit + c_string timezone + + cdef cppclass CTimeType" arrow::TimeType"(CFixedWidthType): + TimeUnit unit + cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType): CDictionaryType(const shared_ptr[CDataType]& index_type, const shared_ptr[CArray]& dictionary) @@ -92,6 +99,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] dictionary() shared_ptr[CDataType] timestamp(TimeUnit unit) + shared_ptr[CDataType] timestamp(const c_string& timezone, TimeUnit unit) cdef cppclass CMemoryPool" arrow::MemoryPool": int64_t bytes_allocated() @@ -117,9 +125,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringType" arrow::StringType"(CDataType): pass - cdef cppclass CTimestampType" arrow::TimestampType"(CDataType): - TimeUnit unit - cdef cppclass CField" arrow::Field": c_string name shared_ptr[CDataType] type diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index f1d45e0d50f..9fbddba3d10 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,22 +18,29 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CField, +from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable, CDataType, CStatus, Type, CMemoryPool, TimeUnit) cimport pyarrow.includes.libarrow_io as arrow_io -cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: +cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) shared_ptr[CDataType] GetTimestampType(TimeUnit unit) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out) + CStatus ConvertPySequence(object obj, CMemoryPool* pool, + shared_ptr[CArray]* out) + + CStatus PandasDtypeToArrow(object dtype, shared_ptr[CDataType]* type) CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - shared_ptr[CField] field, + const shared_ptr[CDataType]& type, shared_ptr[CArray]* out) + CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, + const shared_ptr[CDataType]& type, + shared_ptr[CArray]* out) + CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, PyObject* py_ref, PyObject** out) @@ -47,12 +54,12 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: CMemoryPool* get_memory_pool() -cdef extern from "pyarrow/common.h" namespace "pyarrow" nogil: +cdef extern from "pyarrow/common.h" namespace "arrow::py" nogil: cdef cppclass PyBytesBuffer(CBuffer): PyBytesBuffer(object o) -cdef extern from "pyarrow/io.h" namespace "pyarrow" nogil: +cdef extern from "pyarrow/io.h" namespace "arrow::py" nogil: cdef cppclass PyReadableFile(arrow_io.ReadableFileInterface): PyReadableFile(object fo) diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 390954cfc6b..15ee5f19ee5 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -16,7 +16,9 @@ # under the License. from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CDataType, CDictionaryType, +from pyarrow.includes.libarrow cimport (CDataType, + CDictionaryType, + CTimestampType, CField, CSchema) cdef class DataType: @@ -31,6 +33,12 @@ cdef class DictionaryType(DataType): cdef: const CDictionaryType* dict_type + +cdef class TimestampType(DataType): + cdef: + const CTimestampType* ts_type + + cdef class Field: cdef: shared_ptr[CField] sp_field diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index d636b5a10bb..4bc938df668 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -26,23 +26,19 @@ from cython.operator cimport dereference as deref from pyarrow.compat import frombytes, tobytes from pyarrow.array cimport Array +from pyarrow.error cimport check_status from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType, - Type_NA, Type_BOOL, - Type_UINT8, Type_INT8, - Type_UINT16, Type_INT16, - Type_UINT32, Type_INT32, - Type_UINT64, Type_INT64, - Type_TIMESTAMP, Type_DATE, - Type_FLOAT, Type_DOUBLE, - Type_STRING, Type_BINARY, TimeUnit_SECOND, TimeUnit_MILLI, TimeUnit_MICRO, TimeUnit_NANO, Type, TimeUnit) cimport pyarrow.includes.pyarrow as pyarrow -cimport pyarrow.includes.libarrow as libarrow +cimport pyarrow.includes.libarrow as la cimport cpython +import six + + cdef class DataType: def __cinit__(self): @@ -73,13 +69,33 @@ cdef class DictionaryType(DataType): DataType.init(self, type) self.dict_type = type.get() - def __str__(self): - return frombytes(self.type.ToString()) - def __repr__(self): return 'DictionaryType({0})'.format(str(self)) +cdef class TimestampType(DataType): + + cdef init(self, const shared_ptr[CDataType]& type): + DataType.init(self, type) + self.ts_type = type.get() + + property unit: + + def __get__(self): + return timeunit_to_string(self.ts_type.unit) + + property tz: + + def __get__(self): + if self.ts_type.timezone.size() > 0: + return frombytes(self.ts_type.timezone) + else: + return None + + def __repr__(self): + return 'TimestampType({0})'.format(str(self)) + + cdef class Field: def __cinit__(self): @@ -205,49 +221,76 @@ cdef DataType primitive_type(Type type): def field(name, type, bint nullable=True): return Field.from_py(name, type, nullable) + cdef set PRIMITIVE_TYPES = set([ - Type_NA, Type_BOOL, - Type_UINT8, Type_INT8, - Type_UINT16, Type_INT16, - Type_UINT32, Type_INT32, - Type_UINT64, Type_INT64, - Type_TIMESTAMP, Type_DATE, - Type_FLOAT, Type_DOUBLE]) + la.Type_NA, la.Type_BOOL, + la.Type_UINT8, la.Type_INT8, + la.Type_UINT16, la.Type_INT16, + la.Type_UINT32, la.Type_INT32, + la.Type_UINT64, la.Type_INT64, + la.Type_TIMESTAMP, la.Type_DATE, + la.Type_FLOAT, la.Type_DOUBLE]) + def null(): - return primitive_type(Type_NA) + return primitive_type(la.Type_NA) + def bool_(): - return primitive_type(Type_BOOL) + return primitive_type(la.Type_BOOL) + def uint8(): - return primitive_type(Type_UINT8) + return primitive_type(la.Type_UINT8) + def int8(): - return primitive_type(Type_INT8) + return primitive_type(la.Type_INT8) + def uint16(): - return primitive_type(Type_UINT16) + return primitive_type(la.Type_UINT16) + def int16(): - return primitive_type(Type_INT16) + return primitive_type(la.Type_INT16) + def uint32(): - return primitive_type(Type_UINT32) + return primitive_type(la.Type_UINT32) + def int32(): - return primitive_type(Type_INT32) + return primitive_type(la.Type_INT32) + def uint64(): - return primitive_type(Type_UINT64) + return primitive_type(la.Type_UINT64) + def int64(): - return primitive_type(Type_INT64) + return primitive_type(la.Type_INT64) + cdef dict _timestamp_type_cache = {} -def timestamp(unit_str): - cdef TimeUnit unit + +cdef timeunit_to_string(TimeUnit unit): + if unit == TimeUnit_SECOND: + return 's' + elif unit == TimeUnit_MILLI: + return 'ms' + elif unit == TimeUnit_MICRO: + return 'us' + elif unit == TimeUnit_NANO: + return 'ns' + + +def timestamp(unit_str, tz=None): + cdef: + TimeUnit unit + c_string c_timezone + if unit_str == "s": unit = TimeUnit_SECOND elif unit_str == 'ms': @@ -259,34 +302,47 @@ def timestamp(unit_str): else: raise TypeError('Invalid TimeUnit string') - if unit in _timestamp_type_cache: - return _timestamp_type_cache[unit] + cdef TimestampType out = TimestampType() + + if tz is None: + out.init(la.timestamp(unit)) + if unit in _timestamp_type_cache: + return _timestamp_type_cache[unit] + _timestamp_type_cache[unit] = out + else: + if not isinstance(tz, six.string_types): + tz = tz.zone + + c_timezone = tobytes(tz) + out.init(la.timestamp(c_timezone, unit)) - cdef DataType out = DataType() - out.init(libarrow.timestamp(unit)) - _timestamp_type_cache[unit] = out return out + def date(): - return primitive_type(Type_DATE) + return primitive_type(la.Type_DATE) + def float_(): - return primitive_type(Type_FLOAT) + return primitive_type(la.Type_FLOAT) + def double(): - return primitive_type(Type_DOUBLE) + return primitive_type(la.Type_DOUBLE) + def string(): """ UTF8 string """ - return primitive_type(Type_STRING) + return primitive_type(la.Type_STRING) + def binary(): """ Binary (PyBytes-like) type """ - return primitive_type(Type_BINARY) + return primitive_type(la.Type_BINARY) def list_(DataType value_type): @@ -326,13 +382,25 @@ def struct(fields): out.init(struct_type) return out + def schema(fields): return Schema.from_fields(fields) + cdef DataType box_data_type(const shared_ptr[CDataType]& type): + cdef: + DataType out + if type.get() == NULL: return None - cdef DataType out = DataType() + + if type.get().type == la.Type_DICTIONARY: + out = DictionaryType() + elif type.get().type == la.Type_TIMESTAMP: + out = TimestampType() + else: + out = DataType() + out.init(type) return out @@ -347,3 +415,11 @@ cdef Schema box_schema(const shared_ptr[CSchema]& type): cdef Schema out = Schema() out.init_schema(type) return out + + +def type_from_numpy_dtype(object dtype): + cdef shared_ptr[CDataType] c_type + with nogil: + check_status(pyarrow.PandasDtypeToArrow(dtype, &c_type)) + + return box_data_type(c_type) diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 5657b973d13..58f5d680393 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -30,7 +30,7 @@ import pyarrow.config from pyarrow.array cimport Array, box_array, wrap_array_output from pyarrow.error import ArrowException from pyarrow.error cimport check_status -from pyarrow.schema cimport box_data_type, box_schema, Field +from pyarrow.schema cimport box_data_type, box_schema, DataType from pyarrow.compat import frombytes, tobytes @@ -302,14 +302,15 @@ cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema): cdef: list names = [] list arrays = [] - Field field = None + DataType type = None for name in df.columns: col = df[name] if schema is not None: - field = schema.field_by_name(name) - arr = Array.from_pandas(col, timestamps_to_ms=timestamps_to_ms, - field=field) + type = schema.field_by_name(name).type + + arr = Array.from_pandas(col, type=type, + timestamps_to_ms=timestamps_to_ms) names.append(name) arrays.append(arr) @@ -522,6 +523,7 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): import pandas.core.internals as _int from pandas import RangeIndex, Categorical + from pyarrow.compat import DatetimeTZDtype with nogil: check_status(pyarrow.ConvertTableToPandas(table, nthreads, @@ -541,9 +543,9 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: - from pandas.types.api import DatetimeTZDtype dtype = DatetimeTZDtype('ns', tz=item['timezone']) block = _int.make_block(block_arr, placement=placement, + klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index a79bb2392ea..6b89444b3e8 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -77,9 +77,9 @@ def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def _check_array_roundtrip(self, values, expected=None, - timestamps_to_ms=False, field=None): + timestamps_to_ms=False, type=None): arr = A.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms, - field=field) + type=type) result = arr.to_pandas() assert arr.null_count == pd.isnull(values).sum() @@ -134,11 +134,13 @@ def test_integer_no_nulls(self): data = OrderedDict() fields = [] - numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()), - ('i4', A.int32()), ('i8', A.int64()), - ('u1', A.uint8()), ('u2', A.uint16()), - ('u4', A.uint32()), ('u8', A.uint64()), - ('longlong', A.int64()), ('ulonglong', A.uint64())] + numpy_dtypes = [ + ('i1', A.int8()), ('i2', A.int16()), + ('i4', A.int32()), ('i8', A.int64()), + ('u1', A.uint8()), ('u2', A.uint16()), + ('u4', A.uint32()), ('u8', A.uint64()), + ('longlong', A.int64()), ('ulonglong', A.uint64()) + ] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: @@ -153,7 +155,6 @@ def test_integer_no_nulls(self): schema = A.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema) - def test_integer_with_nulls(self): # pandas requires upcast to float dtype @@ -301,9 +302,9 @@ def test_timestamps_with_timezone(self): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - df_est = df['datetime64'].dt.tz_localize('US/Eastern').to_frame() - df_utc = df_est['datetime64'].dt.tz_convert('UTC').to_frame() - self._check_pandas_roundtrip(df_est, expected=df_utc, timestamps_to_ms=True, check_dtype=False) + df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') + .to_frame()) + self._check_pandas_roundtrip(df, timestamps_to_ms=True) # drop-in a null and ns instead of ms df = pd.DataFrame({ @@ -314,9 +315,9 @@ def test_timestamps_with_timezone(self): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - df_est = df['datetime64'].dt.tz_localize('US/Eastern').to_frame() - df_utc = df_est['datetime64'].dt.tz_convert('UTC').to_frame() - self._check_pandas_roundtrip(df_est, expected=df_utc, timestamps_to_ms=False, check_dtype=False) + df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') + .to_frame()) + self._check_pandas_roundtrip(df, timestamps_to_ms=False) def test_date(self): df = pd.DataFrame({ @@ -341,7 +342,7 @@ def test_column_of_arrays(self): for column in df.columns: field = schema.field_by_name(column) - self._check_array_roundtrip(df[column], field=field) + self._check_array_roundtrip(df[column], type=field.type) def test_column_of_lists(self): df, schema = dataframe_with_lists() @@ -351,7 +352,7 @@ def test_column_of_lists(self): for column in df.columns: field = schema.field_by_name(column) - self._check_array_roundtrip(df[column], field=field) + self._check_array_roundtrip(df[column], type=field.type) def test_threaded_conversion(self): df = _alltypes_example() diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 451475b4c6d..e4b6273ffcc 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -23,8 +23,8 @@ from pandas.util.testing import assert_frame_equal import pandas as pd +import pyarrow as pa from pyarrow.compat import guid -from pyarrow.error import ArrowException from pyarrow.feather import (read_feather, write_feather, FeatherReader) from pyarrow._feather import FeatherWriter @@ -47,7 +47,7 @@ def tearDown(self): pass def test_file_not_exist(self): - with self.assertRaises(ArrowException): + with self.assertRaises(pa.ArrowException): FeatherReader('test_invalid_file') def _get_null_counts(self, path, columns=None): @@ -291,7 +291,6 @@ def test_category(self): self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - @pytest.mark.xfail def test_timestamp(self): df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) df['with_tz'] = (df.naive.dt.tz_localize('utc') @@ -299,7 +298,6 @@ def test_timestamp(self): self._check_pandas_roundtrip(df) - @pytest.mark.xfail def test_timestamp_with_nulls(self): df = pd.DataFrame({'test': [pd.datetime(2016, 1, 1), None, @@ -308,7 +306,6 @@ def test_timestamp_with_nulls(self): self._check_pandas_roundtrip(df, null_counts=[1, 1]) - @pytest.mark.xfail def test_out_of_float64_timestamp_with_nulls(self): df = pd.DataFrame( {'test': pd.DatetimeIndex([1451606400000000001, diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index dd68f396a68..5588840cceb 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -15,82 +15,108 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest -import pyarrow as arrow +import pytest -A = arrow +import pyarrow as pa +import numpy as np -class TestTypes(unittest.TestCase): +# XXX: pyarrow.schema.schema masks the module on imports +sch = pa._schema - def test_integers(self): - dtypes = ['int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64'] - for name in dtypes: - factory = getattr(arrow, name) - t = factory() - assert str(t) == name +def test_type_integers(): + dtypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64'] - def test_list(self): - value_type = arrow.int32() - list_type = arrow.list_(value_type) - assert str(list_type) == 'list' + for name in dtypes: + factory = getattr(pa, name) + t = factory() + assert str(t) == name - def test_string(self): - t = arrow.string() - assert str(t) == 'string' - def test_field(self): - t = arrow.string() - f = arrow.field('foo', t) +def test_type_list(): + value_type = pa.int32() + list_type = pa.list_(value_type) + assert str(list_type) == 'list' - assert f.name == 'foo' - assert f.nullable - assert f.type is t - assert repr(f) == "Field('foo', type=string)" - f = arrow.field('foo', t, False) - assert not f.nullable +def test_type_string(): + t = pa.string() + assert str(t) == 'string' - def test_schema(self): - fields = [ - A.field('foo', A.int32()), - A.field('bar', A.string()), - A.field('baz', A.list_(A.int8())) - ] - sch = A.schema(fields) - assert len(sch) == 3 - assert sch[0].name == 'foo' - assert sch[0].type == fields[0].type - assert sch.field_by_name('foo').name == 'foo' - assert sch.field_by_name('foo').type == fields[0].type +def test_type_timestamp_with_tz(): + tz = 'America/Los_Angeles' + t = pa.timestamp('ns', tz=tz) + assert t.unit == 'ns' + assert t.tz == tz - assert repr(sch) == """\ + +def test_type_from_numpy_dtype_timestamps(): + cases = [ + (np.dtype('datetime64[s]'), pa.timestamp('s')), + (np.dtype('datetime64[ms]'), pa.timestamp('ms')), + (np.dtype('datetime64[us]'), pa.timestamp('us')), + (np.dtype('datetime64[ns]'), pa.timestamp('ns')) + ] + + for dt, pt in cases: + result = sch.type_from_numpy_dtype(dt) + assert result == pt + + +def test_field(): + t = pa.string() + f = pa.field('foo', t) + + assert f.name == 'foo' + assert f.nullable + assert f.type is t + assert repr(f) == "Field('foo', type=string)" + + f = pa.field('foo', t, False) + assert not f.nullable + + +def test_schema(): + fields = [ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string()), + pa.field('baz', pa.list_(pa.int8())) + ] + sch = pa.schema(fields) + + assert len(sch) == 3 + assert sch[0].name == 'foo' + assert sch[0].type == fields[0].type + assert sch.field_by_name('foo').name == 'foo' + assert sch.field_by_name('foo').type == fields[0].type + + assert repr(sch) == """\ foo: int32 bar: string baz: list""" - def test_schema_equals(self): - fields = [ - A.field('foo', A.int32()), - A.field('bar', A.string()), - A.field('baz', A.list_(A.int8())) - ] - sch1 = A.schema(fields) - print(dir(sch1)) - sch2 = A.schema(fields) - assert sch1.equals(sch2) +def test_field_empty(): + f = pa.Field() + with pytest.raises(ReferenceError): + repr(f) + - del fields[-1] - sch3 = A.schema(fields) - assert not sch1.equals(sch3) +def test_schema_equals(): + fields = [ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string()), + pa.field('baz', pa.list_(pa.int8())) + ] + sch1 = pa.schema(fields) + print(dir(sch1)) + sch2 = pa.schema(fields) + assert sch1.equals(sch2) -class TestField(unittest.TestCase): - def test_empty_field(self): - f = arrow.Field() - with self.assertRaises(ReferenceError): - repr(f) + del fields[-1] + sch3 = pa.schema(fields) + assert not sch1.equals(sch3) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 4f7b2cb09e1..b197f5845c0 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -27,13 +27,8 @@ #include "pyarrow/helpers.h" #include "pyarrow/util/datetime.h" -using arrow::ArrayBuilder; -using arrow::DataType; -using arrow::MemoryPool; -using arrow::Status; -using arrow::Type; - -namespace pyarrow { +namespace arrow { +namespace py { static inline bool IsPyInteger(PyObject* obj) { #if PYARROW_IS_PY2 @@ -82,22 +77,22 @@ class ScalarVisitor { std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases if (float_count_) { - return arrow::float64(); + return float64(); } else if (int_count_) { // TODO(wesm): tighter type later - return arrow::int64(); + return int64(); } else if (date_count_) { - return arrow::date(); + return date(); } else if (timestamp_count_) { - return arrow::timestamp(arrow::TimeUnit::MICRO); + return timestamp(TimeUnit::MICRO); } else if (bool_count_) { - return arrow::boolean(); + return boolean(); } else if (binary_count_) { - return arrow::binary(); + return binary(); } else if (unicode_count_) { - return arrow::utf8(); + return utf8(); } else { - return arrow::null(); + return null(); } } @@ -157,14 +152,14 @@ class SeqVisitor { std::shared_ptr GetType() { if (scalars_.total_count() == 0) { if (max_nesting_level_ == 0) { - return arrow::null(); + return null(); } else { return nullptr; } } else { std::shared_ptr result = scalars_.GetType(); for (int i = 0; i < max_nesting_level_; ++i) { - result = std::make_shared(result); + result = std::make_shared(result); } return result; } @@ -215,7 +210,7 @@ Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr* o } // For 0-length sequences, refuse to guess - if (*size == 0) { *out_type = arrow::null(); } + if (*size == 0) { *out_type = null(); } SeqVisitor seq_visitor; RETURN_NOT_OK(seq_visitor.Visit(obj)); @@ -255,7 +250,7 @@ class TypedConverter : public SeqConverter { BuilderType* typed_builder_; }; -class BoolConverter : public TypedConverter { +class BoolConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { Py_ssize_t size = PySequence_Size(seq); @@ -276,7 +271,7 @@ class BoolConverter : public TypedConverter { } }; -class Int64Converter : public TypedConverter { +class Int64Converter : public TypedConverter { public: Status AppendData(PyObject* seq) override { int64_t val; @@ -296,7 +291,7 @@ class Int64Converter : public TypedConverter { } }; -class DateConverter : public TypedConverter { +class DateConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { Py_ssize_t size = PySequence_Size(seq); @@ -314,7 +309,7 @@ class DateConverter : public TypedConverter { } }; -class TimestampConverter : public TypedConverter { +class TimestampConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { Py_ssize_t size = PySequence_Size(seq); @@ -347,7 +342,7 @@ class TimestampConverter : public TypedConverter { } }; -class DoubleConverter : public TypedConverter { +class DoubleConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { double val; @@ -367,7 +362,7 @@ class DoubleConverter : public TypedConverter { } }; -class BytesConverter : public TypedConverter { +class BytesConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { PyObject* item; @@ -401,7 +396,7 @@ class BytesConverter : public TypedConverter { } }; -class UTF8Converter : public TypedConverter { +class UTF8Converter : public TypedConverter { public: Status AppendData(PyObject* seq) override { PyObject* item; @@ -433,7 +428,7 @@ class UTF8Converter : public TypedConverter { } }; -class ListConverter : public TypedConverter { +class ListConverter : public TypedConverter { public: Status Init(const std::shared_ptr& builder) override; @@ -483,10 +478,10 @@ std::shared_ptr GetConverter(const std::shared_ptr& type Status ListConverter::Init(const std::shared_ptr& builder) { builder_ = builder; - typed_builder_ = static_cast(builder.get()); + typed_builder_ = static_cast(builder.get()); value_converter_ = - GetConverter(static_cast(builder->type().get())->value_type()); + GetConverter(static_cast(builder->type().get())->value_type()); if (value_converter_ == nullptr) { return Status::NotImplemented("value type not implemented"); } @@ -508,8 +503,7 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, return converter->AppendData(obj); } -Status ConvertPySequence( - PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { +Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr type; int64_t size; PyDateTime_IMPORT; @@ -517,16 +511,17 @@ Status ConvertPySequence( // Handle NA / NullType case if (type->type == Type::NA) { - out->reset(new arrow::NullArray(size)); + out->reset(new NullArray(size)); return Status::OK(); } // Give the sequence converter an array builder std::shared_ptr builder; - RETURN_NOT_OK(arrow::MakeBuilder(pool, type, &builder)); + RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); RETURN_NOT_OK(AppendPySequence(obj, type, builder)); return builder->Finish(out); } -} // namespace pyarrow +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 0c863a5631a..2d45e670628 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -27,27 +27,28 @@ #include +#include "arrow/util/visibility.h" + #include "pyarrow/common.h" -#include "pyarrow/visibility.h" namespace arrow { + class Array; class Status; -} -namespace pyarrow { +namespace py { -PYARROW_EXPORT arrow::Status InferArrowType( +ARROW_EXPORT arrow::Status InferArrowType( PyObject* obj, int64_t* size, std::shared_ptr* out_type); -PYARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, +ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, const std::shared_ptr& builder); -PYARROW_EXPORT -arrow::Status ConvertPySequence( - PyObject* obj, arrow::MemoryPool* pool, std::shared_ptr* out); +ARROW_EXPORT +Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out); -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_ADAPTERS_BUILTIN_H diff --git a/python/src/pyarrow/adapters/pandas-test.cc b/python/src/pyarrow/adapters/pandas-test.cc index e286ccc2c8d..e694e790a38 100644 --- a/python/src/pyarrow/adapters/pandas-test.cc +++ b/python/src/pyarrow/adapters/pandas-test.cc @@ -30,9 +30,8 @@ #include "arrow/type.h" #include "pyarrow/adapters/pandas.h" -using namespace arrow; - -namespace pyarrow { +namespace arrow { +namespace py { TEST(PandasConversionTest, TestObjectBlockWriteFails) { StringBuilder builder; @@ -61,4 +60,5 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { Py_END_ALLOW_THREADS; } +} // namespace py } // namespace arrow diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 40079b49b96..863cf54c9aa 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -19,7 +19,6 @@ #include -#include "pyarrow/adapters/builtin.h" #include "pyarrow/adapters/pandas.h" #include "pyarrow/numpy_interop.h" @@ -34,120 +33,39 @@ #include #include -#include "arrow/api.h" +#include "arrow/array.h" +#include "arrow/column.h" #include "arrow/loader.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" +#include "pyarrow/adapters/builtin.h" #include "pyarrow/common.h" #include "pyarrow/config.h" +#include "pyarrow/type_traits.h" #include "pyarrow/util/datetime.h" -namespace pyarrow { - -using arrow::Array; -using arrow::ChunkedArray; -using arrow::Column; -using arrow::DictionaryType; -using arrow::Field; -using arrow::DataType; -using arrow::ListType; -using arrow::ListBuilder; -using arrow::Status; -using arrow::Table; -using arrow::Type; - -namespace BitUtil = arrow::BitUtil; +namespace arrow { +namespace py { // ---------------------------------------------------------------------- // Utility code -template -struct npy_traits {}; - -template <> -struct npy_traits { - typedef uint8_t value_type; - using TypeClass = arrow::BooleanType; - using BuilderClass = arrow::BooleanBuilder; - - static constexpr bool supports_nulls = false; - static inline bool isnull(uint8_t v) { return false; } -}; - -#define NPY_INT_DECL(TYPE, CapType, T) \ - template <> \ - struct npy_traits { \ - typedef T value_type; \ - using TypeClass = arrow::CapType##Type; \ - using BuilderClass = arrow::CapType##Builder; \ - \ - static constexpr bool supports_nulls = false; \ - static inline bool isnull(T v) { return false; } \ - }; - -NPY_INT_DECL(INT8, Int8, int8_t); -NPY_INT_DECL(INT16, Int16, int16_t); -NPY_INT_DECL(INT32, Int32, int32_t); -NPY_INT_DECL(INT64, Int64, int64_t); +int cast_npy_type_compat(int type_num) { +// Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set +// U/LONGLONG to U/INT64 so things work properly. -NPY_INT_DECL(UINT8, UInt8, uint8_t); -NPY_INT_DECL(UINT16, UInt16, uint16_t); -NPY_INT_DECL(UINT32, UInt32, uint32_t); -NPY_INT_DECL(UINT64, UInt64, uint64_t); - -#if NPY_INT64 != NPY_LONGLONG -NPY_INT_DECL(LONGLONG, Int64, int64_t); -NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); +#if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8) + if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; } + if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; } #endif -template <> -struct npy_traits { - typedef float value_type; - using TypeClass = arrow::FloatType; - using BuilderClass = arrow::FloatBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(float v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef double value_type; - using TypeClass = arrow::DoubleType; - using BuilderClass = arrow::DoubleBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(double v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef int64_t value_type; - using TypeClass = arrow::TimestampType; - using BuilderClass = arrow::TimestampBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(int64_t v) { - // NaT = -2**63 - // = -0x8000000000000000 - // = -9223372036854775808; - // = std::numeric_limits::min() - return v == std::numeric_limits::min(); - } -}; - -template <> -struct npy_traits { - typedef PyObject* value_type; - static constexpr bool supports_nulls = true; -}; + return type_num; +} static inline bool PyObject_is_null(const PyObject* obj) { return obj == Py_None || obj == numpy_nan; @@ -181,8 +99,24 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) return null_count; } +// Returns null count +static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { + int64_t null_count = 0; + const uint8_t* mask_values = static_cast(PyArray_DATA(mask)); + // TODO(wesm): strided null mask + for (int i = 0; i < length; ++i) { + if (mask_values[i]) { + ++null_count; + } else { + BitUtil::SetBit(bitmap, i); + } + } + return null_count; +} + template -static int64_t ValuesToBytemap(const void* data, int64_t length, uint8_t* valid_bytes) { +static int64_t ValuesToValidBytes( + const void* data, int64_t length, uint8_t* valid_bytes) { typedef npy_traits traits; typedef typename traits::value_type T; @@ -214,7 +148,7 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::OK(); } -Status AppendObjectStrings(arrow::StringBuilder& string_builder, PyObject** objects, +Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects, int64_t objects_length, bool* have_bytes) { PyObject* obj; @@ -242,360 +176,561 @@ Status AppendObjectStrings(arrow::StringBuilder& string_builder, PyObject** obje return Status::OK(); } -template -struct arrow_traits {}; +template +struct WrapBytes {}; template <> -struct arrow_traits { - static constexpr int npy_type = NPY_BOOL; - static constexpr bool supports_nulls = false; - static constexpr bool is_boolean = true; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; +struct WrapBytes { + static inline PyObject* Wrap(const uint8_t* data, int64_t length) { + return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); + } }; -#define INT_DECL(TYPE) \ - template <> \ - struct arrow_traits { \ - static constexpr int npy_type = NPY_##TYPE; \ - static constexpr bool supports_nulls = false; \ - static constexpr double na_value = NAN; \ - static constexpr bool is_boolean = false; \ - static constexpr bool is_numeric_not_nullable = true; \ - static constexpr bool is_numeric_nullable = false; \ - typedef typename npy_traits::value_type T; \ - }; - -INT_DECL(INT8); -INT_DECL(INT16); -INT_DECL(INT32); -INT_DECL(INT64); -INT_DECL(UINT8); -INT_DECL(UINT16); -INT_DECL(UINT32); -INT_DECL(UINT64); - template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT32; - static constexpr bool supports_nulls = true; - static constexpr float na_value = NAN; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; - typedef typename npy_traits::value_type T; +struct WrapBytes { + static inline PyObject* Wrap(const uint8_t* data, int64_t length) { + return PyBytes_FromStringAndSize(reinterpret_cast(data), length); + } }; -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT64; - static constexpr bool supports_nulls = true; - static constexpr double na_value = NAN; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; - typedef typename npy_traits::value_type T; -}; +static inline bool ListTypeSupported(const Type::type type_id) { + switch (type_id) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::INT64: + case Type::UINT64: + case Type::FLOAT: + case Type::DOUBLE: + case Type::STRING: + case Type::TIMESTAMP: + // The above types are all supported. + return true; + default: + break; + } + return false; +} -static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); +// ---------------------------------------------------------------------- +// Conversion from NumPy-in-Pandas to Arrow -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_DATETIME; - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; - typedef typename npy_traits::value_type T; -}; +class PandasConverter : public TypeVisitor { + public: + PandasConverter( + MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr& type) + : pool_(pool), + type_(type), + arr_(reinterpret_cast(ao)), + mask_(nullptr) { + if (mo != nullptr and mo != Py_None) { mask_ = reinterpret_cast(mo); } + length_ = PyArray_SIZE(arr_); + } -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_DATETIME; - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = true; - typedef typename npy_traits::value_type T; -}; + bool is_strided() const { + npy_intp* astrides = PyArray_STRIDES(arr_); + return astrides[0] != PyArray_DESCR(arr_)->elsize; + } -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; -}; + Status InitNullBitmap() { + int null_bytes = BitUtil::BytesForBits(length_); -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - static constexpr bool is_boolean = false; - static constexpr bool is_numeric_not_nullable = false; - static constexpr bool is_numeric_nullable = false; -}; + null_bitmap_ = std::make_shared(pool_); + RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); -template -struct WrapBytes {}; + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, null_bytes); -template <> -struct WrapBytes { - static inline PyObject* Wrap(const uint8_t* data, int64_t length) { - return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); + return Status::OK(); } -}; -template <> -struct WrapBytes { - static inline PyObject* Wrap(const uint8_t* data, int64_t length) { - return PyBytes_FromStringAndSize(reinterpret_cast(data), length); - } -}; + // ---------------------------------------------------------------------- + // Traditional visitor conversion for non-object arrays -inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { - if (type == NPY_DATETIME) { - PyArray_Descr* descr = PyArray_DESCR(out); - auto date_dtype = reinterpret_cast(descr->c_metadata); - if (datatype->type == Type::TIMESTAMP) { - auto timestamp_type = static_cast(datatype); + template + Status ConvertData(std::shared_ptr* data); - switch (timestamp_type->unit) { - case arrow::TimestampType::Unit::SECOND: - date_dtype->meta.base = NPY_FR_s; - break; - case arrow::TimestampType::Unit::MILLI: - date_dtype->meta.base = NPY_FR_ms; - break; - case arrow::TimestampType::Unit::MICRO: - date_dtype->meta.base = NPY_FR_us; - break; - case arrow::TimestampType::Unit::NANO: - date_dtype->meta.base = NPY_FR_ns; - break; - } - } else { - // datatype->type == Type::DATE - date_dtype->meta.base = NPY_FR_D; + template + Status VisitNative() { + using traits = arrow_traits; + + if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); } + + std::shared_ptr data; + RETURN_NOT_OK(ConvertData(&data)); + + int64_t null_count = 0; + if (mask_ != nullptr) { + null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else if (traits::supports_nulls) { + // TODO(wesm): this presumes the NumPy C type and arrow C type are the + // same + null_count = ValuesToBitmap( + PyArray_DATA(arr_), length_, null_bitmap_data_); } + + std::vector fields(1); + fields[0].length = length_; + fields[0].null_count = null_count; + fields[0].offset = 0; + + return LoadArray(type_, fields, {null_bitmap_, data}, &out_); } -} -template -inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - // Upcast to double, set NaN as appropriate +#define VISIT_NATIVE(TYPE) \ + Status Visit(const TYPE& type) override { return VisitNative(); } - for (int i = 0; i < arr->length(); ++i) { - *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i]; + VISIT_NATIVE(BooleanType); + VISIT_NATIVE(Int8Type); + VISIT_NATIVE(Int16Type); + VISIT_NATIVE(Int32Type); + VISIT_NATIVE(Int64Type); + VISIT_NATIVE(UInt8Type); + VISIT_NATIVE(UInt16Type); + VISIT_NATIVE(UInt32Type); + VISIT_NATIVE(UInt64Type); + VISIT_NATIVE(FloatType); + VISIT_NATIVE(DoubleType); + VISIT_NATIVE(TimestampType); + +#undef VISIT_NATIVE + + Status Convert(std::shared_ptr* out) { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); } + // TODO(wesm): strided arrays + if (is_strided()) { return Status::Invalid("no support for strided data yet"); } + + if (type_ == nullptr) { return Status::Invalid("Must pass data type"); } + + // Visit the type to perform conversion + RETURN_NOT_OK(type_->Accept(this)); + + *out = out_; + return Status::OK(); } -} -template -inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + // ---------------------------------------------------------------------- + // Conversion logic for various object dtype arrays + + template + Status ConvertTypedLists( + const std::shared_ptr& type, std::shared_ptr* out); + + Status ConvertObjectStrings(std::shared_ptr* out); + Status ConvertBooleans(std::shared_ptr* out); + Status ConvertDates(std::shared_ptr* out); + Status ConvertLists(const std::shared_ptr& type, std::shared_ptr* out); + Status ConvertObjects(std::shared_ptr* out); + + protected: + MemoryPool* pool_; + std::shared_ptr type_; + PyArrayObject* arr_; + PyArrayObject* mask_; + int64_t length_; + + // Used in visitor pattern + std::shared_ptr out_; + + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; +}; + +template +inline Status PandasConverter::ConvertData(std::shared_ptr* data) { + using traits = arrow_traits; + + // Handle LONGLONG->INT64 and other fun things + int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); + + if (traits::npy_type != type_num_compat) { + return Status::NotImplemented("NumPy type casts not yet implemented"); } + + *data = std::make_shared(arr_); + return Status::OK(); } -template -inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values = in_values[i]; - } +template <> +inline Status PandasConverter::ConvertData(std::shared_ptr* data) { + int nbytes = BitUtil::BytesForBits(length_); + auto buffer = std::make_shared(pool_); + RETURN_NOT_OK(buffer->Resize(nbytes)); + + const uint8_t* values = reinterpret_cast(PyArray_DATA(arr_)); + + uint8_t* bitmap = buffer->mutable_data(); + + memset(bitmap, 0, nbytes); + for (int i = 0; i < length_; ++i) { + if (values[i] > 0) { BitUtil::SetBit(bitmap, i); } } + + *data = buffer; + return Status::OK(); } -static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) { +Status PandasConverter::ConvertDates(std::shared_ptr* out) { PyAcquireGIL lock; - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto bool_arr = static_cast(arr.get()); - for (int64_t i = 0; i < arr->length(); ++i) { - if (bool_arr->IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else if (bool_arr->Value(i)) { - // True - Py_INCREF(Py_True); - *out_values++ = Py_True; - } else { - // False - Py_INCREF(Py_False); - *out_values++ = Py_False; - } + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + DateBuilder date_builder(pool_); + RETURN_NOT_OK(date_builder.Resize(length_)); + + Status s; + PyObject* obj; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if (PyDate_CheckExact(obj)) { + PyDateTime_Date* pydate = reinterpret_cast(obj); + date_builder.Append(PyDate_to_ms(pydate)); + } else { + date_builder.AppendNull(); } } - return Status::OK(); + return date_builder.Finish(out); } -static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto bool_arr = static_cast(arr.get()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = static_cast(bool_arr->Value(i)); - } +Status PandasConverter::ConvertObjectStrings(std::shared_ptr* out) { + PyAcquireGIL lock; + + // The output type at this point is inconclusive because there may be bytes + // and unicode mixed in the object array + + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + StringBuilder string_builder(pool_); + RETURN_NOT_OK(string_builder.Resize(length_)); + + Status s; + bool have_bytes = false; + RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes)); + RETURN_NOT_OK(string_builder.Finish(out)); + + if (have_bytes) { + const auto& arr = static_cast(*out->get()); + *out = std::make_shared(arr.length(), arr.value_offsets(), arr.data(), + arr.null_bitmap(), arr.null_count()); } + return Status::OK(); } -template -inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { +Status PandasConverter::ConvertBooleans(std::shared_ptr* out) { PyAcquireGIL lock; - for (int c = 0; c < data.num_chunks(); c++) { - auto arr = static_cast(data.chunk(c).get()); - const uint8_t* data_ptr; - int32_t length; - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - data_ptr = arr->GetValue(i, &length); - *out_values = WrapBytes::Wrap(data_ptr, length); - if (*out_values == nullptr) { - PyErr_Clear(); - std::stringstream ss; - ss << "Wrapping " - << std::string(reinterpret_cast(data_ptr), length) << " failed"; - return Status::UnknownError(ss.str()); - } - } - ++out_values; + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + int nbytes = BitUtil::BytesForBits(length_); + auto data = std::make_shared(pool_); + RETURN_NOT_OK(data->Resize(nbytes)); + uint8_t* bitmap = data->mutable_data(); + memset(bitmap, 0, nbytes); + + int64_t null_count = 0; + for (int64_t i = 0; i < length_; ++i) { + if (objects[i] == Py_True) { + BitUtil::SetBit(bitmap, i); + BitUtil::SetBit(null_bitmap_data_, i); + } else if (objects[i] != Py_False) { + ++null_count; + } else { + BitUtil::SetBit(null_bitmap_data_, i); } } + + *out = std::make_shared(length_, data, null_bitmap_, null_count); + return Status::OK(); } -template -inline Status ConvertListsLike( - const std::shared_ptr& col, PyObject** out_values) { - const ChunkedArray& data = *col->data().get(); - auto list_type = std::static_pointer_cast(col->type()); +Status PandasConverter::ConvertObjects(std::shared_ptr* out) { + // Python object arrays are annoying, since we could have one of: + // + // * Strings + // * Booleans with nulls + // * Mixed type (not supported at the moment by arrow format) + // + // Additionally, nulls may be encoded either as np.nan or None. So we have to + // do some type inference and conversion - // Get column of underlying value arrays - std::vector> value_arrays; - for (int c = 0; c < data.num_chunks(); c++) { - auto arr = std::static_pointer_cast(data.chunk(c)); - value_arrays.emplace_back(arr->values()); - } - auto flat_column = std::make_shared(list_type->value_field(), value_arrays); - // TODO(ARROW-489): Currently we don't have a Python reference for single columns. - // Storing a reference to the whole Array would be to expensive. - PyObject* numpy_array; - RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array)); + RETURN_NOT_OK(InitNullBitmap()); - PyAcquireGIL lock; + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } - for (int c = 0; c < data.num_chunks(); c++) { - auto arr = std::static_pointer_cast(data.chunk(c)); + const PyObject** objects; + { + PyAcquireGIL lock; + objects = reinterpret_cast(PyArray_DATA(arr_)); + PyDateTime_IMPORT; + } - const uint8_t* data_ptr; - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; + if (type_) { + switch (type_->type) { + case Type::STRING: + return ConvertObjectStrings(out); + case Type::BOOL: + return ConvertBooleans(out); + case Type::DATE: + return ConvertDates(out); + case Type::LIST: { + const auto& list_field = static_cast(*type_); + return ConvertLists(list_field.value_field()->type, out); + } + default: + return Status::TypeError("No known conversion to Arrow type"); + } + } else { + for (int64_t i = 0; i < length_; ++i) { + if (PyObject_is_null(objects[i])) { + continue; + } else if (PyObject_is_string(objects[i])) { + return ConvertObjectStrings(out); + } else if (PyBool_Check(objects[i])) { + return ConvertBooleans(out); + } else if (PyDate_CheckExact(objects[i])) { + return ConvertDates(out); } else { - PyObject* start = PyLong_FromLong(arr->value_offset(i)); - PyObject* end = PyLong_FromLong(arr->value_offset(i + 1)); - PyObject* slice = PySlice_New(start, end, NULL); - *out_values = PyObject_GetItem(numpy_array, slice); - Py_DECREF(start); - Py_DECREF(end); - Py_DECREF(slice); + return Status::TypeError("unhandled python type"); } - ++out_values; } } - Py_XDECREF(numpy_array); - return Status::OK(); + return Status::TypeError("Unable to infer type of object array, were all null"); } -template -inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); +template +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr& type, std::shared_ptr* out) { + typedef npy_traits traits; + typedef typename traits::value_type T; + typedef typename traits::BuilderClass BuilderT; - const uint8_t* valid_bits = arr->null_bitmap_data(); + PyAcquireGIL lock; - if (arr->null_count() > 0) { - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; + auto value_builder = std::make_shared(pool_, type); + ListBuilder list_builder(pool_, value_builder); + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + for (int64_t i = 0; i < length_; ++i) { + if (PyObject_is_null(objects[i])) { + RETURN_NOT_OK(list_builder.AppendNull()); + } else if (PyArray_Check(objects[i])) { + auto numpy_array = reinterpret_cast(objects[i]); + RETURN_NOT_OK(list_builder.Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE)); + + int64_t size = PyArray_DIM(numpy_array, 0); + auto data = reinterpret_cast(PyArray_DATA(numpy_array)); + if (traits::supports_nulls) { + null_bitmap_->Resize(size, false); + // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't + // currently support this. + // ValuesToBitmap(data, size, null_bitmap_->mutable_data()); + ValuesToValidBytes(data, size, null_bitmap_->mutable_data()); + RETURN_NOT_OK(value_builder->Append(data, size, null_bitmap_->data())); + } else { + RETURN_NOT_OK(value_builder->Append(data, size)); + } + + } else if (PyList_Check(objects[i])) { + int64_t size; + std::shared_ptr inferred_type; + RETURN_NOT_OK(list_builder.Append(true)); + RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type)); + if (inferred_type->type != type->type) { + std::stringstream ss; + ss << inferred_type->ToString() << " cannot be converted to " << type->ToString(); + return Status::TypeError(ss.str()); } + RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder)); } else { - memcpy(out_values, in_values, sizeof(T) * arr->length()); - out_values += arr->length(); + return Status::TypeError("Unsupported Python type for list items"); } } + return list_builder.Finish(out); } -template -inline void ConvertNumericNullableCast( - const ChunkedArray& data, OutType na_value, OutType* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); +template <> +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr& type, std::shared_ptr* out) { + PyAcquireGIL lock; + // TODO: If there are bytes involed, convert to Binary representation + bool have_bytes = false; - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? na_value : static_cast(in_values[i]); + auto value_builder = std::make_shared(pool_); + ListBuilder list_builder(pool_, value_builder); + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + for (int64_t i = 0; i < length_; ++i) { + if (PyObject_is_null(objects[i])) { + RETURN_NOT_OK(list_builder.AppendNull()); + } else if (PyArray_Check(objects[i])) { + auto numpy_array = reinterpret_cast(objects[i]); + RETURN_NOT_OK(list_builder.Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + int64_t size = PyArray_DIM(numpy_array, 0); + auto data = reinterpret_cast(PyArray_DATA(numpy_array)); + RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes)); + } else if (PyList_Check(objects[i])) { + int64_t size; + std::shared_ptr inferred_type; + RETURN_NOT_OK(list_builder.Append(true)); + RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type)); + if (inferred_type->type != Type::STRING) { + std::stringstream ss; + ss << inferred_type->ToString() << " cannot be converted to STRING."; + return Status::TypeError(ss.str()); + } + RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder)); + } else { + return Status::TypeError("Unsupported Python type for list items"); } } + return list_builder.Finish(out); } -template -inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); - - for (int64_t i = 0; i < arr->length(); ++i) { - // There are 1000 * 60 * 60 * 24 = 86400000ms in a day - *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000; - } +#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ + case Type::TYPE: { \ + return ConvertTypedLists(type, out); \ + } + +Status PandasConverter::ConvertLists( + const std::shared_ptr& type, std::shared_ptr* out) { + switch (type->type) { + LIST_CASE(UINT8, NPY_UINT8, UInt8Type) + LIST_CASE(INT8, NPY_INT8, Int8Type) + LIST_CASE(UINT16, NPY_UINT16, UInt16Type) + LIST_CASE(INT16, NPY_INT16, Int16Type) + LIST_CASE(UINT32, NPY_UINT32, UInt32Type) + LIST_CASE(INT32, NPY_INT32, Int32Type) + LIST_CASE(UINT64, NPY_UINT64, UInt64Type) + LIST_CASE(INT64, NPY_INT64, Int64Type) + LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) + LIST_CASE(FLOAT, NPY_FLOAT, FloatType) + LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) + LIST_CASE(STRING, NPY_OBJECT, StringType) + default: + return Status::TypeError("Unknown list item type"); } + + return Status::TypeError("Unknown list type"); } -template -inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto prim_arr = static_cast(arr.get()); - auto in_values = reinterpret_cast(prim_arr->data()->data()); +Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, std::shared_ptr* out) { + PandasConverter converter(pool, ao, mo, type); + return converter.Convert(out); +} - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = arr->IsNull(i) ? kPandasTimestampNull - : (static_cast(in_values[i]) * SHIFT); +Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, std::shared_ptr* out) { + PandasConverter converter(pool, ao, mo, type); + return converter.ConvertObjects(out); +} + +Status PandasDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { + PyArray_Descr* descr = reinterpret_cast(dtype); + + int type_num = cast_npy_type_compat(descr->type_num); + +#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ + case NPY_##NPY_NAME: \ + *out = FACTORY(); \ + break; + + switch (type_num) { + TO_ARROW_TYPE_CASE(BOOL, boolean); + TO_ARROW_TYPE_CASE(INT8, int8); + TO_ARROW_TYPE_CASE(INT16, int16); + TO_ARROW_TYPE_CASE(INT32, int32); + TO_ARROW_TYPE_CASE(INT64, int64); +#if (NPY_INT64 != NPY_LONGLONG) + TO_ARROW_TYPE_CASE(LONGLONG, int64); +#endif + TO_ARROW_TYPE_CASE(UINT8, uint8); + TO_ARROW_TYPE_CASE(UINT16, uint16); + TO_ARROW_TYPE_CASE(UINT32, uint32); + TO_ARROW_TYPE_CASE(UINT64, uint64); +#if (NPY_UINT64 != NPY_ULONGLONG) + TO_ARROW_CASE(ULONGLONG); +#endif + TO_ARROW_TYPE_CASE(FLOAT32, float32); + TO_ARROW_TYPE_CASE(FLOAT64, float64); + case NPY_DATETIME: { + auto date_dtype = + reinterpret_cast(descr->c_metadata); + TimeUnit unit; + switch (date_dtype->meta.base) { + case NPY_FR_s: + unit = TimeUnit::SECOND; + break; + case NPY_FR_ms: + unit = TimeUnit::MILLI; + break; + case NPY_FR_us: + unit = TimeUnit::MICRO; + break; + case NPY_FR_ns: + unit = TimeUnit::NANO; + break; + default: + return Status::NotImplemented("Unsupported datetime64 time unit"); + } + *out = timestamp(unit); + } break; + default: { + std::stringstream ss; + ss << "Unsupported numpy type " << descr->type_num << std::endl; + return Status::NotImplemented(ss.str()); } } + +#undef TO_ARROW_TYPE_CASE + + return Status::OK(); } // ---------------------------------------------------------------------- // pandas 0.x DataFrame conversion internals +inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { + if (type == NPY_DATETIME) { + PyArray_Descr* descr = PyArray_DESCR(out); + auto date_dtype = reinterpret_cast(descr->c_metadata); + if (datatype->type == Type::TIMESTAMP) { + auto timestamp_type = static_cast(datatype); + + switch (timestamp_type->unit) { + case TimestampType::Unit::SECOND: + date_dtype->meta.base = NPY_FR_s; + break; + case TimestampType::Unit::MILLI: + date_dtype->meta.base = NPY_FR_ms; + break; + case TimestampType::Unit::MICRO: + date_dtype->meta.base = NPY_FR_us; + break; + case TimestampType::Unit::NANO: + date_dtype->meta.base = NPY_FR_ns; + break; + } + } else { + // datatype->type == Type::DATE + date_dtype->meta.base = NPY_FR_D; + } + } +} + class PandasBlock { public: enum type { @@ -688,10 +823,219 @@ class PandasBlock { DISALLOW_COPY_AND_ASSIGN(PandasBlock); }; -#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ - case Type::ArrowEnum: \ - RETURN_NOT_OK((ConvertListsLike<::arrow::ArrowType>(col, out_buffer))); \ - break; +template +inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + // Upcast to double, set NaN as appropriate + + for (int i = 0; i < arr->length(); ++i) { + *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i]; + } + } +} + +template +inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + memcpy(out_values, in_values, sizeof(T) * arr->length()); + out_values += arr->length(); + } +} + +template +inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values = in_values[i]; + } + } +} + +static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto bool_arr = static_cast(arr.get()); + + for (int64_t i = 0; i < arr->length(); ++i) { + if (bool_arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else if (bool_arr->Value(i)) { + // True + Py_INCREF(Py_True); + *out_values++ = Py_True; + } else { + // False + Py_INCREF(Py_False); + *out_values++ = Py_False; + } + } + } + return Status::OK(); +} + +static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto bool_arr = static_cast(arr.get()); + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = static_cast(bool_arr->Value(i)); + } + } +} + +template +inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + auto arr = static_cast(data.chunk(c).get()); + + const uint8_t* data_ptr; + int32_t length; + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + if (has_nulls && arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + data_ptr = arr->GetValue(i, &length); + *out_values = WrapBytes::Wrap(data_ptr, length); + if (*out_values == nullptr) { + PyErr_Clear(); + std::stringstream ss; + ss << "Wrapping " + << std::string(reinterpret_cast(data_ptr), length) << " failed"; + return Status::UnknownError(ss.str()); + } + } + ++out_values; + } + } + return Status::OK(); +} + +template +inline Status ConvertListsLike( + const std::shared_ptr& col, PyObject** out_values) { + const ChunkedArray& data = *col->data().get(); + auto list_type = std::static_pointer_cast(col->type()); + + // Get column of underlying value arrays + std::vector> value_arrays; + for (int c = 0; c < data.num_chunks(); c++) { + auto arr = std::static_pointer_cast(data.chunk(c)); + value_arrays.emplace_back(arr->values()); + } + auto flat_column = std::make_shared(list_type->value_field(), value_arrays); + // TODO(ARROW-489): Currently we don't have a Python reference for single columns. + // Storing a reference to the whole Array would be to expensive. + PyObject* numpy_array; + RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array)); + + PyAcquireGIL lock; + + for (int c = 0; c < data.num_chunks(); c++) { + auto arr = std::static_pointer_cast(data.chunk(c)); + + const uint8_t* data_ptr; + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + if (has_nulls && arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + PyObject* start = PyLong_FromLong(arr->value_offset(i)); + PyObject* end = PyLong_FromLong(arr->value_offset(i + 1)); + PyObject* slice = PySlice_New(start, end, NULL); + *out_values = PyObject_GetItem(numpy_array, slice); + Py_DECREF(start); + Py_DECREF(end); + Py_DECREF(slice); + } + ++out_values; + } + } + + Py_XDECREF(numpy_array); + return Status::OK(); +} + +template +inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + const uint8_t* valid_bits = arr->null_bitmap_data(); + + if (arr->null_count() > 0) { + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; + } + } else { + memcpy(out_values, in_values, sizeof(T) * arr->length()); + out_values += arr->length(); + } + } +} + +template +inline void ConvertNumericNullableCast( + const ChunkedArray& data, OutType na_value, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = arr->IsNull(i) ? na_value : static_cast(in_values[i]); + } + } +} + +template +inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + // There are 1000 * 60 * 60 * 24 = 86400000ms in a day + *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000; + } + } +} + +template +inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const std::shared_ptr arr = data.chunk(c); + auto prim_arr = static_cast(arr.get()); + auto in_values = reinterpret_cast(prim_arr->data()->data()); + + for (int64_t i = 0; i < arr->length(); ++i) { + *out_values++ = arr->IsNull(i) ? kPandasTimestampNull + : (static_cast(in_values[i]) * SHIFT); + } + } +} + +#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ + case Type::ArrowEnum: \ + RETURN_NOT_OK((ConvertListsLike(col, out_buffer))); \ + break; class ObjectBlock : public PandasBlock { public: @@ -712,9 +1056,9 @@ class ObjectBlock : public PandasBlock { if (type == Type::BOOL) { RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer)); } else if (type == Type::BINARY) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); } else if (type == Type::STRING) { - RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->type) { @@ -880,8 +1224,8 @@ class DatetimeBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { - RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME)); + Status AllocateDatetime(int ndim) { + RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME, ndim)); PyAcquireGIL lock; auto date_dtype = reinterpret_cast( @@ -890,6 +1234,8 @@ class DatetimeBlock : public PandasBlock { return Status::OK(); } + Status Allocate() override { return AllocateDatetime(2); } + Status Write(const std::shared_ptr& col, int64_t abs_placement, int64_t rel_placement) override { Type::type type = col->type()->type; @@ -904,15 +1250,15 @@ class DatetimeBlock : public PandasBlock { // TODO(wesm): Do we want to make sure to zero out the milliseconds? ConvertDatetimeNanos(data, out_buffer); } else if (type == Type::TIMESTAMP) { - auto ts_type = static_cast(col->type().get()); + auto ts_type = static_cast(col->type().get()); - if (ts_type->unit == arrow::TimeUnit::NANO) { + if (ts_type->unit == TimeUnit::NANO) { ConvertNumericNullable(data, kPandasTimestampNull, out_buffer); - } else if (ts_type->unit == arrow::TimeUnit::MICRO) { + } else if (ts_type->unit == TimeUnit::MICRO) { ConvertDatetimeNanos(data, out_buffer); - } else if (ts_type->unit == arrow::TimeUnit::MILLI) { + } else if (ts_type->unit == TimeUnit::MILLI) { ConvertDatetimeNanos(data, out_buffer); - } else if (ts_type->unit == arrow::TimeUnit::SECOND) { + } else if (ts_type->unit == TimeUnit::SECOND) { ConvertDatetimeNanos(data, out_buffer); } else { return Status::NotImplemented("Unsupported time unit"); @@ -931,6 +1277,9 @@ class DatetimeTZBlock : public DatetimeBlock { DatetimeTZBlock(const std::string& timezone, int64_t num_rows) : DatetimeBlock(num_rows, 1), timezone_(timezone) {} + // Like Categorical, the internal ndarray is 1-dimensional + Status Allocate() override { return AllocateDatetime(1); } + Status GetPyResult(PyObject** output) override { PyObject* result = PyDict_New(); RETURN_IF_PYERROR(); @@ -977,9 +1326,8 @@ class CategoricalBlock : public PandasBlock { for (int c = 0; c < data.num_chunks(); c++) { const std::shared_ptr arr = data.chunk(c); - const auto& dict_arr = static_cast(*arr); - const auto& indices = - static_cast(*dict_arr.indices()); + const auto& dict_arr = static_cast(*arr); + const auto& indices = static_cast(*dict_arr.indices()); auto in_values = reinterpret_cast(indices.data()->data()); // Null is -1 in CategoricalBlock @@ -1046,28 +1394,6 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, return (*block)->Allocate(); } -static inline bool ListTypeSupported(const Type::type type_id) { - switch (type_id) { - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::UINT32: - case Type::INT32: - case Type::INT64: - case Type::UINT64: - case Type::FLOAT: - case Type::DOUBLE: - case Type::STRING: - case Type::TIMESTAMP: - // The above types are all supported. - return true; - default: - break; - } - return false; -} - static inline Status MakeCategoricalBlock(const std::shared_ptr& type, int64_t num_rows, std::shared_ptr* block) { // All categoricals become a block with a single column @@ -1168,7 +1494,7 @@ class DataFrameBlockCreator { output_type = PandasBlock::DATETIME; break; case Type::TIMESTAMP: { - const auto& ts_type = static_cast(*col->type()); + const auto& ts_type = static_cast(*col->type()); if (ts_type.timezone != "") { output_type = PandasBlock::DATETIME_WITH_TZ; } else { @@ -1182,636 +1508,165 @@ class DataFrameBlockCreator { ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); return Status::NotImplemented(ss.str()); - } - output_type = PandasBlock::OBJECT; - } break; - case Type::DICTIONARY: - output_type = PandasBlock::CATEGORICAL; - break; - default: - return Status::NotImplemented(col->type()->ToString()); - } - - int block_placement = 0; - std::shared_ptr block; - if (output_type == PandasBlock::CATEGORICAL) { - RETURN_NOT_OK(MakeCategoricalBlock(col->type(), table_->num_rows(), &block)); - categorical_blocks_[i] = block; - } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { - const auto& ts_type = static_cast(*col->type()); - block = std::make_shared(ts_type.timezone, table_->num_rows()); - RETURN_NOT_OK(block->Allocate()); - datetimetz_blocks_[i] = block; - } else { - auto it = type_counts_.find(output_type); - if (it != type_counts_.end()) { - block_placement = it->second; - // Increment count - it->second += 1; - } else { - // Add key to map - type_counts_[output_type] = 1; - } - } - - column_types_[i] = output_type; - column_block_placement_[i] = block_placement; - } - - // Create normal non-categorical blocks - for (const auto& it : type_counts_) { - PandasBlock::type type = static_cast(it.first); - std::shared_ptr block; - RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block)); - blocks_[type] = block; - } - return Status::OK(); - } - - Status WriteTableToBlocks(int nthreads) { - auto WriteColumn = [this](int i) { - std::shared_ptr col = this->table_->column(i); - PandasBlock::type output_type = this->column_types_[i]; - - int rel_placement = this->column_block_placement_[i]; - - std::shared_ptr block; - if (output_type == PandasBlock::CATEGORICAL) { - auto it = this->categorical_blocks_.find(i); - if (it == this->blocks_.end()) { - return Status::KeyError("No categorical block allocated"); - } - block = it->second; - } else { - auto it = this->blocks_.find(output_type); - if (it == this->blocks_.end()) { return Status::KeyError("No block allocated"); } - block = it->second; - } - return block->Write(col, i, rel_placement); - }; - - nthreads = std::min(nthreads, table_->num_columns()); - - if (nthreads == 1) { - for (int i = 0; i < table_->num_columns(); ++i) { - RETURN_NOT_OK(WriteColumn(i)); - } - } else { - std::vector thread_pool; - thread_pool.reserve(nthreads); - std::atomic task_counter(0); - - std::mutex error_mtx; - bool error_occurred = false; - Status error; - - for (int thread_id = 0; thread_id < nthreads; ++thread_id) { - thread_pool.emplace_back( - [this, &error, &error_occurred, &error_mtx, &task_counter, &WriteColumn]() { - int column_num; - while (!error_occurred) { - column_num = task_counter.fetch_add(1); - if (column_num >= this->table_->num_columns()) { break; } - Status s = WriteColumn(column_num); - if (!s.ok()) { - std::lock_guard lock(error_mtx); - error_occurred = true; - error = s; - break; - } - } - }); - } - for (auto&& thread : thread_pool) { - thread.join(); - } - - if (error_occurred) { return error; } - } - return Status::OK(); - } - - Status AppendBlocks(const BlockMap& blocks, PyObject* list) { - for (const auto& it : blocks) { - PyObject* item; - RETURN_NOT_OK(it.second->GetPyResult(&item)); - if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); } - } - return Status::OK(); - } - - Status GetResultList(PyObject** out) { - PyAcquireGIL lock; - - PyObject* result = PyList_New(0); - RETURN_IF_PYERROR(); - - RETURN_NOT_OK(AppendBlocks(blocks_, result)); - RETURN_NOT_OK(AppendBlocks(categorical_blocks_, result)); - RETURN_NOT_OK(AppendBlocks(datetimetz_blocks_, result)); - - *out = result; - return Status::OK(); - } - - private: - std::shared_ptr table_; - - // column num -> block type id - std::vector column_types_; - - // column num -> relative placement within internal block - std::vector column_block_placement_; - - // block type -> type count - std::unordered_map type_counts_; - - // block type -> block - BlockMap blocks_; - - // column number -> categorical block - BlockMap categorical_blocks_; - - // column number -> datetimetz block - BlockMap datetimetz_blocks_; -}; - -Status ConvertTableToPandas( - const std::shared_ptr
& table, int nthreads, PyObject** out) { - DataFrameBlockCreator helper(table); - return helper.Convert(nthreads, out); -} - -// ---------------------------------------------------------------------- -// Serialization - -template -class ArrowSerializer { - public: - ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) - : pool_(pool), arr_(arr), mask_(mask) { - length_ = PyArray_SIZE(arr_); - } - - void IndicateType(const std::shared_ptr field) { field_indicator_ = field; } - - Status Convert(std::shared_ptr* out); - - int stride() const { return PyArray_STRIDES(arr_)[0]; } - - Status InitNullBitmap() { - int null_bytes = BitUtil::BytesForBits(length_); - - null_bitmap_ = std::make_shared(pool_); - RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); - - null_bitmap_data_ = null_bitmap_->mutable_data(); - memset(null_bitmap_data_, 0, null_bytes); - - return Status::OK(); - } - - bool is_strided() const { - npy_intp* astrides = PyArray_STRIDES(arr_); - return astrides[0] != PyArray_DESCR(arr_)->elsize; - } - - private: - Status ConvertData(); - - Status ConvertDates(std::shared_ptr* out) { - PyAcquireGIL lock; - - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - arrow::DateBuilder date_builder(pool_); - RETURN_NOT_OK(date_builder.Resize(length_)); - - Status s; - PyObject* obj; - for (int64_t i = 0; i < length_; ++i) { - obj = objects[i]; - if (PyDate_CheckExact(obj)) { - PyDateTime_Date* pydate = reinterpret_cast(obj); - date_builder.Append(PyDate_to_ms(pydate)); - } else { - date_builder.AppendNull(); - } - } - return date_builder.Finish(out); - } - - Status ConvertObjectStrings(std::shared_ptr* out) { - PyAcquireGIL lock; - - // The output type at this point is inconclusive because there may be bytes - // and unicode mixed in the object array - - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - arrow::StringBuilder string_builder(pool_); - RETURN_NOT_OK(string_builder.Resize(length_)); - - Status s; - bool have_bytes = false; - RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes)); - RETURN_NOT_OK(string_builder.Finish(out)); - - if (have_bytes) { - const auto& arr = static_cast(*out->get()); - *out = std::make_shared(arr.length(), arr.value_offsets(), - arr.data(), arr.null_bitmap(), arr.null_count()); - } - return Status::OK(); - } - - Status ConvertBooleans(std::shared_ptr* out) { - PyAcquireGIL lock; - - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - - int nbytes = BitUtil::BytesForBits(length_); - auto data = std::make_shared(pool_); - RETURN_NOT_OK(data->Resize(nbytes)); - uint8_t* bitmap = data->mutable_data(); - memset(bitmap, 0, nbytes); - - int64_t null_count = 0; - for (int64_t i = 0; i < length_; ++i) { - if (objects[i] == Py_True) { - BitUtil::SetBit(bitmap, i); - BitUtil::SetBit(null_bitmap_data_, i); - } else if (objects[i] != Py_False) { - ++null_count; - } else { - BitUtil::SetBit(null_bitmap_data_, i); - } - } - - *out = std::make_shared(length_, data, null_bitmap_, null_count); - - return Status::OK(); - } - - template - Status ConvertTypedLists( - const std::shared_ptr& field, std::shared_ptr* out); - -#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - return ConvertTypedLists(field, out); \ - } - - Status ConvertLists(const std::shared_ptr& field, std::shared_ptr* out) { - switch (field->type->type) { - LIST_CASE(UINT8, NPY_UINT8, UInt8Type) - LIST_CASE(INT8, NPY_INT8, Int8Type) - LIST_CASE(UINT16, NPY_UINT16, UInt16Type) - LIST_CASE(INT16, NPY_INT16, Int16Type) - LIST_CASE(UINT32, NPY_UINT32, UInt32Type) - LIST_CASE(INT32, NPY_INT32, Int32Type) - LIST_CASE(UINT64, NPY_UINT64, UInt64Type) - LIST_CASE(INT64, NPY_INT64, Int64Type) - LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) - LIST_CASE(FLOAT, NPY_FLOAT, FloatType) - LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) - LIST_CASE(STRING, NPY_OBJECT, StringType) - default: - return Status::TypeError("Unknown list item type"); - } - - return Status::TypeError("Unknown list type"); - } - - Status MakeDataType(std::shared_ptr* out); - - arrow::MemoryPool* pool_; - - PyArrayObject* arr_; - PyArrayObject* mask_; - - int64_t length_; - - std::shared_ptr field_indicator_; - std::shared_ptr data_; - std::shared_ptr null_bitmap_; - uint8_t* null_bitmap_data_; -}; - -// Returns null count -static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { - int64_t null_count = 0; - const uint8_t* mask_values = static_cast(PyArray_DATA(mask)); - // TODO(wesm): strided null mask - for (int i = 0; i < length; ++i) { - if (mask_values[i]) { - ++null_count; - } else { - BitUtil::SetBit(bitmap, i); - } - } - return null_count; -} - -template -inline Status ArrowSerializer::MakeDataType(std::shared_ptr* out) { - out->reset(new typename npy_traits::TypeClass()); - return Status::OK(); -} - -template <> -inline Status ArrowSerializer::MakeDataType( - std::shared_ptr* out) { - PyArray_Descr* descr = PyArray_DESCR(arr_); - auto date_dtype = reinterpret_cast(descr->c_metadata); - arrow::TimestampType::Unit unit; - - switch (date_dtype->meta.base) { - case NPY_FR_s: - unit = arrow::TimestampType::Unit::SECOND; - break; - case NPY_FR_ms: - unit = arrow::TimestampType::Unit::MILLI; - break; - case NPY_FR_us: - unit = arrow::TimestampType::Unit::MICRO; - break; - case NPY_FR_ns: - unit = arrow::TimestampType::Unit::NANO; - break; - default: - return Status::Invalid("Unknown NumPy datetime unit"); - } - - out->reset(new arrow::TimestampType(unit)); - return Status::OK(); -} - -template -inline Status ArrowSerializer::Convert(std::shared_ptr* out) { - typedef npy_traits traits; - - if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); } - - int64_t null_count = 0; - if (mask_ != nullptr) { - null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (traits::supports_nulls) { - null_count = ValuesToBitmap(PyArray_DATA(arr_), length_, null_bitmap_data_); - } - - RETURN_NOT_OK(ConvertData()); - std::shared_ptr type; - RETURN_NOT_OK(MakeDataType(&type)); - - std::vector fields(1); - fields[0].length = length_; - fields[0].null_count = null_count; - fields[0].offset = 0; - - return arrow::LoadArray(type, fields, {null_bitmap_, data_}, out); -} - -template <> -inline Status ArrowSerializer::Convert(std::shared_ptr* out) { - // Python object arrays are annoying, since we could have one of: - // - // * Strings - // * Booleans with nulls - // * Mixed type (not supported at the moment by arrow format) - // - // Additionally, nulls may be encoded either as np.nan or None. So we have to - // do some type inference and conversion - - RETURN_NOT_OK(InitNullBitmap()); - - // TODO: mask not supported here - const PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - { - PyAcquireGIL lock; - PyDateTime_IMPORT; - } - - if (field_indicator_) { - switch (field_indicator_->type->type) { - case Type::STRING: - return ConvertObjectStrings(out); - case Type::BOOL: - return ConvertBooleans(out); - case Type::DATE: - return ConvertDates(out); - case Type::LIST: { - auto list_field = static_cast(field_indicator_->type.get()); - return ConvertLists(list_field->value_field(), out); + } + output_type = PandasBlock::OBJECT; + } break; + case Type::DICTIONARY: + output_type = PandasBlock::CATEGORICAL; + break; + default: + return Status::NotImplemented(col->type()->ToString()); } - default: - return Status::TypeError("No known conversion to Arrow type"); - } - } else { - for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { - continue; - } else if (PyObject_is_string(objects[i])) { - return ConvertObjectStrings(out); - } else if (PyBool_Check(objects[i])) { - return ConvertBooleans(out); - } else if (PyDate_CheckExact(objects[i])) { - return ConvertDates(out); + + int block_placement = 0; + std::shared_ptr block; + if (output_type == PandasBlock::CATEGORICAL) { + RETURN_NOT_OK(MakeCategoricalBlock(col->type(), table_->num_rows(), &block)); + categorical_blocks_[i] = block; + } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { + const auto& ts_type = static_cast(*col->type()); + block = std::make_shared(ts_type.timezone, table_->num_rows()); + RETURN_NOT_OK(block->Allocate()); + datetimetz_blocks_[i] = block; } else { - return Status::TypeError("unhandled python type"); + auto it = type_counts_.find(output_type); + if (it != type_counts_.end()) { + block_placement = it->second; + // Increment count + it->second += 1; + } else { + // Add key to map + type_counts_[output_type] = 1; + } } + + column_types_[i] = output_type; + column_block_placement_[i] = block_placement; + } + + // Create normal non-categorical blocks + for (const auto& it : type_counts_) { + PandasBlock::type type = static_cast(it.first); + std::shared_ptr block; + RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block)); + blocks_[type] = block; } + return Status::OK(); } - return Status::TypeError("Unable to infer type of object array, were all null"); -} + Status WriteTableToBlocks(int nthreads) { + auto WriteColumn = [this](int i) { + std::shared_ptr col = this->table_->column(i); + PandasBlock::type output_type = this->column_types_[i]; -template -inline Status ArrowSerializer::ConvertData() { - // TODO(wesm): strided arrays - if (is_strided()) { return Status::Invalid("no support for strided data yet"); } + int rel_placement = this->column_block_placement_[i]; - data_ = std::make_shared(arr_); - return Status::OK(); -} + std::shared_ptr block; + if (output_type == PandasBlock::CATEGORICAL) { + auto it = this->categorical_blocks_.find(i); + if (it == this->blocks_.end()) { + return Status::KeyError("No categorical block allocated"); + } + block = it->second; + } else if (output_type == PandasBlock::DATETIME_WITH_TZ) { + auto it = this->datetimetz_blocks_.find(i); + if (it == this->datetimetz_blocks_.end()) { + return Status::KeyError("No datetimetz block allocated"); + } + block = it->second; + } else { + auto it = this->blocks_.find(output_type); + if (it == this->blocks_.end()) { return Status::KeyError("No block allocated"); } + block = it->second; + } + return block->Write(col, i, rel_placement); + }; -template <> -inline Status ArrowSerializer::ConvertData() { - if (is_strided()) { return Status::Invalid("no support for strided data yet"); } + nthreads = std::min(nthreads, table_->num_columns()); - int nbytes = BitUtil::BytesForBits(length_); - auto buffer = std::make_shared(pool_); - RETURN_NOT_OK(buffer->Resize(nbytes)); + if (nthreads == 1) { + for (int i = 0; i < table_->num_columns(); ++i) { + RETURN_NOT_OK(WriteColumn(i)); + } + } else { + std::vector thread_pool; + thread_pool.reserve(nthreads); + std::atomic task_counter(0); - const uint8_t* values = reinterpret_cast(PyArray_DATA(arr_)); + std::mutex error_mtx; + bool error_occurred = false; + Status error; - uint8_t* bitmap = buffer->mutable_data(); + for (int thread_id = 0; thread_id < nthreads; ++thread_id) { + thread_pool.emplace_back( + [this, &error, &error_occurred, &error_mtx, &task_counter, &WriteColumn]() { + int column_num; + while (!error_occurred) { + column_num = task_counter.fetch_add(1); + if (column_num >= this->table_->num_columns()) { break; } + Status s = WriteColumn(column_num); + if (!s.ok()) { + std::lock_guard lock(error_mtx); + error_occurred = true; + error = s; + break; + } + } + }); + } + for (auto&& thread : thread_pool) { + thread.join(); + } - memset(bitmap, 0, nbytes); - for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { BitUtil::SetBit(bitmap, i); } + if (error_occurred) { return error; } + } + return Status::OK(); } - data_ = buffer; - - return Status::OK(); -} - -template -template -inline Status ArrowSerializer::ConvertTypedLists( - const std::shared_ptr& field, std::shared_ptr* out) { - typedef npy_traits traits; - typedef typename traits::value_type T; - typedef typename traits::BuilderClass BuilderT; - PyAcquireGIL lock; - - auto value_builder = std::make_shared(pool_, field->type); - ListBuilder list_builder(pool_, value_builder); - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { - RETURN_NOT_OK(list_builder.AppendNull()); - } else if (PyArray_Check(objects[i])) { - auto numpy_array = reinterpret_cast(objects[i]); - RETURN_NOT_OK(list_builder.Append(true)); - - // TODO(uwe): Support more complex numpy array structures - RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE)); - - int64_t size = PyArray_DIM(numpy_array, 0); - auto data = reinterpret_cast(PyArray_DATA(numpy_array)); - if (traits::supports_nulls) { - null_bitmap_->Resize(size, false); - // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't - // currently support this. - // ValuesToBitmap(data, size, null_bitmap_->mutable_data()); - ValuesToBytemap(data, size, null_bitmap_->mutable_data()); - RETURN_NOT_OK(value_builder->Append(data, size, null_bitmap_->data())); - } else { - RETURN_NOT_OK(value_builder->Append(data, size)); - } - } else if (PyList_Check(objects[i])) { - int64_t size; - std::shared_ptr type; - RETURN_NOT_OK(list_builder.Append(true)); - RETURN_NOT_OK(InferArrowType(objects[i], &size, &type)); - if (type->type != field->type->type) { - std::stringstream ss; - ss << type->ToString() << " cannot be converted to " << field->type->ToString(); - return Status::TypeError(ss.str()); - } - RETURN_NOT_OK(AppendPySequence(objects[i], field->type, value_builder)); - } else { - return Status::TypeError("Unsupported Python type for list items"); + Status AppendBlocks(const BlockMap& blocks, PyObject* list) { + for (const auto& it : blocks) { + PyObject* item; + RETURN_NOT_OK(it.second->GetPyResult(&item)); + if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); } } + return Status::OK(); } - return list_builder.Finish(out); -} -template <> -template <> -inline Status -ArrowSerializer::ConvertTypedLists( - const std::shared_ptr& field, std::shared_ptr* out) { - // TODO: If there are bytes involed, convert to Binary representation - PyAcquireGIL lock; - bool have_bytes = false; + Status GetResultList(PyObject** out) { + PyAcquireGIL lock; - auto value_builder = std::make_shared(pool_); - ListBuilder list_builder(pool_, value_builder); - PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - for (int64_t i = 0; i < length_; ++i) { - if (PyObject_is_null(objects[i])) { - RETURN_NOT_OK(list_builder.AppendNull()); - } else if (PyArray_Check(objects[i])) { - auto numpy_array = reinterpret_cast(objects[i]); - RETURN_NOT_OK(list_builder.Append(true)); + PyObject* result = PyList_New(0); + RETURN_IF_PYERROR(); - // TODO(uwe): Support more complex numpy array structures - RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + RETURN_NOT_OK(AppendBlocks(blocks_, result)); + RETURN_NOT_OK(AppendBlocks(categorical_blocks_, result)); + RETURN_NOT_OK(AppendBlocks(datetimetz_blocks_, result)); - int64_t size = PyArray_DIM(numpy_array, 0); - auto data = reinterpret_cast(PyArray_DATA(numpy_array)); - RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes)); - } else if (PyList_Check(objects[i])) { - int64_t size; - std::shared_ptr type; - RETURN_NOT_OK(list_builder.Append(true)); - RETURN_NOT_OK(InferArrowType(objects[i], &size, &type)); - if (type->type != Type::STRING) { - std::stringstream ss; - ss << type->ToString() << " cannot be converted to STRING."; - return Status::TypeError(ss.str()); - } - RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder)); - } else { - return Status::TypeError("Unsupported Python type for list items"); - } + *out = result; + return Status::OK(); } - return list_builder.Finish(out); -} -template <> -inline Status ArrowSerializer::ConvertData() { - return Status::TypeError("NYI"); -} - -#define TO_ARROW_CASE(TYPE) \ - case NPY_##TYPE: { \ - ArrowSerializer converter(pool, arr, mask); \ - RETURN_NOT_OK(converter.Convert(out)); \ - } break; + private: + std::shared_ptr
table_; -Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& field, std::shared_ptr* out) { - PyArrayObject* arr = reinterpret_cast(ao); - PyArrayObject* mask = nullptr; + // column num -> block type id + std::vector column_types_; - if (mo != nullptr and mo != Py_None) { mask = reinterpret_cast(mo); } + // column num -> relative placement within internal block + std::vector column_block_placement_; - if (PyArray_NDIM(arr) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } + // block type -> type count + std::unordered_map type_counts_; - int type_num = PyArray_DESCR(arr)->type_num; + // block type -> block + BlockMap blocks_; -#if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8) - // Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set - // U/LONGLONG to U/INT64 so things work properly. - if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; } - if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; } -#endif + // column number -> categorical block + BlockMap categorical_blocks_; - switch (type_num) { - TO_ARROW_CASE(BOOL); - TO_ARROW_CASE(INT8); - TO_ARROW_CASE(INT16); - TO_ARROW_CASE(INT32); - TO_ARROW_CASE(INT64); -#if (NPY_INT64 != NPY_LONGLONG) - TO_ARROW_CASE(LONGLONG); -#endif - TO_ARROW_CASE(UINT8); - TO_ARROW_CASE(UINT16); - TO_ARROW_CASE(UINT32); - TO_ARROW_CASE(UINT64); -#if (NPY_UINT64 != NPY_ULONGLONG) - TO_ARROW_CASE(ULONGLONG); -#endif - TO_ARROW_CASE(FLOAT32); - TO_ARROW_CASE(FLOAT64); - TO_ARROW_CASE(DATETIME); - case NPY_OBJECT: { - ArrowSerializer converter(pool, arr, mask); - converter.IndicateType(field); - RETURN_NOT_OK(converter.Convert(out)); - } break; - default: - std::stringstream ss; - ss << "Unsupported numpy type " << PyArray_DESCR(arr)->type_num << std::endl; - return Status::NotImplemented(ss.str()); - } - return Status::OK(); -} + // column number -> datetimetz block + BlockMap datetimetz_blocks_; +}; class ArrowDeserializer { public: @@ -1839,7 +1694,7 @@ class ArrowDeserializer { Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr arr) { typedef typename arrow_traits::T T; - auto prim_arr = static_cast(arr.get()); + auto prim_arr = static_cast(arr.get()); auto in_values = reinterpret_cast(prim_arr->data()->data()); // Zero-Copy. We can pass the data pointer directly to NumPy. @@ -1988,19 +1843,19 @@ class ArrowDeserializer { inline typename std::enable_if::type ConvertValues() { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - return ConvertBinaryLike(data_, out_values); + return ConvertBinaryLike(data_, out_values); } template inline typename std::enable_if::type ConvertValues() { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - return ConvertBinaryLike(data_, out_values); + return ConvertBinaryLike(data_, out_values); } #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ - return ConvertListsLike<::arrow::ArrowType>(col_, out_values); + return ConvertListsLike(col_, out_values); template inline typename std::enable_if::type ConvertValues() { @@ -2051,7 +1906,7 @@ class ArrowDeserializer { private: std::shared_ptr col_; - const arrow::ChunkedArray& data_; + const ChunkedArray& data_; PyObject* py_ref_; PyArrayObject* arr_; PyObject* result_; @@ -2071,4 +1926,11 @@ Status ConvertColumnToPandas( return converter.Convert(out); } -} // namespace pyarrow +Status ConvertTableToPandas( + const std::shared_ptr
& table, int nthreads, PyObject** out) { + DataFrameBlockCreator helper(table); + return helper.Convert(nthreads, out); +} + +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index b548f9321d7..6862339d89b 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -25,28 +25,26 @@ #include -#include "pyarrow/visibility.h" +#include "arrow/util/visibility.h" namespace arrow { class Array; class Column; -class Field; +class DataType; class MemoryPool; class Status; class Table; -} // namespace arrow - -namespace pyarrow { +namespace py { -PYARROW_EXPORT -arrow::Status ConvertArrayToPandas( - const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); +ARROW_EXPORT +Status ConvertArrayToPandas( + const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); -PYARROW_EXPORT -arrow::Status ConvertColumnToPandas( - const std::shared_ptr& col, PyObject* py_ref, PyObject** out); +ARROW_EXPORT +Status ConvertColumnToPandas( + const std::shared_ptr& col, PyObject* py_ref, PyObject** out); struct PandasOptions { bool strings_to_categorical; @@ -58,14 +56,24 @@ struct PandasOptions { // BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. // // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) -PYARROW_EXPORT -arrow::Status ConvertTableToPandas( - const std::shared_ptr& table, int nthreads, PyObject** out); +ARROW_EXPORT +Status ConvertTableToPandas( + const std::shared_ptr
& table, int nthreads, PyObject** out); + +ARROW_EXPORT +Status PandasDtypeToArrow(PyObject* dtype, std::shared_ptr* out); -PYARROW_EXPORT -arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& field, std::shared_ptr* out); +ARROW_EXPORT +Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, std::shared_ptr* out); -} // namespace pyarrow +/// Convert dtype=object arrays. If target data type is not known, pass a type +/// with nullptr +ARROW_EXPORT +Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, std::shared_ptr* out); + +} // namespace py +} // namespace arrow #endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index d2f5291ea83..c898f634aed 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -24,24 +24,23 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" -using arrow::Status; - -namespace pyarrow { +namespace arrow { +namespace py { static std::mutex memory_pool_mutex; -static arrow::MemoryPool* default_pyarrow_pool = nullptr; +static MemoryPool* default_pyarrow_pool = nullptr; -void set_default_memory_pool(arrow::MemoryPool* pool) { +void set_default_memory_pool(MemoryPool* pool) { std::lock_guard guard(memory_pool_mutex); default_pyarrow_pool = pool; } -arrow::MemoryPool* get_memory_pool() { +MemoryPool* get_memory_pool() { std::lock_guard guard(memory_pool_mutex); if (default_pyarrow_pool) { return default_pyarrow_pool; } else { - return arrow::default_memory_pool(); + return default_memory_pool(); } } @@ -60,4 +59,5 @@ PyBytesBuffer::~PyBytesBuffer() { Py_DECREF(obj_); } -} // namespace pyarrow +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index ad65ec75eec..0b4c6bebcfe 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -19,16 +19,16 @@ #define PYARROW_COMMON_H #include "pyarrow/config.h" -#include "pyarrow/visibility.h" #include "arrow/buffer.h" #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { + class MemoryPool; -} -namespace pyarrow { +namespace py { class PyAcquireGIL { public: @@ -98,10 +98,10 @@ struct PyObjectStringify { } // Return the common PyArrow memory pool -PYARROW_EXPORT void set_default_memory_pool(arrow::MemoryPool* pool); -PYARROW_EXPORT arrow::MemoryPool* get_memory_pool(); +ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool); +ARROW_EXPORT MemoryPool* get_memory_pool(); -class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { +class ARROW_EXPORT NumPyBuffer : public Buffer { public: NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) { arr_ = arr; @@ -118,7 +118,7 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { PyArrayObject* arr_; }; -class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer { +class ARROW_EXPORT PyBytesBuffer : public Buffer { public: PyBytesBuffer(PyObject* obj); ~PyBytesBuffer(); @@ -127,6 +127,7 @@ class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer { PyObject* obj_; }; -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc index e1002bf4fd1..0be6d962b55 100644 --- a/python/src/pyarrow/config.cc +++ b/python/src/pyarrow/config.cc @@ -19,7 +19,8 @@ #include "pyarrow/config.h" -namespace pyarrow { +namespace arrow { +namespace py { void pyarrow_init() {} @@ -30,4 +31,5 @@ void pyarrow_set_numpy_nan(PyObject* obj) { numpy_nan = obj; } -} // namespace pyarrow +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h index 386ee4b1e25..87fc5c2b290 100644 --- a/python/src/pyarrow/config.h +++ b/python/src/pyarrow/config.h @@ -20,24 +20,27 @@ #include +#include "arrow/util/visibility.h" + #include "pyarrow/numpy_interop.h" -#include "pyarrow/visibility.h" #if PY_MAJOR_VERSION >= 3 #define PyString_Check PyUnicode_Check #endif -namespace pyarrow { +namespace arrow { +namespace py { -PYARROW_EXPORT +ARROW_EXPORT extern PyObject* numpy_nan; -PYARROW_EXPORT +ARROW_EXPORT void pyarrow_init(); -PYARROW_EXPORT +ARROW_EXPORT void pyarrow_set_numpy_nan(PyObject* obj); -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_CONFIG_H diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 78fad165ac8..edebea6d97c 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -19,9 +19,8 @@ #include -using namespace arrow; - -namespace pyarrow { +namespace arrow { +namespace py { #define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ case Type::NAME: \ @@ -51,4 +50,5 @@ std::shared_ptr GetPrimitiveType(Type::type type) { } } -} // namespace pyarrow +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index 788c3eedddf..611e814b7d8 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -18,19 +18,18 @@ #ifndef PYARROW_HELPERS_H #define PYARROW_HELPERS_H -#include #include -#include "pyarrow/visibility.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" -namespace pyarrow { +namespace arrow { +namespace py { -using arrow::DataType; -using arrow::Type; - -PYARROW_EXPORT +ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_HELPERS_H diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index aa4cb7b052c..0aa61dc811f 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -26,9 +26,8 @@ #include "pyarrow/common.h" -using arrow::Status; - -namespace pyarrow { +namespace arrow { +namespace py { // ---------------------------------------------------------------------- // Python file @@ -151,7 +150,7 @@ Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { return Status::OK(); } -Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { +Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { PyAcquireGIL lock; PyObject* bytes_obj; @@ -214,8 +213,9 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { // A readable file that is backed by a PyBytes PyBytesReader::PyBytesReader(PyObject* obj) - : arrow::io::BufferReader(std::make_shared(obj)) {} + : io::BufferReader(std::make_shared(obj)) {} PyBytesReader::~PyBytesReader() {} -} // namespace pyarrow +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h index 4cb010f2d4e..a603e816225 100644 --- a/python/src/pyarrow/io.h +++ b/python/src/pyarrow/io.h @@ -20,17 +20,17 @@ #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" +#include "arrow/util/visibility.h" #include "pyarrow/config.h" #include "pyarrow/common.h" -#include "pyarrow/visibility.h" namespace arrow { + class MemoryPool; -} -namespace pyarrow { +namespace py { // A common interface to a Python file-like object. Must acquire GIL before // calling any methods @@ -39,31 +39,31 @@ class PythonFile { PythonFile(PyObject* file); ~PythonFile(); - arrow::Status Close(); - arrow::Status Seek(int64_t position, int whence); - arrow::Status Read(int64_t nbytes, PyObject** out); - arrow::Status Tell(int64_t* position); - arrow::Status Write(const uint8_t* data, int64_t nbytes); + Status Close(); + Status Seek(int64_t position, int whence); + Status Read(int64_t nbytes, PyObject** out); + Status Tell(int64_t* position); + Status Write(const uint8_t* data, int64_t nbytes); private: PyObject* file_; }; -class PYARROW_EXPORT PyReadableFile : public arrow::io::ReadableFileInterface { +class ARROW_EXPORT PyReadableFile : public io::ReadableFileInterface { public: explicit PyReadableFile(PyObject* file); virtual ~PyReadableFile(); - arrow::Status Close() override; + Status Close() override; - arrow::Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; - arrow::Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; - arrow::Status GetSize(int64_t* size) override; + Status GetSize(int64_t* size) override; - arrow::Status Seek(int64_t position) override; + Status Seek(int64_t position) override; - arrow::Status Tell(int64_t* position) override; + Status Tell(int64_t* position) override; bool supports_zero_copy() const override; @@ -71,21 +71,21 @@ class PYARROW_EXPORT PyReadableFile : public arrow::io::ReadableFileInterface { std::unique_ptr file_; }; -class PYARROW_EXPORT PyOutputStream : public arrow::io::OutputStream { +class ARROW_EXPORT PyOutputStream : public io::OutputStream { public: explicit PyOutputStream(PyObject* file); virtual ~PyOutputStream(); - arrow::Status Close() override; - arrow::Status Tell(int64_t* position) override; - arrow::Status Write(const uint8_t* data, int64_t nbytes) override; + Status Close() override; + Status Tell(int64_t* position) override; + Status Write(const uint8_t* data, int64_t nbytes) override; private: std::unique_ptr file_; }; // A zero-copy reader backed by a PyBytes object -class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader { +class ARROW_EXPORT PyBytesReader : public io::BufferReader { public: explicit PyBytesReader(PyObject* obj); virtual ~PyBytesReader(); @@ -93,6 +93,7 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader { // TODO(wesm): seekable output files -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_IO_H diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h index 6326527a674..57f3328e870 100644 --- a/python/src/pyarrow/numpy_interop.h +++ b/python/src/pyarrow/numpy_interop.h @@ -42,7 +42,8 @@ #include #include -namespace pyarrow { +namespace arrow { +namespace py { inline int import_numpy() { #ifdef NUMPY_IMPORT_ARRAY @@ -53,6 +54,7 @@ inline int import_numpy() { return 0; } -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_NUMPY_INTEROP_H diff --git a/python/src/pyarrow/type_traits.h b/python/src/pyarrow/type_traits.h new file mode 100644 index 00000000000..f4604d7a989 --- /dev/null +++ b/python/src/pyarrow/type_traits.h @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "pyarrow/numpy_interop.h" + +#include "arrow/builder.h" +#include "arrow/type.h" + +namespace arrow { +namespace py { + +template +struct npy_traits {}; + +template <> +struct npy_traits { + typedef uint8_t value_type; + using TypeClass = BooleanType; + using BuilderClass = BooleanBuilder; + + static constexpr bool supports_nulls = false; + static inline bool isnull(uint8_t v) { return false; } +}; + +#define NPY_INT_DECL(TYPE, CapType, T) \ + template <> \ + struct npy_traits { \ + typedef T value_type; \ + using TypeClass = CapType##Type; \ + using BuilderClass = CapType##Builder; \ + \ + static constexpr bool supports_nulls = false; \ + static inline bool isnull(T v) { return false; } \ + }; + +NPY_INT_DECL(INT8, Int8, int8_t); +NPY_INT_DECL(INT16, Int16, int16_t); +NPY_INT_DECL(INT32, Int32, int32_t); +NPY_INT_DECL(INT64, Int64, int64_t); + +NPY_INT_DECL(UINT8, UInt8, uint8_t); +NPY_INT_DECL(UINT16, UInt16, uint16_t); +NPY_INT_DECL(UINT32, UInt32, uint32_t); +NPY_INT_DECL(UINT64, UInt64, uint64_t); + +#if NPY_INT64 != NPY_LONGLONG +NPY_INT_DECL(LONGLONG, Int64, int64_t); +NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); +#endif + +template <> +struct npy_traits { + typedef float value_type; + using TypeClass = FloatType; + using BuilderClass = FloatBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(float v) { return v != v; } +}; + +template <> +struct npy_traits { + typedef double value_type; + using TypeClass = DoubleType; + using BuilderClass = DoubleBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(double v) { return v != v; } +}; + +template <> +struct npy_traits { + typedef int64_t value_type; + using TypeClass = TimestampType; + using BuilderClass = TimestampBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(int64_t v) { + // NaT = -2**63 + // = -0x8000000000000000 + // = -9223372036854775808; + // = std::numeric_limits::min() + return v == std::numeric_limits::min(); + } +}; + +template <> +struct npy_traits { + typedef PyObject* value_type; + static constexpr bool supports_nulls = true; +}; + +template +struct arrow_traits {}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_BOOL; + static constexpr bool supports_nulls = false; + static constexpr bool is_boolean = true; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; +}; + +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + static constexpr bool is_boolean = false; \ + static constexpr bool is_numeric_not_nullable = true; \ + static constexpr bool is_numeric_nullable = false; \ + typedef typename npy_traits::value_type T; \ + }; + +INT_DECL(INT8); +INT_DECL(INT16); +INT_DECL(INT32); +INT_DECL(INT64); +INT_DECL(UINT8); +INT_DECL(UINT16); +INT_DECL(UINT32); +INT_DECL(UINT64); + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT32; + static constexpr bool supports_nulls = true; + static constexpr float na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT64; + static constexpr bool supports_nulls = true; + static constexpr double na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; + typedef typename npy_traits::value_type T; +}; + +static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_DATETIME; + static constexpr bool supports_nulls = true; + static constexpr int64_t na_value = kPandasTimestampNull; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_DATETIME; + static constexpr bool supports_nulls = true; + static constexpr int64_t na_value = kPandasTimestampNull; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr bool is_boolean = false; + static constexpr bool is_numeric_not_nullable = false; + static constexpr bool is_numeric_nullable = false; +}; + +} // namespace py +} // namespace arrow diff --git a/python/src/pyarrow/util/datetime.h b/python/src/pyarrow/util/datetime.h index 9ffa6910524..f704a96d91b 100644 --- a/python/src/pyarrow/util/datetime.h +++ b/python/src/pyarrow/util/datetime.h @@ -21,7 +21,8 @@ #include #include -namespace pyarrow { +namespace arrow { +namespace py { inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { struct tm date = {0}; @@ -35,6 +36,7 @@ inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { return lrint(difftime(mktime(&date), mktime(&epoch)) * 1000); } -} // namespace pyarrow +} // namespace py +} // namespace arrow #endif // PYARROW_UTIL_DATETIME_H diff --git a/python/src/pyarrow/util/test_main.cc b/python/src/pyarrow/util/test_main.cc index 02e9a54f659..d8d1d030f8f 100644 --- a/python/src/pyarrow/util/test_main.cc +++ b/python/src/pyarrow/util/test_main.cc @@ -26,7 +26,7 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); Py_Initialize(); - pyarrow::import_numpy(); + arrow::py::import_numpy(); int ret = RUN_ALL_TESTS(); diff --git a/python/src/pyarrow/visibility.h b/python/src/pyarrow/visibility.h deleted file mode 100644 index 9f0c13b4b20..00000000000 --- a/python/src/pyarrow/visibility.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_VISIBILITY_H -#define PYARROW_VISIBILITY_H - -#if defined(_WIN32) || defined(__CYGWIN__) -#define PYARROW_EXPORT __declspec(dllexport) -#else // Not Windows -#ifndef PYARROW_EXPORT -#define PYARROW_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef PYARROW_NO_EXPORT -#define PYARROW_NO_EXPORT __attribute__((visibility("hidden"))) -#endif -#endif // Non-Windows - -#endif // PYARROW_VISIBILITY_H