From 7b530e4b20437a82daf3c66a7d168e0be7c3bc48 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 27 Sep 2017 23:34:20 -0400 Subject: [PATCH 1/7] Consolidate both sequence and ndarray/Series/Index conversion in pyarrow.Array Change-Id: I97e785c7fd34540f2c6ba05cfaaef5b1fbf830f4 --- python/pyarrow/array.pxi | 269 ++++++++++---------- python/pyarrow/pandas_compat.py | 22 +- python/pyarrow/table.pxi | 13 +- python/pyarrow/tests/test_array.py | 16 +- python/pyarrow/tests/test_convert_pandas.py | 83 +++--- python/pyarrow/tests/test_parquet.py | 22 +- 6 files changed, 196 insertions(+), 229 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index eec6180165c..1c0e26e5179 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,17 +16,107 @@ # under the License. -def array(object sequence, DataType type=None, MemoryPool memory_pool=None, - size=None): +cdef _sequence_to_array(object sequence, object size, DataType type, + MemoryPool memory_pool): + cdef shared_ptr[CArray] out + cdef int64_t c_size + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + if type is None: + with nogil: + check_status(ConvertPySequence(sequence, pool, &out)) + else: + if size is None: + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type + ) + ) + else: + c_size = size + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type, c_size + ) + ) + + return pyarrow_wrap_array(out) + + +cdef _is_array_like(obj): + try: + import pandas + return isinstance(obj, (np.ndarray, pd.Series, pd.Index, Categorical)) + except: + return isinstance(obj, np.ndarray) + + +cdef _arraylike_to_array(object obj, object mask, DataType type, + MemoryPool memory_pool): + cdef: + shared_ptr[CArray] out + shared_ptr[CChunkedArray] chunked_out + shared_ptr[CDataType] c_type + + if mask is not None: + mask = get_series_values(mask) + + values = get_series_values(obj) + + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + if isinstance(values, Categorical): + return DictionaryArray.from_arrays( + values.codes, values.categories.values, + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) + elif values.dtype == object: + # Object dtype undergoes a different conversion path as more type + # inference may be needed + if type is not None: + c_type = type.sp_type + with nogil: + check_status(PandasObjectsToArrow( + pool, values, mask, c_type, &chunked_out)) + + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + out = chunked_out.get().chunk(0) + else: + values, type = pdcompat.maybe_coerce_datetime64( + values, obj.dtype, type) + + if type is None: + dtype = values.dtype + with nogil: + check_status(NumPyDtypeToArrow(dtype, &c_type)) + else: + c_type = type.sp_type + + with nogil: + check_status(PandasToArrow( + pool, values, mask, c_type, &out)) + + return pyarrow_wrap_array(out) + + +def array(object sequence, DataType type=None, mask=None, + MemoryPool memory_pool=None, size=None): """ Create pyarrow.Array instance from a Python sequence Parameters ---------- - sequence : sequence-like or iterable object of Python objects. - If both type and size are specified may be a single use iterable. - type : pyarrow.DataType, optional - If not passed, will be inferred from the data + sequence : sequence, iterable, ndarray or Series + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array + mask : array (boolean), optional + Indicate which values are null (True) or not null (False) + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool @@ -37,37 +127,43 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, of size followed by a resize to the actual size (so if you know the exact size specifying it correctly will give you better performance). + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), np.array([0, 1], + ... dtype=bool)) + + [ + 1, + NA + ] + Returns ------- - array : pyarrow.Array + array : pyarrow.Array or pyarrow.ChunkedArray (if object data + overflowed binary storage) """ - cdef: - shared_ptr[CArray] sp_array - CMemoryPool* pool - int64_t c_size - - pool = maybe_unbox_memory_pool(memory_pool) - if type is None: - with nogil: - check_status(ConvertPySequence(sequence, pool, &sp_array)) + if _is_array_like(sequence): + return _arraylike_to_array(sequence, mask, type, memory_pool) else: - if size is None: - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type - ) - ) - else: - c_size = size - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type, c_size - ) - ) - - return pyarrow_wrap_array(sp_array) + if mask is not None: + raise ValueError("Masks only supported with ndarray-like inputs") + return _sequence_to_array(sequence, size, type, memory_pool) def _normalize_slice(object arrow_obj, slice key): @@ -141,110 +237,11 @@ cdef class Array: return pyarrow_wrap_array(result) @staticmethod - def from_pandas(obj, mask=None, DataType type=None, - timestamps_to_ms=False, - MemoryPool memory_pool=None): - """ - Convert pandas.Series to an Arrow Array. - - Parameters - ---------- - series : pandas.Series or numpy.ndarray - - mask : pandas.Series or numpy.ndarray, optional - boolean mask if the object is null (True) or valid (False) - - type : pyarrow.DataType - Explicit type to attempt to coerce to - - timestamps_to_ms : bool, optional - Convert datetime columns to ms resolution. This is needed for - compatibility with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate the resulting Arrow array. - - Notes - ----- - Localized timestamps will currently be returned as UTC (pandas's native - representation). Timezone-naive data will be implicitly interpreted as - UTC. - - Examples - -------- - - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.Array.from_pandas(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> import numpy as np - >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], - ... dtype=bool)) - - [ - 1, - NA - ] - - Returns - ------- - array : pyarrow.Array or pyarrow.ChunkedArray (if object data - overflowed binary storage) + def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): + """Convert pandas.Series to an Arrow Array. See pyarrow.array for more + information on usage. """ - cdef: - shared_ptr[CArray] out - shared_ptr[CChunkedArray] chunked_out - shared_ptr[CDataType] c_type - CMemoryPool* pool - - if mask is not None: - mask = get_series_values(mask) - - values = get_series_values(obj) - pool = maybe_unbox_memory_pool(memory_pool) - - if isinstance(values, Categorical): - return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - memory_pool=memory_pool) - elif values.dtype == object: - # Object dtype undergoes a different conversion path as more type - # inference may be needed - if type is not None: - c_type = type.sp_type - with nogil: - check_status(PandasObjectsToArrow( - pool, values, mask, c_type, &chunked_out)) - - if chunked_out.get().num_chunks() > 1: - return pyarrow_wrap_chunked_array(chunked_out) - else: - out = chunked_out.get().chunk(0) - else: - values, type = pdcompat.maybe_coerce_datetime64( - values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) - - if type is None: - dtype = values.dtype - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) - else: - c_type = type.sp_type - - with nogil: - check_status(PandasToArrow( - pool, values, mask, c_type, &out)) - - return pyarrow_wrap_array(out) + return array(obj, mask=mask, type=type, memory_pool=memory_pool) property null_count: diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d1e6f5a8096..2ae55b09231 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -203,7 +203,7 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): } -def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): +def dataframe_to_arrays(df, schema, preserve_index): names = [] arrays = [] index_columns = [] @@ -223,15 +223,13 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): field = schema.field_by_name(name) type = getattr(field, "type", None) - array = pa.Array.from_pandas( - col, type=type, timestamps_to_ms=timestamps_to_ms - ) + array = pa.array(col, type=type) arrays.append(array) names.append(name) types.append(array.type) for i, column in enumerate(index_columns): - array = pa.Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) + array = pa.array(column) arrays.append(array) names.append(index_level_name(column, i)) types.append(array.type) @@ -242,25 +240,15 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): return names, arrays, metadata -def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): - if timestamps_to_ms: - import warnings - warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning) - +def maybe_coerce_datetime64(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: return values, type_ - coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' - - if coerce_ms: - values = values.astype('datetime64[ms]') - type_ = pa.timestamp('ms') - if isinstance(dtype, DatetimeTZDtype): tz = dtype.tz - unit = 'ms' if coerce_ms else dtype.unit + unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 028797e45b8..e5422a5beca 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -575,7 +575,7 @@ cdef class RecordBatch: pyarrow.RecordBatch """ names, arrays, metadata = pdcompat.dataframe_to_arrays( - df, False, schema, preserve_index + df, schema, preserve_index ) return cls.from_arrays(arrays, names, metadata) @@ -714,21 +714,13 @@ cdef class Table: return result @classmethod - def from_pandas(cls, df, bint timestamps_to_ms=False, - Schema schema=None, bint preserve_index=True): + def from_pandas(cls, df, Schema schema=None, bint preserve_index=True): """ Convert pandas.DataFrame to an Arrow Table Parameters ---------- df : pandas.DataFrame - timestamps_to_ms : bool - Convert datetime columns to ms resolution. This is needed for - compability with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - schema : pyarrow.Schema, optional The expected schema of the Arrow Table. This can be used to indicate the type of columns if we cannot infer it automatically. @@ -754,7 +746,6 @@ cdef class Table: """ names, arrays, metadata = pdcompat.dataframe_to_arrays( df, - timestamps_to_ms=timestamps_to_ms, schema=schema, preserve_index=preserve_index ) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f316417caaf..c52ddb2cd98 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -149,6 +149,10 @@ def test_array_factory_invalid_type(): pa.array(arr) +def test_array_ref_to_ndarray_base(): + pass + + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) @@ -170,8 +174,8 @@ def test_dictionary_from_boxed_arrays(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) - iarr = pa.Array.from_pandas(indices) - darr = pa.Array.from_pandas(dictionary) + iarr = pa.array(indices) + darr = pa.array(dictionary) d1 = pa.DictionaryArray.from_arrays(iarr, darr) @@ -201,7 +205,7 @@ def test_dictionary_with_pandas(): def test_list_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') - offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32()) + offsets = pa.array(offsets_arr, type=pa.int32()) pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] values = pa.array(pyvalues, type=pa.binary()) @@ -214,10 +218,10 @@ def test_list_from_arrays(): def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) casted = in_arr.cast(out_type, safe=safe) - expected = pa.Array.from_pandas(out_data, type=out_type) + expected = pa.array(out_data, type=out_type) assert casted.equals(expected) @@ -243,7 +247,7 @@ def test_cast_integers_safe(): (np.array([50000], dtype='u2'), pa.uint16(), pa.int16()) ] for in_data, in_type, out_type in unsafe_cases: - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) with pytest.raises(pa.ArrowInvalid): in_arr.cast(out_type) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5d56cde7d48..b4b3ba42631 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -18,7 +18,7 @@ from collections import OrderedDict -from datetime import datetime, date, time +from datetime import date, time import unittest import decimal import json @@ -82,7 +82,7 @@ def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def _check_series_roundtrip(self, s, type_=None): - arr = pa.Array.from_pandas(s, type=type_) + arr = pa.array(s, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if isinstance(arr.type, pa.TimestampType) and arr.type.tz is not None: @@ -93,7 +93,7 @@ def _check_series_roundtrip(self, s, type_=None): def _check_array_roundtrip(self, values, expected=None, mask=None, type=None): - arr = pa.Array.from_pandas(values, mask=mask, type=type) + arr = pa.array(values, mask=mask, type=type) result = arr.to_pandas() values_nulls = pd.isnull(values) @@ -152,7 +152,7 @@ def test_float_nulls(self): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan @@ -223,7 +223,7 @@ def test_integer_with_nulls(self): for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, mask=null_mask) arrays.append(arr) expected = values.astype('f8') @@ -244,8 +244,8 @@ def test_array_from_pandas_type_cast(self): target_type = pa.int8() - result = pa.Array.from_pandas(arr, type=target_type) - expected = pa.Array.from_pandas(arr.astype('int8')) + result = pa.array(arr, type=target_type) + expected = pa.array(arr.astype('int8')) assert result.equals(expected) def test_boolean_no_nulls(self): @@ -266,7 +266,7 @@ def test_boolean_nulls(self): mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 - arr = pa.Array.from_pandas(values, mask) + arr = pa.array(values, mask=mask) expected = values.astype(object) expected[mask] = None @@ -292,7 +292,7 @@ def test_all_nulls_cast_numeric(self): arr = np.array([None], dtype=object) def _check_type(t): - a2 = pa.Array.from_pandas(arr, type=t) + a2 = pa.array(arr, type=t) assert a2.type == t assert a2[0].as_py() is None @@ -325,7 +325,7 @@ def test_bytes_exceed_2gb(self): df = pd.DataFrame({ 'strings': np.array([val] * 4000, dtype=object) }) - arr = pa.Array.from_pandas(df['strings']) + arr = pa.array(df['strings']) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 2 arr = None @@ -365,19 +365,6 @@ def test_timestamps_notimezone_no_nulls(self): expected_schema=schema, ) - def test_timestamps_to_ms_explicit_schema(self): - # ARROW-1328 - df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]}) - pa_type = pa.from_numpy_dtype(df['datetime'].dtype) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = pa.Array.from_pandas(df['datetime'], type=pa_type, - timestamps_to_ms=True) - - tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'), - arr.to_pandas()) - def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ @@ -450,11 +437,11 @@ def test_date_objects_typed(self): t32 = pa.date32() t64 = pa.date64() - a32 = pa.Array.from_pandas(arr, type=t32) - a64 = pa.Array.from_pandas(arr, type=t64) + a32 = pa.array(arr, type=t32) + a64 = pa.array(arr, type=t64) - a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32) - a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64) + a32_expected = pa.array(arr_i4, mask=mask, type=t32) + a64_expected = pa.array(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) @@ -481,8 +468,8 @@ def test_dates_from_integers(self): arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 - a1 = pa.Array.from_pandas(arr, type=t1) - a2 = pa.Array.from_pandas(arr2, type=t2) + a1 = pa.array(arr, type=t1) + a2 = pa.array(arr2, type=t2) expected = date(2017, 4, 3) assert a1[0].as_py() == expected @@ -520,7 +507,7 @@ def test_column_of_arrays_to_py(self): np.arange(1, dtype=dtype) ]) type_ = pa.list_(pa.int8()) - parr = pa.Array.from_pandas(arr, type=type_) + parr = pa.array(arr, type=type_) assert parr[0].as_py() == list(range(10)) assert parr[1].as_py() == list(range(5)) @@ -592,7 +579,7 @@ def test_column_of_lists_strided(self): def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) - arr = pa.Array.from_pandas(data) + arr = pa.array(data) expected = pa.array(list(data)) assert arr.equals(expected) assert arr.type == pa.list_(pa.null()) @@ -600,7 +587,7 @@ def test_nested_lists_all_none(self): data2 = np.array([None, None, [None, None], np.array([None, None], dtype=object)], dtype=object) - arr = pa.Array.from_pandas(data2) + arr = pa.array(data2) expected = pa.array([None, None, [None, None], [None, None]]) assert arr.equals(expected) @@ -760,7 +747,7 @@ def test_pytime_from_pandas(self): t1 = pa.time64('us') aobjs = np.array(pytimes + [None], dtype=object) - parr = pa.Array.from_pandas(aobjs) + parr = pa.array(aobjs) assert parr.type == t1 assert parr[0].as_py() == pytimes[0] assert parr[1].as_py() == pytimes[1] @@ -775,18 +762,18 @@ def test_pytime_from_pandas(self): arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') - a1 = pa.Array.from_pandas(arr, type=pa.time64('us')) + a1 = pa.array(arr, type=pa.time64('us')) assert a1[0].as_py() == pytimes[0] - a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns')) + a2 = pa.array(arr * 1000, type=pa.time64('ns')) assert a2[0].as_py() == pytimes[0] - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), - type=pa.time32('ms')) + a3 = pa.array((arr / 1000).astype('i4'), + type=pa.time32('ms')) assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), - type=pa.time32('s')) + a4 = pa.array((arr / 1000000).astype('i4'), + type=pa.time32('s')) assert a4[0].as_py() == pytimes[0].replace(microsecond=0) def test_arrow_time_to_pandas(self): @@ -809,14 +796,14 @@ def test_arrow_time_to_pandas(self): null_mask = np.array([False, False, True], dtype=bool) - a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us')) - a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask, - type=pa.time64('ns')) + a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) + a2 = pa.array(arr * 1000, mask=null_mask, + type=pa.time64('ns')) - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask, - type=pa.time32('ms')) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask, - type=pa.time32('s')) + a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, + type=pa.time32('ms')) + a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, + type=pa.time32('s')) names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) @@ -842,7 +829,7 @@ def test_arrow_time_to_pandas(self): tm.assert_frame_equal(df, expected_df) def _check_numpy_array_roundtrip(self, np_array): - arr = pa.Array.from_pandas(np_array) + arr = pa.array(np_array) result = arr.to_pandas() npt.assert_array_equal(result, np_array) @@ -889,7 +876,7 @@ def test_numpy_datetime64_columns(self): def test_all_nones(self): def _check_series(s): - converted = pa.Array.from_pandas(s) + converted = pa.array(s) assert isinstance(converted, pa.NullArray) assert len(converted) == 3 assert converted.null_count == 3 diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index eb23894f480..0308dd3f948 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -497,41 +497,41 @@ def test_column_of_lists(tmpdir): def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') - a1 = pa.Array.from_pandas(data1, type=t1) + a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 - a2 = pa.Array.from_pandas(data2, type=t2) + a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') - a3 = pa.Array.from_pandas(data3, type=t3) + a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') - a4 = pa.Array.from_pandas(data4, type=t4) + a4 = pa.array(data4, type=t4) t5 = pa.time64('us') - a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) + a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') - a6 = pa.Array.from_pandas(data4, type=t6) + a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') - ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) + ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') - a7 = pa.Array.from_pandas(data7, type=t7) + a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 - a7_us = pa.Array.from_pandas(data7_us, type=t7_us) + a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', @@ -575,7 +575,7 @@ def _assert_unsupported(array): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') - a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) + a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7) @@ -1295,7 +1295,7 @@ def test_large_table_int32_overflow(): arr = np.ones(size, dtype='uint8') - parr = pa.Array.from_pandas(arr, type=pa.uint8()) + parr = pa.array(arr, type=pa.uint8()) table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() From cf40b7678d39ddbccbc9b0e477b74ac4766eb577 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 27 Sep 2017 23:55:40 -0400 Subject: [PATCH 2/7] Add type aliases, some unit tests Change-Id: I17dee43549b04a06a190baf5d0996fab4d60301f --- python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pxi | 4 +- python/pyarrow/tests/test_array.py | 10 +++-- python/pyarrow/tests/test_schema.py | 41 ++++++++++++++++++++ python/pyarrow/types.pxi | 60 +++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 0d76a35f4ed..ac069482274 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, dictionary, field, type_for_alias, DataType, NAType, Field, Schema, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1c0e26e5179..622a6de85d6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -102,7 +102,7 @@ cdef _arraylike_to_array(object obj, object mask, DataType type, return pyarrow_wrap_array(out) -def array(object sequence, DataType type=None, mask=None, +def array(object sequence, type=None, mask=None, MemoryPool memory_pool=None, size=None): """ Create pyarrow.Array instance from a Python sequence @@ -158,6 +158,8 @@ def array(object sequence, DataType type=None, mask=None, array : pyarrow.Array or pyarrow.ChunkedArray (if object data overflowed binary storage) """ + if type is not None and not isinstance(type, DataType): + type = type_for_alias(type) if _is_array_like(sequence): return _arraylike_to_array(sequence, mask, type, memory_pool) else: diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c52ddb2cd98..56aa70e3904 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -150,7 +150,11 @@ def test_array_factory_invalid_type(): def test_array_ref_to_ndarray_base(): - pass + arr = np.array([1, 2, 3]) + + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) def test_dictionary_from_numpy(): @@ -205,9 +209,9 @@ def test_dictionary_with_pandas(): def test_list_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') - offsets = pa.array(offsets_arr, type=pa.int32()) + offsets = pa.array(offsets_arr, type='int32') pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] - values = pa.array(pyvalues, type=pa.binary()) + values = pa.array(pyvalues, type='binary') result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 4bb6a5af7dc..ece7b82a897 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -69,6 +69,47 @@ def test_type_list(): assert str(l2) == 'list' +def test_type_for_alias(): + cases = [ + ('i1', pa.int8()), + ('int8', pa.int8()), + ('i2', pa.int16()), + ('int16', pa.int16()), + ('i4', pa.int32()), + ('int32', pa.int32()), + ('i8', pa.int64()), + ('int64', pa.int64()), + ('u1', pa.uint8()), + ('uint8', pa.uint8()), + ('u2', pa.uint16()), + ('uint16', pa.uint16()), + ('u4', pa.uint32()), + ('uint32', pa.uint32()), + ('u8', pa.uint64()), + ('uint64', pa.uint64()), + ('f4', pa.float32()), + ('float32', pa.float32()), + ('f8', pa.float64()), + ('float64', pa.float64()), + ('date32', pa.date32()), + ('date64', pa.date64()), + ('string', pa.string()), + ('str', pa.string()), + ('binary', pa.binary()), + ('time32[s]', pa.time32('s')), + ('time32[ms]', pa.time32('ms')), + ('time64[us]', pa.time64('us')), + ('time64[ns]', pa.time64('ns')), + ('timestamp[s]', pa.timestamp('s')), + ('timestamp[ms]', pa.timestamp('ms')), + ('timestamp[us]', pa.timestamp('us')), + ('timestamp[ns]', pa.timestamp('ns')), + ] + + for val, expected in cases: + assert pa.type_for_alias(val) == expected + + def test_type_string(): t = pa.string() assert str(t) == 'string' diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b298e740250..0c96a553d85 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import re + # These are imprecise because the type (in pandas 0.x) depends on the presence # of nulls cdef dict _pandas_type_map = { @@ -922,6 +924,64 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +cdef dict _type_aliases = { + 'null': null, + 'i1': int8, + 'int8': int8, + 'i2': int16, + 'int16': int16, + 'i4': int32, + 'int32': int32, + 'i8': int64, + 'int64': int64, + 'u1': uint8, + 'uint8': uint8, + 'u2': uint16, + 'uint16': uint16, + 'u4': uint32, + 'uint32': uint32, + 'u8': uint64, + 'uint64': uint64, + 'f4': float32, + 'float32': float32, + 'f8': float64, + 'float64': float64, + 'string': string, + 'str': string, + 'utf8': string, + 'binary': binary, + 'date32': date32, + 'date64': date64, + 'time32[s]': time32('s'), + 'time32[ms]': time32('ms'), + 'time64[us]': time64('us'), + 'time64[ns]': time64('ns'), + 'timestamp[s]': timestamp('s'), + 'timestamp[ms]': timestamp('ms'), + 'timestamp[us]': timestamp('us'), + 'timestamp[ns]': timestamp('ns'), +} + + +def type_for_alias(name): + """ + Return DataType given a string alias if one exists + + Returns + ------- + type : DataType + """ + name = name.lower() + try: + alias = _type_aliases[name] + except KeyError: + raise ValueError('No type alias for {0}'.format(name)) + + if isinstance(alias, DataType): + return alias + return alias() + + def schema(fields): """ Construct pyarrow.Schema from collection of fields From 587c575aadcada9baba06431bc8db3413d29ae92 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 28 Sep 2017 00:20:33 -0400 Subject: [PATCH 3/7] Add direct types sequence converters for more data types Change-Id: I484937c8eb23b96402ec6b1ec3d4342fa8dedbd4 --- cpp/src/arrow/python/builtin_convert.cc | 223 ++++++++++++++++-------- python/pyarrow/scalar.pxi | 8 +- 2 files changed, 156 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 747b872af0a..b17a12f5a86 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -359,7 +360,11 @@ class TypedConverterVisitor : public TypedConverter { if (PySequence_Check(obj)) { for (int64_t i = 0; i < size; ++i) { OwnedRef ref(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } } } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); @@ -370,7 +375,11 @@ class TypedConverterVisitor : public TypedConverter { // consuming at size. while ((item = PyIter_Next(iter)) && i < size) { OwnedRef ref(item); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } ++i; } if (size != i) { @@ -388,52 +397,136 @@ class TypedConverterVisitor : public TypedConverter { class NullConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - return Status::Invalid("NullConverter: passed non-None value"); - } + return Status::Invalid("NullConverter: passed non-None value"); } }; class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - if (item.obj() == Py_True) { - return typed_builder_->Append(true); - } else { - return typed_builder_->Append(false); - } + return typed_builder_->Append(item.obj() == Py_True); + } +}; + +class Int8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int16Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; class Int64Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - int64_t val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = static_cast(PyLong_AsLongLong(item.obj())); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class UInt8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; -class DateConverter : public TypedConverterVisitor { +class UInt16Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_Date* pydate = reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDate_to_ms(pydate)); + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt64Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class DateConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + PyDateTime_Date* pydate = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDate_to_ms(pydate)); } }; @@ -441,27 +534,17 @@ class TimestampConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_DateTime* pydatetime = - reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDateTime_to_us(pydatetime)); - } + PyDateTime_DateTime* pydatetime = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDateTime_to_us(pydatetime)); } }; class DoubleConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - double val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = PyFloat_AsDouble(item.obj()); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); - } + double val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); } }; @@ -473,10 +556,7 @@ class BytesConverter : public TypedConverterVisitorAppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -504,10 +584,7 @@ class FixedWidthBytesConverter Py_ssize_t expected_length = std::dynamic_pointer_cast(typed_builder_->type()) ->byte_width(); - if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -535,9 +612,7 @@ class UTF8Converter : public TypedConverterVisitor Py_ssize_t length; PyObject* obj = item.obj(); - if (obj == Py_None) { - return typed_builder_->AppendNull(); - } else if (PyBytes_Check(obj)) { + if (PyBytes_Check(obj)) { tmp.reset( PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj))); RETURN_IF_PYERROR(); @@ -565,14 +640,10 @@ class ListConverter : public TypedConverterVisitor { Status Init(ArrayBuilder* builder) override; inline Status AppendItem(const OwnedRef& item) override { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - RETURN_NOT_OK(typed_builder_->Append()); - PyObject* item_obj = item.obj(); - int64_t list_size = static_cast(PySequence_Size(item_obj)); - return value_converter_->AppendData(item_obj, list_size); - } + RETURN_NOT_OK(typed_builder_->Append()); + PyObject* item_obj = item.obj(); + int64_t list_size = static_cast(PySequence_Size(item_obj)); + return value_converter_->AppendData(item_obj, list_size); } protected: @@ -584,16 +655,12 @@ class DecimalConverter public: inline Status AppendItem(const OwnedRef& item) { /// TODO(phillipc): Check for nan? - if (item.obj() != Py_None) { - std::string string; - RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string)); - - Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); - return typed_builder_->Append(value); - } + std::string string; + RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string)); - return typed_builder_->AppendNull(); + Decimal128 value; + RETURN_NOT_OK(Decimal128::FromString(string, &value)); + return typed_builder_->Append(value); } }; @@ -604,8 +671,22 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::BOOL: return std::make_shared(); + case Type::INT8: + return std::make_shared(); + case Type::INT16: + return std::make_shared(); + case Type::INT32: + return std::make_shared(); case Type::INT64: return std::make_shared(); + case Type::UINT8: + return std::make_shared(); + case Type::UINT16: + return std::make_shared(); + case Type::UINT32: + return std::make_shared(); + case Type::UINT64: + return std::make_shared(); case Type::DATE64: return std::make_shared(); case Type::TIMESTAMP: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 3a847f77c4f..c37ed3b200e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -348,10 +348,10 @@ cdef class StructValue(ArrayValue): cdef dict _scalar_classes = { _Type_BOOL: BooleanValue, - _Type_UINT8: Int8Value, - _Type_UINT16: Int16Value, - _Type_UINT32: Int32Value, - _Type_UINT64: Int64Value, + _Type_UINT8: UInt8Value, + _Type_UINT16: UInt16Value, + _Type_UINT32: UInt32Value, + _Type_UINT64: UInt64Value, _Type_INT8: Int8Value, _Type_INT16: Int16Value, _Type_INT32: Int32Value, From f2802fc724225ed9986f10a39eff4bb4cae6ff7a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 28 Sep 2017 17:12:31 -0500 Subject: [PATCH 4/7] Cleaner codepath for numpy->arrow conversions Change-Id: I2ec4737119bf25c5f5a5ee0e760855d01daaa79b --- cpp/src/arrow/python/CMakeLists.txt | 4 +- cpp/src/arrow/python/api.h | 2 +- .../{pandas_to_arrow.cc => numpy_to_arrow.cc} | 192 +++++++++--------- .../{pandas_to_arrow.h => numpy_to_arrow.h} | 21 +- python/pyarrow/array.pxi | 129 +++++++----- python/pyarrow/includes/libarrow.pxd | 11 +- python/pyarrow/pandas_compat.py | 2 +- 7 files changed, 190 insertions(+), 171 deletions(-) rename cpp/src/arrow/python/{pandas_to_arrow.cc => numpy_to_arrow.cc} (92%) rename cpp/src/arrow/python/{pandas_to_arrow.h => numpy_to_arrow.h} (70%) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 84aad82e2a9..7938d8473b6 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -57,7 +57,7 @@ set(ARROW_PYTHON_SRCS init.cc io.cc numpy_convert.cc - pandas_to_arrow.cc + numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc ) @@ -100,7 +100,7 @@ install(FILES io.h numpy_convert.h numpy_interop.h - pandas_to_arrow.h + numpy_to_arrow.h python_to_arrow.h platform.h pyarrow.h diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 4ceb3f1a45d..a000ac5fa5a 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -25,7 +25,7 @@ #include "arrow/python/helpers.h" #include "arrow/python/io.h" #include "arrow/python/numpy_convert.h" -#include "arrow/python/pandas_to_arrow.h" +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" #endif // ARROW_PYTHON_API_H diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc similarity index 92% rename from cpp/src/arrow/python/pandas_to_arrow.cc rename to cpp/src/arrow/python/numpy_to_arrow.cc index dc5b67f53e4..1e0e920b85c 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -19,10 +19,9 @@ #define ARROW_NO_DEFAULT_MEMORY_POOL +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/numpy_interop.h" -#include "arrow/python/pandas_to_arrow.h" - #include #include #include @@ -60,10 +59,14 @@ namespace py { using internal::NumPyTypeSize; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); + // ---------------------------------------------------------------------- // Conversion utilities -static inline bool PyFloat_isnan(const PyObject* obj) { +namespace { + +inline bool PyFloat_isnan(const PyObject* obj) { if (PyFloat_Check(obj)) { double val = PyFloat_AS_DOUBLE(obj); return val != val; @@ -71,11 +74,12 @@ static inline bool PyFloat_isnan(const PyObject* obj) { return false; } } -static inline bool PandasObjectIsNull(const PyObject* obj) { + +inline bool PandasObjectIsNull(const PyObject* obj) { return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); } -static inline bool PyObject_is_string(const PyObject* obj) { +inline bool PyObject_is_string(const PyObject* obj) { #if PY_MAJOR_VERSION >= 3 return PyUnicode_Check(obj) || PyBytes_Check(obj); #else @@ -83,14 +87,14 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } -static inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } +inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } -static inline bool PyObject_is_integer(const PyObject* obj) { +inline bool PyObject_is_integer(const PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } template -static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { +inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits traits; typedef typename traits::value_type T; @@ -109,7 +113,7 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { } // Returns null count -static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { +int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; Ndarray1DIndexer mask_values(mask); @@ -124,7 +128,7 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap } template -static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { +Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { typedef internal::npy_traits traits; typedef typename traits::value_type T; @@ -162,7 +166,7 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::OK(); } -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); +} // namespace /// Append as many string objects from NumPy arrays to a `StringBuilder` as we /// can fit @@ -272,10 +276,10 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas // ---------------------------------------------------------------------- // Conversion from NumPy-in-Pandas to Arrow -class PandasConverter { +class NumPyConverter { public: - PandasConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type) + NumPyConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type) : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), @@ -291,6 +295,39 @@ class PandasConverter { return astrides[0] != PyArray_DESCR(arr_)->elsize; } + Status Convert(); + + const std::vector>& result() const { return out_arrays_; } + + template + typename std::enable_if::value || + std::is_same::value, + Status>::type + Visit(const T& type) { + return VisitNative(); + } + + Status Visit(const Date32Type& type) { return VisitNative(); } + Status Visit(const Date64Type& type) { return VisitNative(); } + Status Visit(const TimestampType& type) { return VisitNative(); } + Status Visit(const Time32Type& type) { return VisitNative(); } + Status Visit(const Time64Type& type) { return VisitNative(); } + + Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const FixedSizeBinaryType& type) { + return TypeNotImplemented(type.ToString()); + } + + Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } + + protected: Status InitNullBitmap() { int64_t null_bytes = BitUtil::BytesForBits(length_); @@ -350,58 +387,17 @@ class PandasConverter { return PushArray(arr_data); } - template - typename std::enable_if::value || - std::is_same::value, - Status>::type - Visit(const T& type) { - return VisitNative(); - } - - Status Visit(const Date32Type& type) { return VisitNative(); } - Status Visit(const Date64Type& type) { return VisitNative(); } - Status Visit(const TimestampType& type) { return VisitNative(); } - Status Visit(const Time32Type& type) { return VisitNative(); } - Status Visit(const Time64Type& type) { return VisitNative(); } - Status TypeNotImplemented(std::string type_name) { std::stringstream ss; - ss << "PandasConverter doesn't implement <" << type_name << "> conversion. "; + ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; return Status::NotImplemented(ss.str()); } - Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const FixedSizeBinaryType& type) { - return TypeNotImplemented(type.ToString()); - } - - Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } - - Status Convert() { - if (PyArray_NDIM(arr_) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - if (type_ == nullptr) { - return Status::Invalid("Must pass data type"); - } - - // Visit the type to perform conversion - return VisitTypeInline(*type_, this); - } - - const std::vector>& result() const { return out_arrays_; } - // ---------------------------------------------------------------------- // Conversion logic for various object dtype arrays + Status ConvertObjects(); + template Status ConvertTypedLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list); @@ -419,11 +415,9 @@ class PandasConverter { PyObject* list); Status ConvertDecimals(); Status ConvertTimes(); - Status ConvertObjects(); Status ConvertObjectsInfer(); Status ConvertObjectsInferAndCast(); - protected: MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; @@ -437,6 +431,23 @@ class PandasConverter { uint8_t* null_bitmap_data_; }; +Status NumPyConverter::Convert() { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) { + return ConvertObjects(); + } + + if (type_ == nullptr) { + return Status::Invalid("Must pass data type for non-object arrays"); + } + + // Visit the type to perform conversion + return VisitTypeInline(*type_, this); +} + template void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { // Passing input_data as non-const is a concession to PyObject* @@ -482,7 +493,7 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len } template -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { using traits = internal::arrow_traits; using T = typename traits::T; @@ -513,7 +524,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* data) { } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { // Handle LONGLONG->INT64 and other fun things int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); int type_size = NumPyTypeSize(type_num_compat); @@ -552,7 +563,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { int64_t nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); RETURN_NOT_OK(buffer->Resize(nbytes)); @@ -590,7 +601,7 @@ struct UnboxDate { }; template -Status PandasConverter::ConvertDates() { +Status NumPyConverter::ConvertDates() { PyAcquireGIL lock; using BuilderType = typename TypeTraits::BuilderType; @@ -626,7 +637,7 @@ Status PandasConverter::ConvertDates() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertDecimals() { +Status NumPyConverter::ConvertDecimals() { PyAcquireGIL lock; // Import the decimal module and Decimal class @@ -669,7 +680,7 @@ Status PandasConverter::ConvertDecimals() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertTimes() { +Status NumPyConverter::ConvertTimes() { // Convert array of datetime.time objects to Arrow PyAcquireGIL lock; PyDateTime_IMPORT; @@ -697,7 +708,7 @@ Status PandasConverter::ConvertTimes() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectStrings() { +Status NumPyConverter::ConvertObjectStrings() { PyAcquireGIL lock; // The output type at this point is inconclusive because there may be bytes @@ -729,7 +740,7 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } -Status PandasConverter::ConvertObjectFloats() { +Status NumPyConverter::ConvertObjectFloats() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -764,7 +775,7 @@ Status PandasConverter::ConvertObjectFloats() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectIntegers() { +Status NumPyConverter::ConvertObjectIntegers() { PyAcquireGIL lock; Int64Builder builder(pool_); @@ -799,7 +810,7 @@ Status PandasConverter::ConvertObjectIntegers() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectFixedWidthBytes( +Status NumPyConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; @@ -822,7 +833,7 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( return Status::OK(); } -Status PandasConverter::ConvertBooleans() { +Status NumPyConverter::ConvertBooleans() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -864,7 +875,7 @@ Status PandasConverter::ConvertBooleans() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInfer() { +Status NumPyConverter::ConvertObjectsInfer() { Ndarray1DIndexer objects; PyAcquireGIL lock; @@ -912,7 +923,7 @@ Status PandasConverter::ConvertObjectsInfer() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInferAndCast() { +Status NumPyConverter::ConvertObjectsInferAndCast() { size_t position = out_arrays_.size(); RETURN_NOT_OK(ConvertObjectsInfer()); @@ -932,7 +943,7 @@ Status PandasConverter::ConvertObjectsInferAndCast() { return Status::OK(); } -Status PandasConverter::ConvertObjects() { +Status NumPyConverter::ConvertObjects() { // Python object arrays are annoying, since we could have one of: // // * Strings @@ -1005,8 +1016,8 @@ Status LoopPySequence(PyObject* sequence, T func) { } template -inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { typedef internal::npy_traits traits; typedef typename traits::BuilderClass BuilderT; @@ -1050,7 +1061,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; @@ -1091,7 +1102,7 @@ inline Status PandasConverter::ConvertTypedLists( } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; // TODO: If there are bytes involed, convert to Binary representation @@ -1145,8 +1156,8 @@ inline Status PandasConverter::ConvertTypedLists( return ConvertTypedLists(type, builder, list); \ } -Status PandasConverter::ConvertLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { switch (type->id()) { LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) @@ -1185,7 +1196,7 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type, } } -Status PandasConverter::ConvertLists(const std::shared_ptr& type) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type) { std::unique_ptr array_builder; RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder)); ListBuilder* list_builder = static_cast(array_builder.get()); @@ -1193,20 +1204,13 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type) { return PushBuilderResult(list_builder); } -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out) { + NumPyConverter converter(pool, ao, mo, type); RETURN_NOT_OK(converter.Convert()); - *out = converter.result()[0]; - DCHECK(*out); - return Status::OK(); -} - -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); - RETURN_NOT_OK(converter.ConvertObjects()); + DCHECK(converter.result()[0]); *out = std::make_shared(converter.result()); return Status::OK(); } diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h similarity index 70% rename from cpp/src/arrow/python/pandas_to_arrow.h rename to cpp/src/arrow/python/numpy_to_arrow.h index 3e655ba3fee..4a70b4bc533 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -17,8 +17,8 @@ // Converting from pandas memory representation to Arrow data structures -#ifndef ARROW_PYTHON_PANDAS_TO_ARROW_H -#define ARROW_PYTHON_PANDAS_TO_ARROW_H +#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H +#define ARROW_PYTHON_NUMPY_TO_ARROW_H #include "arrow/python/platform.h" @@ -36,12 +36,8 @@ class Status; namespace py { -ARROW_EXPORT -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out); - -/// Convert dtype=object arrays. If target data type is not known, pass a type -/// with nullptr +/// Convert NumPy arrays to Arrow. If target data type is not known, pass a +/// type with nullptr /// /// \param[in] pool Memory pool for any memory allocations /// \param[in] ao an ndarray with the array data @@ -49,11 +45,12 @@ Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, /// \param[in] type /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_EXPORT -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out); } // namespace py } // namespace arrow -#endif // ARROW_PYTHON_PANDAS_TO_ARROW_H +#endif // ARROW_PYTHON_NUMPY_TO_ARROW_H diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 622a6de85d6..ba300565629 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -17,10 +17,9 @@ cdef _sequence_to_array(object sequence, object size, DataType type, - MemoryPool memory_pool): + CMemoryPool* pool): cdef shared_ptr[CArray] out cdef int64_t c_size - cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) if type is None: with nogil: check_status(ConvertPySequence(sequence, pool, &out)) @@ -52,64 +51,41 @@ cdef _is_array_like(obj): return isinstance(obj, np.ndarray) -cdef _arraylike_to_array(object obj, object mask, DataType type, - MemoryPool memory_pool): - cdef: - shared_ptr[CArray] out - shared_ptr[CChunkedArray] chunked_out - shared_ptr[CDataType] c_type - - if mask is not None: - mask = get_series_values(mask) +cdef _ndarray_to_array(object values, object mask, DataType type, + c_bool use_pandas_null_sentinels, + CMemoryPool* pool): + cdef shared_ptr[CChunkedArray] chunked_out + cdef shared_ptr[CDataType] c_type - values = get_series_values(obj) + dtype = values.dtype - cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - - if isinstance(values, Categorical): - return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - memory_pool=memory_pool) - elif values.dtype == object: - # Object dtype undergoes a different conversion path as more type - # inference may be needed - if type is not None: - c_type = type.sp_type + if type is None and dtype != object: with nogil: - check_status(PandasObjectsToArrow( - pool, values, mask, c_type, &chunked_out)) - - if chunked_out.get().num_chunks() > 1: - return pyarrow_wrap_chunked_array(chunked_out) - else: - out = chunked_out.get().chunk(0) - else: - values, type = pdcompat.maybe_coerce_datetime64( - values, obj.dtype, type) + check_status(NumPyDtypeToArrow(dtype, &c_type)) - if type is None: - dtype = values.dtype - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) - else: - c_type = type.sp_type + if type is not None: + c_type = type.sp_type - with nogil: - check_status(PandasToArrow( - pool, values, mask, c_type, &out)) + with nogil: + check_status(NdarrayToArrow(pool, values, mask, + use_pandas_null_sentinels, + c_type, &chunked_out)) - return pyarrow_wrap_array(out) + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + return pyarrow_wrap_array(chunked_out.get().chunk(0)) -def array(object sequence, type=None, mask=None, - MemoryPool memory_pool=None, size=None): +def array(object obj, type=None, mask=None, + MemoryPool memory_pool=None, size=None, + from_pandas=False): """ - Create pyarrow.Array instance from a Python sequence + Create pyarrow.Array instance from a Python object Parameters ---------- - sequence : sequence, iterable, ndarray or Series + obj : sequence, iterable, ndarray or Series If both type and size are specified may be a single use iterable. If not strongly-typed, Arrow type will be inferred for resulting array mask : array (boolean), optional @@ -120,12 +96,16 @@ def array(object sequence, type=None, mask=None, memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool + size : int64, optional Size of the elements. If the imput is larger than size bail at this length. For iterators, if size is larger than the input iterator this will be treated as a "max size", but will involve an initial allocation of size followed by a resize to the actual size (so if you know the exact size specifying it correctly will give you better performance). + from_pandas : boolean, default False + Use pandas's semantics for inferring nulls from values in ndarray-like + data Notes ----- @@ -160,12 +140,27 @@ def array(object sequence, type=None, mask=None, """ if type is not None and not isinstance(type, DataType): type = type_for_alias(type) - if _is_array_like(sequence): - return _arraylike_to_array(sequence, mask, type, memory_pool) + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + if _is_array_like(obj): + if mask is not None: + mask = get_series_values(mask) + + values = get_series_values(obj) + + if isinstance(values, Categorical): + return DictionaryArray.from_arrays( + values.codes, values.categories.values, + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) + else: + values, type = pdcompat.get_datetimetz_type(values, obj.dtype, + type) + return _ndarray_to_array(values, mask, type, from_pandas, pool) else: if mask is not None: raise ValueError("Masks only supported with ndarray-like inputs") - return _sequence_to_array(sequence, size, type, memory_pool) + return _sequence_to_array(obj, size, type, pool) def _normalize_slice(object arrow_obj, slice key): @@ -240,10 +235,36 @@ cdef class Array: @staticmethod def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): - """Convert pandas.Series to an Arrow Array. See pyarrow.array for more - information on usage. """ - return array(obj, mask=mask, type=type, memory_pool=memory_pool) + Convert pandas.Series to an Arrow Array, using pandas's semantics about + what values indicate nulls. See pyarrow.array for more general + conversion from arrays or sequences to Arrow arrays + + Parameters + ---------- + sequence : ndarray, Inded Series + mask : array (boolean), optional + Indicate which values are null (True) or not null (False) + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray (if object data + overflows binary buffer) + """ + return array(obj, mask=mask, type=type, memory_pool=memory_pool, + from_pandas=True) property null_count: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5e6708871e6..fc17d1c06ae 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -766,13 +766,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) - CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CChunkedArray]* out) + CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, + c_bool use_pandas_null_sentinels, + const shared_ptr[CDataType]& type, + shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 2ae55b09231..5c6b5328fc7 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -240,7 +240,7 @@ def dataframe_to_arrays(df, schema, preserve_index): return names, arrays, metadata -def maybe_coerce_datetime64(values, dtype, type_): +def get_datetimetz_type(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: From 797f0151e83c14d894c436207dd1cee6a2793c6b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 28 Sep 2017 17:50:48 -0500 Subject: [PATCH 5/7] Allow null checking to be skipped with from_pandas=False in pyarrow.array Change-Id: Ie8e87c3c529f4071e221f390b333ad702d247c8d --- cpp/src/arrow/python/numpy_to_arrow.cc | 65 ++++++++++++--------- python/pyarrow/pandas_compat.py | 2 +- python/pyarrow/tests/test_array.py | 14 +++++ python/pyarrow/tests/test_convert_pandas.py | 20 +++---- python/pyarrow/tests/test_parquet.py | 20 ++++++- python/pyarrow/types.pxi | 12 +++- 6 files changed, 90 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 1e0e920b85c..7151c94c0db 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -127,29 +127,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { return null_count; } -template -Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { - typedef internal::npy_traits traits; - typedef typename traits::value_type T; - - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(array); - if (traits::supports_nulls) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - return Status::OK(); -} - Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { if (PyArray_NDIM(numpy_array) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); @@ -279,11 +256,12 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type) + const std::shared_ptr& type, bool use_pandas_null_sentinels) : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), - mask_(nullptr) { + mask_(nullptr), + use_pandas_null_sentinels_(use_pandas_null_sentinels) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } @@ -354,6 +332,32 @@ class NumPyConverter { return Status::OK(); } + template + Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { + typedef internal::npy_traits traits; + typedef typename traits::value_type T; + + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(array); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + return Status::OK(); + } + Status PushArray(const std::shared_ptr& data) { std::shared_ptr result; RETURN_NOT_OK(MakeArray(data, &result)); @@ -365,7 +369,10 @@ class NumPyConverter { Status VisitNative() { using traits = internal::arrow_traits; - if (mask_ != nullptr || traits::supports_nulls) { + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + if (mask_ != nullptr || null_sentinels_possible) { RETURN_NOT_OK(InitNullBitmap()); } @@ -375,7 +382,7 @@ class NumPyConverter { int64_t null_count = 0; if (mask_ != nullptr) { null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (traits::supports_nulls) { + } else if (null_sentinels_possible) { // TODO(wesm): this presumes the NumPy C type and arrow C type are the // same null_count = ValuesToBitmap(arr_, null_bitmap_data_); @@ -424,6 +431,8 @@ class NumPyConverter { PyArrayObject* mask_; int64_t length_; + bool use_pandas_null_sentinels_; + // Used in visitor pattern std::vector> out_arrays_; @@ -1208,7 +1217,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool use_pandas_null_sentinels, const std::shared_ptr& type, std::shared_ptr* out) { - NumPyConverter converter(pool, ao, mo, type); + NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels); RETURN_NOT_OK(converter.Convert()); DCHECK(converter.result()[0]); *out = std::make_shared(converter.result()); diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5c6b5328fc7..be48aeb442d 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -223,7 +223,7 @@ def dataframe_to_arrays(df, schema, preserve_index): field = schema.field_by_name(name) type = getattr(field, "type", None) - array = pa.array(col, type=type) + array = pa.array(col, from_pandas=True, type=type) arrays.append(array) names.append(name) types.append(array.type) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 56aa70e3904..9b93ee034db 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -323,3 +323,17 @@ def test_simple_type_construction(): ) def test_logical_type(type, expected): assert get_logical_type(type) == expected + + +def test_array_conversions_no_sentinel_values(): + arr = np.array([1, 2, 3, 4], dtype='int8') + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + assert arr2.type == 'int8' + + arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), + type='float32') + assert arr3.type == 'float32' + assert arr3.null_count == 0 diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index b4b3ba42631..182f3afc7e6 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -82,7 +82,7 @@ def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def _check_series_roundtrip(self, s, type_=None): - arr = pa.array(s, type=type_) + arr = pa.array(s, from_pandas=True, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if isinstance(arr.type, pa.TimestampType) and arr.type.tz is not None: @@ -93,7 +93,7 @@ def _check_series_roundtrip(self, s, type_=None): def _check_array_roundtrip(self, values, expected=None, mask=None, type=None): - arr = pa.array(values, mask=mask, type=type) + arr = pa.array(values, from_pandas=True, mask=mask, type=type) result = arr.to_pandas() values_nulls = pd.isnull(values) @@ -152,7 +152,7 @@ def test_float_nulls(self): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = pa.array(values, mask=null_mask) + arr = pa.array(values, from_pandas=True, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan @@ -828,8 +828,8 @@ def test_arrow_time_to_pandas(self): tm.assert_frame_equal(df, expected_df) - def _check_numpy_array_roundtrip(self, np_array): - arr = pa.array(np_array) + def _check_array_from_pandas_roundtrip(self, np_array): + arr = pa.array(np_array, from_pandas=True) result = arr.to_pandas() npt.assert_array_equal(result, np_array) @@ -840,7 +840,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - self._check_numpy_array_roundtrip(datetime64_ns) + self._check_array_from_pandas_roundtrip(datetime64_ns) datetime64_us = np.array([ '2007-07-13T01:23:34.123456', @@ -848,7 +848,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539', '2010-08-13T05:46:57.437699'], dtype='datetime64[us]') - self._check_numpy_array_roundtrip(datetime64_us) + self._check_array_from_pandas_roundtrip(datetime64_us) datetime64_ms = np.array([ '2007-07-13T01:23:34.123', @@ -856,7 +856,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - self._check_numpy_array_roundtrip(datetime64_ms) + self._check_array_from_pandas_roundtrip(datetime64_ms) datetime64_s = np.array([ '2007-07-13T01:23:34', @@ -864,7 +864,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56', '2010-08-13T05:46:57'], dtype='datetime64[s]') - self._check_numpy_array_roundtrip(datetime64_s) + self._check_array_from_pandas_roundtrip(datetime64_s) datetime64_d = np.array([ '2007-07-13', @@ -872,7 +872,7 @@ def test_numpy_datetime64_columns(self): '2006-01-15', '2010-08-19'], dtype='datetime64[D]') - self._check_numpy_array_roundtrip(datetime64_d) + self._check_array_from_pandas_roundtrip(datetime64_d) def test_all_nones(self): def _check_series(s): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 0308dd3f948..b0593fe885d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -457,8 +457,26 @@ def test_column_of_arrays(tmpdir): @parquet def test_coerce_timestamps(tmpdir): + from collections import OrderedDict # ARROW-622 - df, schema = dataframe_with_arrays() + arrays = OrderedDict() + fields = [pa.field('datetime64', + pa.list_(pa.timestamp('ms')))] + arrays['datetime64'] = [ + np.array(['2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + None, + None, + np.array(['2007-07-13T02', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + ] + + df = pd.DataFrame(arrays) + schema = pa.schema(fields) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, schema=schema) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0c96a553d85..a3ff6bc76f3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -74,11 +74,17 @@ cdef class DataType: def __repr__(self): return '{0.__class__.__name__}({0})'.format(self) - def __richcmp__(DataType self, DataType other, int op): + def __richcmp__(DataType self, object other, int op): + cdef DataType other_type + if not isinstance(other, DataType): + other_type = type_for_alias(other) + else: + other_type = other + if op == cp.Py_EQ: - return self.type.Equals(deref(other.type)) + return self.type.Equals(deref(other_type.type)) elif op == cp.Py_NE: - return not self.type.Equals(deref(other.type)) + return not self.type.Equals(deref(other_type.type)) else: raise TypeError('Invalid comparison') From d3bbb3c3cbb547abb2baf15317a951c96050fd40 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 28 Sep 2017 18:04:55 -0500 Subject: [PATCH 6/7] Handle type aliases in cast, too Change-Id: Ic2999fba7575bc80ce71121f050ca528636a106d --- python/pyarrow/array.pxi | 19 +++++++++++++++---- python/pyarrow/tests/test_array.py | 22 +++++++++++----------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ba300565629..ab19287b7f9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -77,6 +77,15 @@ cdef _ndarray_to_array(object values, object mask, DataType type, return pyarrow_wrap_array(chunked_out.get().chunk(0)) +cdef DataType _ensure_type(object type): + if type is None: + return None + elif not isinstance(type, DataType): + return type_for_alias(type) + else: + return type + + def array(object obj, type=None, mask=None, MemoryPool memory_pool=None, size=None, from_pandas=False): @@ -138,8 +147,7 @@ def array(object obj, type=None, mask=None, array : pyarrow.Array or pyarrow.ChunkedArray (if object data overflowed binary storage) """ - if type is not None and not isinstance(type, DataType): - type = type_for_alias(type) + type = _ensure_type(type) cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) if _is_array_like(obj): @@ -205,7 +213,7 @@ cdef class Array: with nogil: check_status(DebugPrint(deref(self.ap), 0)) - def cast(self, DataType target_type, safe=True): + def cast(self, object target_type, safe=True): """ Cast array values to another data type @@ -223,12 +231,15 @@ cdef class Array: cdef: CCastOptions options shared_ptr[CArray] result + DataType type + + type = _ensure_type(target_type) if not safe: options.allow_int_overflow = 1 with nogil: - check_status(Cast(_context(), self.ap[0], target_type.sp_type, + check_status(Cast(_context(), self.ap[0], type.sp_type, options, &result)) return pyarrow_wrap_array(result) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 9b93ee034db..3bf392686f0 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -231,13 +231,13 @@ def _check_cast_case(case, safe=True): def test_cast_integers_safe(): safe_cases = [ - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) ] @@ -245,10 +245,10 @@ def test_cast_integers_safe(): _check_cast_case(case) unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([50000], dtype='u2'), pa.uint16(), pa.int16()) + (np.array([50000], dtype='i4'), 'int32', 'int16'), + (np.array([70000], dtype='i4'), 'int32', 'uint16'), + (np.array([-1], dtype='i4'), 'int32', 'uint16'), + (np.array([50000], dtype='u2'), 'uint16', 'int16') ] for in_data, in_type, out_type in unsafe_cases: in_arr = pa.array(in_data, type=in_type) @@ -260,11 +260,11 @@ def test_cast_integers_safe(): def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), + (np.array([50000], dtype='i4'), 'int32', np.array([50000], dtype='i2'), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), + (np.array([70000], dtype='i4'), 'int32', np.array([70000], dtype='u2'), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), + (np.array([-1], dtype='i4'), 'int32', np.array([-1], dtype='u2'), pa.uint16()), (np.array([50000], dtype='u2'), pa.uint16(), np.array([50000], dtype='i2'), pa.int16()) From 1570e5256bf9c4c29b1762972ca435371e8f2bc6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 29 Sep 2017 20:34:44 -0500 Subject: [PATCH 7/7] Code review comments Change-Id: I371b9a8c30a7deaad41ccb729a26983de9a39ee6 --- cpp/src/arrow/python/builtin_convert.cc | 4 ++-- python/pyarrow/array.pxi | 6 ++++-- python/pyarrow/tests/test_schema.py | 9 +++++++++ python/pyarrow/types.pxi | 4 ++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index b17a12f5a86..f9d7361e004 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -525,7 +525,7 @@ class UInt64Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - PyDateTime_Date* pydate = reinterpret_cast(item.obj()); + auto pydate = reinterpret_cast(item.obj()); return typed_builder_->Append(PyDate_to_ms(pydate)); } }; @@ -534,7 +534,7 @@ class TimestampConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - PyDateTime_DateTime* pydatetime = reinterpret_cast(item.obj()); + auto pydatetime = reinterpret_cast(item.obj()); return typed_builder_->Append(PyDateTime_to_us(pydatetime)); } }; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ab19287b7f9..f402defc9b0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -98,7 +98,7 @@ def array(object obj, type=None, mask=None, If both type and size are specified may be a single use iterable. If not strongly-typed, Arrow type will be inferred for resulting array mask : array (boolean), optional - Indicate which values are null (True) or not null (False) + Indicate which values are null (True) or not null (False). type : pyarrow.DataType Explicit type to attempt to coerce to, otherwise will be inferred from the data @@ -114,7 +114,9 @@ def array(object obj, type=None, mask=None, exact size specifying it correctly will give you better performance). from_pandas : boolean, default False Use pandas's semantics for inferring nulls from values in ndarray-like - data + data. If passed, the mask tasks precendence, but if a value is unmasked + (not-null), but still null according to pandas semantics, then it is + null Notes ----- diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index ece7b82a897..c77be98054c 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -69,6 +69,15 @@ def test_type_list(): assert str(l2) == 'list' +def test_type_comparisons(): + val = pa.int32() + assert val == pa.int32() + assert val == 'int32' + + with pytest.raises(TypeError): + val == 5 + + def test_type_for_alias(): cases = [ ('i1', pa.int8()), diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a3ff6bc76f3..316e09a6efd 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -import re - # These are imprecise because the type (in pandas 0.x) depends on the presence # of nulls cdef dict _pandas_type_map = { @@ -77,6 +75,8 @@ cdef class DataType: def __richcmp__(DataType self, object other, int op): cdef DataType other_type if not isinstance(other, DataType): + if not isinstance(other, six.string_types): + raise TypeError(other) other_type = type_for_alias(other) else: other_type = other