From da96a382cb9b4bb68d87cfb3a9c3b0d888bbfe68 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Apr 2017 00:16:36 -0400 Subject: [PATCH 1/4] Add failing Parquet test case. Enable same-type-size cases in pandas_convert.cc Change-Id: I8b88d398e08c921b7990b96ffffbc40f58ac6938 --- cpp/src/arrow/python/pandas_convert.cc | 2 +- cpp/src/arrow/python/type_traits.h | 48 ++++++++++++++++++++++++++ python/pyarrow/tests/test_ipc.py | 3 +- python/pyarrow/tests/test_parquet.py | 34 ++++++++++++++++++ 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 636a3fd15c0..9f65af41bb2 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -444,7 +444,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* data) { // Handle LONGLONG->INT64 and other fun things int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); - if (traits::npy_type != type_num_compat) { + if (numpy_type_size(traits::npy_type) != numpy_type_size(type_num_compat)) { return Status::NotImplemented("NumPy type casts not yet implemented"); } diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index 26b15bdc9f4..b6761ae0d26 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +// Internal header + #include "arrow/python/platform.h" #include @@ -24,6 +26,7 @@ #include "arrow/builder.h" #include "arrow/type.h" +#include "arrow/util/logging.h" namespace arrow { namespace py { @@ -224,5 +227,50 @@ struct arrow_traits { static constexpr bool supports_nulls = true; }; +static inline int numpy_type_size(int npy_type) { + switch (npy_type) { + case NPY_BOOL: + return 1; + case NPY_INT8: + return 1; + case NPY_INT16: + return 2; + case NPY_INT32: + return 4; + case NPY_INT64: + return 8; +#if (NPY_INT64 != NPY_LONGLONG) + case NPY_LONGLONG: + return 8; +#endif + case NPY_UINT8: + return 1; + case NPY_UINT16: + return 2; + case NPY_UINT32: + return 4; + case NPY_UINT64: + return 8; +#if (NPY_UINT64 != NPY_ULONGLONG) + case NPY_ULONGLONG: + return 8; +#endif + case NPY_FLOAT16: + return 2; + case NPY_FLOAT32: + return 4; + case NPY_FLOAT64: + return 8; + case NPY_DATETIME: + return 8; + case NPY_OBJECT: + return sizeof(void*); + default: + DCHECK(false) << "unhandled numpy type"; + break; + } + return -1; +} + } // namespace py } // namespace arrow diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 81213ede315..02040678958 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -158,7 +158,8 @@ def run(self): connection.close() def get_result(self): - return(self._schema, self._table if self._do_read_all else self._batches) + return(self._schema, self._table if self._do_read_all + else self._batches) def setUp(self): # NOTE: must start and stop server in test diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 268e87af7dd..5a6f9fcb4c2 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -348,6 +348,40 @@ def test_column_of_lists(tmpdir): tm.assert_frame_equal(df, df_read) +@parquet +def test_date_time_types(tmpdir): + buf = io.BytesIO() + + t1 = pa.date32() + data1 = np.array([17259, 17260, 17261], dtype='int32') + a1 = pa.Array.from_pandas(data1, type=t1) + + t2 = pa.date64() + data2 = data1.astype('int64') * 86400000 + a2 = pa.Array.from_pandas(data2, type=t2) + + t3 = pa.timestamp('us') + start = pd.Timestamp('2000-01-01').value / 1000 + data3 = np.array([start, start + 1, start + 2], dtype='int64') + a3 = pa.Array.from_pandas(data3, type=t3) + + t4 = pa.time32('s') + data4 = np.arange(3, dtype='i4') + a4 = pa.Array.from_pandas(data4, type=t4) + + t5 = pa.time64('us') + a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) + + table = pa.Table.from_arrays([a1, a2, a3, a4, a5], + ['date32', 'date64', 'timestamp[us]', + 'time32[s]', 'time32[us]']) + pq.write_table(table, buf, version="2.0") + buf.seek(0) + + result = pq.read_table(buf) + assert result.equals(table) + + @parquet def test_multithreaded_read(): df = alltypes_sample(size=10000) From fad3934ae774326cfcf78d7a370d82372cca32bb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Apr 2017 08:29:58 -0400 Subject: [PATCH 2/4] Update test case Change-Id: Ie32591cd8f4fd36f5572d535926fd0ab48b14271 --- python/pyarrow/tests/test_parquet.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5a6f9fcb4c2..a285b27b1b1 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -374,12 +374,18 @@ def test_date_time_types(tmpdir): table = pa.Table.from_arrays([a1, a2, a3, a4, a5], ['date32', 'date64', 'timestamp[us]', - 'time32[s]', 'time32[us]']) + 'time32[s]', 'time64[us]']) + + # date64 as date32 + expected = pa.Table.from_arrays([a1, a1, a3, a4, a5], + ['date32', 'date64', 'timestamp[us]', + 'time32[s]', 'time64[us]']) + pq.write_table(table, buf, version="2.0") buf.seek(0) result = pq.read_table(buf) - assert result.equals(table) + assert result.equals(expected) @parquet From 475fa3f04df7ebe5ca7a13135e57960040fa14b8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Apr 2017 19:42:16 -0400 Subject: [PATCH 3/4] Fix test case Change-Id: I2c5c3ecacc50b6faa63e5241119cf0b6914ae864 --- cpp/src/arrow/util/stl.h | 2 +- python/pyarrow/tests/test_parquet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h index bfce111ff8a..d58689b7488 100644 --- a/cpp/src/arrow/util/stl.h +++ b/cpp/src/arrow/util/stl.h @@ -20,7 +20,7 @@ #include -#include +#include "arrow/util/logging.h" namespace arrow { diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a285b27b1b1..9b6bd356f88 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -365,7 +365,7 @@ def test_date_time_types(tmpdir): data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) - t4 = pa.time32('s') + t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) From db169409aebb964e329f48b1c02df3e5905554ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Apr 2017 20:23:35 -0400 Subject: [PATCH 4/4] Add tests for auto-casted types, and unsupported nanosecond time Change-Id: Ib95a2c36482e8f95f2ca21c48c2f66c35e87d837 --- python/pyarrow/tests/test_parquet.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9b6bd356f88..8c446af03fc 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -372,14 +372,21 @@ def test_date_time_types(tmpdir): t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) - table = pa.Table.from_arrays([a1, a2, a3, a4, a5], + t6 = pa.time32('s') + a6 = pa.Array.from_pandas(data4, type=t6) + + ex_t6 = pa.time32('ms') + ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) + + table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], ['date32', 'date64', 'timestamp[us]', - 'time32[s]', 'time64[us]']) + 'time32[s]', 'time64[us]', 'time32[s]']) # date64 as date32 - expected = pa.Table.from_arrays([a1, a1, a3, a4, a5], + # time32[s] to time32[ms] + expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], ['date32', 'date64', 'timestamp[us]', - 'time32[s]', 'time64[us]']) + 'time32[s]', 'time64[us]', 'time32[s]']) pq.write_table(table, buf, version="2.0") buf.seek(0) @@ -387,6 +394,19 @@ def test_date_time_types(tmpdir): result = pq.read_table(buf) assert result.equals(expected) + # Unsupported stuff + def _assert_unsupported(array): + table = pa.Table.from_arrays([array], ['unsupported']) + buf = io.BytesIO() + + with pytest.raises(NotImplementedError): + pq.write_table(table, buf, version="2.0") + + t7 = pa.time64('ns') + a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) + + _assert_unsupported(a7) + @parquet def test_multithreaded_read():