From d4a83912cd215857f90a29c3a16dbe6e437ddd93 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 May 2017 17:26:08 -0400 Subject: [PATCH 1/5] Reorganize Cython code in the style of lxml so make declaring a public C API easier Change-Id: I3879a8f0546c88959f4468f69f41044455c7978a --- python/CMakeLists.txt | 6 +- python/pyarrow/__init__.pxd | 33 +++++ python/pyarrow/__init__.py | 108 ++++++++-------- python/pyarrow/_error.pxd | 20 --- python/pyarrow/_io.pxd | 50 -------- python/pyarrow/_memory.pxd | 30 ----- python/pyarrow/_parquet.pyx | 1 - python/pyarrow/_table.pxd | 62 --------- python/pyarrow/{_array.pyx => array.pxi} | 140 +++++---------------- python/pyarrow/{_error.pyx => error.pxi} | 0 python/pyarrow/filesystem.py | 14 +-- python/pyarrow/includes/libarrow.pxd | 55 +++++++- python/pyarrow/includes/pyarrow.pxd | 75 ----------- python/pyarrow/{_io.pyx => io.pxi} | 34 ++--- python/pyarrow/ipc.py | 10 +- python/pyarrow/{_array.pxd => lib.pxd} | 92 +++++++++++++- python/pyarrow/lib.pyx | 59 +++++++++ python/pyarrow/{_memory.pyx => memory.pxi} | 11 +- python/pyarrow/public-api.pxi | 97 ++++++++++++++ python/pyarrow/{_table.pyx => table.pxi} | 96 ++++---------- python/setup.py | 8 +- 21 files changed, 468 insertions(+), 533 deletions(-) create mode 100644 python/pyarrow/__init__.pxd delete mode 100644 python/pyarrow/_error.pxd delete mode 100644 python/pyarrow/_io.pxd delete mode 100644 python/pyarrow/_memory.pxd delete mode 100644 python/pyarrow/_table.pxd rename python/pyarrow/{_array.pyx => array.pxi} (91%) rename python/pyarrow/{_error.pyx => error.pxi} (100%) delete mode 100644 python/pyarrow/includes/pyarrow.pxd rename python/pyarrow/{_io.pyx => io.pxi} (97%) rename python/pyarrow/{_array.pxd => lib.pxd} (68%) create mode 100644 python/pyarrow/lib.pyx rename python/pyarrow/{_memory.pyx => memory.pxi} (82%) create mode 100644 python/pyarrow/public-api.pxi rename python/pyarrow/{_table.pyx => table.pxi} (90%) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d8287108d4c..44b5aaf7063 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -265,12 +265,8 @@ if (UNIX) endif() set(CYTHON_EXTENSIONS - _array _config - _error - _io - _memory - _table + lib ) set(LINK_LIBS diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd new file mode 100644 index 00000000000..be2603ac9a5 --- /dev/null +++ b/python/pyarrow/__init__.pxd @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libcpp.memory cimport shared_ptr +from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, + CRecordBatch, CSchema, + CTable, CTensor) + +cdef extern from "pyarrow.lib_api.h": + cdef int import_pyarrow__lib() except -1 + + cdef object wrap_data_type(const shared_ptr[CDataType]& type) + cdef object wrap_field(const shared_ptr[CField]& field) + cdef object wrap_schema(const shared_ptr[CSchema]& schema) + cdef object wrap_array(const shared_ptr[CArray]& sp_array) + cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor) + cdef object wrap_column(const shared_ptr[CColumn]& ccolumn) + cdef object wrap_table(const shared_ptr[CTable]& ctable) + cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4d8da9f5a10..7017103f93a 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -28,50 +28,50 @@ import pyarrow._config from pyarrow._config import cpu_count, set_cpu_count -from pyarrow._array import (null, bool_, - int8, int16, int32, int64, - uint8, uint16, uint32, uint64, - time32, time64, timestamp, date32, date64, - float16, float32, float64, - binary, string, decimal, - list_, struct, dictionary, field, - DataType, - DecimalType, - DictionaryType, - FixedSizeBinaryType, - TimestampType, - Time32Type, - Time64Type, - Field, - Schema, - schema, - Array, Tensor, - array, - from_numpy_dtype, - NullArray, - NumericArray, IntegerArray, FloatingPointArray, - BooleanArray, - Int8Array, UInt8Array, - Int16Array, UInt16Array, - Int32Array, UInt32Array, - Int64Array, UInt64Array, - ListArray, - BinaryArray, StringArray, - FixedSizeBinaryArray, - DictionaryArray, - Date32Array, Date64Array, - TimestampArray, Time32Array, Time64Array, - DecimalArray, - ArrayValue, Scalar, NA, NAType, - BooleanValue, - Int8Value, Int16Value, Int32Value, Int64Value, - UInt8Value, UInt16Value, UInt32Value, UInt64Value, - FloatValue, DoubleValue, ListValue, - BinaryValue, StringValue, FixedSizeBinaryValue, - DecimalValue, - Date32Value, Date64Value, TimestampValue) +from pyarrow.lib import (null, bool_, + int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + time32, time64, timestamp, date32, date64, + float16, float32, float64, + binary, string, decimal, + list_, struct, dictionary, field, + DataType, + DecimalType, + DictionaryType, + FixedSizeBinaryType, + TimestampType, + Time32Type, + Time64Type, + Field, + Schema, + schema, + Array, Tensor, + array, + from_numpy_dtype, + NullArray, + NumericArray, IntegerArray, FloatingPointArray, + BooleanArray, + Int8Array, UInt8Array, + Int16Array, UInt16Array, + Int32Array, UInt32Array, + Int64Array, UInt64Array, + ListArray, + BinaryArray, StringArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, Date64Array, + TimestampArray, Time32Array, Time64Array, + DecimalArray, + ArrayValue, Scalar, NA, NAType, + BooleanValue, + Int8Value, Int16Value, Int32Value, Int64Value, + UInt8Value, UInt16Value, UInt32Value, UInt64Value, + FloatValue, DoubleValue, ListValue, + BinaryValue, StringValue, FixedSizeBinaryValue, + DecimalValue, + Date32Value, Date64Value, TimestampValue) -from pyarrow._io import (HdfsFile, NativeFile, PythonFile, +from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, Buffer, BufferReader, InMemoryOutputStream, OSFile, MemoryMappedFile, memory_map, frombuffer, read_tensor, write_tensor, @@ -79,17 +79,17 @@ get_record_batch_size, get_tensor_size, have_libhdfs, have_libhdfs3) -from pyarrow._memory import (MemoryPool, total_allocated_bytes, - set_memory_pool, default_memory_pool) -from pyarrow._table import (ChunkedArray, Column, RecordBatch, Table, - concat_tables) -from pyarrow._error import (ArrowException, - ArrowKeyError, - ArrowInvalid, - ArrowIOError, - ArrowMemoryError, - ArrowNotImplementedError, - ArrowTypeError) +from pyarrow.lib import (MemoryPool, total_allocated_bytes, + set_memory_pool, default_memory_pool) +from pyarrow.lib import (ChunkedArray, Column, RecordBatch, Table, + concat_tables) +from pyarrow.lib import (ArrowException, + ArrowKeyError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError) def jemalloc_memory_pool(): diff --git a/python/pyarrow/_error.pxd b/python/pyarrow/_error.pxd deleted file mode 100644 index 4fb46c25faf..00000000000 --- a/python/pyarrow/_error.pxd +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow.includes.libarrow cimport CStatus - -cdef int check_status(const CStatus& status) nogil except -1 diff --git a/python/pyarrow/_io.pxd b/python/pyarrow/_io.pxd deleted file mode 100644 index 0c37a09add5..00000000000 --- a/python/pyarrow/_io.pxd +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# distutils: language = c++ - -from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport * - - -cdef class Buffer: - cdef: - shared_ptr[CBuffer] buffer - Py_ssize_t shape[1] - Py_ssize_t strides[1] - - cdef init(self, const shared_ptr[CBuffer]& buffer) - - -cdef class NativeFile: - cdef: - shared_ptr[RandomAccessFile] rd_file - shared_ptr[OutputStream] wr_file - bint is_readable - bint is_writeable - bint is_open - bint own_file - - # By implementing these "virtual" functions (all functions in Cython - # extension classes are technically virtual in the C++ sense) we can expose - # the arrow::io abstract file interfaces to other components throughout the - # suite of Arrow C++ libraries - cdef read_handle(self, shared_ptr[RandomAccessFile]* file) - cdef write_handle(self, shared_ptr[OutputStream]* file) - -cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader) -cdef get_writer(object source, shared_ptr[OutputStream]* writer) diff --git a/python/pyarrow/_memory.pxd b/python/pyarrow/_memory.pxd deleted file mode 100644 index bb1af85c8ea..00000000000 --- a/python/pyarrow/_memory.pxd +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow.includes.libarrow cimport CMemoryPool, CLoggingMemoryPool - - -cdef class MemoryPool: - cdef: - CMemoryPool* pool - - cdef init(self, CMemoryPool* pool) - -cdef class LoggingMemoryPool(MemoryPool): - pass - -cdef CMemoryPool* maybe_unbox_memory_pool(MemoryPool memory_pool) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c06eab26302..b0a029e6960 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -22,7 +22,6 @@ from cython.operator cimport dereference as deref from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -cimport pyarrow.includes.pyarrow as pyarrow from pyarrow._array cimport Array, Schema, box_schema from pyarrow._error cimport check_status from pyarrow._memory cimport MemoryPool, maybe_unbox_memory_pool diff --git a/python/pyarrow/_table.pxd b/python/pyarrow/_table.pxd deleted file mode 100644 index e61e90d7462..00000000000 --- a/python/pyarrow/_table.pxd +++ /dev/null @@ -1,62 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pyarrow.includes.common cimport shared_ptr -from pyarrow.includes.libarrow cimport (CChunkedArray, CColumn, CTable, - CRecordBatch) -from pyarrow._array cimport Schema - - -cdef class ChunkedArray: - cdef: - shared_ptr[CChunkedArray] sp_chunked_array - CChunkedArray* chunked_array - - cdef init(self, const shared_ptr[CChunkedArray]& chunked_array) - cdef _check_nullptr(self) - - -cdef class Column: - cdef: - shared_ptr[CColumn] sp_column - CColumn* column - - cdef init(self, const shared_ptr[CColumn]& column) - cdef _check_nullptr(self) - - -cdef class Table: - cdef: - shared_ptr[CTable] sp_table - CTable* table - - cdef init(self, const shared_ptr[CTable]& table) - cdef _check_nullptr(self) - - -cdef class RecordBatch: - cdef: - shared_ptr[CRecordBatch] sp_batch - CRecordBatch* batch - Schema _schema - - cdef init(self, const shared_ptr[CRecordBatch]& table) - cdef _check_nullptr(self) - -cdef object box_column(const shared_ptr[CColumn]& ccolumn) -cdef api object table_from_ctable(const shared_ptr[CTable]& ctable) -cdef api object batch_from_cbatch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/array.pxi similarity index 91% rename from python/pyarrow/_array.pyx rename to python/pyarrow/array.pxi index f01cff6cc99..a115cfdb36c 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/array.pxi @@ -15,31 +15,7 @@ # specific language governing permissions and limitations # under the License. -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True - -from cython.operator cimport dereference as deref from pyarrow.includes.libarrow cimport * -from pyarrow.includes.common cimport PyObject_to_object -cimport pyarrow.includes.pyarrow as pyarrow -from pyarrow._error cimport check_status -from pyarrow._memory cimport MemoryPool, maybe_unbox_memory_pool -cimport cpython as cp - - -import datetime -import decimal as _pydecimal -import numpy as np -import six -import pyarrow._config -from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical - - -cdef _pandas(): - import pandas as pd - return pd - # These are imprecise because the type (in pandas 0.x) depends on the presence # of nulls @@ -186,7 +162,7 @@ cdef class Field: cdef init(self, const shared_ptr[CField]& field): self.sp_field = field self.field = field.get() - self.type = box_data_type(field.get().type()) + self.type = wrap_data_type(field.get().type()) def equals(self, Field other): """ @@ -244,7 +220,7 @@ cdef class Field: with nogil: check_status(self.field.AddMetadata(c_meta, &new_field)) - return box_field(new_field) + return wrap_field(new_field) def remove_metadata(self): """ @@ -257,7 +233,7 @@ cdef class Field: cdef shared_ptr[CField] new_field with nogil: new_field = self.field.RemoveMetadata() - return box_field(new_field) + return wrap_field(new_field) cdef class Schema: @@ -274,7 +250,7 @@ cdef class Schema: cdef Field result = Field() result.init(self.schema.field(i)) - result.type = box_data_type(result.field.type()) + result.type = wrap_data_type(result.field.type()) return result @@ -322,7 +298,7 @@ cdef class Schema: ------- field: pyarrow.Field """ - return box_field(self.schema.GetFieldByName(tobytes(name))) + return wrap_field(self.schema.GetFieldByName(tobytes(name))) def add_metadata(self, dict metadata): """ @@ -344,7 +320,7 @@ cdef class Schema: with nogil: check_status(self.schema.AddMetadata(c_meta, &new_schema)) - return box_schema(new_schema) + return wrap_schema(new_schema) def remove_metadata(self): """ @@ -357,7 +333,7 @@ cdef class Schema: cdef shared_ptr[CSchema] new_schema with nogil: new_schema = self.schema.RemoveMetadata() - return box_schema(new_schema) + return wrap_schema(new_schema) def __str__(self): return frombytes(self.schema.ToString()) @@ -383,7 +359,7 @@ cdef DataType primitive_type(Type type): return _type_cache[type] cdef DataType out = DataType() - out.init(pyarrow.GetPrimitiveType(type)) + out.init(GetPrimitiveType(type)) _type_cache[type] = out return out @@ -604,7 +580,7 @@ def float64(): cpdef DataType decimal(int precision, int scale=0): cdef shared_ptr[CDataType] decimal_type decimal_type.reset(new CDecimalType(precision, scale)) - return box_data_type(decimal_type) + return wrap_data_type(decimal_type) def string(): @@ -629,7 +605,7 @@ def binary(int length=-1): cdef shared_ptr[CDataType] fixed_size_binary_type fixed_size_binary_type.reset(new CFixedSizeBinaryType(length)) - return box_data_type(fixed_size_binary_type) + return wrap_data_type(fixed_size_binary_type) def list_(DataType value_type): @@ -695,49 +671,15 @@ def schema(fields): return result -cdef DataType box_data_type(const shared_ptr[CDataType]& type): - cdef: - DataType out - - if type.get() == NULL: - return None - - if type.get().id() == _Type_DICTIONARY: - out = DictionaryType() - elif type.get().id() == _Type_TIMESTAMP: - out = TimestampType() - elif type.get().id() == _Type_FIXED_SIZE_BINARY: - out = FixedSizeBinaryType() - elif type.get().id() == _Type_DECIMAL: - out = DecimalType() - else: - out = DataType() - - out.init(type) - return out - -cdef Field box_field(const shared_ptr[CField]& field): - if field.get() == NULL: - return None - cdef Field out = Field() - out.init(field) - return out - -cdef Schema box_schema(const shared_ptr[CSchema]& type): - cdef Schema out = Schema() - out.init_schema(type) - return out - - def from_numpy_dtype(object dtype): """ Convert NumPy dtype to pyarrow.DataType """ cdef shared_ptr[CDataType] c_type with nogil: - check_status(pyarrow.NumPyDtypeToArrow(dtype, &c_type)) + check_status(NumPyDtypeToArrow(dtype, &c_type)) - return box_data_type(c_type) + return wrap_data_type(c_type) NA = None @@ -960,7 +902,7 @@ cdef class ListValue(ArrayValue): cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.value_type = box_data_type(self.ap.value_type()) + self.value_type = wrap_data_type(self.ap.value_type()) cdef getitem(self, int64_t i): cdef int64_t j = self.ap.value_offset(self.index) + i @@ -1076,15 +1018,15 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): pool = maybe_unbox_memory_pool(memory_pool) if type is None: - check_status(pyarrow.ConvertPySequence(sequence, pool, &sp_array)) + check_status(ConvertPySequence(sequence, pool, &sp_array)) else: check_status( - pyarrow.ConvertPySequence( + ConvertPySequence( sequence, pool, &sp_array, type.sp_type ) ) - return box_array(sp_array) + return wrap_array(sp_array) @@ -1093,7 +1035,7 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.type = box_data_type(self.sp_array.get().type()) + self.type = wrap_data_type(self.sp_array.get().type()) @staticmethod def from_pandas(obj, mask=None, DataType type=None, @@ -1172,22 +1114,22 @@ cdef class Array: if type is not None: c_type = type.sp_type with nogil: - check_status(pyarrow.PandasObjectsToArrow( + check_status(PandasObjectsToArrow( pool, values, mask, c_type, &out)) else: values, type = maybe_coerce_datetime64( values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) if type is None: - check_status(pyarrow.NumPyDtypeToArrow(values.dtype, &c_type)) + check_status(NumPyDtypeToArrow(values.dtype, &c_type)) else: c_type = type.sp_type with nogil: - check_status(pyarrow.PandasToArrow( + check_status(PandasToArrow( pool, values, mask, c_type, &out)) - return box_array(out) + return wrap_array(out) property null_count: @@ -1271,7 +1213,7 @@ cdef class Array: else: result = self.ap.Slice(offset, length) - return box_array(result) + return wrap_array(result) def to_pandas(self): """ @@ -1287,8 +1229,7 @@ cdef class Array: PyObject* out with nogil: - check_status( - pyarrow.ConvertArrayToPandas(self.sp_array, self, &out)) + check_status(ConvertArrayToPandas(self.sp_array, self, &out)) return wrap_array_output(out) def to_pylist(self): @@ -1303,7 +1244,7 @@ cdef class Tensor: cdef init(self, const shared_ptr[CTensor]& sp_tensor): self.sp_tensor = sp_tensor self.tp = sp_tensor.get() - self.type = box_data_type(self.tp.type()) + self.type = wrap_data_type(self.tp.type()) def __repr__(self): return """ @@ -1314,9 +1255,8 @@ strides: {2}""".format(self.type, self.shape, self.strides) @staticmethod def from_numpy(obj): cdef shared_ptr[CTensor] ctensor - check_status(pyarrow.NdarrayToTensor(default_memory_pool(), - obj, &ctensor)) - return box_tensor(ctensor) + check_status(NdarrayToTensor(c_default_memory_pool(), obj, &ctensor)) + return wrap_tensor(ctensor) def to_numpy(self): """ @@ -1325,7 +1265,7 @@ strides: {2}""".format(self.type, self.shape, self.strides) cdef: PyObject* out - check_status(pyarrow.TensorToNdarray(deref(self.tp), self, &out)) + check_status(TensorToNdarray(deref(self.tp), self, &out)) return PyObject_to_object(out) def equals(self, Tensor other): @@ -1502,7 +1442,7 @@ cdef class DictionaryArray(Array): cdef CDictionaryArray* darr = (self.ap) if self._dictionary is None: - self._dictionary = box_array(darr.dictionary()) + self._dictionary = wrap_array(darr.dictionary()) return self._dictionary @@ -1512,7 +1452,7 @@ cdef class DictionaryArray(Array): cdef CDictionaryArray* darr = (self.ap) if self._indices is None: - self._indices = box_array(darr.indices()) + self._indices = wrap_array(darr.indices()) return self._indices @@ -1597,28 +1537,6 @@ cdef dict _array_classes = { _Type_DECIMAL: DecimalArray, } -cdef object box_array(const shared_ptr[CArray]& sp_array): - if sp_array.get() == NULL: - raise ValueError('Array was NULL') - - cdef CDataType* data_type = sp_array.get().type().get() - - if data_type == NULL: - raise ValueError('Array data type was NULL') - - cdef Array arr = _array_classes[data_type.id()]() - arr.init(sp_array) - return arr - - -cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor): - if sp_tensor.get() == NULL: - raise ValueError('Tensor was NULL') - - cdef Tensor tensor = Tensor() - tensor.init(sp_tensor) - return tensor - cdef object get_series_values(object obj): if isinstance(obj, PandasSeries): diff --git a/python/pyarrow/_error.pyx b/python/pyarrow/error.pxi similarity index 100% rename from python/pyarrow/_error.pyx rename to python/pyarrow/error.pxi diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 92dd91ce9de..ac37fd87294 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -19,7 +19,7 @@ import os from pyarrow.util import implements -import pyarrow._io as io +import pyarrow.lib as lib class Filesystem(object): @@ -133,7 +133,7 @@ def open(self, path, mode='rb'): return open(path, mode=mode) -class HdfsClient(io._HdfsClient, Filesystem): +class HdfsClient(lib._HdfsClient, Filesystem): """ Connect to an HDFS cluster. All parameters are optional and should only be set if the defaults need to be overridden. @@ -168,19 +168,19 @@ def __init__(self, host="default", port=0, user=None, kerb_ticket=None, @implements(Filesystem.isdir) def isdir(self, path): - return io._HdfsClient.isdir(self, path) + return lib._HdfsClient.isdir(self, path) @implements(Filesystem.isfile) def isfile(self, path): - return io._HdfsClient.isfile(self, path) + return lib._HdfsClient.isfile(self, path) @implements(Filesystem.delete) def delete(self, path, recursive=False): - return io._HdfsClient.delete(self, path, recursive) + return lib._HdfsClient.delete(self, path, recursive) @implements(Filesystem.mkdir) def mkdir(self, path, create_parents=True): - return io._HdfsClient.mkdir(self, path) + return lib._HdfsClient.mkdir(self, path) def ls(self, path, full_info=False): """ @@ -196,4 +196,4 @@ def ls(self, path, full_info=False): ------- result : list of dicts (full_info=True) or strings (full_info=False) """ - return io._HdfsClient.ls(self, path, full_info) + return lib._HdfsClient.ls(self, path, full_info) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8a730b39884..9ed13b3e898 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -149,7 +149,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: PoolBuffer() PoolBuffer(CMemoryPool*) - cdef CMemoryPool* default_memory_pool() + cdef CMemoryPool* c_default_memory_pool" arrow::default_memory_pool"() cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type) @@ -625,3 +625,56 @@ cdef extern from "arrow/ipc/feather.h" namespace "arrow::ipc::feather" nogil: CStatus GetColumn(int i, shared_ptr[CColumn]* out) c_string GetColumnName(int i) + + +cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: + shared_ptr[CDataType] GetPrimitiveType(Type type) + shared_ptr[CDataType] GetTimestampType(TimeUnit unit) + CStatus ConvertPySequence(object obj, CMemoryPool* pool, + shared_ptr[CArray]* out) + CStatus ConvertPySequence(object obj, CMemoryPool* pool, + shared_ptr[CArray]* out, + const shared_ptr[CDataType]& type) + + CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) + + CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, + const shared_ptr[CDataType]& type, + shared_ptr[CArray]* out) + + CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, + const shared_ptr[CDataType]& type, + shared_ptr[CArray]* out) + + CStatus NdarrayToTensor(CMemoryPool* pool, object ao, + shared_ptr[CTensor]* out); + + CStatus TensorToNdarray(const CTensor& tensor, object base, + PyObject** out) + + CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + object py_ref, PyObject** out) + + CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + object py_ref, PyObject** out) + + CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, + int nthreads, PyObject** out) + + void c_set_default_memory_pool \ + " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\ + + CMemoryPool* c_get_memory_pool \ + " arrow::py::get_memory_pool"() + + cdef cppclass PyBuffer(CBuffer): + PyBuffer(object o) + + cdef cppclass PyReadableFile(RandomAccessFile): + PyReadableFile(object fo) + + cdef cppclass PyOutputStream(OutputStream): + PyOutputStream(object fo) + + cdef cppclass PyBytesReader(CBufferReader): + PyBytesReader(object fo) diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd deleted file mode 100644 index 35c71107f8d..00000000000 --- a/python/pyarrow/includes/pyarrow.pxd +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# distutils: language = c++ - -from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, - CTable, CTensor, CStatus, Type, - CMemoryPool, TimeUnit, - RandomAccessFile, OutputStream, - CBufferReader) - - -cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: - shared_ptr[CDataType] GetPrimitiveType(Type type) - shared_ptr[CDataType] GetTimestampType(TimeUnit unit) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, - shared_ptr[CArray]* out) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, - shared_ptr[CArray]* out, - const shared_ptr[CDataType]& type) - - CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) - - CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus NdarrayToTensor(CMemoryPool* pool, object ao, - shared_ptr[CTensor]* out); - - CStatus TensorToNdarray(const CTensor& tensor, object base, - PyObject** out) - - CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, - object py_ref, PyObject** out) - - CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) - - CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, - int nthreads, PyObject** out) - - void set_default_memory_pool(CMemoryPool* pool) - CMemoryPool* get_memory_pool() - - cdef cppclass PyBuffer(CBuffer): - PyBuffer(object o) - - cdef cppclass PyReadableFile(RandomAccessFile): - PyReadableFile(object fo) - - cdef cppclass PyOutputStream(OutputStream): - PyOutputStream(object fo) - - cdef cppclass PyBytesReader(CBufferReader): - PyBytesReader(object fo) diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/io.pxi similarity index 97% rename from python/pyarrow/_io.pyx rename to python/pyarrow/io.pxi index e9e2ba01c06..13e8b638723 100644 --- a/python/pyarrow/_io.pyx +++ b/python/pyarrow/io.pxi @@ -18,22 +18,7 @@ # Cython wrappers for IO interfaces defined in arrow::io and messaging in # arrow::ipc -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True - -from cython.operator cimport dereference as deref from libc.stdlib cimport malloc, free -from pyarrow.includes.libarrow cimport * -cimport pyarrow.includes.pyarrow as pyarrow -from pyarrow._array cimport Array, Tensor, box_tensor, Schema -from pyarrow._error cimport check_status -from pyarrow._memory cimport MemoryPool, maybe_unbox_memory_pool -from pyarrow._table cimport (Column, RecordBatch, batch_from_cbatch, - table_from_ctable) -cimport cpython as cp - -import pyarrow._config from pyarrow.compat import frombytes, tobytes, encode_file_path import re @@ -52,6 +37,7 @@ cdef extern from "Python.h": PyObject* PyBytes_FromStringAndSizeNative" PyBytes_FromStringAndSize"( char *v, Py_ssize_t len) except NULL + cdef class NativeFile: def __cinit__(self): @@ -315,11 +301,11 @@ cdef class PythonFile(NativeFile): self.handle = handle if mode.startswith('w'): - self.wr_file.reset(new pyarrow.PyOutputStream(handle)) + self.wr_file.reset(new PyOutputStream(handle)) self.is_readable = 0 self.is_writeable = 1 elif mode.startswith('r'): - self.rd_file.reset(new pyarrow.PyReadableFile(handle)) + self.rd_file.reset(new PyReadableFile(handle)) self.is_readable = 1 self.is_writeable = 0 else: @@ -554,7 +540,7 @@ cdef class BufferReader(NativeFile): Parameters ---------- - obj : Python bytes or pyarrow.io.Buffer + obj : Python bytes or pyarrow.Buffer """ cdef: Buffer buffer @@ -579,7 +565,7 @@ def frombuffer(object obj): cdef shared_ptr[CBuffer] buf try: memoryview(obj) - buf.reset(new pyarrow.PyBuffer(obj)) + buf.reset(new PyBuffer(obj)) return wrap_buffer(buf) except TypeError: raise ValueError('Must pass object that implements buffer protocol') @@ -1007,7 +993,7 @@ cdef class _StreamReader: if batch.get() == NULL: raise StopIteration - return batch_from_cbatch(batch) + return wrap_batch(batch) def read_all(self): """ @@ -1027,7 +1013,7 @@ cdef class _StreamReader: check_status(CTable.FromRecordBatches(batches, &table)) - return table_from_ctable(table) + return wrap_table(table) cdef class _FileWriter(_StreamWriter): @@ -1080,7 +1066,7 @@ cdef class _FileReader: with nogil: check_status(self.reader.get().GetRecordBatch(i, &batch)) - return batch_from_cbatch(batch) + return wrap_batch(batch) # TODO(wesm): ARROW-503: Function was renamed. Remove after a period of # time has passed @@ -1103,7 +1089,7 @@ cdef class _FileReader: check_status(self.reader.get().GetRecordBatch(i, &batches[i])) check_status(CTable.FromRecordBatches(batches, &table)) - return table_from_ctable(table) + return wrap_table(table) #---------------------------------------------------------------------- @@ -1271,4 +1257,4 @@ def read_tensor(NativeFile source): with nogil: check_status(ReadTensor(offset, source.rd_file.get(), &sp_tensor)) - return box_tensor(sp_tensor) + return wrap_tensor(sp_tensor) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index f96ead3b923..c37a1ce7df1 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -17,10 +17,10 @@ # Arrow file and stream reader/writer classes, and other messaging tools -import pyarrow._io as _io +import pyarrow.lib as lib -class StreamReader(_io._StreamReader): +class StreamReader(lib._StreamReader): """ Reader for the Arrow streaming binary format @@ -37,7 +37,7 @@ def __iter__(self): yield self.get_next_batch() -class StreamWriter(_io._StreamWriter): +class StreamWriter(lib._StreamWriter): """ Writer for the Arrow streaming binary format @@ -52,7 +52,7 @@ def __init__(self, sink, schema): self._open(sink, schema) -class FileReader(_io._FileReader): +class FileReader(lib._FileReader): """ Class for reading Arrow record batch data from the Arrow binary file format @@ -68,7 +68,7 @@ def __init__(self, source, footer_offset=None): self._open(source, footer_offset=footer_offset) -class FileWriter(_io._FileWriter): +class FileWriter(lib._FileWriter): """ Writer to create the Arrow binary file format diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/lib.pxd similarity index 68% rename from python/pyarrow/_array.pxd rename to python/pyarrow/lib.pxd index 464de316f04..08d4a5d0e70 100644 --- a/python/pyarrow/_array.pxd +++ b/python/pyarrow/lib.pxd @@ -17,13 +17,32 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * - from cpython cimport PyObject cdef extern from "Python.h": int PySlice_Check(object) +from pyarrow.includes.libarrow cimport CStatus + + +cdef int check_status(const CStatus& status) nogil except -1 + + +cdef class MemoryPool: + cdef: + CMemoryPool* pool + + cdef init(self, CMemoryPool* pool) + + +cdef class LoggingMemoryPool(MemoryPool): + pass + + +cdef CMemoryPool* maybe_unbox_memory_pool(MemoryPool memory_pool) + + cdef class DataType: cdef: shared_ptr[CDataType] sp_type @@ -237,11 +256,72 @@ cdef class DictionaryArray(Array): cdef wrap_array_output(PyObject* output) -cdef DataType box_data_type(const shared_ptr[CDataType]& type) -cdef Field box_field(const shared_ptr[CField]& field) -cdef Schema box_schema(const shared_ptr[CSchema]& schema) -cdef object box_array(const shared_ptr[CArray]& sp_array) -cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor) cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array, int64_t index) + + +cdef class ChunkedArray: + cdef: + shared_ptr[CChunkedArray] sp_chunked_array + CChunkedArray* chunked_array + + cdef init(self, const shared_ptr[CChunkedArray]& chunked_array) + cdef _check_nullptr(self) + + +cdef class Column: + cdef: + shared_ptr[CColumn] sp_column + CColumn* column + + cdef init(self, const shared_ptr[CColumn]& column) + cdef _check_nullptr(self) + + +cdef class Table: + cdef: + shared_ptr[CTable] sp_table + CTable* table + + cdef init(self, const shared_ptr[CTable]& table) + cdef _check_nullptr(self) + + +cdef class RecordBatch: + cdef: + shared_ptr[CRecordBatch] sp_batch + CRecordBatch* batch + Schema _schema + + cdef init(self, const shared_ptr[CRecordBatch]& table) + cdef _check_nullptr(self) + + +cdef class Buffer: + cdef: + shared_ptr[CBuffer] buffer + Py_ssize_t shape[1] + Py_ssize_t strides[1] + + cdef init(self, const shared_ptr[CBuffer]& buffer) + + +cdef class NativeFile: + cdef: + shared_ptr[RandomAccessFile] rd_file + shared_ptr[OutputStream] wr_file + bint is_readable + bint is_writeable + bint is_open + bint own_file + + # By implementing these "virtual" functions (all functions in Cython + # extension classes are technically virtual in the C++ sense) we can expose + # the arrow::io abstract file interfaces to other components throughout the + # suite of Arrow C++ libraries + cdef read_handle(self, shared_ptr[RandomAccessFile]* file) + cdef write_handle(self, shared_ptr[OutputStream]* file) + +cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader) +cdef get_writer(object source, shared_ptr[OutputStream]* writer) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx new file mode 100644 index 00000000000..b47630459e4 --- /dev/null +++ b/python/pyarrow/lib.pyx @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from cython.operator cimport dereference as deref +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.common cimport PyObject_to_object +cimport pyarrow.includes.libarrow as libarrow +cimport cpython as cp + + +import datetime +import decimal as _pydecimal +import numpy as np +import six +import pyarrow._config +from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical + + +cdef _pandas(): + import pandas as pd + return pd + +# Exception types +include "error.pxi" + +# Memory pools and allocation +include "memory.pxi" + +# Array types +include "array.pxi" + +# Column, Table, Record Batch +include "table.pxi" + +# File IO, IPC +include "io.pxi" + +#---------------------------------------------------------------------- +# Public API + +include "public-api.pxi" diff --git a/python/pyarrow/_memory.pyx b/python/pyarrow/memory.pxi similarity index 82% rename from python/pyarrow/_memory.pyx rename to python/pyarrow/memory.pxi index 8b73a17553e..15d59d237ad 100644 --- a/python/pyarrow/_memory.pyx +++ b/python/pyarrow/memory.pxi @@ -19,9 +19,6 @@ # distutils: language = c++ # cython: embedsignature = True -from pyarrow.includes.libarrow cimport CMemoryPool, CLoggingMemoryPool -from pyarrow.includes.pyarrow cimport set_default_memory_pool, get_memory_pool - cdef class MemoryPool: cdef init(self, CMemoryPool* pool): @@ -33,7 +30,7 @@ cdef class MemoryPool: cdef CMemoryPool* maybe_unbox_memory_pool(MemoryPool memory_pool): if memory_pool is None: - return get_memory_pool() + return c_get_memory_pool() else: return memory_pool.pool @@ -45,14 +42,14 @@ cdef class LoggingMemoryPool(MemoryPool): def default_memory_pool(): cdef: MemoryPool pool = MemoryPool() - pool.init(get_memory_pool()) + pool.init(c_get_memory_pool()) return pool def set_memory_pool(MemoryPool pool): - set_default_memory_pool(pool.pool) + c_set_default_memory_pool(pool.pool) def total_allocated_bytes(): - cdef CMemoryPool* pool = get_memory_pool() + cdef CMemoryPool* pool = c_get_memory_pool() return pool.bytes_allocated() diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi new file mode 100644 index 00000000000..32b27ac93a0 --- /dev/null +++ b/python/pyarrow/public-api.pxi @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libcpp.memory cimport shared_ptr +from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, + CRecordBatch, CSchema, + CTable, CTensor) + +cdef public api object wrap_data_type(const shared_ptr[CDataType]& type): + cdef: + DataType out + + if type.get() == NULL: + return None + + if type.get().id() == _Type_DICTIONARY: + out = DictionaryType() + elif type.get().id() == _Type_TIMESTAMP: + out = TimestampType() + elif type.get().id() == _Type_FIXED_SIZE_BINARY: + out = FixedSizeBinaryType() + elif type.get().id() == _Type_DECIMAL: + out = DecimalType() + else: + out = DataType() + + out.init(type) + return out + + +cdef public api object wrap_field(const shared_ptr[CField]& field): + if field.get() == NULL: + return None + cdef Field out = Field() + out.init(field) + return out + + +cdef public api object wrap_schema(const shared_ptr[CSchema]& type): + cdef Schema out = Schema() + out.init_schema(type) + return out + + +cdef public api object wrap_array(const shared_ptr[CArray]& sp_array): + if sp_array.get() == NULL: + raise ValueError('Array was NULL') + + cdef CDataType* data_type = sp_array.get().type().get() + + if data_type == NULL: + raise ValueError('Array data type was NULL') + + cdef Array arr = _array_classes[data_type.id()]() + arr.init(sp_array) + return arr + + +cdef public api object wrap_tensor(const shared_ptr[CTensor]& sp_tensor): + if sp_tensor.get() == NULL: + raise ValueError('Tensor was NULL') + + cdef Tensor tensor = Tensor() + tensor.init(sp_tensor) + return tensor + + +cdef public api object wrap_column(const shared_ptr[CColumn]& ccolumn): + cdef Column column = Column() + column.init(ccolumn) + return column + + +cdef public api object wrap_table(const shared_ptr[CTable]& ctable): + cdef Table table = Table() + table.init(ctable) + return table + + +cdef public api object wrap_batch(const shared_ptr[CRecordBatch]& cbatch): + cdef RecordBatch batch = RecordBatch() + batch.init(cbatch) + return batch diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/table.pxi similarity index 90% rename from python/pyarrow/_table.pyx rename to python/pyarrow/table.pxi index 223fe27ea98..a78817b4aff 100644 --- a/python/pyarrow/_table.pyx +++ b/python/pyarrow/table.pxi @@ -15,33 +15,9 @@ # specific language governing permissions and limitations # under the License. -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True - -from cython.operator cimport dereference as deref - -from pyarrow.includes.libarrow cimport * -from pyarrow.includes.common cimport * -cimport pyarrow.includes.pyarrow as pyarrow -from pyarrow._array cimport (Array, box_array, wrap_array_output, - box_data_type, box_schema, DataType, Field) -from pyarrow._error cimport check_status -cimport cpython - -import pyarrow._config -from pyarrow._error import ArrowException -from pyarrow._array import field -from pyarrow.compat import frombytes, tobytes - from collections import OrderedDict -cdef _pandas(): - import pandas as pd - return pd - - cdef class ChunkedArray: """ Array backed via one or more memory chunks. @@ -104,10 +80,10 @@ cdef class ChunkedArray: Returns ------- - pyarrow.array.Array + pyarrow.Array """ self._check_nullptr() - return box_array(self.chunked_array.chunk(i)) + return wrap_array(self.chunked_array.chunk(i)) def iterchunks(self): for i in range(self.num_chunks): @@ -150,7 +126,7 @@ cdef class Column: cdef shared_ptr[CColumn] sp_column sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) - return box_column(sp_column) + return wrap_column(sp_column) def to_pandas(self): """ @@ -163,8 +139,8 @@ cdef class Column: cdef: PyObject* out - check_status(pyarrow.ConvertColumnToPandas(self.sp_column, - self, &out)) + check_status(libarrow.ConvertColumnToPandas(self.sp_column, + self, &out)) return _pandas().Series(wrap_array_output(out), name=self.name) @@ -254,9 +230,9 @@ cdef class Column: Returns ------- - pyarrow.schema.DataType + pyarrow.DataType """ - return box_data_type(self.column.type()) + return wrap_data_type(self.column.type()) @property def data(self): @@ -265,7 +241,7 @@ cdef class Column: Returns ------- - pyarrow.table.ChunkedArray + pyarrow.ChunkedArray """ cdef ChunkedArray chunked_array = ChunkedArray() chunked_array.init(self.column.data()) @@ -396,7 +372,7 @@ cdef class RecordBatch: Returns ------- - pyarrow.schema.Schema + pyarrow.Schema """ cdef Schema schema self._check_nullptr() @@ -408,7 +384,7 @@ cdef class RecordBatch: return self._schema def __getitem__(self, i): - return box_array(self.batch.column(i)) + return wrap_array(self.batch.column(i)) def slice(self, offset=0, length=None): """ @@ -436,7 +412,7 @@ cdef class RecordBatch: else: result = self.batch.Slice(offset, length) - return batch_from_cbatch(result) + return wrap_batch(result) def equals(self, RecordBatch other): cdef: @@ -492,7 +468,7 @@ cdef class RecordBatch: Returns ------- - pyarrow.table.RecordBatch + pyarrow.RecordBatch """ names, arrays, metadata = _dataframe_to_arrays(df, False, schema) return cls.from_arrays(arrays, names, metadata) @@ -511,7 +487,7 @@ cdef class RecordBatch: Returns ------- - pyarrow.table.RecordBatch + pyarrow.RecordBatch """ cdef: Array arr @@ -534,7 +510,7 @@ cdef class RecordBatch: c_arrays.push_back(arr.sp_array) batch.reset(new CRecordBatch(schema, num_rows, c_arrays)) - return batch_from_cbatch(batch) + return wrap_batch(batch) cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): @@ -548,8 +524,8 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): from pyarrow.compat import DatetimeTZDtype with nogil: - check_status(pyarrow.ConvertTableToPandas(table, nthreads, - &result_obj)) + check_status(libarrow.ConvertTableToPandas(table, nthreads, + &result_obj)) result = PyObject_to_object(result_obj) @@ -652,7 +628,7 @@ cdef class Table: Returns ------- - pyarrow.table.Table + pyarrow.Table Examples -------- @@ -664,7 +640,7 @@ cdef class Table: ... 'str': ['a', 'b'] ... }) >>> pa.Table.from_pandas(df) - + """ names, arrays, metadata = _dataframe_to_arrays(df, timestamps_to_ms=timestamps_to_ms, @@ -686,7 +662,7 @@ cdef class Table: Returns ------- - pyarrow.table.Table + pyarrow.Table """ cdef: @@ -713,7 +689,7 @@ cdef class Table: raise ValueError(type(arrays[i])) table.reset(new CTable(schema, columns)) - return table_from_ctable(table) + return wrap_table(table) @staticmethod def from_batches(batches): @@ -737,7 +713,7 @@ cdef class Table: with nogil: check_status(CTable.FromRecordBatches(c_batches, &c_table)) - return table_from_ctable(c_table) + return wrap_table(c_table) def to_pandas(self, nthreads=None): """ @@ -782,9 +758,9 @@ cdef class Table: Returns ------- - pyarrow.schema.Schema + pyarrow.Schema """ - return box_schema(self.table.schema()) + return wrap_schema(self.table.schema()) def column(self, index): """ @@ -796,7 +772,7 @@ cdef class Table: Returns ------- - pyarrow.table.Column + pyarrow.Column """ self._check_nullptr() cdef Column column = Column() @@ -863,7 +839,7 @@ cdef class Table: with nogil: check_status(self.table.AddColumn(i, column.sp_column, &c_table)) - return table_from_ctable(c_table) + return wrap_table(c_table) def append_column(self, Column column): """ @@ -880,7 +856,7 @@ cdef class Table: with nogil: check_status(self.table.RemoveColumn(i, &c_table)) - return table_from_ctable(c_table) + return wrap_table(c_table) def concat_tables(tables): @@ -905,22 +881,4 @@ def concat_tables(tables): with nogil: check_status(ConcatenateTables(c_tables, &c_result)) - return table_from_ctable(c_result) - - -cdef object box_column(const shared_ptr[CColumn]& ccolumn): - cdef Column column = Column() - column.init(ccolumn) - return column - - -cdef api object table_from_ctable(const shared_ptr[CTable]& ctable): - cdef Table table = Table() - table.init(ctable) - return table - - -cdef api object batch_from_cbatch(const shared_ptr[CRecordBatch]& cbatch): - cdef RecordBatch batch = RecordBatch() - batch.init(cbatch) - return batch + return wrap_table(c_result) diff --git a/python/setup.py b/python/setup.py index 148224afb05..e592f3c924e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -106,14 +106,10 @@ def initialize_options(self): os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) CYTHON_MODULE_NAMES = [ - '_array', + 'lib', '_config', - '_error', - '_io', '_jemalloc', - '_memory', - '_parquet', - '_table'] + '_parquet'] def _run_cmake(self): # The directory containing this setup.py From ff1b5e519a1a607011478224adbd725d43b7bc49 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 May 2017 17:28:14 -0400 Subject: [PATCH 2/5] Rename things a bit Change-Id: Ifc6b97f7bae539c7c072e6c4ddbba8b8dbd06bc5 --- python/pyarrow/__init__.pxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index be2603ac9a5..b5ae1f44c85 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -20,9 +20,9 @@ from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, CRecordBatch, CSchema, CTable, CTensor) -cdef extern from "pyarrow.lib_api.h": - cdef int import_pyarrow__lib() except -1 +cdef extern from "arrow/python/pyarrow_api.h" namespace "arrow::py": + cdef int import_pyarrow() except -1 cdef object wrap_data_type(const shared_ptr[CDataType]& type) cdef object wrap_field(const shared_ptr[CField]& field) cdef object wrap_schema(const shared_ptr[CSchema]& schema) From b39d19cdf2e39b2bcb3e10cfaefd4eb21f5e62b2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 May 2017 17:35:34 -0400 Subject: [PATCH 3/5] Fix test suite. Move _config into lib Change-Id: I8eb4db35868420d374e03ebb9305e2884b3cac44 --- python/CMakeLists.txt | 1 - python/pyarrow/__init__.pxd | 2 +- python/pyarrow/__init__.py | 4 +--- python/pyarrow/feather.py | 6 ++--- python/pyarrow/formatting.py | 4 ++-- python/pyarrow/includes/libarrow.pxd | 6 +++++ python/pyarrow/lib.pyx | 33 ++++++++++++++++++++++++++-- python/pyarrow/table.pxi | 2 +- python/pyarrow/tests/test_feather.py | 2 +- python/setup.py | 1 - 10 files changed, 46 insertions(+), 15 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 44b5aaf7063..123dd5d8d7a 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -265,7 +265,6 @@ if (UNIX) endif() set(CYTHON_EXTENSIONS - _config lib ) diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index b5ae1f44c85..44305e01ca7 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -21,7 +21,7 @@ from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, CTable, CTensor) -cdef extern from "arrow/python/pyarrow_api.h" namespace "arrow::py": +cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef int import_pyarrow() except -1 cdef object wrap_data_type(const shared_ptr[CDataType]& type) cdef object wrap_field(const shared_ptr[CField]& field) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 7017103f93a..7d79811d988 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -25,9 +25,7 @@ pass -import pyarrow._config -from pyarrow._config import cpu_count, set_cpu_count - +from pyarrow.lib import cpu_count, set_cpu_count from pyarrow.lib import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index c7b118e60a4..3754aec7372 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -22,9 +22,9 @@ import pandas as pd from pyarrow.compat import pdapi -from pyarrow._io import FeatherError # noqa -from pyarrow._table import Table -import pyarrow._io as ext +from pyarrow.lib import FeatherError # noqa +from pyarrow.lib import Table +import pyarrow.lib as ext if LooseVersion(pd.__version__) < '0.17.0': diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py index c3583448d6e..0af2873b293 100644 --- a/python/pyarrow/formatting.py +++ b/python/pyarrow/formatting.py @@ -17,7 +17,7 @@ # Pretty-printing and other formatting utilities for Arrow data structures -import pyarrow._array as _array +import pyarrow.lib as lib def array_format(arr, window=None): @@ -42,7 +42,7 @@ def array_format(arr, window=None): def value_format(x, indent_level=0): - if isinstance(x, _array.ListValue): + if isinstance(x, lib.ListValue): contents = ',\n'.join(value_format(item) for item in x) return '[{0}]'.format(_indent(contents, 1).strip()) else: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9ed13b3e898..3d56c14bae2 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -678,3 +678,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyBytesReader(CBufferReader): PyBytesReader(object fo) + +cdef extern from 'arrow/python/init.h': + int arrow_init_numpy() except -1 + +cdef extern from 'arrow/python/config.h' namespace 'arrow::py': + void set_numpy_nan(object o) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index b47630459e4..ae311aca8d0 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -30,14 +30,43 @@ import datetime import decimal as _pydecimal import numpy as np import six -import pyarrow._config from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical - cdef _pandas(): import pandas as pd return pd + +arrow_init_numpy() + +import numpy as np +set_numpy_nan(np.nan) + +import multiprocessing +import os +cdef int CPU_COUNT = int( + os.environ.get('OMP_NUM_THREADS', + max(multiprocessing.cpu_count() // 2, 1))) + + +def cpu_count(): + """ + Returns + ------- + count : Number of CPUs to use by default in parallel operations. Default is + max(1, multiprocessing.cpu_count() / 2), but can be overridden by the + OMP_NUM_THREADS environment variable. For the default, we divide the CPU + count by 2 because most modern computers have hyperthreading turned on, + so doubling the CPU count beyond the number of physical cores does not + help. + """ + return CPU_COUNT + +def set_cpu_count(count): + global CPU_COUNT + CPU_COUNT = max(int(count), 1) + + # Exception types include "error.pxi" diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index a78817b4aff..4ce0fb25b66 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -731,7 +731,7 @@ cdef class Table: pandas.DataFrame """ if nthreads is None: - nthreads = pyarrow._config.cpu_count() + nthreads = cpu_count() mgr = table_to_blockmanager(self.sp_table, nthreads) return _pandas().DataFrame(mgr) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 7a8abf486f4..69c32be5f3d 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -27,7 +27,7 @@ from pyarrow.compat import guid from pyarrow.feather import (read_feather, write_feather, FeatherReader) -from pyarrow._io import FeatherWriter +from pyarrow.lib import FeatherWriter def random_path(): diff --git a/python/setup.py b/python/setup.py index e592f3c924e..b38fca47b23 100644 --- a/python/setup.py +++ b/python/setup.py @@ -107,7 +107,6 @@ def initialize_options(self): CYTHON_MODULE_NAMES = [ 'lib', - '_config', '_jemalloc', '_parquet'] From cff757debe34c1223d142122445847f798fff801 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 May 2017 18:07:55 -0400 Subject: [PATCH 4/5] Expose pyarrow C API in arrow/python/pyarrow.h Change-Id: I21e6a311273cda7eb793f8430581ba06b2393913 --- cpp/CMakeLists.txt | 4 +- cpp/src/arrow/python/CMakeLists.txt | 2 + cpp/src/arrow/python/pyarrow.cc | 75 +++++++++++++++ cpp/src/arrow/python/pyarrow.h | 55 +++++++++++ cpp/src/arrow/python/pyarrow_api.h | 143 ++++++++++++++++++++++++++++ python/pyarrow/__init__.pxd | 5 +- python/pyarrow/array.pxi | 38 ++++---- python/pyarrow/io.pxi | 25 ++--- python/pyarrow/public-api.pxi | 26 +++-- python/pyarrow/table.pxi | 24 ++--- 10 files changed, 339 insertions(+), 58 deletions(-) create mode 100644 cpp/src/arrow/python/pyarrow.cc create mode 100644 cpp/src/arrow/python/pyarrow.h create mode 100644 cpp/src/arrow/python/pyarrow_api.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 21463794879..6b2ceec3277 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -852,7 +852,8 @@ if (UNIX) ) FOREACH(item ${LINT_FILES}) - IF(NOT (item MATCHES "_generated.h")) + IF(NOT ((item MATCHES "_generated.h") OR + (item MATCHES "pyarrow_api.h"))) LIST(APPEND FILTERED_LINT_FILES ${item}) ENDIF() ENDFOREACH(item ${LINT_FILES}) @@ -878,6 +879,7 @@ if (${CLANG_FORMAT_FOUND}) `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g' | sed -e '/windows_compatibility.h/g' | + sed -e '/pyarrow_api.h/g' | sed -e '/config.h/g' | # python/config.h sed -e '/platform.h/g'` # python/platform.h ) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index c5cbc50845d..30852291d1a 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -50,6 +50,7 @@ set(ARROW_PYTHON_SRCS io.cc numpy_convert.cc pandas_convert.cc + pyarrow.cc ) set(ARROW_PYTHON_SHARED_LINK_LIBS @@ -90,6 +91,7 @@ install(FILES numpy_interop.h pandas_convert.h platform.h + pyarrow.h type_traits.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc new file mode 100644 index 00000000000..56c0381957f --- /dev/null +++ b/cpp/src/arrow/python/pyarrow.cc @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/pyarrow.h" + +#include + +#include "arrow/array.h" +#include "arrow/table.h" +#include "arrow/tensor.h" +#include "arrow/type.h" + +namespace { +#include "arrow/python/pyarrow_api.h" +} + +namespace arrow { +namespace py { + +int import_pyarrow() { + return ::import_pyarrow__lib(); +} + +PyObject* wrap_buffer(const std::shared_ptr& buffer) { + return ::pyarrow_wrap_buffer(buffer); +} + +PyObject* wrap_data_type(const std::shared_ptr& type) { + return ::pyarrow_wrap_data_type(type); +} + +PyObject* wrap_field(const std::shared_ptr& field) { + return ::pyarrow_wrap_field(field); +} + +PyObject* wrap_schema(const std::shared_ptr& schema) { + return ::pyarrow_wrap_schema(schema); +} + +PyObject* wrap_array(const std::shared_ptr& array) { + return ::pyarrow_wrap_array(array); +} + +PyObject* wrap_tensor(const std::shared_ptr& tensor) { + return ::pyarrow_wrap_tensor(tensor); +} + +PyObject* wrap_column(const std::shared_ptr& column) { + return ::pyarrow_wrap_column(column); +} + +PyObject* wrap_table(const std::shared_ptr& table) { + return ::pyarrow_wrap_table(table); +} + +PyObject* wrap_record_batch(const std::shared_ptr& batch) { + return ::pyarrow_wrap_batch(batch); +} + +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h new file mode 100644 index 00000000000..7c618ce1925 --- /dev/null +++ b/cpp/src/arrow/python/pyarrow.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_PYARROW_H +#define ARROW_PYTHON_PYARROW_H + +#include "arrow/python/platform.h" + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class Column; +class DataType; +class Field; +class RecordBatch; +class Schema; +class Table; +class Tensor; + +namespace py { + +ARROW_EXPORT int import_pyarrow(); +ARROW_EXPORT PyObject* wrap_buffer(const std::shared_ptr& buffer); +ARROW_EXPORT PyObject* wrap_data_type(const std::shared_ptr& type); +ARROW_EXPORT PyObject* wrap_field(const std::shared_ptr& field); +ARROW_EXPORT PyObject* wrap_schema(const std::shared_ptr& schema); +ARROW_EXPORT PyObject* wrap_array(const std::shared_ptr& array); +ARROW_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); +ARROW_EXPORT PyObject* wrap_column(const std::shared_ptr& column); +ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); +ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr& batch); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_PYARROW_H diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h new file mode 100644 index 00000000000..7b708446610 --- /dev/null +++ b/cpp/src/arrow/python/pyarrow_api.h @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build + +/* Generated by Cython 0.25.2 */ + +#ifndef __PYX_HAVE_API__pyarrow__lib +#define __PYX_HAVE_API__pyarrow__lib +#include "Python.h" + +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; +#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0; +#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0; +#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0; +#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0; +#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; +#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column)(std::shared_ptr< arrow::Column> const &) = 0; +#define pyarrow_wrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0; +#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0; +#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch +#if !defined(__Pyx_PyIdentifier_FromString) +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s) +#else + #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s) +#endif +#endif + +#ifndef __PYX_HAVE_RT_ImportModule +#define __PYX_HAVE_RT_ImportModule +static PyObject *__Pyx_ImportModule(const char *name) { + PyObject *py_name = 0; + PyObject *py_module = 0; + py_name = __Pyx_PyIdentifier_FromString(name); + if (!py_name) + goto bad; + py_module = PyImport_Import(py_name); + Py_DECREF(py_name); + return py_module; +bad: + Py_XDECREF(py_name); + return 0; +} +#endif + +#ifndef __PYX_HAVE_RT_ImportFunction +#define __PYX_HAVE_RT_ImportFunction +static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) { + PyObject *d = 0; + PyObject *cobj = 0; + union { + void (*fp)(void); + void *p; + } tmp; + d = PyObject_GetAttrString(module, (char *)"__pyx_capi__"); + if (!d) + goto bad; + cobj = PyDict_GetItemString(d, funcname); + if (!cobj) { + PyErr_Format(PyExc_ImportError, + "%.200s does not export expected C function %.200s", + PyModule_GetName(module), funcname); + goto bad; + } +#if PY_VERSION_HEX >= 0x02070000 + if (!PyCapsule_IsValid(cobj, sig)) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj)); + goto bad; + } + tmp.p = PyCapsule_GetPointer(cobj, sig); +#else + {const char *desc, *s1, *s2; + desc = (const char *)PyCObject_GetDesc(cobj); + if (!desc) + goto bad; + s1 = desc; s2 = sig; + while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; } + if (*s1 != *s2) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, desc); + goto bad; + } + tmp.p = PyCObject_AsVoidPtr(cobj);} +#endif + *f = tmp.fp; + if (!(*f)) + goto bad; + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(d); + return -1; +} +#endif + + +static int import_pyarrow__lib(void) { + PyObject *module = 0; + module = __Pyx_ImportModule("pyarrow.lib"); + if (!module) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; + Py_DECREF(module); module = 0; + return 0; + bad: + Py_XDECREF(module); + return -1; +} + +#endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index 44305e01ca7..4f434557915 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -16,13 +16,14 @@ # under the License. from libcpp.memory cimport shared_ptr -from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, - CRecordBatch, CSchema, +from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, + CField, CRecordBatch, CSchema, CTable, CTensor) cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef int import_pyarrow() except -1 + cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer) cdef object wrap_data_type(const shared_ptr[CDataType]& type) cdef object wrap_field(const shared_ptr[CField]& field) cdef object wrap_schema(const shared_ptr[CSchema]& schema) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a115cfdb36c..46e94b4f4b3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -162,7 +162,7 @@ cdef class Field: cdef init(self, const shared_ptr[CField]& field): self.sp_field = field self.field = field.get() - self.type = wrap_data_type(field.get().type()) + self.type = pyarrow_wrap_data_type(field.get().type()) def equals(self, Field other): """ @@ -220,7 +220,7 @@ cdef class Field: with nogil: check_status(self.field.AddMetadata(c_meta, &new_field)) - return wrap_field(new_field) + return pyarrow_wrap_field(new_field) def remove_metadata(self): """ @@ -233,7 +233,7 @@ cdef class Field: cdef shared_ptr[CField] new_field with nogil: new_field = self.field.RemoveMetadata() - return wrap_field(new_field) + return pyarrow_wrap_field(new_field) cdef class Schema: @@ -250,7 +250,7 @@ cdef class Schema: cdef Field result = Field() result.init(self.schema.field(i)) - result.type = wrap_data_type(result.field.type()) + result.type = pyarrow_wrap_data_type(result.field.type()) return result @@ -298,7 +298,7 @@ cdef class Schema: ------- field: pyarrow.Field """ - return wrap_field(self.schema.GetFieldByName(tobytes(name))) + return pyarrow_wrap_field(self.schema.GetFieldByName(tobytes(name))) def add_metadata(self, dict metadata): """ @@ -320,7 +320,7 @@ cdef class Schema: with nogil: check_status(self.schema.AddMetadata(c_meta, &new_schema)) - return wrap_schema(new_schema) + return pyarrow_wrap_schema(new_schema) def remove_metadata(self): """ @@ -333,7 +333,7 @@ cdef class Schema: cdef shared_ptr[CSchema] new_schema with nogil: new_schema = self.schema.RemoveMetadata() - return wrap_schema(new_schema) + return pyarrow_wrap_schema(new_schema) def __str__(self): return frombytes(self.schema.ToString()) @@ -580,7 +580,7 @@ def float64(): cpdef DataType decimal(int precision, int scale=0): cdef shared_ptr[CDataType] decimal_type decimal_type.reset(new CDecimalType(precision, scale)) - return wrap_data_type(decimal_type) + return pyarrow_wrap_data_type(decimal_type) def string(): @@ -605,7 +605,7 @@ def binary(int length=-1): cdef shared_ptr[CDataType] fixed_size_binary_type fixed_size_binary_type.reset(new CFixedSizeBinaryType(length)) - return wrap_data_type(fixed_size_binary_type) + return pyarrow_wrap_data_type(fixed_size_binary_type) def list_(DataType value_type): @@ -679,7 +679,7 @@ def from_numpy_dtype(object dtype): with nogil: check_status(NumPyDtypeToArrow(dtype, &c_type)) - return wrap_data_type(c_type) + return pyarrow_wrap_data_type(c_type) NA = None @@ -902,7 +902,7 @@ cdef class ListValue(ArrayValue): cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.value_type = wrap_data_type(self.ap.value_type()) + self.value_type = pyarrow_wrap_data_type(self.ap.value_type()) cdef getitem(self, int64_t i): cdef int64_t j = self.ap.value_offset(self.index) + i @@ -1026,7 +1026,7 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): ) ) - return wrap_array(sp_array) + return pyarrow_wrap_array(sp_array) @@ -1035,7 +1035,7 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.type = wrap_data_type(self.sp_array.get().type()) + self.type = pyarrow_wrap_data_type(self.sp_array.get().type()) @staticmethod def from_pandas(obj, mask=None, DataType type=None, @@ -1129,7 +1129,7 @@ cdef class Array: check_status(PandasToArrow( pool, values, mask, c_type, &out)) - return wrap_array(out) + return pyarrow_wrap_array(out) property null_count: @@ -1213,7 +1213,7 @@ cdef class Array: else: result = self.ap.Slice(offset, length) - return wrap_array(result) + return pyarrow_wrap_array(result) def to_pandas(self): """ @@ -1244,7 +1244,7 @@ cdef class Tensor: cdef init(self, const shared_ptr[CTensor]& sp_tensor): self.sp_tensor = sp_tensor self.tp = sp_tensor.get() - self.type = wrap_data_type(self.tp.type()) + self.type = pyarrow_wrap_data_type(self.tp.type()) def __repr__(self): return """ @@ -1256,7 +1256,7 @@ strides: {2}""".format(self.type, self.shape, self.strides) def from_numpy(obj): cdef shared_ptr[CTensor] ctensor check_status(NdarrayToTensor(c_default_memory_pool(), obj, &ctensor)) - return wrap_tensor(ctensor) + return pyarrow_wrap_tensor(ctensor) def to_numpy(self): """ @@ -1442,7 +1442,7 @@ cdef class DictionaryArray(Array): cdef CDictionaryArray* darr = (self.ap) if self._dictionary is None: - self._dictionary = wrap_array(darr.dictionary()) + self._dictionary = pyarrow_wrap_array(darr.dictionary()) return self._dictionary @@ -1452,7 +1452,7 @@ cdef class DictionaryArray(Array): cdef CDictionaryArray* darr = (self.ap) if self._indices is None: - self._indices = wrap_array(darr.indices()) + self._indices = pyarrow_wrap_array(darr.indices()) return self._indices diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 13e8b638723..a0a96e72864 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -163,7 +163,7 @@ cdef class NativeFile: with nogil: check_status(self.rd_file.get().ReadB(c_nbytes, &output)) - return wrap_buffer(output) + return pyarrow_wrap_buffer(output) def download(self, stream_or_path, buffer_size=None): """ @@ -483,7 +483,7 @@ cdef class Buffer: if parent_buf.get() == NULL: return None else: - return wrap_buffer(parent_buf) + return pyarrow_wrap_buffer(parent_buf) def __getitem__(self, key): # TODO(wesm): buffer slicing @@ -531,7 +531,7 @@ cdef class InMemoryOutputStream(NativeFile): def get_result(self): check_status(self.wr_file.get().Close()) self.is_open = False - return wrap_buffer( self.buffer) + return pyarrow_wrap_buffer( self.buffer) cdef class BufferReader(NativeFile): @@ -566,18 +566,11 @@ def frombuffer(object obj): try: memoryview(obj) buf.reset(new PyBuffer(obj)) - return wrap_buffer(buf) + return pyarrow_wrap_buffer(buf) except TypeError: raise ValueError('Must pass object that implements buffer protocol') - -cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf): - cdef Buffer result = Buffer() - result.init(buf) - return result - - cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader): cdef NativeFile nf @@ -993,7 +986,7 @@ cdef class _StreamReader: if batch.get() == NULL: raise StopIteration - return wrap_batch(batch) + return pyarrow_wrap_batch(batch) def read_all(self): """ @@ -1013,7 +1006,7 @@ cdef class _StreamReader: check_status(CTable.FromRecordBatches(batches, &table)) - return wrap_table(table) + return pyarrow_wrap_table(table) cdef class _FileWriter(_StreamWriter): @@ -1066,7 +1059,7 @@ cdef class _FileReader: with nogil: check_status(self.reader.get().GetRecordBatch(i, &batch)) - return wrap_batch(batch) + return pyarrow_wrap_batch(batch) # TODO(wesm): ARROW-503: Function was renamed. Remove after a period of # time has passed @@ -1089,7 +1082,7 @@ cdef class _FileReader: check_status(self.reader.get().GetRecordBatch(i, &batches[i])) check_status(CTable.FromRecordBatches(batches, &table)) - return wrap_table(table) + return pyarrow_wrap_table(table) #---------------------------------------------------------------------- @@ -1257,4 +1250,4 @@ def read_tensor(NativeFile source): with nogil: check_status(ReadTensor(offset, source.rd_file.get(), &sp_tensor)) - return wrap_tensor(sp_tensor) + return pyarrow_wrap_tensor(sp_tensor) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 32b27ac93a0..7b556512433 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -20,7 +20,15 @@ from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, CRecordBatch, CSchema, CTable, CTensor) -cdef public api object wrap_data_type(const shared_ptr[CDataType]& type): + +cdef public api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): + cdef Buffer result = Buffer() + result.init(buf) + return result + + +cdef public api object pyarrow_wrap_data_type( + const shared_ptr[CDataType]& type): cdef: DataType out @@ -42,7 +50,7 @@ cdef public api object wrap_data_type(const shared_ptr[CDataType]& type): return out -cdef public api object wrap_field(const shared_ptr[CField]& field): +cdef public api object pyarrow_wrap_field(const shared_ptr[CField]& field): if field.get() == NULL: return None cdef Field out = Field() @@ -50,13 +58,13 @@ cdef public api object wrap_field(const shared_ptr[CField]& field): return out -cdef public api object wrap_schema(const shared_ptr[CSchema]& type): +cdef public api object pyarrow_wrap_schema(const shared_ptr[CSchema]& type): cdef Schema out = Schema() out.init_schema(type) return out -cdef public api object wrap_array(const shared_ptr[CArray]& sp_array): +cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): if sp_array.get() == NULL: raise ValueError('Array was NULL') @@ -70,7 +78,8 @@ cdef public api object wrap_array(const shared_ptr[CArray]& sp_array): return arr -cdef public api object wrap_tensor(const shared_ptr[CTensor]& sp_tensor): +cdef public api object pyarrow_wrap_tensor( + const shared_ptr[CTensor]& sp_tensor): if sp_tensor.get() == NULL: raise ValueError('Tensor was NULL') @@ -79,19 +88,20 @@ cdef public api object wrap_tensor(const shared_ptr[CTensor]& sp_tensor): return tensor -cdef public api object wrap_column(const shared_ptr[CColumn]& ccolumn): +cdef public api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): cdef Column column = Column() column.init(ccolumn) return column -cdef public api object wrap_table(const shared_ptr[CTable]& ctable): +cdef public api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable): cdef Table table = Table() table.init(ctable) return table -cdef public api object wrap_batch(const shared_ptr[CRecordBatch]& cbatch): +cdef public api object pyarrow_wrap_batch( + const shared_ptr[CRecordBatch]& cbatch): cdef RecordBatch batch = RecordBatch() batch.init(cbatch) return batch diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 4ce0fb25b66..8dd18cf4136 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -83,7 +83,7 @@ cdef class ChunkedArray: pyarrow.Array """ self._check_nullptr() - return wrap_array(self.chunked_array.chunk(i)) + return pyarrow_wrap_array(self.chunked_array.chunk(i)) def iterchunks(self): for i in range(self.num_chunks): @@ -126,7 +126,7 @@ cdef class Column: cdef shared_ptr[CColumn] sp_column sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) - return wrap_column(sp_column) + return pyarrow_wrap_column(sp_column) def to_pandas(self): """ @@ -232,7 +232,7 @@ cdef class Column: ------- pyarrow.DataType """ - return wrap_data_type(self.column.type()) + return pyarrow_wrap_data_type(self.column.type()) @property def data(self): @@ -384,7 +384,7 @@ cdef class RecordBatch: return self._schema def __getitem__(self, i): - return wrap_array(self.batch.column(i)) + return pyarrow_wrap_array(self.batch.column(i)) def slice(self, offset=0, length=None): """ @@ -412,7 +412,7 @@ cdef class RecordBatch: else: result = self.batch.Slice(offset, length) - return wrap_batch(result) + return pyarrow_wrap_batch(result) def equals(self, RecordBatch other): cdef: @@ -510,7 +510,7 @@ cdef class RecordBatch: c_arrays.push_back(arr.sp_array) batch.reset(new CRecordBatch(schema, num_rows, c_arrays)) - return wrap_batch(batch) + return pyarrow_wrap_batch(batch) cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads): @@ -689,7 +689,7 @@ cdef class Table: raise ValueError(type(arrays[i])) table.reset(new CTable(schema, columns)) - return wrap_table(table) + return pyarrow_wrap_table(table) @staticmethod def from_batches(batches): @@ -713,7 +713,7 @@ cdef class Table: with nogil: check_status(CTable.FromRecordBatches(c_batches, &c_table)) - return wrap_table(c_table) + return pyarrow_wrap_table(c_table) def to_pandas(self, nthreads=None): """ @@ -760,7 +760,7 @@ cdef class Table: ------- pyarrow.Schema """ - return wrap_schema(self.table.schema()) + return pyarrow_wrap_schema(self.table.schema()) def column(self, index): """ @@ -839,7 +839,7 @@ cdef class Table: with nogil: check_status(self.table.AddColumn(i, column.sp_column, &c_table)) - return wrap_table(c_table) + return pyarrow_wrap_table(c_table) def append_column(self, Column column): """ @@ -856,7 +856,7 @@ cdef class Table: with nogil: check_status(self.table.RemoveColumn(i, &c_table)) - return wrap_table(c_table) + return pyarrow_wrap_table(c_table) def concat_tables(tables): @@ -881,4 +881,4 @@ def concat_tables(tables): with nogil: check_status(ConcatenateTables(c_tables, &c_result)) - return wrap_table(c_result) + return pyarrow_wrap_table(c_result) From 9e6ee24673e9fcb1f0410aa2eb5e04d1fb0b11cd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 13 May 2017 15:19:00 -0400 Subject: [PATCH 5/5] Fix up optional extensions Change-Id: Icad57e6d5e9ee5302e8623664c3c58ac363bdd69 --- python/pyarrow/_jemalloc.pyx | 2 +- python/pyarrow/_parquet.pyx | 21 +++++++++++---------- python/pyarrow/lib.pxd | 10 ++++++++++ python/pyarrow/parquet.py | 13 ++++++------- 4 files changed, 28 insertions(+), 18 deletions(-) diff --git a/python/pyarrow/_jemalloc.pyx b/python/pyarrow/_jemalloc.pyx index 3b41964a39c..6f00c9d2dde 100644 --- a/python/pyarrow/_jemalloc.pyx +++ b/python/pyarrow/_jemalloc.pyx @@ -20,7 +20,7 @@ # cython: embedsignature = True from pyarrow.includes.libarrow_jemalloc cimport CJemallocMemoryPool -from pyarrow._memory cimport MemoryPool +from pyarrow.lib cimport MemoryPool def default_pool(): cdef MemoryPool pool = MemoryPool() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index b0a029e6960..51bd938c79a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -22,15 +22,16 @@ from cython.operator cimport dereference as deref from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -from pyarrow._array cimport Array, Schema, box_schema -from pyarrow._error cimport check_status -from pyarrow._memory cimport MemoryPool, maybe_unbox_memory_pool -from pyarrow._table cimport Table, table_from_ctable -from pyarrow._io cimport NativeFile, get_reader, get_writer +from pyarrow.lib cimport (Array, Schema, + check_status, + MemoryPool, maybe_unbox_memory_pool, + Table, + pyarrow_wrap_schema, + pyarrow_wrap_table, + NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes -from pyarrow._error import ArrowException -from pyarrow._io import NativeFile +from pyarrow.lib import ArrowException, NativeFile import six @@ -212,7 +213,7 @@ cdef class ParquetSchema: with nogil: check_status(FromParquetSchema(self.schema, &sp_arrow_schema)) - return box_schema(sp_arrow_schema) + return pyarrow_wrap_schema(sp_arrow_schema) def equals(self, ParquetSchema other): """ @@ -425,7 +426,7 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() .ReadRowGroup(i, &ctable)) - return table_from_ctable(ctable) + return pyarrow_wrap_table(ctable) def read_all(self, column_indices=None): cdef: @@ -444,7 +445,7 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() .ReadTable(&ctable)) - return table_from_ctable(ctable) + return pyarrow_wrap_table(ctable) def column_name_idx(self, column_name): """ diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 08d4a5d0e70..d3d03aaaefa 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -325,3 +325,13 @@ cdef class NativeFile: cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader) cdef get_writer(object source, shared_ptr[OutputStream]* writer) + +cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) +cdef public object pyarrow_wrap_data_type(const shared_ptr[CDataType]& type) +cdef public object pyarrow_wrap_field(const shared_ptr[CField]& field) +cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type) +cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array) +cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor) +cdef public object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn) +cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable) +cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 21359f137f2..050ec3176d7 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -24,8 +24,7 @@ RowGroupMetaData, ParquetSchema, ParquetWriter) import pyarrow._parquet as _parquet # noqa -import pyarrow._array as _array -import pyarrow._table as _table +import pyarrow.lib as lib # ---------------------------------------------------------------------- @@ -241,8 +240,8 @@ def read(self, columns=None, nthreads=1, partitions=None, # manifest, so ['a', 'b', 'c'] as in our example above. dictionary = partitions.levels[i].dictionary - arr = _array.DictionaryArray.from_arrays(indices, dictionary) - col = _table.Column.from_array(name, arr) + arr = lib.DictionaryArray.from_arrays(indices, dictionary) + col = lib.Column.from_array(name, arr) table = table.append_column(col) return table @@ -298,9 +297,9 @@ def dictionary(self): # Only integer and string partition types are supported right now try: integer_keys = [int(x) for x in self.keys] - dictionary = _array.array(integer_keys) + dictionary = lib.array(integer_keys) except ValueError: - dictionary = _array.array(self.keys) + dictionary = lib.array(self.keys) self._dictionary = dictionary return dictionary @@ -539,7 +538,7 @@ def read(self, columns=None, nthreads=1): open_file_func=open_file) tables.append(table) - all_data = _table.concat_tables(tables) + all_data = lib.concat_tables(tables) return all_data def _get_open_file_func(self):