From 20fbdc15095b36fe0afe9b39dbcd5661c2ceda93 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 19:21:21 -0800 Subject: [PATCH 1/2] Draft scalar box types, no tests yet --- cpp/src/arrow/types/list.h | 6 ++- python/arrow/array.pyx | 3 +- python/arrow/includes/arrow.pxd | 18 ++++++-- python/arrow/scalar.pxd | 25 +++++++++-- python/arrow/scalar.pyx | 79 +++++++++++++++++++++++++++++++++ python/arrow/schema.pxd | 2 + python/arrow/schema.pyx | 14 ++++++ 7 files changed, 138 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f40a8245362..210c76a046c 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -63,7 +63,11 @@ class ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. - const ArrayPtr& values() const {return values_;} + const std::shared_ptr& values() const {return values_;} + + const std::shared_ptr& value_type() const { + return values_->type(); + } const int32_t* offsets() const { return offsets_;} diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3a3210d6cc1..6b47b5e7c31 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status +cimport arrow.scalar as scalar from arrow.scalar import NA def total_allocated_bytes(): @@ -79,7 +80,7 @@ cdef class Array: return self._getitem(key) cdef _getitem(self, int i): - raise NotImplementedError + return scalar.box_arrow_scalar(self.type, self.sp_array, i) def slice(self, start, end): pass diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index fde5de91091..0bc9a1fcac6 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -84,13 +84,23 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsNull(int i) cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): - pass + uint8_t Value(int i) cdef cppclass CInt8Array" arrow::Int8Array"(CArray): - pass + int8_t Value(int i) + + cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray): + uint64_t Value(int i) + + cdef cppclass CInt64Array" arrow::Int64Array"(CArray): + int64_t Value(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): - pass + const int32_t* offsets() + int32_t offset(int i) + int32_t value_length(int i) + const shared_ptr[CArray]& values() + const shared_ptr[CDataType]& value_type() cdef cppclass CStringArray" arrow::StringArray"(CListArray): - pass + c_string GetString(int i) diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd index e193c09cd69..15cdc956a25 100644 --- a/python/arrow/scalar.pxd +++ b/python/arrow/scalar.pxd @@ -16,7 +16,7 @@ # under the License. from arrow.includes.common cimport * -from arrow.includes.arrow cimport CArray, CListArray +from arrow.includes.arrow cimport * from arrow.schema cimport DataType @@ -31,17 +31,36 @@ cdef class NAType(Scalar): cdef class ArrayValue(Scalar): cdef: - shared_ptr[CArray] array + shared_ptr[CArray] sp_array int index + cdef void init(self, DataType type, + const shared_ptr[CArray]& sp_array, int index) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array) + cdef class Int8Value(ArrayValue): pass -cdef class ListValue(ArrayValue): +cdef class Int64Value(ArrayValue): pass +cdef class ListValue(ArrayValue): + cdef readonly: + DataType value_type + + cdef: + CListArray* ap + + cdef _getitem(self, int i) + + cdef class StringValue(ArrayValue): pass + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index) diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx index 78dadecf9b4..f9c220fd5dd 100644 --- a/python/arrow/scalar.pyx +++ b/python/arrow/scalar.pyx @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +from arrow.schema cimport DataType, box_data_type + +from arrow.compat import frombytes import arrow.schema as schema cdef class NAType(Scalar): @@ -25,4 +28,80 @@ cdef class NAType(Scalar): def __repr__(self): return 'NA' + def as_py(self): + return None + NA = NAType() + +cdef class ArrayValue(Scalar): + + cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array, + int index): + self.type = type + self.index = index + self._set_array(sp_array) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + + def __repr__(self): + if hasattr(self, 'as_py'): + return repr(self.as_py()) + else: + return Scalar.__repr__(self) + + +cdef class Int64Value(ArrayValue): + + def as_py(self): + cdef CInt64Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class StringValue(ArrayValue): + + def as_py(self): + cdef CStringArray* ap = self.sp_array.get() + return frombytes(ap.GetString(self.index)) + + +cdef class ListValue(ArrayValue): + + def __len__(self): + return self.ap.value_length(self.index) + + def __getitem__(self, i): + return self._getitem(i) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.value_type = box_data_type(self.ap.value_type()) + + cdef _getitem(self, int i): + cdef int j = self.ap.offset(self.index) + i + return box_arrow_scalar(self.value_type, self.ap.values(), j) + + def as_py(self): + cdef: + int j + list result = [] + + for j in range(len(self)): + result.append(self._getitem(j).as_py()) + + return result + + +cdef dict _scalar_classes = { + LogicalType_INT64: Int64Value, + LogicalType_LIST: ListValue, + LogicalType_STRING: StringValue +} + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index): + cdef ArrayValue val = _scalar_classes[type.type.type]() + val.init(type, sp_array, index) + return val diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd index 487c246f44a..8cc244aaba3 100644 --- a/python/arrow/schema.pxd +++ b/python/arrow/schema.pxd @@ -37,3 +37,5 @@ cdef class Schema: cdef: shared_ptr[CSchema] sp_schema CSchema* schema + +cdef DataType box_data_type(const shared_ptr[CDataType]& type) diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index 63cd6e888ab..3001531eb60 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -85,6 +85,14 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): def field(name, type): return Field(name, type) +cdef set PRIMITIVE_TYPES = set([ + LogicalType_NA, LogicalType_BOOL, + LogicalType_UINT8, LogicalType_INT8, + LogicalType_UINT16, LogicalType_INT16, + LogicalType_UINT32, LogicalType_INT32, + LogicalType_UINT64, LogicalType_INT64, + LogicalType_FLOAT, LogicalType_DOUBLE]) + def null(): return primitive_type(LogicalType_NA) @@ -148,3 +156,9 @@ def struct(fields, c_bool nullable=True): out.init(shared_ptr[CDataType]( new CStructType(c_fields, nullable))) return out + + +cdef DataType box_data_type(const shared_ptr[CDataType]& type): + cdef DataType out = DataType() + out.init(type) + return out From df06ba1792e38d41e9ee351432a25cb2da9339d6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 20:38:46 -0800 Subject: [PATCH 2/2] Add tests for scalars proxying implemented Python list type conversions, fix associated bugs --- python/arrow/__init__.py | 6 +- python/arrow/array.pxd | 1 - python/arrow/array.pyx | 16 ++--- python/arrow/compat.py | 6 ++ python/arrow/includes/arrow.pxd | 18 +++++ python/arrow/scalar.pyx | 92 +++++++++++++++++++++++++- python/arrow/tests/test_scalars.py | 82 +++++++++++++++++++++++ python/src/pyarrow/adapters/builtin.cc | 2 +- 8 files changed, 208 insertions(+), 15 deletions(-) create mode 100644 python/arrow/tests/test_scalars.py diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 3c049b85e8c..3507ea0235a 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -24,7 +24,11 @@ from arrow.error import ArrowException -from arrow.scalar import ArrayValue, NA, Scalar +from arrow.scalar import (ArrayValue, Scalar, NA, NAType, + BooleanValue, + Int8Value, Int16Value, Int32Value, Int64Value, + UInt8Value, UInt16Value, UInt32Value, UInt64Value, + FloatValue, DoubleValue, ListValue, StringValue) from arrow.schema import (null, bool_, int8, int16, int32, int64, diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index e32d27769b5..04dd8d182bc 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -34,7 +34,6 @@ cdef class Array: DataType type cdef init(self, const shared_ptr[CArray]& sp_array) - cdef _getitem(self, int i) cdef class BooleanArray(Array): diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 6b47b5e7c31..8ebd01d1dbe 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -74,13 +74,7 @@ cdef class Array: while key < 0: key += len(self) - if self.ap.IsNull(key): - return NA - else: - return self._getitem(key) - - cdef _getitem(self, int i): - return scalar.box_arrow_scalar(self.type, self.sp_array, i) + return scalar.box_arrow_scalar(self.type, self.sp_array, key) def slice(self, start, end): pass @@ -169,12 +163,16 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): return arr -def from_pylist(object list_obj, type=None): +def from_pylist(object list_obj, DataType type=None): """ Convert Python list to Arrow array """ cdef: shared_ptr[CArray] sp_array - check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + if type is None: + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + else: + raise NotImplementedError + return box_arrow_array(sp_array) diff --git a/python/arrow/compat.py b/python/arrow/compat.py index 2ac41ac8abf..08f0f237967 100644 --- a/python/arrow/compat.py +++ b/python/arrow/compat.py @@ -54,6 +54,9 @@ def dict_values(x): range = xrange long = long + def u(s): + return unicode(s, "unicode_escape") + def tobytes(o): if isinstance(o, unicode): return o.encode('utf8') @@ -73,6 +76,9 @@ def dict_values(x): from decimal import Decimal range = range + def u(s): + return s + def tobytes(o): if isinstance(o, str): return o.encode('utf8') diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 0bc9a1fcac6..0cc44c06cb6 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -89,12 +89,30 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CInt8Array" arrow::Int8Array"(CArray): int8_t Value(int i) + cdef cppclass CUInt16Array" arrow::UInt16Array"(CArray): + uint16_t Value(int i) + + cdef cppclass CInt16Array" arrow::Int16Array"(CArray): + int16_t Value(int i) + + cdef cppclass CUInt32Array" arrow::UInt32Array"(CArray): + uint32_t Value(int i) + + cdef cppclass CInt32Array" arrow::Int32Array"(CArray): + int32_t Value(int i) + cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray): uint64_t Value(int i) cdef cppclass CInt64Array" arrow::Int64Array"(CArray): int64_t Value(int i) + cdef cppclass CFloatArray" arrow::FloatArray"(CArray): + float Value(int i) + + cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray): + double Value(int i) + cdef cppclass CListArray" arrow::ListArray"(CArray): const int32_t* offsets() int32_t offset(int i) diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx index f9c220fd5dd..951ede28776 100644 --- a/python/arrow/scalar.pyx +++ b/python/arrow/scalar.pyx @@ -20,9 +20,15 @@ from arrow.schema cimport DataType, box_data_type from arrow.compat import frombytes import arrow.schema as schema +NA = None + cdef class NAType(Scalar): def __cinit__(self): + global NA + if NA is not None: + raise Exception('Cannot create multiple NAType instances') + self.type = schema.null() def __repr__(self): @@ -51,6 +57,52 @@ cdef class ArrayValue(Scalar): return Scalar.__repr__(self) +cdef class BooleanValue(ArrayValue): + pass + + +cdef class Int8Value(ArrayValue): + + def as_py(self): + cdef CInt8Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt8Value(ArrayValue): + + def as_py(self): + cdef CUInt8Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int16Value(ArrayValue): + + def as_py(self): + cdef CInt16Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt16Value(ArrayValue): + + def as_py(self): + cdef CUInt16Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int32Value(ArrayValue): + + def as_py(self): + cdef CInt32Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt32Value(ArrayValue): + + def as_py(self): + cdef CUInt32Array* ap = self.sp_array.get() + return ap.Value(self.index) + + cdef class Int64Value(ArrayValue): def as_py(self): @@ -58,6 +110,27 @@ cdef class Int64Value(ArrayValue): return ap.Value(self.index) +cdef class UInt64Value(ArrayValue): + + def as_py(self): + cdef CUInt64Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class FloatValue(ArrayValue): + + def as_py(self): + cdef CFloatArray* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class DoubleValue(ArrayValue): + + def as_py(self): + cdef CDoubleArray* ap = self.sp_array.get() + return ap.Value(self.index) + + cdef class StringValue(ArrayValue): def as_py(self): @@ -94,7 +167,16 @@ cdef class ListValue(ArrayValue): cdef dict _scalar_classes = { + LogicalType_UINT8: Int8Value, + LogicalType_UINT16: Int16Value, + LogicalType_UINT32: Int32Value, + LogicalType_UINT64: Int64Value, + LogicalType_INT8: Int8Value, + LogicalType_INT16: Int16Value, + LogicalType_INT32: Int32Value, LogicalType_INT64: Int64Value, + LogicalType_FLOAT: FloatValue, + LogicalType_DOUBLE: DoubleValue, LogicalType_LIST: ListValue, LogicalType_STRING: StringValue } @@ -102,6 +184,10 @@ cdef dict _scalar_classes = { cdef object box_arrow_scalar(DataType type, const shared_ptr[CArray]& sp_array, int index): - cdef ArrayValue val = _scalar_classes[type.type.type]() - val.init(type, sp_array, index) - return val + cdef ArrayValue val + if sp_array.get().IsNull(index): + return NA + else: + val = _scalar_classes[type.type.type]() + val.init(type, sp_array, index) + return val diff --git a/python/arrow/tests/test_scalars.py b/python/arrow/tests/test_scalars.py new file mode 100644 index 00000000000..951380bd981 --- /dev/null +++ b/python/arrow/tests/test_scalars.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest, u +import arrow + + +class TestScalars(unittest.TestCase): + + def test_null_singleton(self): + with self.assertRaises(Exception): + arrow.NAType() + + def test_bool(self): + pass + + def test_int64(self): + arr = arrow.from_pylist([1, 2, None]) + + v = arr[0] + assert isinstance(v, arrow.Int64Value) + assert repr(v) == "1" + assert v.as_py() == 1 + + assert arr[2] is arrow.NA + + def test_double(self): + arr = arrow.from_pylist([1.5, None, 3]) + + v = arr[0] + assert isinstance(v, arrow.DoubleValue) + assert repr(v) == "1.5" + assert v.as_py() == 1.5 + + assert arr[1] is arrow.NA + + v = arr[2] + assert v.as_py() == 3.0 + + def test_string(self): + arr = arrow.from_pylist(['foo', None, u('bar')]) + + v = arr[0] + assert isinstance(v, arrow.StringValue) + assert repr(v) == "'foo'" + assert v.as_py() == 'foo' + + assert arr[1] is arrow.NA + + v = arr[2].as_py() + assert v == 'bar' + assert isinstance(v, str) + + def test_list(self): + arr = arrow.from_pylist([['foo', None], None, ['bar'], []]) + + v = arr[0] + assert len(v) == 2 + assert isinstance(v, arrow.ListValue) + assert repr(v) == "['foo', None]" + assert v.as_py() == ['foo', None] + assert v[0].as_py() == 'foo' + assert v[1] is arrow.NA + + assert arr[1] is arrow.NA + + v = arr[3] + assert len(v) == 0 diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index ae84fa12b0d..60d6248842e 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -276,7 +276,7 @@ class Int64Converter : public TypedConverter { class DoubleConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { - int64_t val; + double val; Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i));