From ea2f3ec4a827eabcf2b8a5612049cecd56bf3386 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 5 Mar 2016 21:33:28 -0800 Subject: [PATCH 01/21] Bootstrap end-to-end exposure in Python, wrap DataType and Field types --- cpp/CMakeLists.txt | 82 ++++++++---------- cpp/src/arrow/api.h | 18 ++++ cpp/src/arrow/table/CMakeLists.txt | 15 ---- cpp/src/arrow/table/schema-test.cc | 8 +- cpp/src/arrow/type.cc | 1 + cpp/src/arrow/type.h | 45 +++++----- cpp/src/arrow/types/CMakeLists.txt | 21 ----- cpp/src/arrow/types/json.cc | 5 +- cpp/src/arrow/types/list-test.cc | 7 +- cpp/src/arrow/types/list.cc | 6 +- cpp/src/arrow/types/list.h | 1 + cpp/src/arrow/types/string.h | 7 +- cpp/src/arrow/types/struct-test.cc | 2 +- cpp/src/arrow/types/struct.cc | 2 +- cpp/src/arrow/util/CMakeLists.txt | 20 +---- python/CMakeLists.txt | 19 +++-- python/arrow/array.pxd | 24 ++++++ python/arrow/array.pyx | 30 +++++++ python/arrow/includes/arrow.pxd | 61 +++++++++++++- python/arrow/includes/pyarrow.pxd | 6 +- python/arrow/schema.pxd | 39 +++++++++ python/arrow/schema.pyx | 84 +++++++++++++++++++ python/setup.py | 2 +- python/src/pyarrow/helpers.cc | 51 +++++++++++ .../null.h => python/src/pyarrow/helpers.h | 21 ++--- 25 files changed, 417 insertions(+), 160 deletions(-) create mode 100644 python/arrow/array.pxd create mode 100644 python/arrow/array.pyx create mode 100644 python/arrow/schema.pxd create mode 100644 python/arrow/schema.pyx create mode 100644 python/src/pyarrow/helpers.cc rename cpp/src/arrow/types/null.h => python/src/pyarrow/helpers.h (77%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8042661533e..1e31265216e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,18 +37,17 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -# Enable using a custom GCC toolchain to build Arrow -if (NOT "$ENV{ARROW_GCC_ROOT}" STREQUAL "") - set(GCC_ROOT $ENV{ARROW_GCC_ROOT}) - set(CMAKE_C_COMPILER ${GCC_ROOT}/bin/gcc) - set(CMAKE_CXX_COMPILER ${GCC_ROOT}/bin/g++) -endif() - if(APPLE) # In newer versions of CMake, this is the default setting set(CMAKE_MACOSX_RPATH 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + # ---------------------------------------------------------------------- # cmake options @@ -126,38 +125,16 @@ endif () # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") -# Required to avoid static linking errors with dependencies -add_definitions(-fPIC) - # Determine compiler version include(CompilerInfo) if ("${COMPILER_FAMILY}" STREQUAL "clang") - # Clang helpfully provides a few extensions from C++11 such as the 'override' - # keyword on methods. This doesn't change behavior, and we selectively enable - # it in src/gutil/port.h only on clang. So, we can safely use it, and don't want - # to trigger warnings when we do so. - # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-extensions") - # Using Clang with ccache causes a bunch of spurious warnings that are # purportedly fixed in the next version of ccache. See the following for details: # # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") - - # Only hardcode -fcolor-diagnostics if stderr is opened on a terminal. Otherwise - # the color codes show up as noisy artifacts. - # - # This test is imperfect because 'cmake' and 'make' can be run independently - # (with different terminal options), and we're testing during the former. - execute_process(COMMAND test -t 2 RESULT_VARIABLE ARROW_IS_TTY) - if ((${ARROW_IS_TTY} EQUAL 0) AND (NOT ("$ENV{TERM}" STREQUAL "dumb"))) - message("Running in a controlling terminal") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") - else() - message("Running without a controlling terminal or in a dumb terminal") - endif() endif() # Sanity check linking option. @@ -278,12 +255,6 @@ set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) -############################################################ -# Visibility -############################################################ -# For generate_export_header() and add_compiler_export_flags(). -include(GenerateExportHeader) - ############################################################ # Testing ############################################################ @@ -456,21 +427,33 @@ endif() # Subdirectories ############################################################ -add_subdirectory(src/arrow) -add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/table) -add_subdirectory(src/arrow/types) - -set(LINK_LIBS - arrow_util - arrow_table - arrow_types) +set(LIBARROW_LINK_LIBS +) set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc src/arrow/field.cc src/arrow/type.cc + + src/arrow/table/column.cc + src/arrow/table/schema.cc + src/arrow/table/table.cc + + src/arrow/types/construct.cc + src/arrow/types/floating.cc + src/arrow/types/integer.cc + src/arrow/types/json.cc + src/arrow/types/list.cc + src/arrow/types/primitive.cc + src/arrow/types/string.cc + src/arrow/types/struct.cc + src/arrow/types/union.cc + + src/arrow/util/bit-util.cc + src/arrow/util/buffer.cc + src/arrow/util/memory-pool.cc + src/arrow/util/status.cc ) set(LIBARROW_LINKAGE "SHARED") @@ -479,8 +462,15 @@ add_library(arrow ${LIBARROW_LINKAGE} ${ARROW_SRCS} ) -target_link_libraries(arrow ${LINK_LIBS}) -set_target_properties(arrow PROPERTIES LINKER_LANGUAGE CXX) +set_target_properties(arrow + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) + +add_subdirectory(src/arrow) +add_subdirectory(src/arrow/util) +add_subdirectory(src/arrow/table) +add_subdirectory(src/arrow/types) install(TARGETS arrow LIBRARY DESTINATION lib diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 899e8aae19c..377c8a4a257 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -15,7 +15,25 @@ // specific language governing permissions and limitations // under the License. +// Coarse public API while the library is in development + #ifndef ARROW_API_H #define ARROW_API_H +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/field.h" +#include "arrow/type.h" + +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/table.h" + +#include "arrow/types/boolean.h" +#include "arrow/types/floating.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" + #endif // ARROW_API_H diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index 68bf3148a98..26d843d853b 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -19,21 +19,6 @@ # arrow_table ####################################### -set(TABLE_SRCS - column.cc - schema.cc - table.cc -) - -set(TABLE_LIBS -) - -add_library(arrow_table STATIC - ${TABLE_SRCS} -) -target_link_libraries(arrow_table ${TABLE_LIBS}) -SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES column.h diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc index 0cf1b3c5f9a..5b5393efecc 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -97,10 +97,10 @@ TEST_F(TestSchema, ToString) { auto schema = std::make_shared(fields); std::string result = schema->ToString(); - std::string expected = R"(f0 ?int32 -f1 uint8 -f2 ?string -f3 ?list + std::string expected = R"(f0 int32 +f1 uint8 not null +f2 string +f3 list )"; ASSERT_EQ(expected, result); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ff145e2c1e3..d8d2a4e98c1 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -19,6 +19,7 @@ namespace arrow { +const std::shared_ptr NA = std::make_shared(); const std::shared_ptr BOOL = std::make_shared(); const std::shared_ptr UINT8 = std::make_shared(); const std::shared_ptr UINT16 = std::make_shared(); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4193a0e8bc8..264f08269f5 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -71,49 +71,46 @@ struct LogicalType { UINT64 = 7, INT64 = 8, - // A boolean value represented as 1 byte - BOOL = 9, - // A boolean value represented as 1 bit - BIT = 10, + BOOL = 9, // 4-byte floating point value - FLOAT = 11, + FLOAT = 10, // 8-byte floating point value - DOUBLE = 12, + DOUBLE = 11, // CHAR(N): fixed-length UTF8 string with length N - CHAR = 13, + CHAR = 12, // UTF8 variable-length string as List - STRING = 14, + STRING = 13, // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 15, + VARCHAR = 14, // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 16, + BINARY = 15, // By default, int32 days since the UNIX epoch - DATE = 17, + DATE = 16, // Exact timestamp encoded with int64 since UNIX epoch // Default unit millisecond - TIMESTAMP = 18, + TIMESTAMP = 17, // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 19, + TIMESTAMP_DOUBLE = 18, // Exact time encoded with int64, default unit millisecond - TIME = 20, + TIME = 19, // Precision- and scale-based decimal type. Storage type depends on the // parameters. - DECIMAL = 21, + DECIMAL = 20, // Decimal value encoded as a text string - DECIMAL_TEXT = 22, + DECIMAL_TEXT = 21, // A list of some logical data type LIST = 30, @@ -141,7 +138,9 @@ struct DataType { type(type), nullable(nullable) {} - virtual bool Equals(const DataType* other) { + virtual ~DataType() {} + + bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses return this == other || (this->type == other->type && this->nullable == other->nullable); @@ -184,11 +183,10 @@ struct PrimitiveType : public DataType { : DataType(Derived::type_enum, nullable) {} virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); + std::string result(static_cast(this)->name()); + if (!nullable) { + result.append(" not null"); } - result.append(static_cast(this)->name()); return result; } }; @@ -205,6 +203,10 @@ struct PrimitiveType : public DataType { return NAME; \ } +struct NullType : public PrimitiveType { + PRIMITIVE_DECL(NullType, void, NA, 0, "null"); +}; + struct BooleanType : public PrimitiveType { PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); }; @@ -249,6 +251,7 @@ struct DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; +extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; extern const std::shared_ptr UINT8; extern const std::shared_ptr UINT16; diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index e090aead1f8..bae4b6235ff 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -19,27 +19,6 @@ # arrow_types ####################################### -set(TYPES_SRCS - construct.cc - floating.cc - integer.cc - json.cc - list.cc - primitive.cc - string.cc - struct.cc - union.cc -) - -set(TYPES_LIBS -) - -add_library(arrow_types STATIC - ${TYPES_SRCS} -) -target_link_libraries(arrow_types ${TYPES_LIBS}) -SET_TARGET_PROPERTIES(arrow_types PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES boolean.h diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index b29b95715fe..168e370d51a 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -19,10 +19,7 @@ #include -#include "arrow/types/boolean.h" -#include "arrow/types/integer.h" -#include "arrow/types/floating.h" -#include "arrow/types/null.h" +#include "arrow/type.h" #include "arrow/types/string.h" #include "arrow/types/union.h" diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index b4bbd2841a8..cec13995142 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -47,17 +47,18 @@ TEST(TypesTest, TestListType) { ASSERT_EQ(list_type.type, LogicalType::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("?list")); + ASSERT_EQ(list_type.ToString(), string("list")); ASSERT_EQ(list_type.value_type->type, vt->type); ASSERT_EQ(list_type.value_type->type, vt->type); std::shared_ptr st = std::make_shared(false); std::shared_ptr lt = std::make_shared(st, false); - ASSERT_EQ(lt->ToString(), string("list")); + ASSERT_EQ(lt->ToString(), string("list not null")); ListType lt2(lt, false); - ASSERT_EQ(lt2.ToString(), string("list>")); + ASSERT_EQ(lt2.ToString(), + string("list not null> not null")); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 577d71d0b28..5a47ce3187c 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -24,10 +24,10 @@ namespace arrow { std::string ListType::ToString() const { std::stringstream s; - if (this->nullable) { - s << "?"; - } s << "list<" << value_type->ToString() << ">"; + if (!this->nullable) { + s << " not null"; + } return s.str(); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f39fe5c4d81..0539ac87e0c 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -43,6 +43,7 @@ struct ListType : public DataType { explicit ListType(const TypePtr& value_type, bool nullable = true) : DataType(LogicalType::LIST, nullable), value_type(value_type) {} + virtual ~ListType() {} static char const *name() { return "list"; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 5795cfed577..7daf62fadf5 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -84,11 +84,10 @@ struct StringType : public DataType { } virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); + std::string result(name()); + if (!nullable) { + result.append(" not null"); } - result.append(name()); return result; } }; diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index df615710479..1a9fc6be4a5 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -49,7 +49,7 @@ TEST(TestStructType, Basics) { ASSERT_TRUE(struct_type.field(1).Equals(f1)); ASSERT_TRUE(struct_type.field(2).Equals(f2)); - ASSERT_EQ(struct_type.ToString(), "?struct"); + ASSERT_EQ(struct_type.ToString(), "struct"); // TODO: out of bounds for field(...) } diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 6b233bc372a..ee2a41c56c9 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -26,7 +26,6 @@ namespace arrow { std::string StructType::ToString() const { std::stringstream s; - if (nullable) s << "?"; s << "struct<"; for (size_t i = 0; i < fields_.size(); ++i) { if (i > 0) s << ", "; @@ -34,6 +33,7 @@ std::string StructType::ToString() const { s << field.name << ": " << field.type->ToString(); } s << ">"; + if (!nullable) s << " not null"; return s.str(); } diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c53f307c9f5..4272ce42854 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -19,22 +19,6 @@ # arrow_util ####################################### -set(UTIL_SRCS - bit-util.cc - buffer.cc - memory-pool.cc - status.cc -) - -set(UTIL_LIBS -) - -add_library(arrow_util STATIC - ${UTIL_SRCS} -) -target_link_libraries(arrow_util ${UTIL_LIBS}) -SET_TARGET_PROPERTIES(arrow_util PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES bit-util.h @@ -50,7 +34,7 @@ install(FILES add_library(arrow_test_util) target_link_libraries(arrow_test_util - arrow_util) +) SET_TARGET_PROPERTIES(arrow_test_util PROPERTIES LINKER_LANGUAGE CXX) @@ -64,7 +48,6 @@ add_library(arrow_test_main if (APPLE) target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util dl) set_target_properties(arrow_test_main @@ -72,7 +55,6 @@ if (APPLE) else() target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util pthread dl diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index df55bfac9eb..2ea79042bc5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,6 +45,12 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + ############################################################ # Compiler flags ############################################################ @@ -389,6 +395,7 @@ add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS + src/pyarrow/helpers.cc src/pyarrow/init.cc ) @@ -410,11 +417,12 @@ endif() # Setup and build Cython modules ############################################################ -foreach(pyx_api_file - arrow/config.pyx - arrow/parquet.pyx) - set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) -endforeach(pyx_api_file) +# foreach(pyx_api_file +# arrow/config.pyx +# arrow/parquet.pyx +# arrow/schema.pyx) +# set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) +# endforeach(pyx_api_file) set(USE_RELATIVE_RPATH ON) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) @@ -422,6 +430,7 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CYTHON_EXTENSIONS config parquet + schema ) foreach(module ${CYTHON_EXTENSIONS}) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd new file mode 100644 index 00000000000..07e7fd335b3 --- /dev/null +++ b/python/arrow/array.pxd @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CArray + +cdef class Array: + cdef: + shared_ptr[CArray] sp_array + CArray* array diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx new file mode 100644 index 00000000000..1f05493f84f --- /dev/null +++ b/python/arrow/array.pyx @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.compat import frombytes, tobytes +from arrow.includes.arrow cimport * + + +def from_list(list_obj, type=None): + """ + Convert Python list to Arrow array + """ + pass diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 3635ceb8685..8b71f22cc08 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -20,4 +20,63 @@ from arrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: - pass + + enum LogicalType" arrow::LogicalType::type": + LogicalType_BOOL" arrow::LogicalType::BOOL" + + LogicalType_UINT8" arrow::LogicalType::UINT8" + LogicalType_INT8" arrow::LogicalType::INT8" + LogicalType_UINT16" arrow::LogicalType::UINT16" + LogicalType_INT16" arrow::LogicalType::INT16" + LogicalType_UINT32" arrow::LogicalType::UINT32" + LogicalType_INT32" arrow::LogicalType::INT32" + LogicalType_UINT64" arrow::LogicalType::UINT64" + LogicalType_INT64" arrow::LogicalType::INT64" + + LogicalType_FLOAT" arrow::LogicalType::FLOAT" + LogicalType_DOUBLE" arrow::LogicalType::DOUBLE" + + LogicalType_STRING" arrow::LogicalType::STRING" + + cdef cppclass CDataType" arrow::DataType": + LogicalType type + c_bool nullable + + string ToString() + + cdef cppclass CListType" arrow::ListType"(CDataType): + CListType(const shared_ptr[CDataType]& value_type, + c_bool nullable) + + cdef cppclass CStringType" arrow::StringType"(CDataType): + pass + + cdef cppclass CField" arrow::Field": + string name + shared_ptr[CDataType] type + + CField(const string& name, const shared_ptr[CDataType]& type) + + cdef cppclass CSchema" arrow::Schema": + pass + + cdef cppclass CArray" arrow::Array": + const shared_ptr[CDataType]& type() + + int32_t length() + int32_t null_count() + LogicalType logical_type() + + c_bool IsNull(int i) + + cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): + pass + + cdef cppclass CInt8Array" arrow::Int8Array"(CArray): + pass + + cdef cppclass CListArray" arrow::ListArray"(CArray): + pass + + cdef cppclass CStringArray" arrow::StringArray"(CListArray): + pass diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index dcef663f389..5b6d87a841a 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,6 +18,10 @@ # distutils: language = c++ from arrow.includes.common cimport * +from arrow.includes.arrow cimport LogicalType, CDataType -cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: +cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil: pass + +cdef extern from "pyarrow/helpers.h" namespace "arrow::py" nogil: + shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable); diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd new file mode 100644 index 00000000000..487c246f44a --- /dev/null +++ b/python/arrow/schema.pxd @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CDataType, CField, CSchema + +cdef class DataType: + cdef: + shared_ptr[CDataType] sp_type + CDataType* type + + cdef init(self, const shared_ptr[CDataType]& type) + +cdef class Field: + cdef: + shared_ptr[CField] sp_field + CField* field + + cdef readonly: + DataType type + +cdef class Schema: + cdef: + shared_ptr[CSchema] sp_schema + CSchema* schema diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx new file mode 100644 index 00000000000..6f6a2dff354 --- /dev/null +++ b/python/arrow/schema.pyx @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################## +# Data types, fields, schemas, and so forth + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.compat import frombytes, tobytes +from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +cdef class DataType: + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CDataType]& type): + self.sp_type = type + self.type = type.get() + + def __repr__(self): + return 'DataType({0})'.format(self._type_repr()) + + def _type_repr(self): + return frombytes(self.type.ToString()) + +cdef class Field: + + def __cinit__(self, object name, DataType type): + self.type = type + self.sp_field.reset(new CField(tobytes(name), type.sp_type)) + self.field = self.sp_field.get() + + def __repr__(self): + return 'Field({0}, type={1})'.format(self.name, + self.type._type_repr()) + + property name: + + def __get__(self): + return frombytes(self.field.name) + + +cdef DataType primitive_type(LogicalType type, bint nullable=True): + cdef DataType out = DataType() + out.init(pyarrow.GetPrimitiveType(type, nullable)) + return out + +#------------------------------------------------------------ +# Type factory functions + +def uint32(c_bool nullable=True): + return primitive_type(LogicalType_UINT32, nullable) + +def int32(c_bool nullable=True): + return primitive_type(LogicalType_INT32, nullable) + +def list(DataType value_type, c_bool nullable=True): + cdef DataType out = DataType() + + cdef shared_ptr[CDataType] tp + tp.reset( new CListType(value_type.sp_type, nullable)) + out.init(tp) + return out + +def struct(fields, c_bool nullable=True): + pass diff --git a/python/setup.py b/python/setup.py index f6b0a4bee83..c67351ede29 100644 --- a/python/setup.py +++ b/python/setup.py @@ -207,7 +207,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['config', 'parquet'] + return ['config', 'parquet', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc new file mode 100644 index 00000000000..651e77ed22f --- /dev/null +++ b/python/src/pyarrow/helpers.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/helpers.h" + +#include + +namespace arrow { + +namespace py { + +#define GET_PRIMITIVE_TYPE(NAME, Type) \ + case LogicalType::NAME: \ + if (nullable) { \ + return NAME; \ + } else { \ + return std::make_shared(nullable); \ + } \ + break; + +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable) { + switch (type) { + case LogicalType::NA: + return NA; + GET_PRIMITIVE_TYPE(UINT8, UInt8Type); + GET_PRIMITIVE_TYPE(INT8, Int8Type); + GET_PRIMITIVE_TYPE(UINT32, UInt32Type); + GET_PRIMITIVE_TYPE(INT32, Int32Type); + default: + return nullptr; + } +} + +} // namespace py + +} // namespace arrow diff --git a/cpp/src/arrow/types/null.h b/python/src/pyarrow/helpers.h similarity index 77% rename from cpp/src/arrow/types/null.h rename to python/src/pyarrow/helpers.h index c67f752d409..d4ab13eb557 100644 --- a/cpp/src/arrow/types/null.h +++ b/python/src/pyarrow/helpers.h @@ -15,20 +15,21 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPES_NULL_H -#define ARROW_TYPES_NULL_H +#ifndef PYARROW_HELPERS_H +#define PYARROW_HELPERS_H -#include -#include - -#include "arrow/type.h" +#include +#include namespace arrow { -struct NullType : public PrimitiveType { - PRIMITIVE_DECL(NullType, void, NA, 0, "null"); -}; +namespace py { + +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable); + +} // namespace py } // namespace arrow -#endif // ARROW_TYPES_NULL_H +#endif // PYARROW_HELPERS_H From 8f7edaf77c14342cee84f65839cbad132edf5706 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 5 Mar 2016 22:08:49 -0800 Subject: [PATCH 02/21] Consolidate Field and data type subclasses. Add more Python stubs --- cpp/CMakeLists.txt | 1 - cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/api.h | 1 - cpp/src/arrow/field.cc | 31 ---------- cpp/src/arrow/field.h | 63 -------------------- cpp/src/arrow/table/column-test.cc | 1 - cpp/src/arrow/table/column.cc | 2 +- cpp/src/arrow/table/column.h | 2 +- cpp/src/arrow/table/schema-test.cc | 1 - cpp/src/arrow/table/schema.cc | 2 +- cpp/src/arrow/table/schema.h | 1 - cpp/src/arrow/table/table-test.cc | 1 - cpp/src/arrow/table/table.cc | 2 +- cpp/src/arrow/table/test-common.h | 1 - cpp/src/arrow/type.cc | 32 ++++++++++ cpp/src/arrow/type.h | 96 +++++++++++++++++++++++++++++- cpp/src/arrow/types/list.cc | 12 ---- cpp/src/arrow/types/list.h | 16 ----- cpp/src/arrow/types/string.h | 21 ------- cpp/src/arrow/types/struct-test.cc | 17 +++--- cpp/src/arrow/types/struct.cc | 18 ------ cpp/src/arrow/types/struct.h | 21 +------ python/arrow/__init__.py | 24 ++++++++ python/arrow/includes/arrow.pxd | 12 ++-- python/arrow/includes/common.pxd | 4 +- python/arrow/schema.pyx | 56 +++++++++++++++-- python/src/pyarrow/helpers.cc | 8 +++ 27 files changed, 234 insertions(+), 213 deletions(-) delete mode 100644 cpp/src/arrow/field.cc delete mode 100644 cpp/src/arrow/field.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1e31265216e..e8cb88c0b4d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -433,7 +433,6 @@ set(LIBARROW_LINK_LIBS set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc - src/arrow/field.cc src/arrow/type.cc src/arrow/table/column.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 102a8a1853f..77326ce38d7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,7 +20,6 @@ install(FILES api.h array.h builder.h - field.h type.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 377c8a4a257..7620450d96f 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -22,7 +22,6 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/field.h" #include "arrow/type.h" #include "arrow/table/column.h" diff --git a/cpp/src/arrow/field.cc b/cpp/src/arrow/field.cc deleted file mode 100644 index 4568d905c29..00000000000 --- a/cpp/src/arrow/field.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/field.h" - -#include -#include - -namespace arrow { - -std::string Field::ToString() const { - std::stringstream ss; - ss << this->name << " " << this->type->ToString(); - return ss.str(); -} - -} // namespace arrow diff --git a/cpp/src/arrow/field.h b/cpp/src/arrow/field.h deleted file mode 100644 index 89a450c66f2..00000000000 --- a/cpp/src/arrow/field.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_FIELD_H -#define ARROW_FIELD_H - -#include - -#include "arrow/type.h" - -namespace arrow { - -// A field is a piece of metadata that includes (for now) a name and a data -// type - -struct Field { - // Field name - std::string name; - - // The field's data type - TypePtr type; - - Field(const std::string& name, const TypePtr& type) : - name(name), - type(type) {} - - bool operator==(const Field& other) const { - return this->Equals(other); - } - - bool operator!=(const Field& other) const { - return !this->Equals(other); - } - - bool Equals(const Field& other) const { - return (this == &other) || (this->name == other.name && - this->type->Equals(other.type.get())); - } - - bool nullable() const { - return this->type->nullable; - } - - std::string ToString() const; -}; - -} // namespace arrow - -#endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc index 4959b82c6e2..bf95932916c 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/table/column-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/test-common.h" diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc index d68b491fb99..573e6508759 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/table/column.cc @@ -20,7 +20,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h index 64423bf9561..dfc7516e26a 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/table/column.h @@ -23,7 +23,7 @@ #include #include "arrow/array.h" -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc index 5b5393efecc..d6725cc08c0 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -20,7 +20,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/schema.h" #include "arrow/type.h" #include "arrow/types/string.h" diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/table/schema.cc index fb3b4d6f292..d49d0a713e7 100644 --- a/cpp/src/arrow/table/schema.cc +++ b/cpp/src/arrow/table/schema.cc @@ -22,7 +22,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/table/schema.h index d04e3f628c1..103f01b26e3 100644 --- a/cpp/src/arrow/table/schema.h +++ b/cpp/src/arrow/table/schema.h @@ -22,7 +22,6 @@ #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc index dd4f74cd16f..c4fdb062db8 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table/table-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table/table.cc index 4cefc924ed3..0c788b8fe3f 100644 --- a/cpp/src/arrow/table/table.cc +++ b/cpp/src/arrow/table/table.cc @@ -20,9 +20,9 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h index efe2f228cd0..50a5f6a2f50 100644 --- a/cpp/src/arrow/table/test-common.h +++ b/cpp/src/arrow/table/test-common.h @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d8d2a4e98c1..7c0210e2498 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -17,8 +17,17 @@ #include "arrow/type.h" +#include +#include + namespace arrow { +std::string Field::ToString() const { + std::stringstream ss; + ss << this->name << " " << this->type->ToString(); + return ss.str(); +} + const std::shared_ptr NA = std::make_shared(); const std::shared_ptr BOOL = std::make_shared(); const std::shared_ptr UINT8 = std::make_shared(); @@ -31,5 +40,28 @@ const std::shared_ptr INT32 = std::make_shared(); const std::shared_ptr INT64 = std::make_shared(); const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); + +std::string ListType::ToString() const { + std::stringstream s; + s << "list<" << value_type->ToString() << ">"; + if (!this->nullable) { + s << " not null"; + } + return s.str(); +} + +std::string StructType::ToString() const { + std::stringstream s; + s << "struct<"; + for (size_t i = 0; i < fields_.size(); ++i) { + if (i > 0) s << ", "; + const std::shared_ptr& field = fields_[i]; + s << field->name << ": " << field->type->ToString(); + } + s << ">"; + if (!nullable) s << " not null"; + return s.str(); +} } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 264f08269f5..2890e02cd89 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -20,6 +20,7 @@ #include #include +#include namespace arrow { @@ -153,10 +154,45 @@ struct DataType { virtual std::string ToString() const = 0; }; - typedef std::shared_ptr LayoutPtr; typedef std::shared_ptr TypePtr; +// A field is a piece of metadata that includes (for now) a name and a data +// type +struct Field { + // Field name + std::string name; + + // The field's data type + TypePtr type; + + Field(const std::string& name, const TypePtr& type) : + name(name), + type(type) {} + + bool operator==(const Field& other) const { + return this->Equals(other); + } + + bool operator!=(const Field& other) const { + return !this->Equals(other); + } + + bool Equals(const Field& other) const { + return (this == &other) || (this->name == other.name && + this->type->Equals(other.type.get())); + } + + bool Equals(const std::shared_ptr& other) const { + return Equals(*other.get()); + } + + bool nullable() const { + return this->type->nullable; + } + + std::string ToString() const; +}; struct BytesType : public LayoutType { int size; @@ -251,6 +287,63 @@ struct DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; +struct ListType : public DataType { + // List can contain any other logical value type + TypePtr value_type; + + explicit ListType(const TypePtr& value_type, bool nullable = true) + : DataType(LogicalType::LIST, nullable), + value_type(value_type) {} + virtual ~ListType() {} + + static char const *name() { + return "list"; + } + + virtual std::string ToString() const; +}; + +// String is a logical type consisting of a physical list of 1-byte values +struct StringType : public DataType { + explicit StringType(bool nullable = true) + : DataType(LogicalType::STRING, nullable) {} + + StringType(const StringType& other) + : StringType() {} + + static char const *name() { + return "string"; + } + + virtual std::string ToString() const { + std::string result(name()); + if (!nullable) { + result.append(" not null"); + } + return result; + } +}; + +struct StructType : public DataType { + std::vector > fields_; + + explicit StructType(const std::vector >& fields, + bool nullable = true) + : DataType(LogicalType::STRUCT, nullable) { + fields_ = fields; + } + + const std::shared_ptr& field(int i) const { + return fields_[i]; + } + + int num_children() const { + return fields_.size(); + } + + virtual std::string ToString() const; +}; + extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; extern const std::shared_ptr UINT8; @@ -263,6 +356,7 @@ extern const std::shared_ptr INT32; extern const std::shared_ptr INT64; extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; +extern const std::shared_ptr STRING; } // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 5a47ce3187c..69a79a77fab 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -17,18 +17,6 @@ #include "arrow/types/list.h" -#include -#include - namespace arrow { -std::string ListType::ToString() const { - std::stringstream s; - s << "list<" << value_type->ToString() << ">"; - if (!this->nullable) { - s << " not null"; - } - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 0539ac87e0c..6df842fdeaf 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -36,22 +36,6 @@ namespace arrow { class MemoryPool; -struct ListType : public DataType { - // List can contain any other logical value type - TypePtr value_type; - - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} - virtual ~ListType() {} - - static char const *name() { - return "list"; - } - - virtual std::string ToString() const; -}; - class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 7daf62fadf5..94069c7ae93 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -71,27 +71,6 @@ struct VarcharType : public DataType { static const LayoutPtr byte1(new BytesType(1)); static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); -// String is a logical type consisting of a physical list of 1-byte values -struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(LogicalType::STRING, nullable) {} - - StringType(const StringType& other) - : StringType() {} - - static char const *name() { - return "string"; - } - - virtual std::string ToString() const { - std::string result(name()); - if (!nullable) { - result.append(" not null"); - } - return result; - } -}; - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 1a9fc6be4a5..9a4777e8b98 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -17,15 +17,16 @@ #include +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" #include "arrow/types/integer.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +using std::shared_ptr; using std::string; using std::vector; @@ -33,21 +34,21 @@ namespace arrow { TEST(TestStructType, Basics) { TypePtr f0_type = TypePtr(new Int32Type()); - Field f0("f0", f0_type); + auto f0 = std::make_shared("f0", f0_type); TypePtr f1_type = TypePtr(new StringType()); - Field f1("f1", f1_type); + auto f1 = std::make_shared("f1", f1_type); TypePtr f2_type = TypePtr(new UInt8Type()); - Field f2("f2", f2_type); + auto f2 = std::make_shared("f2", f2_type); - vector fields = {f0, f1, f2}; + vector > fields = {f0, f1, f2}; StructType struct_type(fields); - ASSERT_TRUE(struct_type.field(0).Equals(f0)); - ASSERT_TRUE(struct_type.field(1).Equals(f1)); - ASSERT_TRUE(struct_type.field(2).Equals(f2)); + ASSERT_TRUE(struct_type.field(0)->Equals(f0)); + ASSERT_TRUE(struct_type.field(1)->Equals(f1)); + ASSERT_TRUE(struct_type.field(2)->Equals(f2)); ASSERT_EQ(struct_type.ToString(), "struct"); diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index ee2a41c56c9..02af600b017 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,24 +17,6 @@ #include "arrow/types/struct.h" -#include -#include -#include -#include - namespace arrow { -std::string StructType::ToString() const { - std::stringstream s; - s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { - if (i > 0) s << ", "; - const Field& field = fields_[i]; - s << field.name << ": " << field.type->ToString(); - } - s << ">"; - if (!nullable) s << " not null"; - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index e575c31287c..5842534d35b 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -18,33 +18,14 @@ #ifndef ARROW_TYPES_STRUCT_H #define ARROW_TYPES_STRUCT_H +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { -struct StructType : public DataType { - std::vector fields_; - - explicit StructType(const std::vector& fields, bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const Field& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); - } - - virtual std::string ToString() const; -}; - } // namespace arrow #endif // ARROW_TYPES_STRUCT_H diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index e69de29bb2d..08f933d3d40 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +from arrow.schema import (bool_, int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + float_, double, string, + list_, struct, + DataType, Field, Schema) diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 8b71f22cc08..2385eb98eb3 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -42,7 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: LogicalType type c_bool nullable - string ToString() + c_string ToString() cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type, @@ -52,13 +52,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: pass cdef cppclass CField" arrow::Field": - string name + c_string name shared_ptr[CDataType] type - CField(const string& name, const shared_ptr[CDataType]& type) + CField(const c_string& name, const shared_ptr[CDataType]& type) + + cdef cppclass CStructType" arrow::StructType"(CDataType): + CStructType(const vector[shared_ptr[CField]]& fields, + c_bool nullable) cdef cppclass CSchema" arrow::Schema": - pass + CSchema(const shared_ptr[CField]& fields) cdef cppclass CArray" arrow::Array": const shared_ptr[CDataType]& type() diff --git a/python/arrow/includes/common.pxd b/python/arrow/includes/common.pxd index f2fc826625e..839427a6990 100644 --- a/python/arrow/includes/common.pxd +++ b/python/arrow/includes/common.pxd @@ -19,7 +19,7 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool -from libcpp.string cimport string +from libcpp.string cimport string as c_string from libcpp.vector cimport vector # This must be included for cerr and other things to work @@ -29,6 +29,8 @@ cdef extern from "": cdef extern from "" namespace "std" nogil: cdef cppclass shared_ptr[T]: + shared_ptr() + shared_ptr(T*) T* get() void reset() void reset(T* p) diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index 6f6a2dff354..ada46761128 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -66,19 +66,63 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): #------------------------------------------------------------ # Type factory functions +def bool_(c_bool nullable=True): + return primitive_type(LogicalType_BOOL, nullable) + +def uint8(c_bool nullable=True): + return primitive_type(LogicalType_UINT8, nullable) + +def int8(c_bool nullable=True): + return primitive_type(LogicalType_INT8, nullable) + +def uint16(c_bool nullable=True): + return primitive_type(LogicalType_UINT16, nullable) + +def int16(c_bool nullable=True): + return primitive_type(LogicalType_INT16, nullable) + def uint32(c_bool nullable=True): return primitive_type(LogicalType_UINT32, nullable) def int32(c_bool nullable=True): return primitive_type(LogicalType_INT32, nullable) -def list(DataType value_type, c_bool nullable=True): - cdef DataType out = DataType() +def uint64(c_bool nullable=True): + return primitive_type(LogicalType_UINT64, nullable) + +def int64(c_bool nullable=True): + return primitive_type(LogicalType_INT64, nullable) + +def float_(c_bool nullable=True): + return primitive_type(LogicalType_FLOAT, nullable) - cdef shared_ptr[CDataType] tp - tp.reset( new CListType(value_type.sp_type, nullable)) - out.init(tp) +def double(c_bool nullable=True): + return primitive_type(LogicalType_DOUBLE, nullable) + +def string(c_bool nullable=True): + """ + UTF8 string + """ + return primitive_type(LogicalType_STRING, nullable) + +def list_(DataType value_type, c_bool nullable=True): + cdef DataType out = DataType() + out.init(shared_ptr[CDataType]( + new CListType(value_type.sp_type, nullable))) return out def struct(fields, c_bool nullable=True): - pass + """ + + """ + cdef: + DataType out = DataType() + Field field + vector[shared_ptr[CField]] c_fields + + for field in fields: + c_fields.push_back(field.sp_field) + + out.init(shared_ptr[CDataType]( + new CStructType(c_fields, nullable))) + return out diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 651e77ed22f..4f63b47ed46 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -39,8 +39,16 @@ std::shared_ptr GetPrimitiveType(LogicalType::type type, return NA; GET_PRIMITIVE_TYPE(UINT8, UInt8Type); GET_PRIMITIVE_TYPE(INT8, Int8Type); + GET_PRIMITIVE_TYPE(UINT16, UInt16Type); + GET_PRIMITIVE_TYPE(INT16, Int16Type); GET_PRIMITIVE_TYPE(UINT32, UInt32Type); GET_PRIMITIVE_TYPE(INT32, Int32Type); + GET_PRIMITIVE_TYPE(UINT64, UInt64Type); + GET_PRIMITIVE_TYPE(INT64, Int64Type); + GET_PRIMITIVE_TYPE(BOOL, BooleanType); + GET_PRIMITIVE_TYPE(FLOAT, FloatType); + GET_PRIMITIVE_TYPE(DOUBLE, DoubleType); + GET_PRIMITIVE_TYPE(STRING, StringType); default: return nullptr; } From ac8c796f397425c06ad2076265794b9a316ab8b6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 5 Mar 2016 22:15:17 -0800 Subject: [PATCH 03/21] Cache primitive data type instances --- python/arrow/schema.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index ada46761128..c5bb61a3e6f 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -57,10 +57,16 @@ cdef class Field: def __get__(self): return frombytes(self.field.name) +cdef dict _type_cache = {} cdef DataType primitive_type(LogicalType type, bint nullable=True): + if (type, nullable) in _type_cache: + return _type_cache[type, nullable] + cdef DataType out = DataType() out.init(pyarrow.GetPrimitiveType(type, nullable)) + + _type_cache[type, nullable] = out return out #------------------------------------------------------------ From 55e69a2ef520d56de96a8cc728f731639ac90bcb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 5 Mar 2016 23:23:23 -0800 Subject: [PATCH 04/21] Typed array stubs --- python/arrow/array.pxd | 49 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index 07e7fd335b3..52f6d2082de 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -16,9 +16,56 @@ # under the License. from arrow.includes.common cimport shared_ptr -from arrow.includes.arrow cimport CArray +from arrow.includes.arrow cimport CArray, LogicalType cdef class Array: cdef: shared_ptr[CArray] sp_array CArray* array + + def __len__(self): + return self.array.length() + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass + + +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class StringArray(NumericArray): + pass From 4132bda76b462dbe4864ff5218dad0894a804a52 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 15:36:04 -0800 Subject: [PATCH 05/21] Essential scaffolding -- error handling, memory pools, etc. -- to work toward converting Python lists to Arrow arrays --- python/CMakeLists.txt | 12 +- python/arrow/config.pyx | 2 +- python/arrow/error.pxd | 20 +++ python/arrow/error.pyx | 28 +++ python/arrow/includes/pyarrow.pxd | 22 ++- python/arrow/scalar.pxd | 39 +++++ python/setup.py | 2 +- python/src/pyarrow/adapters/builtin.cc | 226 +++++++++++++++++++++++++ python/src/pyarrow/adapters/builtin.h | 40 +++++ python/src/pyarrow/api.h | 2 + python/src/pyarrow/common.cc | 71 ++++++++ python/src/pyarrow/common.h | 61 +++++++ python/src/pyarrow/helpers.cc | 8 +- python/src/pyarrow/helpers.h | 9 +- python/src/pyarrow/init.cc | 8 +- python/src/pyarrow/init.h | 8 +- python/src/pyarrow/status.cc | 38 +++++ python/src/pyarrow/status.h | 144 ++++++++++++++++ 18 files changed, 706 insertions(+), 34 deletions(-) create mode 100644 python/arrow/error.pxd create mode 100644 python/arrow/error.pyx create mode 100644 python/arrow/scalar.pxd create mode 100644 python/src/pyarrow/adapters/builtin.cc create mode 100644 python/src/pyarrow/adapters/builtin.h create mode 100644 python/src/pyarrow/common.cc create mode 100644 python/src/pyarrow/common.h create mode 100644 python/src/pyarrow/status.cc create mode 100644 python/src/pyarrow/status.h diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 2ea79042bc5..e6069adc06b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -395,8 +395,12 @@ add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS + src/pyarrow/common.cc src/pyarrow/helpers.cc src/pyarrow/init.cc + src/pyarrow/status.cc + + src/pyarrow/adapters/builtin.cc ) set(LINK_LIBS @@ -417,18 +421,12 @@ endif() # Setup and build Cython modules ############################################################ -# foreach(pyx_api_file -# arrow/config.pyx -# arrow/parquet.pyx -# arrow/schema.pyx) -# set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) -# endforeach(pyx_api_file) - set(USE_RELATIVE_RPATH ON) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CYTHON_EXTENSIONS config + error parquet schema ) diff --git a/python/arrow/config.pyx b/python/arrow/config.pyx index 8f10beb3a2e..521bc066cd4 100644 --- a/python/arrow/config.pyx +++ b/python/arrow/config.pyx @@ -2,7 +2,7 @@ # distutils: language = c++ # cython: embedsignature = True -cdef extern from 'pyarrow/init.h' namespace 'arrow::py': +cdef extern from 'pyarrow/init.h' namespace 'pyarrow': void pyarrow_init() pyarrow_init() diff --git a/python/arrow/error.pxd b/python/arrow/error.pxd new file mode 100644 index 00000000000..c18cb3efffc --- /dev/null +++ b/python/arrow/error.pxd @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.pyarrow cimport * + +cdef check_status(const Status& status) diff --git a/python/arrow/error.pyx b/python/arrow/error.pyx new file mode 100644 index 00000000000..7c301e5d872 --- /dev/null +++ b/python/arrow/error.pyx @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport c_string + +class ArrowException(Exception): + pass + +cdef check_status(const Status& status): + if status.ok(): + return + + cdef c_string c_message = status.ToString() + return ArrowException(c_message) diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index 5b6d87a841a..ff591c81b84 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -20,8 +20,24 @@ from arrow.includes.common cimport * from arrow.includes.arrow cimport LogicalType, CDataType -cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil: - pass +cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: + # We can later add more of the common status factory methods as needed + cdef Status Status_OK "Status::OK"() -cdef extern from "pyarrow/helpers.h" namespace "arrow::py" nogil: + cdef cppclass Status: + Status() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsTypeError() + c_bool IsIOError() + c_bool IsValueError() + c_bool IsNotImplemented() + c_bool IsArrowError() + + +cdef extern from "pyarrow/helpers.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable); diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd new file mode 100644 index 00000000000..671490ee3bf --- /dev/null +++ b/python/arrow/scalar.pxd @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport * +from arrow.includes.arrow cimport CArray, CListArray + +from arrow.schema cimport DataType + +cdef class ScalarValue: + cdef readonly: + shared_ptr[CArray] array + int index + DataType type + + +cdef class Int8Value: + pass + + +cdef class ListValue: + pass + + +cdef class StringValue: + pass diff --git a/python/setup.py b/python/setup.py index c67351ede29..e0b032e4767 100644 --- a/python/setup.py +++ b/python/setup.py @@ -207,7 +207,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['config', 'parquet', 'schema'] + return ['config', 'error', 'parquet', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc new file mode 100644 index 00000000000..f50c3faf58c --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.cc @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "pyarrow/adapters/builtin.h" + +#include + +#include "pyarrow/status.h" + +namespace pyarrow { + +using arrow::DataType; +using arrow::LogicalType; + +static inline bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + +static inline bool IsPyBaseString(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyString_Check(obj) || PyUnicode_Check(obj); +#else + return PyUnicode_Check(obj); +#endif +} + +class ScalarTypeInfer { + public: + ScalarTypeInfer() : + none_count_(0), + bool_count_(0), + int_count_(0), + float_count_(0), + string_count_(0) {} + + void Visit(PyObject* obj) { + if (obj == Py_None) { + ++none_count_; + } else if (PyFloat_Check(obj)) { + ++float_count_; + } else if (IsPyInteger(obj)) { + ++int_count_; + } else if (IsPyBaseString(obj)) { + ++string_count_; + } else { + // TODO(wesm): accumulate error information somewhere + } + } + + std::shared_ptr GetType() { + // TODO(wesm): handling mixed-type cases + + if (float_count_) { + return arrow::DOUBLE; + } else if (int_count_) { + // TODO(wesm): tighter type later + return arrow::INT64; + } else if (bool_count_) { + // TODO(wesm): tighter type later + return arrow::BOOL; + } else if (string_count_) { + return arrow::STRING; + } else { + return arrow::NA; + } + } + + private: + int64_t none_count_; + int64_t bool_count_; + int64_t int_count_; + int64_t float_count_; + int64_t string_count_; + + // Place to accumulate errors + // std::vector errors_; +}; + +// Non-exhaustive type inference +static Status InferArrowType(PyObject* obj, int64_t* size, + std::shared_ptr* out_type) { + *size = PySequence_Size(obj); + if (PyErr_Occurred()) { + // Not a sequence + PyErr_Clear(); + return Status::TypeError("Object is not a sequence"); + } + + // For 0-length sequences, refuse to guess + if (*size == 0) { + *out_type = arrow::NA; + } + + ScalarTypeInfer inferer; + + for (int64_t i = 0; i < *size; ++i) { + // TODO(wesm): Error checking? + // TODO(wesm): Specialize for PyList_GET_ITEM? + OwnedRef item_ref(PySequence_GetItem(obj, i)); + PyObject* item = item_ref.obj(); + + if (PyList_Check(item) || PyDict_Check(item)) { + // TODO(wesm): inferring types for collections + return Status::NotImplemented("No type inference for collections"); + } else { + inferer.Visit(obj); + } + } + + *out_type = inferer.GetType(); + return Status::OK(); +} + +// Marshal Python sequence (list, tuple, etc.) to Arrow array +class SeqConverter { + public: + SeqConverter(); + + virtual Status AppendData(PyObject* seq) = 0; + + private: + // Borrowed reference for now + PyObject* obj_; +}; + +class BooleanConverter : SeqConverter { + public: + + Status AppendData(PyObject* obj) override { + return Status::OK(); + } +}; + +template +class IntegerConverter : SeqConverter { + public: + + Status AppendData(PyObject* obj) override { + return Status::OK(); + } +}; + +template +class FloatingConverter : SeqConverter { + public: + + Status AppendData(PyObject* obj) override { + return Status::OK(); + } +}; + +class StringConverter : SeqConverter { + public: + + Status AppendData(PyObject* obj) override { + return Status::OK(); + } + + private: + arrow::StringBuilder builder_; +}; + +class ListConverter : SeqConverter { + public: + + Status AppendData(PyObject* obj) override { + return Status::OK(); + } + + private: + arrow::ListBuilder builder_; +}; + +Status GetConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + switch (type->type) { + case LogicalType::BOOL: + break; + case LogicalType::INT64: + break; + case LogicalType::DOUBLE: + break; + case LogicalType::STRING: + break; + case LogicalType::LIST: + case LogicalType::STRUCT: + default: + return Status::NotImplemented("No type converter implemetned"); + break; + } + return Status::OK(); +} + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { + std::shared_ptr type; + int64_t size; + RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + RETURN_NOT_OK(converter->AppendData(obj)); + + return Status::OK(); +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h new file mode 100644 index 00000000000..24886f4970d --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef PYARROW_ADAPTERS_BUILTIN_H +#define PYARROW_ADAPTERS_BUILTIN_H + +#include + +#include + +#include "pyarrow/common.h" + +namespace arrow { class Array; } + +namespace pyarrow { + +class Status; + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); + +} // namespace pyarrow + +#endif // PYARROW_ADAPTERS_BUILTIN_H diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index c2285de77bf..4e7516fdd6f 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -18,4 +18,6 @@ #ifndef PYARROW_API_H #define PYARROW_API_H +#include "pyarrow/status.h" + #endif // PYARROW_API_H diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc new file mode 100644 index 00000000000..a2748f99b67 --- /dev/null +++ b/python/src/pyarrow/common.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/common.h" + +#include +#include +#include + +#include +#include + +#include "pyarrow/status.h" + +namespace pyarrow { + +class PyArrowMemoryPool : public arrow::MemoryPool { + public: + PyArrowMemoryPool() : bytes_allocated_(0) {} + virtual ~PyArrowMemoryPool() {} + + arrow::Status Allocate(int64_t size, uint8_t** out) override { + std::lock_guard guard(pool_lock_); + *out = static_cast(std::malloc(size)); + if (*out == nullptr) { + std::stringstream ss; + ss << "malloc of size " << size << " failed"; + return arrow::Status::OutOfMemory(ss.str()); + } + + bytes_allocated_ += size; + + return arrow::Status::OK(); + } + + int64_t bytes_allocated() const override { + std::lock_guard guard(pool_lock_); + return bytes_allocated_; + } + + void Free(uint8_t* buffer, int64_t size) override { + std::lock_guard guard(pool_lock_); + std::free(buffer); + bytes_allocated_ -= size; + } + + private: + mutable std::mutex pool_lock_; + int64_t bytes_allocated_; +}; + +arrow::MemoryPool* GetMemoryPool() { + static PyArrowMemoryPool memory_pool; + return &memory_pool; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h new file mode 100644 index 00000000000..a3daf938922 --- /dev/null +++ b/python/src/pyarrow/common.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_COMMON_H +#define PYARROW_COMMON_H + +#include + +namespace arrow { class MemoryPool; } + +namespace pyarrow { + +#define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 + +#define RETURN_IF_PYERROR() \ + if (PyErr_Occurred()) { \ + PyObject *exc_type, *exc_value, *traceback; \ + PyErr_Fetch(&exc_type, &exc_value, &traceback); \ + std::string message(PyString_AsString(exc_value)); \ + Py_DECREF(exc_type); \ + Py_DECREF(exc_value); \ + Py_DECREF(traceback); \ + return Status::UnknownError(message); \ + } + +class OwnedRef { + public: + OwnedRef(PyObject* obj) : + obj_(obj) {} + + ~OwnedRef() { + Py_XDECREF(obj_); + } + + PyObject* obj() const{ + return obj_; + } + + private: + PyObject* obj_; +}; + +arrow::MemoryPool* GetMemoryPool(); + +} // namespace pyarrow + +#endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 4f63b47ed46..d0969dacc21 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -19,9 +19,9 @@ #include -namespace arrow { +using namespace arrow; -namespace py { +namespace pyarrow { #define GET_PRIMITIVE_TYPE(NAME, Type) \ case LogicalType::NAME: \ @@ -54,6 +54,4 @@ std::shared_ptr GetPrimitiveType(LogicalType::type type, } } -} // namespace py - -} // namespace arrow +} // namespace pyarrow diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index d4ab13eb557..1a24f056feb 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -21,15 +21,14 @@ #include #include -namespace arrow { +namespace pyarrow { -namespace py { +using arrow::DataType; +using arrow::LogicalType; std::shared_ptr GetPrimitiveType(LogicalType::type type, bool nullable); -} // namespace py - -} // namespace arrow +} // namespace pyarrow #endif // PYARROW_HELPERS_H diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc index c36f4137255..acd851e1687 100644 --- a/python/src/pyarrow/init.cc +++ b/python/src/pyarrow/init.cc @@ -17,13 +17,9 @@ #include "pyarrow/init.h" -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init() { } -} // namespace py - -} // namespace arrow +} // namespace pyarrow diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h index 1fc9f101026..71e67a20c1c 100644 --- a/python/src/pyarrow/init.h +++ b/python/src/pyarrow/init.h @@ -18,14 +18,10 @@ #ifndef PYARROW_INIT_H #define PYARROW_INIT_H -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init(); -} // namespace py - -} // namespace arrow +} // namespace pyarrow #endif // PYARROW_INIT_H diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc new file mode 100644 index 00000000000..36ed8c6e47c --- /dev/null +++ b/python/src/pyarrow/status.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#include "pyarrow/status.h" + +#include + +namespace pyarrow { + +Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) { + assert(code != StatusCode::OK); + const uint32_t size = msg.size(); + char* result = new char[size + 7]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, &posix_code, sizeof(posix_code)); + memcpy(result + 7, msg.c_str(), msg.size()); + state_ = result; +} + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 7]; + memcpy(result, state, size + 7); + return result; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h new file mode 100644 index 00000000000..cb8c8add210 --- /dev/null +++ b/python/src/pyarrow/status.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef PYARROW_STATUS_H_ +#define PYARROW_STATUS_H_ + +#include +#include +#include + +namespace pyarrow { + +#define PY_RETURN_NOT_OK(s) do { \ + Status _s = (s); \ + if (!_s.ok()) return _s; \ + } while (0); + +enum class StatusCode: char { + OK = 0, + OutOfMemory = 1, + KeyError = 2, + TypeError = 3, + ValueError = 4, + IOError = 5, + NotImplemented = 6, + + ArrowError = 7, + + UnknownError = 10 +}; + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) { + return Status(StatusCode::OutOfMemory, msg, posix_code); + } + + static Status KeyError(const std::string& msg) { + return Status(StatusCode::KeyError, msg, -1); + } + + static Status TypeError(const std::string& msg) { + return Status(StatusCode::TypeError, msg, -1); + } + + static Status IOError(const std::string& msg) { + return Status(StatusCode::IOError, msg, -1); + } + + static Status ValueError(const std::string& msg) { + return Status(StatusCode::ValueError, msg, -1); + } + + static Status NotImplemented(const std::string& msg) { + return Status(StatusCode::NotImplemented, msg, -1); + } + + static Status UnknownError(const std::string& msg) { + return Status(StatusCode::UnknownError, msg, -1); + } + + static Status ArrowError(const std::string& msg) { + return Status(StatusCode::ArrowError, msg, -1); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + bool IsKeyError() const { return code() == StatusCode::KeyError; } + bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsTypeError() const { return code() == StatusCode::TypeError; } + bool IsValueError() const { return code() == StatusCode::ValueError; } + + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + + bool IsArrowError() const { return code() == StatusCode::ArrowError; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + // Return a string representation of the status code, without the message + // text or posix code information. + std::string CodeAsString() const; + + // Get the POSIX code associated with this Status, or -1 if there is none. + int16_t posix_code() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..6] == posix_code + // state_[7..] == message + const char* state_; + + StatusCode code() const { + return ((state_ == NULL) ? + StatusCode::OK : static_cast(state_[4])); + } + + Status(StatusCode code, const std::string& msg, int16_t posix_code); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} + +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +} // namespace pyarrow + +#endif // PYARROW_STATUS_H_ From d5655bad5652c0b68d5487c752983e0f20bad00d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 16:15:57 -0800 Subject: [PATCH 06/21] Clean up array builder API to return shared_ptr --- cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/types/list-test.cc | 7 +++---- cpp/src/arrow/types/list.h | 23 ++++++++++------------- cpp/src/arrow/types/primitive-test.cc | 22 +++++++++++----------- cpp/src/arrow/types/primitive.h | 20 ++++---------------- cpp/src/arrow/types/string-test.cc | 6 ++---- cpp/src/arrow/types/string.h | 7 ++----- python/src/pyarrow/adapters/builtin.cc | 2 -- 8 files changed, 33 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 491b9133d2c..b7c3935d2ac 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -78,7 +78,7 @@ class ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data - virtual Status ToArray(Array** out) = 0; + virtual std::shared_ptr Finish() = 0; protected: MemoryPool* pool_; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index cec13995142..ee0919b6b34 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -32,6 +32,7 @@ #include "arrow/types/test-common.h" #include "arrow/util/status.h" +using std::shared_ptr; using std::string; using std::unique_ptr; using std::vector; @@ -78,9 +79,7 @@ class TestListBuilder : public TestBuilder { } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: @@ -88,7 +87,7 @@ class TestListBuilder : public TestBuilder { TypePtr type_; unique_ptr builder_; - unique_ptr result_; + shared_ptr result_; }; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 6df842fdeaf..f9306fcc2fe 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -132,30 +132,27 @@ class ListBuilder : public Int32Builder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers template - Status Transfer(Container* out) { - Array* child_values; - RETURN_NOT_OK(value_builder_->ToArray(&child_values)); + std::shared_ptr Transfer() { + auto result = std::make_shared(); + + std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero if (length_) { - raw_buffer()[length_] = child_values->length(); + raw_buffer()[length_] = items->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), + result->Init(type_, length_, values_, items, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); + + return result; } - virtual Status ToArray(Array** out) { - ListArray* result = new ListArray(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return Transfer(); } // Start a new variable-length list slot diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 02eaaa7542b..bbf851950bf 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -112,7 +112,6 @@ class TestPrimitiveBuilder : public TestBuilder { } void CheckNullable() { - ArrayType result; ArrayType expected; int size = builder_->length(); @@ -125,7 +124,9 @@ class TestPrimitiveBuilder : public TestBuilder { int32_t ex_null_count = null_count(nulls_); expected.Init(size, ex_data, ex_null_count, ex_nulls); - ASSERT_OK(builder_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_->length()); @@ -133,12 +134,11 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(ex_null_count, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(ex_null_count, result->null_count()); } void CheckNonNullable() { - ArrayType result; ArrayType expected; int size = builder_nn_->length(); @@ -146,15 +146,17 @@ class TestPrimitiveBuilder : public TestBuilder { size * sizeof(T)); expected.Init(size, ex_data); - ASSERT_OK(builder_nn_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_nn_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_nn_->length()); ASSERT_EQ(0, builder_nn_->capacity()); ASSERT_EQ(nullptr, builder_nn_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(0, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(0, result->null_count()); } protected: @@ -225,9 +227,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - Array* result; - ASSERT_OK(this->builder_->ToArray(&result)); - unique_ptr holder(result); + auto result = this->builder_->Finish(); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 09d43e7ec8b..a55ac068a3b 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -197,24 +197,12 @@ class PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers - Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, null_count_, nulls_); + std::shared_ptr Finish() override { + std::shared_ptr result = std::make_shared(); + result->PrimitiveArray::Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); - } - - Status Transfer(ArrayType* out) { - return Transfer(static_cast(out)); - } - - virtual Status ToArray(Array** out) { - ArrayType* result = new ArrayType(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + return result; } value_type* raw_buffer() { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 9af66729502..120932d167a 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -173,16 +173,14 @@ class TestStringBuilder : public TestBuilder { } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr type_; std::unique_ptr builder_; - std::unique_ptr result_; + std::shared_ptr result_; }; TEST_F(TestStringBuilder, TestScalarAppend) { diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 94069c7ae93..85771b86202 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -146,11 +146,8 @@ class StringBuilder : public ListBuilder { Status Append(const std::vector& values, uint8_t* null_bytes); - virtual Status ToArray(Array** out) { - StringArray* result = new StringArray(); - RETURN_NOT_OK(ListBuilder::Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return ListBuilder::Transfer(); } protected: diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index f50c3faf58c..8b85deceafd 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -69,14 +69,12 @@ class ScalarTypeInfer { std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases - if (float_count_) { return arrow::DOUBLE; } else if (int_count_) { // TODO(wesm): tighter type later return arrow::INT64; } else if (bool_count_) { - // TODO(wesm): tighter type later return arrow::BOOL; } else if (string_count_) { return arrow::STRING; From bdb02e717114ddec07b9252a73d715cc5457d59b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 16:29:23 -0800 Subject: [PATCH 07/21] Use shared_ptr with dynamic make_builder too --- cpp/src/arrow/types/construct.cc | 21 ++++++++++----------- cpp/src/arrow/types/construct.h | 6 ++++-- cpp/src/arrow/types/list-test.cc | 6 +++--- cpp/src/arrow/types/list.h | 9 ++++----- cpp/src/arrow/types/primitive-test.cc | 11 ++++++----- cpp/src/arrow/types/string-test.cc | 5 +---- cpp/src/arrow/types/string.h | 3 +-- 7 files changed, 29 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 05d6b270fc3..5a46ef605f1 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -32,13 +32,13 @@ class ArrayBuilder; // Initially looked at doing this with vtables, but shared pointers makes it // difficult -#define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ - *out = static_cast(new BuilderType(pool, type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case LogicalType::ENUM: \ + out->reset(new BuilderType(pool, type)); \ return Status::OK(); -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out) { +Status make_builder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out) { switch (type->type) { BUILDER_CASE(UINT8, UInt8Builder); BUILDER_CASE(INT8, Int8Builder); @@ -58,13 +58,12 @@ Status make_builder(MemoryPool* pool, const TypePtr& type, case LogicalType::LIST: { - ListType* list_type = static_cast(type.get()); - ArrayBuilder* value_builder; - RETURN_NOT_OK(make_builder(pool, list_type->value_type, &value_builder)); + std::shared_ptr value_builder; - // The ListBuilder takes ownership of the value_builder - ListBuilder* builder = new ListBuilder(pool, type, value_builder); - *out = static_cast(builder); + const std::shared_ptr& value_type = static_cast( + type.get())->value_type; + RETURN_NOT_OK(make_builder(pool, value_type, &value_builder)); + out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } // BUILDER_CASE(CHAR, CharBuilder); diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index b5ba436f787..39d6342743a 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,6 +18,8 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include + #include "arrow/type.h" namespace arrow { @@ -26,8 +28,8 @@ class ArrayBuilder; class MemoryPool; class Status; -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out); +Status make_builder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index ee0919b6b34..6de95a31abb 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -73,9 +73,9 @@ class TestListBuilder : public TestBuilder { value_type_ = TypePtr(new Int32Type()); type_ = TypePtr(new ListType(value_type_)); - ArrayBuilder* tmp; + std::shared_ptr tmp; ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + builder_ = std::dynamic_pointer_cast(tmp); } void Done() { @@ -86,7 +86,7 @@ class TestListBuilder : public TestBuilder { TypePtr value_type_; TypePtr type_; - unique_ptr builder_; + shared_ptr builder_; shared_ptr result_; }; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f9306fcc2fe..cdd1e5a0b1c 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -91,10 +91,9 @@ class ListArray : public Array { class ListBuilder : public Int32Builder { public: ListBuilder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder* value_builder) - : Int32Builder(pool, type) { - value_builder_.reset(value_builder); - } + std::shared_ptr value_builder) + : Int32Builder(pool, type), + value_builder_(value_builder) {} Status Init(int32_t elements) { // One more than requested. @@ -183,7 +182,7 @@ class ListBuilder : public Int32Builder { ArrayBuilder* value_builder() const { return value_builder_.get();} protected: - std::unique_ptr value_builder_; + std::shared_ptr value_builder_; }; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index bbf851950bf..e77aec10a42 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -37,6 +37,7 @@ #include "arrow/util/status.h" using std::string; +using std::shared_ptr; using std::unique_ptr; using std::vector; @@ -98,12 +99,12 @@ class TestPrimitiveBuilder : public TestBuilder { type_ = Attrs::type(); - ArrayBuilder* tmp; + std::shared_ptr tmp; ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + builder_ = std::dynamic_pointer_cast(tmp); ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_nn_.reset(static_cast(tmp)); + builder_nn_ = std::dynamic_pointer_cast(tmp); } void RandomData(int N, double pct_null = 0.1) { @@ -162,8 +163,8 @@ class TestPrimitiveBuilder : public TestBuilder { protected: TypePtr type_; TypePtr type_nn_; - unique_ptr builder_; - unique_ptr builder_nn_; + shared_ptr builder_; + shared_ptr builder_nn_; vector draws_; vector nulls_; diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 120932d167a..8e82fd95dd8 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -166,10 +166,7 @@ class TestStringBuilder : public TestBuilder { void SetUp() { TestBuilder::SetUp(); type_ = TypePtr(new StringType()); - - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + builder_.reset(new StringBuilder(pool_, type_)); } void Done() { diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 85771b86202..a4d1522210f 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -131,8 +131,7 @@ class StringArray : public ListArray { class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : - ListBuilder(pool, type, - static_cast(new UInt8Builder(pool, value_type_))) { + ListBuilder(pool, type, std::make_shared(pool, value_type_)) { byte_builder_ = static_cast(value_builder_.get()); } From 94f122f982ee7d12d35caeef591a24eea825d90d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 17:32:16 -0800 Subject: [PATCH 08/21] Basic object model for sequence->arrow conversions --- cpp/src/arrow/api.h | 1 + cpp/src/arrow/builder.h | 6 +- cpp/src/arrow/types/CMakeLists.txt | 1 + cpp/src/arrow/types/boolean.h | 3 +- cpp/src/arrow/types/construct.cc | 4 +- cpp/src/arrow/types/construct.h | 2 +- cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/primitive-test.cc | 4 +- python/src/pyarrow/adapters/builtin.cc | 88 +++++++++++++++++--------- python/src/pyarrow/common.h | 7 ++ 10 files changed, 79 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 7620450d96f..282b9ff2c9f 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -29,6 +29,7 @@ #include "arrow/table/table.h" #include "arrow/types/boolean.h" +#include "arrow/types/construct.h" #include "arrow/types/floating.h" #include "arrow/types/integer.h" #include "arrow/types/list.h" diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index b7c3935d2ac..fafee91f928 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -80,10 +80,14 @@ class ArrayBuilder { // ownership of the data virtual std::shared_ptr Finish() = 0; + const std::shared_ptr& type() const { + return type_; + } + protected: MemoryPool* pool_; - TypePtr type_; + std::shared_ptr type_; // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr nulls_; diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index bae4b6235ff..57cabdefd25 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -23,6 +23,7 @@ install(FILES boolean.h collection.h + construct.h datetime.h decimal.h floating.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index 8fc9cfd19c0..a5023d7b368 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -24,7 +24,8 @@ namespace arrow { typedef PrimitiveArrayImpl BooleanArray; -// typedef PrimitiveBuilder BooleanBuilder; +class BooleanBuilder : public ArrayBuilder { +}; } // namespace arrow diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 5a46ef605f1..43f01a30513 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -37,7 +37,7 @@ class ArrayBuilder; out->reset(new BuilderType(pool, type)); \ return Status::OK(); -Status make_builder(MemoryPool* pool, const std::shared_ptr& type, +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out) { switch (type->type) { BUILDER_CASE(UINT8, UInt8Builder); @@ -62,7 +62,7 @@ Status make_builder(MemoryPool* pool, const std::shared_ptr& type, const std::shared_ptr& value_type = static_cast( type.get())->value_type; - RETURN_NOT_OK(make_builder(pool, value_type, &value_builder)); + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 39d6342743a..59ebe1acddc 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -28,7 +28,7 @@ class ArrayBuilder; class MemoryPool; class Status; -Status make_builder(MemoryPool* pool, const std::shared_ptr& type, +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 6de95a31abb..516008b7763 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -74,7 +74,7 @@ class TestListBuilder : public TestBuilder { type_ = TypePtr(new ListType(value_type_)); std::shared_ptr tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_ = std::dynamic_pointer_cast(tmp); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index e77aec10a42..e25729dfb67 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -100,10 +100,10 @@ class TestPrimitiveBuilder : public TestBuilder { type_ = Attrs::type(); std::shared_ptr tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_ = std::dynamic_pointer_cast(tmp); - ASSERT_OK(make_builder(pool_, type_, &tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_nn_ = std::dynamic_pointer_cast(tmp); } diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 8b85deceafd..fb3b07ccffc 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "pyarrow/adapters/builtin.h" @@ -23,11 +24,12 @@ #include "pyarrow/status.h" -namespace pyarrow { - +using arrow::ArrayBuilder; using arrow::DataType; using arrow::LogicalType; +namespace pyarrow { + static inline bool IsPyInteger(PyObject* obj) { #if PYARROW_IS_PY2 return PyLong_Check(obj) || PyInt_Check(obj); @@ -132,80 +134,93 @@ static Status InferArrowType(PyObject* obj, int64_t* size, // Marshal Python sequence (list, tuple, etc.) to Arrow array class SeqConverter { public: - SeqConverter(); + virtual Status Init(const std::shared_ptr& builder) { + builder_ = builder; + return Status::OK(); + } virtual Status AppendData(PyObject* seq) = 0; - private: - // Borrowed reference for now - PyObject* obj_; + protected: + std::shared_ptr builder_; }; -class BooleanConverter : SeqConverter { +template +class TypedConverter : public SeqConverter { public: + Status Init(const std::shared_ptr& builder) override { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + return Status::OK(); + } + + protected: + BuilderType* typed_builder_; +}; +class BoolConverter : public TypedConverter { + public: Status AppendData(PyObject* obj) override { return Status::OK(); } }; -template -class IntegerConverter : SeqConverter { +class Int64Converter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { return Status::OK(); } }; -template -class FloatingConverter : SeqConverter { +class DoubleConverter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { return Status::OK(); } }; -class StringConverter : SeqConverter { +class StringConverter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { return Status::OK(); } - - private: - arrow::StringBuilder builder_; }; -class ListConverter : SeqConverter { +class ListConverter : public TypedConverter { public: + Status Init(const std::shared_ptr& builder) override; Status AppendData(PyObject* obj) override { return Status::OK(); } - - private: - arrow::ListBuilder builder_; + protected: + std::shared_ptr value_converter_; }; -Status GetConverter(const std::shared_ptr& type, - std::shared_ptr* out) { +// Dynamic constructor for sequence converters +std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->type) { case LogicalType::BOOL: - break; + return std::make_shared(); case LogicalType::INT64: - break; + return std::make_shared(); case LogicalType::DOUBLE: - break; + return std::make_shared(); case LogicalType::STRING: - break; + return std::make_shared(); case LogicalType::LIST: + return std::make_shared(); case LogicalType::STRUCT: default: - return Status::NotImplemented("No type converter implemetned"); + return nullptr; break; } +} + +Status ListConverter::Init(const std::shared_ptr& builder) { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + value_converter_ = GetConverter(builder->type()); return Status::OK(); } @@ -214,8 +229,19 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { int64_t size; RETURN_NOT_OK(InferArrowType(obj, &size, &type)); - std::shared_ptr converter; - RETURN_NOT_OK(GetConverter(type, &converter)); + std::shared_ptr converter = GetConverter(type); + if (converter == nullptr) { + std::stringstream ss; + ss << "No type converter implemented for " + << type->ToString(); + return Status::NotImplemented(ss.str()); + } + + // Give the sequence converter an array builder + std::shared_ptr builder; + RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); + converter->Init(builder); + RETURN_NOT_OK(converter->AppendData(obj)); return Status::OK(); diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index a3daf938922..4f23b6a4445 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -37,6 +37,13 @@ namespace pyarrow { return Status::UnknownError(message); \ } +#define RETURN_ARROW_NOT_OK(s) do { \ + arrow::Status _s = (s); \ + if (!_s.ok()) { \ + return Status::ArrowError(s.ToString()); \ + } \ + } while (0); + class OwnedRef { public: OwnedRef(PyObject* obj) : From 102ed36a3077018e660e4b414f8f4164b1930dd2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 18:04:06 -0800 Subject: [PATCH 09/21] Cython array box scaffold builds --- python/CMakeLists.txt | 1 + python/arrow/array.pxd | 10 ++-- python/arrow/array.pyx | 84 +++++++++++++++++++++++++++++-- python/arrow/includes/arrow.pxd | 3 ++ python/arrow/includes/pyarrow.pxd | 7 ++- python/setup.py | 2 +- python/src/pyarrow/api.h | 5 ++ python/src/pyarrow/common.h | 1 + 8 files changed, 101 insertions(+), 12 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index e6069adc06b..72b8c607f93 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -425,6 +425,7 @@ set(USE_RELATIVE_RPATH ON) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CYTHON_EXTENSIONS + array config error parquet diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index 52f6d2082de..2274f55262e 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -21,10 +21,8 @@ from arrow.includes.arrow cimport CArray, LogicalType cdef class Array: cdef: shared_ptr[CArray] sp_array - CArray* array - def __len__(self): - return self.array.length() + cdef init(self, const shared_ptr[CArray]& sp_array) cdef class BooleanArray(Array): @@ -67,5 +65,9 @@ cdef class UInt64Array(NumericArray): pass -cdef class StringArray(NumericArray): +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): pass diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 1f05493f84f..d0f2af4a797 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -19,12 +19,90 @@ # distutils: language = c++ # cython: embedsignature = True -from arrow.compat import frombytes, tobytes from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +from arrow.compat import frombytes, tobytes +from arrow.error cimport check_status + +cdef class Array: + + cdef init(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + + def __len__(self): + return self.array.length() + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass -def from_list(list_obj, type=None): +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): + pass + + +cdef dict _array_classes = { + LogicalType_BOOL: BooleanArray, + LogicalType_INT64: Int64Array, + LogicalType_LIST: ListArray, + LogicalType_STRING: StringArray, +} + +cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): + cdef LogicalType type = sp_array.get().type().get().type + + cdef Array arr = _array_classes[type]() + arr.init(sp_array) + return arr + + +def from_list(object list_obj, type=None): """ Convert Python list to Arrow array """ - pass + cdef: + shared_ptr[CArray] sp_array + + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + return box_arrow_array(sp_array) diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 2385eb98eb3..666bafc3dad 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -38,6 +38,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: LogicalType_STRING" arrow::LogicalType::STRING" + LogicalType_LIST" arrow::LogicalType::LIST" + LogicalType_STRUCT" arrow::LogicalType::STRUCT" + cdef cppclass CDataType" arrow::DataType": LogicalType type c_bool nullable diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index ff591c81b84..165d1e7f63e 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,7 +18,7 @@ # distutils: language = c++ from arrow.includes.common cimport * -from arrow.includes.arrow cimport LogicalType, CDataType +from arrow.includes.arrow cimport CArray, CDataType, LogicalType cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -38,6 +38,5 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: c_bool IsNotImplemented() c_bool IsArrowError() - -cdef extern from "pyarrow/helpers.h" namespace "pyarrow" nogil: - shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable); + shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) + Status ConvertPySequence(object obj, shared_ptr[CArray]* out) diff --git a/python/setup.py b/python/setup.py index e0b032e4767..d7338a97d9d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -207,7 +207,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['config', 'error', 'parquet', 'schema'] + return ['array', 'config', 'error', 'parquet', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index 4e7516fdd6f..72be6afe02c 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -20,4 +20,9 @@ #include "pyarrow/status.h" +#include "pyarrow/helpers.h" + +#include "pyarrow/adapters/builtin.h" +#include "pyarrow/adapters/pandas.h" + #endif // PYARROW_API_H diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 4f23b6a4445..7847912b68c 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -26,6 +26,7 @@ namespace pyarrow { #define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 +// TODO(wesm): We can just let errors pass through. To be explored later #define RETURN_IF_PYERROR() \ if (PyErr_Occurred()) { \ PyObject *exc_type, *exc_value, *traceback; \ From 4e206fc3e2655304732d0188af38850757c74ff1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 18:04:21 -0800 Subject: [PATCH 10/21] Add pandas converter placeholder --- python/src/pyarrow/adapters/pandas.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 python/src/pyarrow/adapters/pandas.h diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h new file mode 100644 index 00000000000..a4f41638087 --- /dev/null +++ b/python/src/pyarrow/adapters/pandas.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#ifndef PYARROW_ADAPTERS_PANDAS_H +#define PYARROW_ADAPTERS_PANDAS_H + +namespace pyarrow { + +} // namespace pyarrow + +#endif // PYARROW_ADAPTERS_PANDAS_H From 3a774fb35e3c9f766e944b96ef75010c6d4df043 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 18:51:25 -0800 Subject: [PATCH 11/21] Add Status::ToString impls. Unit test stub --- cpp/src/arrow/util/status.cc | 40 ++++++++++++++++++++++++++ python/arrow/__init__.py | 1 + python/arrow/array.pyx | 5 ++++ python/src/pyarrow/status.cc | 54 ++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+) diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index c64b8a3d5f8..c6e113ebea5 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -35,4 +35,44 @@ const char* Status::CopyState(const char* state) { return result; } +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::Invalid: + type = "Invalid"; + break; + case StatusCode::NotImplemented: + type = "NotImplemented"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + } // namespace arrow diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 08f933d3d40..61066ba369d 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -17,6 +17,7 @@ # flake8: noqa +from arrow.array import Array, from_list from arrow.schema import (bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float_, double, string, diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index d0f2af4a797..905d11d9ab2 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -30,6 +30,11 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array + property null_count: + + def __get__(self): + return self.sp_array.get().null_count() + def __len__(self): return self.array.length() diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc index 36ed8c6e47c..1cd54f6a785 100644 --- a/python/src/pyarrow/status.cc +++ b/python/src/pyarrow/status.cc @@ -13,6 +13,8 @@ #include "pyarrow/status.h" #include +#include +#include namespace pyarrow { @@ -35,4 +37,56 @@ const char* Status::CopyState(const char* state) { return result; } +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::TypeError: + type = "Value error"; + break; + case StatusCode::ValueError: + type = "Value error"; + break; + case StatusCode::IOError: + type = "IO error"; + break; + case StatusCode::NotImplemented: + type = "Not implemented"; + break; + case StatusCode::ArrowError: + type = "Arrow C++ error"; + break; + case StatusCode::UnknownError: + type = "Unknown error"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + } // namespace pyarrow From 07c1379657912ef8a8b69cc1d5b9f5993fe35205 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 19:43:01 -0800 Subject: [PATCH 12/21] Move some bits from arrow/type.h to type.cc --- cpp/src/arrow/type.cc | 51 ++++++++++++++++++++++++++++++----------- cpp/src/arrow/type.h | 29 ++++++----------------- python/arrow/schema.pyx | 8 +++---- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 7c0210e2498..cfed238dc41 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -28,19 +28,30 @@ std::string Field::ToString() const { return ss.str(); } -const std::shared_ptr NA = std::make_shared(); -const std::shared_ptr BOOL = std::make_shared(); -const std::shared_ptr UINT8 = std::make_shared(); -const std::shared_ptr UINT16 = std::make_shared(); -const std::shared_ptr UINT32 = std::make_shared(); -const std::shared_ptr UINT64 = std::make_shared(); -const std::shared_ptr INT8 = std::make_shared(); -const std::shared_ptr INT16 = std::make_shared(); -const std::shared_ptr INT32 = std::make_shared(); -const std::shared_ptr INT64 = std::make_shared(); -const std::shared_ptr FLOAT = std::make_shared(); -const std::shared_ptr DOUBLE = std::make_shared(); -const std::shared_ptr STRING = std::make_shared(); +DataType::~DataType() {} + +template +inline std::string PrimitiveType::ToString() const { + std::string result(static_cast(this)->name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + +StringType::StringType(bool nullable) + : DataType(LogicalType::STRING, nullable) {} + +StringType::StringType(const StringType& other) + : StringType(other.nullable) {} + +std::string StringType::ToString() const { + std::string result(name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} std::string ListType::ToString() const { std::stringstream s; @@ -64,4 +75,18 @@ std::string StructType::ToString() const { return s.str(); } +const std::shared_ptr NA = std::make_shared(); +const std::shared_ptr BOOL = std::make_shared(); +const std::shared_ptr UINT8 = std::make_shared(); +const std::shared_ptr UINT16 = std::make_shared(); +const std::shared_ptr UINT32 = std::make_shared(); +const std::shared_ptr UINT64 = std::make_shared(); +const std::shared_ptr INT8 = std::make_shared(); +const std::shared_ptr INT16 = std::make_shared(); +const std::shared_ptr INT32 = std::make_shared(); +const std::shared_ptr INT64 = std::make_shared(); +const std::shared_ptr FLOAT = std::make_shared(); +const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2890e02cd89..e078e2e656b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -139,7 +139,7 @@ struct DataType { type(type), nullable(nullable) {} - virtual ~DataType() {} + virtual ~DataType(); bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses @@ -218,13 +218,7 @@ struct PrimitiveType : public DataType { explicit PrimitiveType(bool nullable = true) : DataType(Derived::type_enum, nullable) {} - virtual std::string ToString() const { - std::string result(static_cast(this)->name()); - if (!nullable) { - result.append(" not null"); - } - return result; - } + std::string ToString() const override; }; #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ @@ -294,34 +288,25 @@ struct ListType : public DataType { explicit ListType(const TypePtr& value_type, bool nullable = true) : DataType(LogicalType::LIST, nullable), value_type(value_type) {} - virtual ~ListType() {} static char const *name() { return "list"; } - virtual std::string ToString() const; + std::string ToString() const override; }; // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(LogicalType::STRING, nullable) {} + explicit StringType(bool nullable = true); - StringType(const StringType& other) - : StringType() {} + StringType(const StringType& other); static char const *name() { return "string"; } - virtual std::string ToString() const { - std::string result(name()); - if (!nullable) { - result.append(" not null"); - } - return result; - } + std::string ToString() const override; }; struct StructType : public DataType { @@ -341,7 +326,7 @@ struct StructType : public DataType { return fields_.size(); } - virtual std::string ToString() const; + std::string ToString() const override; }; extern const std::shared_ptr NA; diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index c5bb61a3e6f..04e233a9e7b 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -35,12 +35,12 @@ cdef class DataType: self.sp_type = type self.type = type.get() - def __repr__(self): - return 'DataType({0})'.format(self._type_repr()) - - def _type_repr(self): + def __str__(self): return frombytes(self.type.ToString()) + def __repr__(self): + return 'DataType({0})'.format(str(self)) + cdef class Field: def __cinit__(self, object name, DataType type): From 47fd78eb1c7cc8c35c880126da33d1661601dea6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 19:43:39 -0800 Subject: [PATCH 13/21] Add unit test stub --- python/arrow/tests/test_convert_builtin.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 python/arrow/tests/test_convert_builtin.py diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py new file mode 100644 index 00000000000..f5e83159342 --- /dev/null +++ b/python/arrow/tests/test_convert_builtin.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestConvertList(unittest.TestCase): + + def test_list_convert(self): + pass From edb451cd84338f61a5ca1d074904f34ce1cce4c9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 19:50:25 -0800 Subject: [PATCH 14/21] Add a few data type smoke tests --- python/arrow/__init__.py | 2 +- python/arrow/schema.pyx | 6 ++-- python/arrow/tests/test_schema.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 python/arrow/tests/test_schema.py diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 61066ba369d..41bc8971d39 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -21,5 +21,5 @@ from arrow.schema import (bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float_, double, string, - list_, struct, + list_, struct, field, DataType, Field, Schema) diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index 04e233a9e7b..ffb159ab3d4 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -49,8 +49,7 @@ cdef class Field: self.field = self.sp_field.get() def __repr__(self): - return 'Field({0}, type={1})'.format(self.name, - self.type._type_repr()) + return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) property name: @@ -72,6 +71,9 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): #------------------------------------------------------------ # Type factory functions +def field(name, type): + return Field(name, type) + def bool_(c_bool nullable=True): return primitive_type(LogicalType_BOOL, nullable) diff --git a/python/arrow/tests/test_schema.py b/python/arrow/tests/test_schema.py new file mode 100644 index 00000000000..a89edd74a0a --- /dev/null +++ b/python/arrow/tests/test_schema.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestTypes(unittest.TestCase): + + def test_integers(self): + dtypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64'] + + for name in dtypes: + factory = getattr(arrow, name) + t = factory() + t_required = factory(False) + + assert str(t) == name + assert str(t_required) == '{0} not null'.format(name) + + def test_list(self): + value_type = arrow.int32() + list_type = arrow.list_(value_type) + assert str(list_type) == 'list' + + def test_string(self): + t = arrow.string() + assert str(t) == 'string' + + def test_field(self): + t = arrow.string() + f = arrow.field('foo', t) + + assert f.name == 'foo' + assert f.type is t + assert repr(f) == "Field('foo', type=string)" From b5b5b825b7da84ad66378700cd451e4f28f03004 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 19:58:14 -0800 Subject: [PATCH 15/21] Failing test stubs, raise on null array --- python/arrow/array.pyx | 10 ++++++++-- python/arrow/tests/test_convert_builtin.py | 16 +++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 905d11d9ab2..cf396da0ef9 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -95,9 +95,15 @@ cdef dict _array_classes = { } cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): - cdef LogicalType type = sp_array.get().type().get().type + if sp_array.get() == NULL: + raise ValueError('Array was NULL') - cdef Array arr = _array_classes[type]() + cdef CDataType* data_type = sp_array.get().type().get() + + if data_type == NULL: + raise ValueError('Array data type was NULL') + + cdef Array arr = _array_classes[data_type.type]() arr.init(sp_array) return arr diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index f5e83159342..f88e3cdad5e 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -21,5 +21,19 @@ class TestConvertList(unittest.TestCase): - def test_list_convert(self): + def test_boolean(self): + pass + + def test_integer(self): + arr = arrow.from_list([1, 2, 3]) + assert len(arr) == 3 + assert arr.type == arrow.int64() + + def test_double(self): + pass + + def test_string(self): + pass + + def test_list_of_int(self): pass From 731544a04d7d45d23a307e33c60d285292286111 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 20:10:44 -0800 Subject: [PATCH 16/21] Move PrimitiveType::ToString template back to type.h --- cpp/src/arrow/type.cc | 9 --------- cpp/src/arrow/type.h | 9 +++++++++ python/arrow/__init__.py | 3 ++- python/arrow/array.pyx | 5 +++++ python/arrow/includes/arrow.pxd | 2 ++ python/arrow/schema.pyx | 3 +++ python/arrow/tests/test_convert_builtin.py | 5 +++++ python/src/pyarrow/adapters/builtin.cc | 8 ++++++++ 8 files changed, 34 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index cfed238dc41..265770822ce 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -30,15 +30,6 @@ std::string Field::ToString() const { DataType::~DataType() {} -template -inline std::string PrimitiveType::ToString() const { - std::string result(static_cast(this)->name()); - if (!nullable) { - result.append(" not null"); - } - return result; -} - StringType::StringType(bool nullable) : DataType(LogicalType::STRING, nullable) {} diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e078e2e656b..e78e4949119 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -221,6 +221,15 @@ struct PrimitiveType : public DataType { std::string ToString() const override; }; +template +inline std::string PrimitiveType::ToString() const { + std::string result(static_cast(this)->name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ typedef C_TYPE c_type; \ static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 41bc8971d39..e59c6fda40b 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -18,7 +18,8 @@ # flake8: noqa from arrow.array import Array, from_list -from arrow.schema import (bool_, int8, int16, int32, int64, +from arrow.schema import (null, bool_, + int8, int16, int32, int64, uint8, uint16, uint32, uint64, float_, double, string, list_, struct, field, diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index cf396da0ef9..2845c9bdec1 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -39,6 +39,10 @@ cdef class Array: return self.array.length() +cdef class NullArray(Array): + pass + + cdef class BooleanArray(Array): pass @@ -88,6 +92,7 @@ cdef class StringArray(Array): cdef dict _array_classes = { + LogicalType_NA: NullArray, LogicalType_BOOL: BooleanArray, LogicalType_INT64: Int64Array, LogicalType_LIST: ListArray, diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 666bafc3dad..a1a8c25467a 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -22,6 +22,8 @@ from arrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: enum LogicalType" arrow::LogicalType::type": + LogicalType_NA" arrow::LogicalType::NA" + LogicalType_BOOL" arrow::LogicalType::BOOL" LogicalType_UINT8" arrow::LogicalType::UINT8" diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index ffb159ab3d4..d22c3937efc 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -74,6 +74,9 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): def field(name, type): return Field(name, type) +def null(): + return primitive_type(LogicalType_NA) + def bool_(c_bool nullable=True): return primitive_type(LogicalType_BOOL, nullable) diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index f88e3cdad5e..68875d5b39d 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -24,6 +24,11 @@ class TestConvertList(unittest.TestCase): def test_boolean(self): pass + def test_empty_list(self): + arr = arrow.from_list([]) + assert len(arr) == 0 + assert arr.type == arrow.null() + def test_integer(self): arr = arrow.from_list([1, 2, 3]) assert len(arr) == 3 diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index fb3b07ccffc..e8429fd4912 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -229,6 +229,12 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { int64_t size; RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + // Handle NA / NullType case + if (type->type == LogicalType::NA) { + out->reset(new arrow::Array(type, size)); + return Status::OK(); + } + std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { std::stringstream ss; @@ -244,6 +250,8 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { RETURN_NOT_OK(converter->AppendData(obj)); + *out = builder->Finish(); + return Status::OK(); } From c28bf09513edba9d63f31597b00a9a96cb36e0b4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 21:03:06 -0800 Subject: [PATCH 17/21] Build array successfully, without validating contents --- python/arrow/array.pxd | 5 +++ python/arrow/array.pyx | 4 +- python/arrow/includes/arrow.pxd | 2 + python/arrow/schema.pyx | 11 +++++ python/arrow/tests/test_convert_builtin.py | 12 +++++- python/setup.py | 5 ++- python/src/pyarrow/adapters/builtin.cc | 29 +++++++++---- python/src/pyarrow/common.h | 50 ++++++++++++++++------ 8 files changed, 93 insertions(+), 25 deletions(-) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index 2274f55262e..1abb4fe5855 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -18,10 +18,15 @@ from arrow.includes.common cimport shared_ptr from arrow.includes.arrow cimport CArray, LogicalType +from arrow.schema cimport DataType + cdef class Array: cdef: shared_ptr[CArray] sp_array + cdef readonly: + DataType type + cdef init(self, const shared_ptr[CArray]& sp_array) diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 2845c9bdec1..3f1efe79be1 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -29,6 +29,8 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array + self.type = DataType() + self.type.init(self.sp_array.get().type()) property null_count: @@ -36,7 +38,7 @@ cdef class Array: return self.sp_array.get().null_count() def __len__(self): - return self.array.length() + return self.sp_array.get().length() cdef class NullArray(Array): diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index a1a8c25467a..a67c3bf5e0a 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -47,6 +47,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: LogicalType type c_bool nullable + c_bool Equals(const CDataType* other) + c_string ToString() cdef cppclass CListType" arrow::ListType"(CDataType): diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index d22c3937efc..63cd6e888ab 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -26,6 +26,8 @@ from arrow.compat import frombytes, tobytes from arrow.includes.arrow cimport * cimport arrow.includes.pyarrow as pyarrow +cimport cpython + cdef class DataType: def __cinit__(self): @@ -41,6 +43,15 @@ cdef class DataType: def __repr__(self): return 'DataType({0})'.format(str(self)) + def __richcmp__(DataType self, DataType other, int op): + if op == cpython.Py_EQ: + return self.type.Equals(other.type) + elif op == cpython.Py_NE: + return not self.type.Equals(other.type) + else: + raise TypeError('Invalid comparison') + + cdef class Field: def __cinit__(self, object name, DataType type): diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index 68875d5b39d..1a926acf24d 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -27,11 +27,19 @@ def test_boolean(self): def test_empty_list(self): arr = arrow.from_list([]) assert len(arr) == 0 + assert arr.null_count == 0 + assert arr.type == arrow.null() + + def test_all_none(self): + arr = arrow.from_list([None, None]) + assert len(arr) == 2 + assert arr.null_count == 2 assert arr.type == arrow.null() def test_integer(self): - arr = arrow.from_list([1, 2, 3]) - assert len(arr) == 3 + arr = arrow.from_list([1, None, 3, None]) + assert len(arr) == 4 + assert arr.null_count == 2 assert arr.type == arrow.int64() def test_double(self): diff --git a/python/setup.py b/python/setup.py index d7338a97d9d..2da24297051 100644 --- a/python/setup.py +++ b/python/setup.py @@ -124,7 +124,10 @@ def _run_cmake(self): static_lib_option, source] self.spawn(cmake_command) - self.spawn(['make']) + args = ['make'] + if 'PYARROW_PARALLEL' in os.environ: + args.append('-j{0}'.format(os.environ['PYARROW_PARALLEL'])) + self.spawn(args) else: import shlex cmake_generator = 'Visual Studio 14 2015' diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index e8429fd4912..d5560594f85 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -123,7 +123,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size, // TODO(wesm): inferring types for collections return Status::NotImplemented("No type inference for collections"); } else { - inferer.Visit(obj); + inferer.Visit(item); } } @@ -139,7 +139,7 @@ class SeqConverter { return Status::OK(); } - virtual Status AppendData(PyObject* seq) = 0; + virtual Status AppendData(PyObject* seq, int64_t size) = 0; protected: std::shared_ptr builder_; @@ -160,28 +160,39 @@ class TypedConverter : public SeqConverter { class BoolConverter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { + Status AppendData(PyObject* seq, int64_t size) override { return Status::OK(); } }; class Int64Converter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { + Status AppendData(PyObject* obj, int64_t size) override { + int64_t val; + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(obj, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyLong_AsLongLong(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } return Status::OK(); } }; class DoubleConverter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { + Status AppendData(PyObject* seq, int64_t size) override { return Status::OK(); } }; class StringConverter : public TypedConverter { public: - Status AppendData(PyObject* obj) override { + Status AppendData(PyObject* seq, int64_t size) override { return Status::OK(); } }; @@ -190,7 +201,7 @@ class ListConverter : public TypedConverter { public: Status Init(const std::shared_ptr& builder) override; - Status AppendData(PyObject* obj) override { + Status AppendData(PyObject* seq, int64_t size) override { return Status::OK(); } protected: @@ -231,7 +242,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { // Handle NA / NullType case if (type->type == LogicalType::NA) { - out->reset(new arrow::Array(type, size)); + out->reset(new arrow::Array(type, size, size)); return Status::OK(); } @@ -248,7 +259,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); converter->Init(builder); - RETURN_NOT_OK(converter->AppendData(obj)); + RETURN_NOT_OK(converter->AppendData(obj, size)); *out = builder->Finish(); diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 7847912b68c..a43e4d28c89 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -26,18 +26,6 @@ namespace pyarrow { #define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 -// TODO(wesm): We can just let errors pass through. To be explored later -#define RETURN_IF_PYERROR() \ - if (PyErr_Occurred()) { \ - PyObject *exc_type, *exc_value, *traceback; \ - PyErr_Fetch(&exc_type, &exc_value, &traceback); \ - std::string message(PyString_AsString(exc_value)); \ - Py_DECREF(exc_type); \ - Py_DECREF(exc_value); \ - Py_DECREF(traceback); \ - return Status::UnknownError(message); \ - } - #define RETURN_ARROW_NOT_OK(s) do { \ arrow::Status _s = (s); \ if (!_s.ok()) { \ @@ -47,6 +35,8 @@ namespace pyarrow { class OwnedRef { public: + OwnedRef() : obj_(nullptr) {} + OwnedRef(PyObject* obj) : obj_(obj) {} @@ -54,6 +44,13 @@ class OwnedRef { Py_XDECREF(obj_); } + void reset(PyObject* obj) { + if (obj_ != nullptr) { + Py_XDECREF(obj_); + } + obj_ = obj; + } + PyObject* obj() const{ return obj_; } @@ -62,6 +59,35 @@ class OwnedRef { PyObject* obj_; }; +struct PyObjectStringify { + OwnedRef tmp_obj; + const char* bytes; + + PyObjectStringify(PyObject* obj) { + PyObject* bytes_obj; + if (PyUnicode_Check(obj)) { + bytes_obj = PyUnicode_AsUTF8String(obj); + tmp_obj.reset(bytes_obj); + } else { + bytes_obj = obj; + } + bytes = PyBytes_AsString(bytes_obj); + } +}; + +// TODO(wesm): We can just let errors pass through. To be explored later +#define RETURN_IF_PYERROR() \ + if (PyErr_Occurred()) { \ + PyObject *exc_type, *exc_value, *traceback; \ + PyErr_Fetch(&exc_type, &exc_value, &traceback); \ + PyObjectStringify stringified(exc_value); \ + std::string message(stringified.bytes); \ + Py_DECREF(exc_type); \ + Py_DECREF(exc_value); \ + Py_DECREF(traceback); \ + return Status::UnknownError(message); \ + } + arrow::MemoryPool* GetMemoryPool(); } // namespace pyarrow From 8c3891c908758632b779fc3697b532dd5261920e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 21:53:35 -0800 Subject: [PATCH 18/21] Smoke test that array garbage collection deallocates memory --- cpp/src/arrow/api.h | 3 +++ cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/types/primitive-test.cc | 27 ++++++++++++++++++++-- cpp/src/arrow/types/primitive.h | 2 ++ cpp/src/arrow/util/buffer.cc | 8 +++++++ cpp/src/arrow/util/buffer.h | 2 ++ python/arrow/__init__.py | 2 +- python/arrow/array.pyx | 6 +++++ python/arrow/includes/arrow.pxd | 3 +++ python/arrow/includes/pyarrow.pxd | 5 +++- python/arrow/tests/test_convert_builtin.py | 7 ++++++ 11 files changed, 62 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 282b9ff2c9f..c73d4b386cf 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -36,4 +36,7 @@ #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + #endif // ARROW_API_H diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index fafee91f928..8cc689c3e81 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // Base class for all data array builders class ArrayBuilder { diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index e25729dfb67..f35a258e2cb 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -235,6 +235,29 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { } } +TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { + DECL_T(); + + int size = 10000; + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + int64_t memory_before = this->pool_->bytes_allocated(); + + this->RandomData(size); + + int i; + for (i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + } + + do { + std::shared_ptr result = this->builder_->Finish(); + } while (false); + + ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); +} TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); @@ -332,11 +355,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { - int n = 100; - ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_OK(this->builder_->Reserve(10)); ASSERT_EQ(0, this->builder_->length()); ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + ASSERT_OK(this->builder_->Reserve(90)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index a55ac068a3b..1073bb6e1c3 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -64,6 +64,8 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} + virtual ~PrimitiveArrayImpl() {} + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 3f3807d4e20..50f4716769d 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -31,6 +31,8 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, parent_ = parent; } +Buffer::~Buffer() {} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -43,6 +45,12 @@ PoolBuffer::PoolBuffer(MemoryPool* pool) : pool_ = pool; } +PoolBuffer::~PoolBuffer() { + if (mutable_data_ != nullptr) { + pool_->Free(mutable_data_, capacity_); + } +} + Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 8704723eb0a..0c3e210abd9 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -39,6 +39,7 @@ class Buffer : public std::enable_shared_from_this { Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the @@ -136,6 +137,7 @@ class ResizableBuffer : public MutableBuffer { class PoolBuffer : public ResizableBuffer { public: explicit PoolBuffer(MemoryPool* pool = nullptr); + virtual ~PoolBuffer(); virtual Status Resize(int64_t new_size); virtual Status Reserve(int64_t new_capacity); diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index e59c6fda40b..f3b8f4659af 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -17,7 +17,7 @@ # flake8: noqa -from arrow.array import Array, from_list +from arrow.array import Array, from_list, total_allocated_bytes from arrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3f1efe79be1..3d6df4965d0 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,12 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status + +def total_allocated_bytes(): + cdef MemoryPool* pool = pyarrow.GetMemoryPool() + return pool.bytes_allocated() + + cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index a67c3bf5e0a..fde5de91091 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -51,6 +51,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() + cdef cppclass MemoryPool" arrow::MemoryPool": + int64_t bytes_allocated() + cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type, c_bool nullable) diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index 165d1e7f63e..3eed5b85424 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,7 +18,8 @@ # distutils: language = c++ from arrow.includes.common cimport * -from arrow.includes.arrow cimport CArray, CDataType, LogicalType +from arrow.includes.arrow cimport (CArray, CDataType, LogicalType, + MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -40,3 +41,5 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + + MemoryPool* GetMemoryPool() diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index 1a926acf24d..731ed2c0eb7 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -42,6 +42,13 @@ def test_integer(self): assert arr.null_count == 2 assert arr.type == arrow.int64() + def test_garbage_collection(self): + import gc + bytes_before = arrow.total_allocated_bytes() + arrow.from_list([1, None, 3, None]) + gc.collect() + assert arrow.total_allocated_bytes() == bytes_before + def test_double(self): pass From b02b2967911a2deb806865dc4f0aa040b48c1a88 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 6 Mar 2016 23:52:56 -0800 Subject: [PATCH 19/21] Type inference for lists and lists-of-lists --- python/CMakeLists.txt | 1 + python/arrow/__init__.py | 10 +- python/arrow/array.pxd | 7 ++ python/arrow/array.pyx | 40 ++++++- python/arrow/error.pyx | 4 +- python/arrow/scalar.pxd | 18 ++- python/arrow/scalar.pyx | 28 +++++ python/arrow/tests/test_array.py | 26 +++++ python/arrow/tests/test_convert_builtin.py | 28 ++++- python/setup.py | 2 +- python/src/pyarrow/adapters/builtin.cc | 129 +++++++++++++++++---- 11 files changed, 259 insertions(+), 34 deletions(-) create mode 100644 python/arrow/scalar.pyx create mode 100644 python/arrow/tests/test_array.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 72b8c607f93..8fdd829010e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -429,6 +429,7 @@ set(CYTHON_EXTENSIONS config error parquet + scalar schema ) diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index f3b8f4659af..3c049b85e8c 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -17,7 +17,15 @@ # flake8: noqa -from arrow.array import Array, from_list, total_allocated_bytes +from arrow.array import (Array, from_pylist, total_allocated_bytes, + BooleanArray, NumericArray, + Int8Array, UInt8Array, + ListArray, StringArray) + +from arrow.error import ArrowException + +from arrow.scalar import ArrayValue, NA, Scalar + from arrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index 1abb4fe5855..e32d27769b5 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -18,16 +18,23 @@ from arrow.includes.common cimport shared_ptr from arrow.includes.arrow cimport CArray, LogicalType +from arrow.scalar import NA + from arrow.schema cimport DataType +cdef extern from "Python.h": + int PySlice_Check(object) + cdef class Array: cdef: shared_ptr[CArray] sp_array + CArray* ap cdef readonly: DataType type cdef init(self, const shared_ptr[CArray]& sp_array) + cdef _getitem(self, int i) cdef class BooleanArray(Array): diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3d6df4965d0..15639f7b4eb 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status +from arrow.scalar import NA def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() @@ -35,6 +36,7 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array + self.ap = sp_array.get() self.type = DataType() self.type.init(self.sp_array.get().type()) @@ -46,6 +48,42 @@ cdef class Array: def __len__(self): return self.sp_array.get().length() + def isnull(self): + raise NotImplemented + + def __getitem__(self, key): + cdef: + Py_ssize_t n = len(self) + + if PySlice_Check(key): + start = key.start or 0 + while start < 0: + start += n + + stop = key.stop if key.stop is not None else n + while stop < 0: + stop += n + + step = key.step or 1 + if step != 1: + raise NotImplementedError + else: + return self.slice(start, stop) + + while key < 0: + key += len(self) + + if self.ap.IsNull(key): + return NA + else: + return self._getitem(key) + + cdef _getitem(self, int i): + raise NotImplementedError + + def slice(self, start, end): + pass + cdef class NullArray(Array): pass @@ -121,7 +159,7 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): return arr -def from_list(object list_obj, type=None): +def from_pylist(object list_obj, type=None): """ Convert Python list to Arrow array """ diff --git a/python/arrow/error.pyx b/python/arrow/error.pyx index 7c301e5d872..f1d51635881 100644 --- a/python/arrow/error.pyx +++ b/python/arrow/error.pyx @@ -17,6 +17,8 @@ from arrow.includes.common cimport c_string +from arrow.compat import frombytes + class ArrowException(Exception): pass @@ -25,4 +27,4 @@ cdef check_status(const Status& status): return cdef c_string c_message = status.ToString() - return ArrowException(c_message) + raise ArrowException(frombytes(c_message)) diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd index 671490ee3bf..e193c09cd69 100644 --- a/python/arrow/scalar.pxd +++ b/python/arrow/scalar.pxd @@ -20,20 +20,28 @@ from arrow.includes.arrow cimport CArray, CListArray from arrow.schema cimport DataType -cdef class ScalarValue: +cdef class Scalar: cdef readonly: + DataType type + + +cdef class NAType(Scalar): + pass + + +cdef class ArrayValue(Scalar): + cdef: shared_ptr[CArray] array int index - DataType type -cdef class Int8Value: +cdef class Int8Value(ArrayValue): pass -cdef class ListValue: +cdef class ListValue(ArrayValue): pass -cdef class StringValue: +cdef class StringValue(ArrayValue): pass diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx new file mode 100644 index 00000000000..78dadecf9b4 --- /dev/null +++ b/python/arrow/scalar.pyx @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import arrow.schema as schema + +cdef class NAType(Scalar): + + def __cinit__(self): + self.type = schema.null() + + def __repr__(self): + return 'NA' + +NA = NAType() diff --git a/python/arrow/tests/test_array.py b/python/arrow/tests/test_array.py new file mode 100644 index 00000000000..8eaa5335206 --- /dev/null +++ b/python/arrow/tests/test_array.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestArrayAPI(unittest.TestCase): + + def test_getitem_NA(self): + arr = arrow.from_pylist([1, None, 2]) + assert arr[1] is arrow.NA diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index 731ed2c0eb7..bc30191d90f 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -25,19 +25,19 @@ def test_boolean(self): pass def test_empty_list(self): - arr = arrow.from_list([]) + arr = arrow.from_pylist([]) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == arrow.null() def test_all_none(self): - arr = arrow.from_list([None, None]) + arr = arrow.from_pylist([None, None]) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == arrow.null() def test_integer(self): - arr = arrow.from_list([1, None, 3, None]) + arr = arrow.from_pylist([1, None, 3, None]) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == arrow.int64() @@ -45,7 +45,7 @@ def test_integer(self): def test_garbage_collection(self): import gc bytes_before = arrow.total_allocated_bytes() - arrow.from_list([1, None, 3, None]) + arrow.from_pylist([1, None, 3, None]) gc.collect() assert arrow.total_allocated_bytes() == bytes_before @@ -56,4 +56,22 @@ def test_string(self): pass def test_list_of_int(self): - pass + data = [[1, 2, 3], [], None, [1, 2]] + arr = arrow.from_pylist(data) + # assert len(arr) == 4 + # assert arr.null_count == 1 + assert arr.type == arrow.list_(arrow.int64()) + + def test_mixed_nesting_levels(self): + arrow.from_pylist([1, 2, None]) + arrow.from_pylist([[1], [2], None]) + arrow.from_pylist([[1], [2], [None]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, [1]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, []]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([[1], [2], [None, [1]]]) diff --git a/python/setup.py b/python/setup.py index 2da24297051..9a0de071a9c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -210,7 +210,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'parquet', 'schema'] + return ['array', 'config', 'error', 'parquet', 'scalar', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index d5560594f85..234f126a875 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -46,9 +46,10 @@ static inline bool IsPyBaseString(PyObject* obj) { #endif } -class ScalarTypeInfer { +class ScalarVisitor { public: - ScalarTypeInfer() : + ScalarVisitor() : + total_count_(0), none_count_(0), bool_count_(0), int_count_(0), @@ -56,6 +57,7 @@ class ScalarTypeInfer { string_count_(0) {} void Visit(PyObject* obj) { + ++total_count_; if (obj == Py_None) { ++none_count_; } else if (PyFloat_Check(obj)) { @@ -85,7 +87,12 @@ class ScalarTypeInfer { } } + int64_t total_count() const { + return total_count_; + } + private: + int64_t total_count_; int64_t none_count_; int64_t bool_count_; int64_t int_count_; @@ -96,6 +103,100 @@ class ScalarTypeInfer { // std::vector errors_; }; +static constexpr int MAX_NESTING_LEVELS = 32; + +class SeqVisitor { + public: + SeqVisitor() : + max_nesting_level_(0) { + memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); + } + + Status Visit(PyObject* obj, int level=0) { + Py_ssize_t size = PySequence_Size(obj); + + if (level > max_nesting_level_) { + max_nesting_level_ = level; + } + + for (int64_t i = 0; i < size; ++i) { + // TODO(wesm): Error checking? + // TODO(wesm): Specialize for PyList_GET_ITEM? + OwnedRef item_ref(PySequence_GetItem(obj, i)); + PyObject* item = item_ref.obj(); + + if (PyList_Check(item)) { + PY_RETURN_NOT_OK(Visit(item, level + 1)); + } else if (PyDict_Check(item)) { + return Status::NotImplemented("No type inference for dicts"); + } else { + // We permit nulls at any level of nesting + if (item == Py_None) { + // TODO + } else { + ++nesting_histogram_[level]; + scalars_.Visit(item); + } + } + } + return Status::OK(); + } + + std::shared_ptr GetType() { + if (scalars_.total_count() == 0) { + if (max_nesting_level_ == 0) { + return arrow::NA; + } else { + return nullptr; + } + } else { + std::shared_ptr result = scalars_.GetType(); + for (int i = 0; i < max_nesting_level_; ++i) { + result = std::make_shared(result); + } + return result; + } + } + + Status Validate() const { + if (scalars_.total_count() > 0) { + if (num_nesting_levels() > 1) { + return Status::ValueError("Mixed nesting levels not supported"); + } else if (max_observed_level() < max_nesting_level_) { + return Status::ValueError("Mixed nesting levels not supported"); + } + } + return Status::OK(); + } + + int max_observed_level() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + result = i; + } + } + return result; + } + + int num_nesting_levels() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + ++result; + } + } + return result; + } + + private: + ScalarVisitor scalars_; + + // Track observed + int max_nesting_level_; + int nesting_histogram_[MAX_NESTING_LEVELS]; +}; + // Non-exhaustive type inference static Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr* out_type) { @@ -111,23 +212,11 @@ static Status InferArrowType(PyObject* obj, int64_t* size, *out_type = arrow::NA; } - ScalarTypeInfer inferer; - - for (int64_t i = 0; i < *size; ++i) { - // TODO(wesm): Error checking? - // TODO(wesm): Specialize for PyList_GET_ITEM? - OwnedRef item_ref(PySequence_GetItem(obj, i)); - PyObject* item = item_ref.obj(); - - if (PyList_Check(item) || PyDict_Check(item)) { - // TODO(wesm): inferring types for collections - return Status::NotImplemented("No type inference for collections"); - } else { - inferer.Visit(item); - } - } + SeqVisitor seq_visitor; + PY_RETURN_NOT_OK(seq_visitor.Visit(obj)); + PY_RETURN_NOT_OK(seq_visitor.Validate()); - *out_type = inferer.GetType(); + *out_type = seq_visitor.GetType(); return Status::OK(); } @@ -238,7 +327,7 @@ Status ListConverter::Init(const std::shared_ptr& builder) { Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { std::shared_ptr type; int64_t size; - RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); // Handle NA / NullType case if (type->type == LogicalType::NA) { @@ -259,7 +348,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); converter->Init(builder); - RETURN_NOT_OK(converter->AppendData(obj, size)); + PY_RETURN_NOT_OK(converter->AppendData(obj, size)); *out = builder->Finish(); From 1d4618ba9f28909acdaf208a3c55b1f3e4db9505 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 11:00:57 -0800 Subject: [PATCH 20/21] Prototype string and double converters --- cpp/src/arrow/types/string.h | 9 ++--- python/arrow/array.pyx | 9 +++++ python/arrow/tests/test_convert_builtin.py | 12 +++++-- python/src/pyarrow/adapters/builtin.cc | 41 ++++++++++++++++++++-- 4 files changed, 63 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index a4d1522210f..8ccc0a9698a 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -136,12 +136,13 @@ class StringBuilder : public ListBuilder { } Status Append(const std::string& value) { - RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(reinterpret_cast(value.c_str()), - value.size()); + return Append(value.c_str(), value.size()); } - Status Append(const uint8_t* value, int32_t length); + Status Append(const char* value, int32_t length) { + RETURN_NOT_OK(ListBuilder::Append()); + return byte_builder_->Append(reinterpret_cast(value), length); + } Status Append(const std::vector& values, uint8_t* null_bytes); diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 15639f7b4eb..3a3210d6cc1 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -129,6 +129,14 @@ cdef class UInt64Array(NumericArray): pass +cdef class FloatArray(NumericArray): + pass + + +cdef class DoubleArray(NumericArray): + pass + + cdef class ListArray(Array): pass @@ -141,6 +149,7 @@ cdef dict _array_classes = { LogicalType_NA: NullArray, LogicalType_BOOL: BooleanArray, LogicalType_INT64: Int64Array, + LogicalType_DOUBLE: DoubleArray, LogicalType_LIST: ListArray, LogicalType_STRING: StringArray, } diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index bc30191d90f..d651fbe357a 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -50,10 +50,18 @@ def test_garbage_collection(self): assert arrow.total_allocated_bytes() == bytes_before def test_double(self): - pass + data = [1.5, 1, None, 2.5, None, None] + arr = arrow.from_pylist(data) + assert len(arr) == 6 + assert arr.null_count == 3 + assert arr.type == arrow.double() def test_string(self): - pass + data = ['foo', b'bar', None, 'arrow'] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.string() def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 234f126a875..0b689113ad4 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -256,10 +256,10 @@ class BoolConverter : public TypedConverter { class Int64Converter : public TypedConverter { public: - Status AppendData(PyObject* obj, int64_t size) override { + Status AppendData(PyObject* seq, int64_t size) override { int64_t val; for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(obj, i)); + OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); } else { @@ -275,6 +275,17 @@ class Int64Converter : public TypedConverter { class DoubleConverter : public TypedConverter { public: Status AppendData(PyObject* seq, int64_t size) override { + int64_t val; + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } return Status::OK(); } }; @@ -282,6 +293,32 @@ class DoubleConverter : public TypedConverter { class StringConverter : public TypedConverter { public: Status AppendData(PyObject* seq, int64_t size) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int32_t length; + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (PyUnicode_Check(item)) { + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item)) { + bytes_obj = item; + } else { + return Status::TypeError("Non-string value encountered"); + } + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length)); + } return Status::OK(); } }; From 234554164a29e84350d43ad986926cbd59a9da4d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 11:33:08 -0800 Subject: [PATCH 21/21] Test basic conversion of nested lists --- cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/list.h | 4 ++- python/arrow/tests/test_convert_builtin.py | 14 ++++----- python/src/pyarrow/adapters/builtin.cc | 36 +++++++++++++++++----- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 516008b7763..02991de2648 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -116,7 +116,7 @@ TEST_F(TestListBuilder, TestBasics) { vector lengths = {3, 0, 4}; vector is_null = {0, 1, 0}; - Int32Builder* vb = static_cast(builder_->value_builder()); + Int32Builder* vb = static_cast(builder_->value_builder().get()); int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index cdd1e5a0b1c..f40a8245362 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -179,7 +179,9 @@ class ListBuilder : public Int32Builder { return Append(true); } - ArrayBuilder* value_builder() const { return value_builder_.get();} + const std::shared_ptr& value_builder() const { + return value_builder_; + } protected: std::shared_ptr value_builder_; diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index d651fbe357a..57e6ab9f0e7 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -63,13 +63,6 @@ def test_string(self): assert arr.null_count == 1 assert arr.type == arrow.string() - def test_list_of_int(self): - data = [[1, 2, 3], [], None, [1, 2]] - arr = arrow.from_pylist(data) - # assert len(arr) == 4 - # assert arr.null_count == 1 - assert arr.type == arrow.list_(arrow.int64()) - def test_mixed_nesting_levels(self): arrow.from_pylist([1, 2, None]) arrow.from_pylist([[1], [2], None]) @@ -83,3 +76,10 @@ def test_mixed_nesting_levels(self): with self.assertRaises(arrow.ArrowException): arrow.from_pylist([[1], [2], [None, [1]]]) + + def test_list_of_int(self): + data = [[1, 2, 3], [], None, [1, 2]] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.list_(arrow.int64()) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 0b689113ad4..ae84fa12b0d 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -228,7 +228,7 @@ class SeqConverter { return Status::OK(); } - virtual Status AppendData(PyObject* seq, int64_t size) = 0; + virtual Status AppendData(PyObject* seq) = 0; protected: std::shared_ptr builder_; @@ -249,15 +249,16 @@ class TypedConverter : public SeqConverter { class BoolConverter : public TypedConverter { public: - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* seq) override { return Status::OK(); } }; class Int64Converter : public TypedConverter { public: - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* seq) override { int64_t val; + Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -274,8 +275,9 @@ class Int64Converter : public TypedConverter { class DoubleConverter : public TypedConverter { public: - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* seq) override { int64_t val; + Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -292,12 +294,13 @@ class DoubleConverter : public TypedConverter { class StringConverter : public TypedConverter { public: - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* seq) override { PyObject* item; PyObject* bytes_obj; OwnedRef tmp; const char* bytes; int32_t length; + Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { item = PySequence_GetItem(seq, i); OwnedRef holder(item); @@ -327,7 +330,17 @@ class ListConverter : public TypedConverter { public: Status Init(const std::shared_ptr& builder) override; - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + typed_builder_->Append(); + PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + } + } return Status::OK(); } protected: @@ -357,7 +370,14 @@ std::shared_ptr GetConverter(const std::shared_ptr& type Status ListConverter::Init(const std::shared_ptr& builder) { builder_ = builder; typed_builder_ = static_cast(builder.get()); - value_converter_ = GetConverter(builder->type()); + + value_converter_ = GetConverter(static_cast( + builder->type().get())->value_type); + if (value_converter_ == nullptr) { + return Status::NotImplemented("value type not implemented"); + } + + value_converter_->Init(typed_builder_->value_builder()); return Status::OK(); } @@ -385,7 +405,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); converter->Init(builder); - PY_RETURN_NOT_OK(converter->AppendData(obj, size)); + PY_RETURN_NOT_OK(converter->AppendData(obj)); *out = builder->Finish();