diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index a7f1d275634..65171a73557 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -90,12 +90,12 @@ fi if [ $TRAVIS_OS_NAME == "linux" ]; then cmake $CMAKE_COMMON_FLAGS \ $CMAKE_LINUX_FLAGS \ - -DARROW_CXXFLAGS="-Wconversion -Wno-sign-conversion -Werror" \ + -DBUILD_WARNING_LEVEL=CHECKIN \ $ARROW_CPP_DIR else cmake $CMAKE_COMMON_FLAGS \ $CMAKE_OSX_FLAGS \ - -DARROW_CXXFLAGS=-Werror \ + -DBUILD_WARNING_LEVEL=CHECKIN \ $ARROW_CPP_DIR fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 972132f29e9..0930ae4ced9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -56,6 +56,14 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() +find_package(InferTools) +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR INFER_FOUND) + # Generate a Clang compile_commands.json "compilation database" file for use + # with various development tools, such as Vim's YouCompleteMe plugin. + # See http://clang.llvm.org/docs/JSONCompilationDatabase.html + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +endif() + find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) @@ -223,12 +231,13 @@ else() set(ARROW_BOOST_HEADER_ONLY 1) endif() -include(BuildUtils) - ############################################################ # Compiler flags ############################################################ +# Determine compiler version +include(CompilerInfo) + if (ARROW_NO_DEPRECATED_API) add_definitions(-DARROW_NO_DEPRECATED_API) endif() @@ -245,6 +254,9 @@ include(SetupCxxFlags) add_custom_target(arrow_dependencies) +include(BuildUtils) +enable_testing() + include(ThirdpartyToolchain) # Add common flags @@ -253,9 +265,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARROW_CXXFLAGS}") message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") -# Determine compiler version -include(CompilerInfo) - if ("${COMPILER_FAMILY}" STREQUAL "clang") # Using Clang with ccache causes a bunch of spurious warnings that are # purportedly fixed in the next version of ccache. See the following for details: @@ -354,153 +363,6 @@ if (PARQUET_BUILD_SHARED) endif() endif() -############################################################ -# Benchmarking -############################################################ -# Add a new micro benchmark, with or without an executable that should be built. -# If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, -# respectively. -# -# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component -# (e.g. monotime-benchmark) or contain additional components (e.g. -# net/net_util-benchmark). Either way, the last component must be a globally -# unique name. - -# The benchmark will registered as unit test with ctest with a label -# of 'benchmark'. -# -# Arguments after the test name will be passed to set_tests_properties(). -function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) - if(NO_BENCHMARKS) - return() - endif() - get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) - - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) - # This benchmark has a corresponding .cc file, set it up as an executable. - set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") - add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") - target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) - add_dependencies(runbenchmark ${BENCHMARK_NAME}) - set(NO_COLOR "--color_print=false") - else() - # No executable, just invoke the benchmark (probably a script) directly. - set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) - set(NO_COLOR "") - endif() - - add_test(${BENCHMARK_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) - set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") - if(ARGN) - set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) - endif() -endfunction() - -# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS. -function(ADD_ARROW_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME) - if(NO_BENCHMARKS) - return() - endif() - get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) - - add_dependencies(${BENCHMARK_NAME} ${ARGN}) -endfunction() - -# A wrapper for target_link_libraries() that is compatible with NO_BENCHMARKS. -function(ARROW_BENCHMARK_LINK_LIBRARIES REL_BENCHMARK_NAME) - if(NO_BENCHMARKS) - return() - endif() - get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) - - target_link_libraries(${BENCHMARK_NAME} ${ARGN}) -endfunction() - - -############################################################ -# Testing -############################################################ -# Add a new test case, with or without an executable that should be built. -# -# REL_TEST_NAME is the name of the test. It may be a single component -# (e.g. monotime-test) or contain additional components (e.g. -# net/net_util-test). Either way, the last component must be a globally -# unique name. -# -# The unit test is added with a label of "unittest" to support filtering with -# ctest. -# -# Arguments after the test name will be passed to set_tests_properties(). -function(ADD_ARROW_TEST REL_TEST_NAME) - set(options) - set(single_value_args) - set(multi_value_args STATIC_LINK_LIBS) - cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) - if(ARG_UNPARSED_ARGUMENTS) - message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") - endif() - - if(NO_TESTS OR NOT ARROW_BUILD_STATIC) - return() - endif() - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) - # This test has a corresponding .cc file, set it up as an executable. - set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") - add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") - - if (ARG_STATIC_LINK_LIBS) - # Customize link libraries - target_link_libraries(${TEST_NAME} ${ARG_STATIC_LINK_LIBS}) - else() - target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) - endif() - add_dependencies(unittest ${TEST_NAME}) - else() - # No executable, just invoke the test (probably a script) directly. - set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) - endif() - - if (ARROW_TEST_MEMCHECK) - SET_PROPERTY(TARGET ${TEST_NAME} - APPEND_STRING PROPERTY - COMPILE_FLAGS " -DARROW_VALGRIND") - add_test(${TEST_NAME} - bash -c "cd ${EXECUTABLE_OUTPUT_PATH}; valgrind --tool=memcheck --leak-check=full --leak-check-heuristics=stdstring --error-exitcode=1 ${TEST_PATH}") - elseif(MSVC) - add_test(${TEST_NAME} ${TEST_PATH}) - else() - add_test(${TEST_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) - endif() - set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") -endfunction() - -# A wrapper for add_dependencies() that is compatible with NO_TESTS. -function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) - if(NO_TESTS) - return() - endif() - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - add_dependencies(${TEST_NAME} ${ARGN}) -endfunction() - -# A wrapper for target_link_libraries() that is compatible with NO_TESTS. -function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) - if(NO_TESTS) - return() - endif() - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - target_link_libraries(${TEST_NAME} ${ARGN}) -endfunction() - -enable_testing() - ############################################################ # "make ctags" target ############################################################ @@ -596,6 +458,19 @@ if (${CLANG_TIDY_FOUND}) endif() +############################################################ +# "make infer" target +############################################################ + +if (${INFER_FOUND}) + # runs infer capture + add_custom_target(infer ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1) + # runs infer analyze + add_custom_target(infer-analyze ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 2) + # runs infer report + add_custom_target(infer-report ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 3) +endif() + ############################################################ # "make iwyu" target ############################################################ @@ -733,121 +608,6 @@ if(NOT WIN32 AND ARROW_PLASMA) endif() add_subdirectory(src/arrow) -add_subdirectory(src/arrow/io) - -set(ARROW_SRCS - src/arrow/array.cc - src/arrow/buffer.cc - src/arrow/builder.cc - src/arrow/compare.cc - src/arrow/memory_pool.cc - src/arrow/pretty_print.cc - src/arrow/status.cc - src/arrow/table.cc - src/arrow/tensor.cc - src/arrow/type.cc - src/arrow/visitor.cc - - src/arrow/compute/cast.cc - src/arrow/compute/context.cc - - src/arrow/io/file.cc - src/arrow/io/interfaces.cc - src/arrow/io/memory.cc - - src/arrow/util/bit-util.cc - src/arrow/util/compression.cc - src/arrow/util/cpu-info.cc - src/arrow/util/decimal.cc - src/arrow/util/key_value_metadata.cc -) - -if (ARROW_COMPUTE) - add_subdirectory(src/arrow/compute) - set(ARROW_SRCS ${ARROW_SRCS} - src/arrow/compute/cast.cc - src/arrow/compute/context.cc - ) -endif() - -if (ARROW_GPU) - # IPC extensions required to build the GPU library - set(ARROW_IPC ON) - add_subdirectory(src/arrow/gpu) -endif() - -if (ARROW_IPC) - add_subdirectory(src/arrow/ipc) - add_dependencies(arrow_dependencies metadata_fbs) -endif() - -if (ARROW_WITH_BROTLI) - add_definitions(-DARROW_WITH_BROTLI) - SET(ARROW_SRCS src/arrow/util/compression_brotli.cc ${ARROW_SRCS}) -endif() - -if (ARROW_WITH_LZ4) - add_definitions(-DARROW_WITH_LZ4) - SET(ARROW_SRCS src/arrow/util/compression_lz4.cc ${ARROW_SRCS}) -endif() - -if (ARROW_WITH_SNAPPY) - add_definitions(-DARROW_WITH_SNAPPY) - SET(ARROW_SRCS src/arrow/util/compression_snappy.cc ${ARROW_SRCS}) -endif() - -if (ARROW_WITH_ZLIB) - add_definitions(-DARROW_WITH_ZLIB) - SET(ARROW_SRCS src/arrow/util/compression_zlib.cc ${ARROW_SRCS}) -endif() - -if (ARROW_WITH_ZSTD) - add_definitions(-DARROW_WITH_ZSTD) - SET(ARROW_SRCS src/arrow/util/compression_zstd.cc ${ARROW_SRCS}) -endif() - -if (NOT ARROW_BOOST_HEADER_ONLY) - set(ARROW_SRCS ${ARROW_SRCS} - src/arrow/io/hdfs.cc - src/arrow/io/hdfs-internal.cc - ) -endif() - -if (ARROW_IPC) - set(ARROW_SRCS ${ARROW_SRCS} - src/arrow/ipc/dictionary.cc - src/arrow/ipc/feather.cc - src/arrow/ipc/json.cc - src/arrow/ipc/json-internal.cc - src/arrow/ipc/message.cc - src/arrow/ipc/metadata-internal.cc - src/arrow/ipc/reader.cc - src/arrow/ipc/writer.cc - ) -endif() - - -if(NOT APPLE AND NOT MSVC) - # Localize thirdparty symbols using a linker version script. This hides them - # from the client application. The OS X linker does not support the - # version-script option. - set(ARROW_SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map") -endif() - -set(ARROW_ALL_SRCS - ${ARROW_SRCS}) - -ADD_ARROW_LIB(arrow - SOURCES ${ARROW_ALL_SRCS} - DEPENDENCIES arrow_dependencies - SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} - SHARED_LINK_LIBS ${ARROW_LINK_LIBS} - SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} - STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} - STATIC_PRIVATE_LINK_LIBS ${ARROW_STATIC_PRIVATE_LINK_LIBS} -) - -add_subdirectory(src/arrow/util) if(ARROW_PYTHON) find_package(PythonLibsNew REQUIRED) diff --git a/cpp/README.md b/cpp/README.md index 4a515079ddc..6e29e6f78f4 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -238,6 +238,13 @@ build failures by running the following checks before submitting your pull reque # before running it. make format # requires clang-format is installed +We run our CI builds with more compiler warnings enabled for the Clang +compiler. Please run CMake with + +`-DBUILD_WARNING_LEVEL=CHECKIN` + +to avoid failures due to compiler warnings. + Note that the clang-tidy target may take a while to run. You might consider running clang-tidy separately on the files you have added/changed before invoking the make target to reduce iteration time. Also, it might generate warnings diff --git a/cpp/build-support/run-infer.sh b/cpp/build-support/run-infer.sh new file mode 100755 index 00000000000..823685aef14 --- /dev/null +++ b/cpp/build-support/run-infer.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Runs infer in the given directory +# Arguments: +# $1 - Path to the infer binary +# $2 - Path to the compile_commands.json to use +# $3 - Apply infer step (1=capture, 2=analyze, 3=report) +# +INFER=$1 +shift +COMPILE_COMMANDS=$1 +shift +APPLY_STEP=$1 +shift + +if [ "$APPLY_STEP" == "1" ]; then + $INFER capture --compilation-database $COMPILE_COMMANDS + echo "" + echo "Run 'make infer-analyze' next." +elif [ "$APPLY_STEP" == "2" ]; then + # infer's analyze step can take a very long time to complete + $INFER analyze + echo "" + echo "Run 'make infer-report' next." + echo "See: http://fbinfer.com/docs/steps-for-ci.html" +elif [ "$APPLY_STEP" == "3" ]; then + $INFER report --issues-csv ./infer-out/report.csv 1> /dev/null + $INFER report --issues-txt ./infer-out/report.txt 1> /dev/null + $INFER report --issues-json ./infer-out/report.json 1> /dev/null + echo "" + echo "Reports (report.txt, report.csv, report.json) can be found in the infer-out subdirectory." +else + echo "" + echo "See: http://fbinfer.com/docs/steps-for-ci.html" +fi diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 8f92d73baea..e398dc1827c 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -188,3 +188,149 @@ function(ADD_ARROW_LIB LIB_NAME) endif() endfunction() + + +############################################################ +# Benchmarking +############################################################ +# Add a new micro benchmark, with or without an executable that should be built. +# If benchmarks are enabled then they will be run along side unit tests with ctest. +# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# respectively. +# +# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component +# (e.g. monotime-benchmark) or contain additional components (e.g. +# net/net_util-benchmark). Either way, the last component must be a globally +# unique name. + +# The benchmark will registered as unit test with ctest with a label +# of 'benchmark'. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) + if(NO_BENCHMARKS) + return() + endif() + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) + # This benchmark has a corresponding .cc file, set it up as an executable. + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") + target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) + add_dependencies(runbenchmark ${BENCHMARK_NAME}) + set(NO_COLOR "--color_print=false") + else() + # No executable, just invoke the benchmark (probably a script) directly. + set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) + set(NO_COLOR "") + endif() + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + if(ARGN) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS. +function(ADD_ARROW_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME) + if(NO_BENCHMARKS) + return() + endif() + get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + add_dependencies(${BENCHMARK_NAME} ${ARGN}) +endfunction() + +# A wrapper for target_link_libraries() that is compatible with NO_BENCHMARKS. +function(ARROW_BENCHMARK_LINK_LIBRARIES REL_BENCHMARK_NAME) + if(NO_BENCHMARKS) + return() + endif() + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + target_link_libraries(${BENCHMARK_NAME} ${ARGN}) +endfunction() + + +############################################################ +# Testing +############################################################ +# Add a new test case, with or without an executable that should be built. +# +# REL_TEST_NAME is the name of the test. It may be a single component +# (e.g. monotime-test) or contain additional components (e.g. +# net/net_util-test). Either way, the last component must be a globally +# unique name. +# +# The unit test is added with a label of "unittest" to support filtering with +# ctest. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_ARROW_TEST REL_TEST_NAME) + set(options) + set(single_value_args) + set(multi_value_args STATIC_LINK_LIBS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NO_TESTS OR NOT ARROW_BUILD_STATIC) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) + # This test has a corresponding .cc file, set it up as an executable. + set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") + add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") + + if (ARG_STATIC_LINK_LIBS) + # Customize link libraries + target_link_libraries(${TEST_NAME} ${ARG_STATIC_LINK_LIBS}) + else() + target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + endif() + add_dependencies(unittest ${TEST_NAME}) + else() + # No executable, just invoke the test (probably a script) directly. + set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) + endif() + + if (ARROW_TEST_MEMCHECK) + SET_PROPERTY(TARGET ${TEST_NAME} + APPEND_STRING PROPERTY + COMPILE_FLAGS " -DARROW_VALGRIND") + add_test(${TEST_NAME} + bash -c "cd ${EXECUTABLE_OUTPUT_PATH}; valgrind --tool=memcheck --leak-check=full --leak-check-heuristics=stdstring --error-exitcode=1 ${TEST_PATH}") + elseif(MSVC) + add_test(${TEST_NAME} ${TEST_PATH}) + else() + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + endif() + set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_TESTS. +function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + add_dependencies(${TEST_NAME} ${ARGN}) +endfunction() + +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() diff --git a/cpp/cmake_modules/CompilerInfo.cmake b/cpp/cmake_modules/CompilerInfo.cmake index a1b470182a8..5ff1d861415 100644 --- a/cpp/cmake_modules/CompilerInfo.cmake +++ b/cpp/cmake_modules/CompilerInfo.cmake @@ -63,6 +63,11 @@ elseif("${COMPILER_VERSION_FULL}" MATCHES ".*clang-7") set(COMPILER_FAMILY "clang") set(COMPILER_VERSION "3.7.0svn") +# clang on Mac OS X, XCode 8. +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*clang-802") + set(COMPILER_FAMILY "clang") + set(COMPILER_VERSION "3.9.0svn") + # clang on Mac OS X, XCode 8. elseif("${COMPILER_VERSION_FULL}" MATCHES ".*clang-8") set(COMPILER_FAMILY "clang") diff --git a/cpp/cmake_modules/FindInferTools.cmake b/cpp/cmake_modules/FindInferTools.cmake new file mode 100644 index 00000000000..00c6709c677 --- /dev/null +++ b/cpp/cmake_modules/FindInferTools.cmake @@ -0,0 +1,45 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find the infer module +# +# Usage of this module as follows: +# +# find_package(InferTools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# InferTools_PATH - +# When set, this path is inspected instead of standard library binary locations +# to find infer +# +# This module defines +# INFER_BIN, The path to the clang tidy binary +# INFER_FOUND, Whether clang tidy was found + +find_program(INFER_BIN + NAMES infer + PATHS ${InferTools_PATH} $ENV{INFER_TOOLS_PATH} /usr/local/bin /usr/bin + /usr/local/homebrew/bin + /opt/local/bin + NO_DEFAULT_PATH +) + +if ( "${INFER_BIN}" STREQUAL "INFER_BIN-NOTFOUND" ) + set(INFER_FOUND 0) + message("infer not found") +else() + set(INFER_FOUND 1) + message("infer found at ${INFER_BIN}") +endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 6e92c4b1c1b..77bfac83555 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -48,7 +48,110 @@ if (MSVC) set(CXX_COMMON_FLAGS "/W3") endif() else() - set(CXX_COMMON_FLAGS "-Wall -std=c++11") + # Common flags set below with warning level + set(CXX_COMMON_FLAGS "") +endif() + +# Build warning level (CHECKIN, EVERYTHING, etc.) + +# if no build warning level is specified, default to development warning level +if (NOT BUILD_WARNING_LEVEL) + set(BUILD_WARNING_LEVEL Production) +endif(NOT BUILD_WARNING_LEVEL) + +string(TOUPPER ${BUILD_WARNING_LEVEL} UPPERCASE_BUILD_WARNING_LEVEL) + +if ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") + # Pre-checkin builds + if ("${COMPILER_FAMILY}" STREQUAL "msvc") + string(REPLACE "/W3" "" CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /W3") + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /WX") + elseif ("${COMPILER_FAMILY}" STREQUAL "clang") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Weverything -Wno-c++98-compat \ +-Wno-c++98-compat-pedantic -Wno-deprecated -Wno-weak-vtables -Wno-padded \ +-Wno-unused-parameter -Wno-undef \ +-Wno-shadow -Wno-switch-enum -Wno-exit-time-destructors \ +-Wno-global-constructors -Wno-weak-template-vtables -Wno-undefined-reinterpret-cast \ +-Wno-implicit-fallthrough -Wno-unreachable-code-return \ +-Wno-float-equal -Wno-missing-prototypes \ +-Wno-old-style-cast -Wno-covered-switch-default \ +-Wno-cast-align -Wno-vla-extension -Wno-shift-sign-overflow \ +-Wno-used-but-marked-unused -Wno-missing-variable-declarations \ +-Wno-gnu-zero-variadic-macro-arguments -Wconversion -Wno-sign-conversion \ +-Wno-disabled-macro-expansion -Wno-shorten-64-to-32") + + # Version numbers where warnings are introduced + if ("${COMPILER_VERSION}" VERSION_GREATER "3.3") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-gnu-folding-constant") + endif() + if ("${COMPILER_VERSION}" VERSION_GREATER "3.6") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-reserved-id-macro") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-range-loop-analysis") + endif() + if ("${COMPILER_VERSION}" VERSION_GREATER "3.7") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-double-promotion") + endif() + if ("${COMPILER_VERSION}" VERSION_GREATER "3.8") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-undefined-func-template") + endif() + + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unknown-warning-option -Werror") + elseif ("${COMPILER_FAMILY}" STREQUAL "gcc") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall -Wconversion -Wno-sign-conversion") + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unknown-warning-option -Werror") + else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") + endif() +elseif ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") + # Pedantic builds for fixing warnings + if ("${COMPILER_FAMILY}" STREQUAL "msvc") + string(REPLACE "/W3" "" CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") + # https://docs.microsoft.com/en-us/cpp/build/reference/compiler-option-warning-level + # /wdnnnn disables a warning where "nnnn" is a warning number + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /WX") + elseif ("${COMPILER_FAMILY}" STREQUAL "clang") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic") + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Werror") + elseif ("${COMPILER_FAMILY}" STREQUAL "gcc") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall -Wpedantic -Wextra -Wno-unused-parameter") + # Treat all compiler warnings as errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Werror") + else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") + endif() +else() + # Production builds (warning are not treated as errors) + if ("${COMPILER_FAMILY}" STREQUAL "msvc") + # https://docs.microsoft.com/en-us/cpp/build/reference/compiler-option-warning-level + # TODO: Enable /Wall and disable individual warnings until build compiles without errors + # /wdnnnn disables a warning where "nnnn" is a warning number + string(REPLACE "/W3" "" CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /W3") + elseif ("${COMPILER_FAMILY}" STREQUAL "clang") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") + elseif ("${COMPILER_FAMILY}" STREQUAL "gcc") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") + else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") + endif() +endif() + +# if build warning flags is set, add to CXX_COMMON_FLAGS +if (BUILD_WARNING_FLAGS) + # Use BUILD_WARNING_FLAGS with BUILD_WARNING_LEVEL=everything to disable + # warnings (use with Clang's -Weverything flag to find potential errors) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${BUILD_WARNING_FLAGS}") +endif(BUILD_WARNING_FLAGS) + +if (NOT ("${COMPILER_FAMILY}" STREQUAL "msvc")) +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -std=c++11") endif() # Only enable additional instruction sets if they are supported diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 194156cc656..98186d01415 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -126,6 +126,7 @@ endif() set(Boost_DEBUG TRUE) set(Boost_USE_MULTITHREADED ON) set(Boost_ADDITIONAL_VERSIONS + "1.65.0" "1.65" "1.64.0" "1.64" "1.63.0" "1.63" "1.62.0" "1.61" diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6963b11e3ff..5c903314181 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,6 +15,121 @@ # specific language governing permissions and limitations # under the License. +set(ARROW_SRCS + array.cc + buffer.cc + builder.cc + compare.cc + memory_pool.cc + pretty_print.cc + status.cc + table.cc + tensor.cc + type.cc + visitor.cc + + io/file.cc + io/interfaces.cc + io/memory.cc + + util/bit-util.cc + util/compression.cc + util/cpu-info.cc + util/decimal.cc + util/key_value_metadata.cc +) + +if ("${COMPILER_FAMILY}" STREQUAL "clang") + set_property(SOURCE io/file.cc + APPEND_STRING + PROPERTY COMPILE_FLAGS + " -Wno-unused-macros ") +endif() + +if (ARROW_COMPUTE) + add_subdirectory(compute) + set(ARROW_SRCS ${ARROW_SRCS} + compute/cast.cc + compute/context.cc + ) +endif() + +if (ARROW_GPU) + # IPC extensions required to build the GPU library + set(ARROW_IPC ON) + add_subdirectory(gpu) +endif() + +if (ARROW_IPC) + add_subdirectory(ipc) + add_dependencies(arrow_dependencies metadata_fbs) +endif() + +if (ARROW_WITH_BROTLI) + add_definitions(-DARROW_WITH_BROTLI) + SET(ARROW_SRCS util/compression_brotli.cc ${ARROW_SRCS}) +endif() + +if (ARROW_WITH_LZ4) + add_definitions(-DARROW_WITH_LZ4) + SET(ARROW_SRCS util/compression_lz4.cc ${ARROW_SRCS}) +endif() + +if (ARROW_WITH_SNAPPY) + add_definitions(-DARROW_WITH_SNAPPY) + SET(ARROW_SRCS util/compression_snappy.cc ${ARROW_SRCS}) +endif() + +if (ARROW_WITH_ZLIB) + add_definitions(-DARROW_WITH_ZLIB) + SET(ARROW_SRCS util/compression_zlib.cc ${ARROW_SRCS}) +endif() + +if (ARROW_WITH_ZSTD) + add_definitions(-DARROW_WITH_ZSTD) + SET(ARROW_SRCS util/compression_zstd.cc ${ARROW_SRCS}) +endif() + +if (NOT ARROW_BOOST_HEADER_ONLY) + set(ARROW_SRCS ${ARROW_SRCS} + io/hdfs.cc + io/hdfs-internal.cc + ) +endif() + +if (ARROW_IPC) + set(ARROW_SRCS ${ARROW_SRCS} + ipc/dictionary.cc + ipc/feather.cc + ipc/json.cc + ipc/json-internal.cc + ipc/message.cc + ipc/metadata-internal.cc + ipc/reader.cc + ipc/writer.cc + ) +endif() + +if(NOT APPLE AND NOT MSVC) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(ARROW_SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map") +endif() + +set(ARROW_ALL_SRCS + ${ARROW_SRCS}) + +ADD_ARROW_LIB(arrow + SOURCES ${ARROW_ALL_SRCS} + DEPENDENCIES arrow_dependencies + SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} + SHARED_LINK_LIBS ${ARROW_LINK_LIBS} + SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} + STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} + STATIC_PRIVATE_LINK_LIBS ${ARROW_STATIC_PRIVATE_LINK_LIBS} +) + # Headers: top level install(FILES allocator.h @@ -60,3 +175,6 @@ ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) + +add_subdirectory(io) +add_subdirectory(util) diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index e02741ec6aa..88f4f5e8a54 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -59,17 +59,16 @@ TEST(stl_allocator, FreeLargeMemory) { } TEST(stl_allocator, MaxMemory) { - DefaultMemoryPool pool; + auto pool = default_memory_pool(); - ASSERT_EQ(0, pool.max_memory()); - stl_allocator alloc(&pool); - uint8_t* data = alloc.allocate(100); - uint8_t* data2 = alloc.allocate(100); + stl_allocator alloc(pool); + uint8_t* data = alloc.allocate(1000); + uint8_t* data2 = alloc.allocate(1000); - alloc.deallocate(data, 100); - alloc.deallocate(data2, 100); + alloc.deallocate(data, 1000); + alloc.deallocate(data2, 1000); - ASSERT_EQ(200, pool.max_memory()); + ASSERT_EQ(2000, pool->max_memory()); } #endif // ARROW_VALGRIND diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 97310830ddc..4ecf0f92b2a 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -283,7 +283,7 @@ class TestPrimitiveBuilder : public TestBuilder { #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ - PTYPE_DECL(CapType, c_type); \ + PTYPE_DECL(CapType, c_type) \ static void draw(int64_t N, vector* draws) { \ test::randint(N, LOWER, UPPER, draws); \ } \ @@ -291,7 +291,7 @@ class TestPrimitiveBuilder : public TestBuilder { #define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ - PTYPE_DECL(CapType, c_type); \ + PTYPE_DECL(CapType, c_type) \ static void draw(int64_t N, vector* draws) { \ test::random_real(N, 0, LOWER, UPPER, draws); \ } \ @@ -311,7 +311,7 @@ PFLOAT_DECL(Float, float, -1000, 1000); PFLOAT_DECL(Double, double, -1000, 1000); struct PBoolean { - PTYPE_DECL(Boolean, uint8_t); + PTYPE_DECL(Boolean, uint8_t) }; template <> @@ -378,8 +378,6 @@ TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); #define DECL_TYPE() typedef typename TestFixture::Type Type; -#define DECL_ARRAYTYPE() typedef typename TestFixture::ArrayType ArrayType; - TYPED_TEST(TestPrimitiveBuilder, TestInit) { DECL_TYPE(); @@ -1200,7 +1198,7 @@ class TestFWBinaryArray : public ::testing::Test { }; TEST_F(TestFWBinaryArray, Builder) { - const int32_t byte_width = 10; + constexpr int32_t byte_width = 10; int64_t length = 4096; int64_t nbytes = length * byte_width; @@ -1215,8 +1213,7 @@ TEST_F(TestFWBinaryArray, Builder) { std::shared_ptr result; - auto CheckResult = [this, &length, &is_valid, &raw_data, - &byte_width](const Array& result) { + auto CheckResult = [&length, &is_valid, &raw_data, byte_width](const Array& result) { // Verify output const auto& fw_result = static_cast(result); @@ -1847,6 +1844,93 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array)); } +TEST(TestDecimalDictionaryBuilder, Basic) { + // Build the dictionary Array + const auto& decimal_type = arrow::decimal(2, 0); + DictionaryBuilder builder(decimal_type, default_memory_pool()); + + // Test data + std::vector test{12, 12, 11, 12}; + for (const auto& value : test) { + ASSERT_OK(builder.Append(value.ToBytes().data())); + } + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + // Build expected data + FixedSizeBinaryBuilder decimal_builder(decimal_type); + ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes())); + ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes())); + + std::shared_ptr decimal_array; + ASSERT_OK(decimal_builder.Finish(&decimal_array)); + auto dtype = arrow::dictionary(int8(), decimal_array); + + Int8Builder int_builder; + ASSERT_OK(int_builder.Append({0, 0, 1, 0})); + std::shared_ptr int_array; + ASSERT_OK(int_builder.Finish(&int_array)); + + DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); +} + +TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { + const auto& decimal_type = arrow::decimal(21, 0); + + // Build the dictionary Array + DictionaryBuilder builder(decimal_type, default_memory_pool()); + + // Build expected data + FixedSizeBinaryBuilder fsb_builder(decimal_type); + Int16Builder int_builder; + + // Fill with 1024 different values + for (int64_t i = 0; i < 1024; i++) { + const uint8_t bytes[] = {0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 12, + 12, + static_cast(i / 128), + static_cast(i % 128)}; + ASSERT_OK(builder.Append(bytes)); + ASSERT_OK(fsb_builder.Append(bytes)); + ASSERT_OK(int_builder.Append(static_cast(i))); + } + // Fill with an already existing value + const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1}; + for (int64_t i = 0; i < 1024; i++) { + ASSERT_OK(builder.Append(known_value)); + ASSERT_OK(int_builder.Append(1)); + } + + // Finalize result + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + // Finalize expected data + std::shared_ptr fsb_array; + ASSERT_OK(fsb_builder.Finish(&fsb_array)); + + auto dtype = std::make_shared(int16(), fsb_array); + std::shared_ptr int_array; + ASSERT_OK(int_builder.Finish(&int_array)); + + DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); +} + // ---------------------------------------------------------------------- // List tests diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 80188a14a95..12922ae7b0a 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -194,7 +194,8 @@ ListArray::ListArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } -Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, +Status ListArray::FromArrays(const Array& offsets, const Array& values, + MemoryPool* ARROW_ARG_UNUSED(pool), std::shared_ptr* out) { if (ARROW_PREDICT_FALSE(offsets.length() == 0)) { return Status::Invalid("List offsets must have non-zero length"); @@ -223,6 +224,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo void ListArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); + DCHECK_EQ(data->buffers.size(), 2); + auto value_offsets = data->buffers[1]; raw_value_offsets_ = value_offsets == nullptr ? nullptr @@ -239,15 +242,13 @@ std::shared_ptr ListArray::values() const { return values_; } // ---------------------------------------------------------------------- // String and binary -static std::shared_ptr kBinary = std::make_shared(); -static std::shared_ptr kString = std::make_shared(); - BinaryArray::BinaryArray(const std::shared_ptr& data) { DCHECK_EQ(data->type->id(), Type::BINARY); SetData(data); } void BinaryArray::SetData(const std::shared_ptr& data) { + DCHECK_EQ(data->buffers.size(), 3); auto value_offsets = data->buffers[1]; auto value_data = data->buffers[2]; this->Array::SetData(data); @@ -261,8 +262,8 @@ BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_of const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) - : BinaryArray(kBinary, length, value_offsets, data, null_bitmap, null_count, offset) { -} + : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count, + offset) {} BinaryArray::BinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -283,8 +284,7 @@ StringArray::StringArray(int64_t length, const std::shared_ptr& value_of const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) - : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) { -} + : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {} // ---------------------------------------------------------------------- // Fixed width binary @@ -345,6 +345,7 @@ std::shared_ptr StructArray::field(int i) const { if (!boxed_fields_[i]) { DCHECK(MakeArray(data_->child_data[i], &boxed_fields_[i]).ok()); } + DCHECK(boxed_fields_[i]); return boxed_fields_[i]; } @@ -354,6 +355,8 @@ std::shared_ptr StructArray::field(int i) const { void UnionArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); + DCHECK_EQ(data->buffers.size(), 3); + auto type_ids = data_->buffers[1]; auto value_offsets = data_->buffers[2]; raw_type_ids_ = @@ -388,6 +391,7 @@ std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { DCHECK(MakeArray(data_->child_data[i], &boxed_fields_[i]).ok()); } + DCHECK(boxed_fields_[i]); return boxed_fields_[i]; } @@ -437,13 +441,13 @@ Status Array::Accept(ArrayVisitor* visitor) const { namespace internal { struct ValidateVisitor { - Status Visit(const NullArray& array) { return Status::OK(); } + Status Visit(const NullArray&) { return Status::OK(); } - Status Visit(const PrimitiveArray& array) { return Status::OK(); } + Status Visit(const PrimitiveArray&) { return Status::OK(); } - Status Visit(const DecimalArray& array) { return Status::OK(); } + Status Visit(const DecimalArray&) { return Status::OK(); } - Status Visit(const BinaryArray& array) { + Status Visit(const BinaryArray&) { // TODO(wesm): what to do here? return Status::OK(); } @@ -585,7 +589,7 @@ class ArrayDataWrapper { : data_(data), out_(out) {} template - Status Visit(const T& type) { + Status Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; *out_ = std::make_shared(data_); return Status::OK(); @@ -597,10 +601,11 @@ class ArrayDataWrapper { } // namespace internal -// Remove enclosing namespace after 0.7.0 Status MakeArray(const std::shared_ptr& data, std::shared_ptr* out) { internal::ArrayDataWrapper wrapper_visitor(data, out); - return VisitTypeInline(*data->type, &wrapper_visitor); + RETURN_NOT_OK(VisitTypeInline(*data->type, &wrapper_visitor)); + DCHECK(out); + return Status::OK(); } #ifndef ARROW_NO_DEPRECATED_API diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index e801b3586df..4ad60eb77f4 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -84,7 +84,7 @@ struct Decimal; /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { - ArrayData() {} + ArrayData() : length(0) {} ArrayData(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) @@ -674,7 +674,7 @@ ARROW_EXPORT Status ValidateArray(const Array& array); #ifndef ARROW_NO_DEPRECATED_API -// \deprecated Since 0.7.0 +// \note Deprecated since 0.7.0 /// Create new arrays for logical types that are backed by primitive arrays. ARROW_EXPORT diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 334ad7bf714..5fd2706f046 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -32,9 +32,7 @@ using std::string; namespace arrow { -class TestBuffer : public ::testing::Test {}; - -TEST_F(TestBuffer, IsMutableFlag) { +TEST(TestBuffer, IsMutableFlag) { Buffer buf(nullptr, 0); ASSERT_FALSE(buf.is_mutable()); @@ -46,7 +44,15 @@ TEST_F(TestBuffer, IsMutableFlag) { ASSERT_TRUE(pbuf.is_mutable()); } -TEST_F(TestBuffer, Resize) { +TEST(TestBuffer, FromStdString) { + std::string val = "hello, world"; + + Buffer buf(val); + ASSERT_EQ(0, memcmp(buf.data(), val.c_str(), val.size())); + ASSERT_EQ(static_cast(val.size()), buf.size()); +} + +TEST(TestBuffer, Resize) { PoolBuffer buf; ASSERT_EQ(0, buf.size()); @@ -69,7 +75,7 @@ TEST_F(TestBuffer, Resize) { ASSERT_EQ(128, buf.capacity()); } -TEST_F(TestBuffer, TypedResize) { +TEST(TestBuffer, TypedResize) { PoolBuffer buf; ASSERT_EQ(0, buf.size()); @@ -88,7 +94,7 @@ TEST_F(TestBuffer, TypedResize) { ASSERT_EQ(832, buf.capacity()); } -TEST_F(TestBuffer, ResizeOOM) { +TEST(TestBuffer, ResizeOOM) { // This test doesn't play nice with AddressSanitizer #ifndef ADDRESS_SANITIZER // realloc fails, even though there may be no explicit limit @@ -99,7 +105,7 @@ TEST_F(TestBuffer, ResizeOOM) { #endif } -TEST_F(TestBuffer, EqualsWithSameContent) { +TEST(TestBuffer, EqualsWithSameContent) { MemoryPool* pool = default_memory_pool(); const int32_t bufferSize = 128 * 1024; uint8_t* rawBuffer1; @@ -123,7 +129,7 @@ TEST_F(TestBuffer, EqualsWithSameContent) { pool->Free(rawBuffer3, bufferSize); } -TEST_F(TestBuffer, EqualsWithSameBuffer) { +TEST(TestBuffer, EqualsWithSameBuffer) { MemoryPool* pool = default_memory_pool(); const int32_t bufferSize = 128 * 1024; uint8_t* rawBuffer; @@ -142,7 +148,7 @@ TEST_F(TestBuffer, EqualsWithSameBuffer) { pool->Free(rawBuffer, bufferSize); } -TEST_F(TestBuffer, Copy) { +TEST(TestBuffer, Copy) { std::string data_str = "some data to copy"; auto data = reinterpret_cast(data_str.c_str()); @@ -157,7 +163,7 @@ TEST_F(TestBuffer, Copy) { ASSERT_TRUE(out->Equals(expected)); } -TEST_F(TestBuffer, SliceBuffer) { +TEST(TestBuffer, SliceBuffer) { std::string data_str = "some data to slice"; auto data = reinterpret_cast(data_str.c_str()); @@ -171,7 +177,7 @@ TEST_F(TestBuffer, SliceBuffer) { ASSERT_EQ(2, buf.use_count()); } -TEST_F(TestBuffer, SliceMutableBuffer) { +TEST(TestBuffer, SliceMutableBuffer) { std::string data_str = "some data to slice"; auto data = reinterpret_cast(data_str.c_str()); diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index cf533eb3be4..e308ed26049 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -132,6 +132,8 @@ Status AllocateBuffer(MemoryPool* pool, const int64_t size, return Status::OK(); } +#ifndef ARROW_NO_DEPRECATED_API + Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out) { std::shared_ptr buffer; @@ -140,6 +142,8 @@ Status AllocateBuffer(MemoryPool* pool, const int64_t size, return Status::OK(); } +#endif + Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out) { auto buffer = std::make_shared(pool); diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index d2152677860..5f61ade956b 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -47,9 +47,25 @@ class MemoryPool; /// The following invariant is always true: Size < Capacity class ARROW_EXPORT Buffer { public: + /// \brief Construct from buffer and size without copying memory + /// + /// \param[in] data a memory buffer + /// \param[in] size buffer size + /// + /// \note The passed memory must be kept alive through some other means Buffer(const uint8_t* data, int64_t size) : is_mutable_(false), data_(data), size_(size), capacity_(size) {} + /// \brief Construct from std::string without copying memory + /// + /// \param[in] data a std::string object + /// + /// \note The std::string must stay alive for the lifetime of the Buffer, so + /// temporary rvalue strings must be stored in an lvalue somewhere + explicit Buffer(const std::string& data) + : Buffer(reinterpret_cast(data.c_str()), + static_cast(data.size())) {} + virtual ~Buffer() = default; /// An offset into data that is owned by another buffer, but we want to be @@ -69,6 +85,8 @@ class ARROW_EXPORT Buffer { /// Return true if both buffers are the same size and contain the same bytes /// up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const; + + /// Return true if both buffers are the same size and contain the same bytes bool Equals(const Buffer& other) const; /// Copy a section of the buffer into a new Buffer. @@ -101,17 +119,6 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; -/// \brief Create Buffer referencing std::string memory -/// -/// Warning: string instance must stay alive -/// -/// \param str std::string instance -/// \return std::shared_ptr -static inline std::shared_ptr GetBufferFromString(const std::string& str) { - return std::make_shared(reinterpret_cast(str.c_str()), - static_cast(str.size())); -} - /// Construct a view on passed buffer at the indicated offset and length. This /// function cannot fail and does not error checking (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, @@ -194,7 +201,7 @@ class ARROW_EXPORT BufferBuilder { if (elements == 0) { return Status::OK(); } - if (capacity_ == 0) { + if (buffer_ == nullptr) { buffer_ = std::make_shared(pool_); } int64_t old_capacity = capacity_; @@ -331,11 +338,24 @@ Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out); #ifndef ARROW_NO_DEPRECATED_API + /// \deprecated Since 0.7.0 ARROW_EXPORT Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out); -#endif + +/// \brief Create Buffer referencing std::string memory +/// \deprecated Since 0.8.0 +/// +/// Warning: string instance must stay alive +/// +/// \param str std::string instance +/// \return std::shared_ptr +static inline std::shared_ptr GetBufferFromString(const std::string& str) { + return std::make_shared(str); +} + +#endif // ARROW_NO_DEPRECATED_API } // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index a194ab74572..0479dc55592 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -831,11 +831,11 @@ DictionaryBuilder::DictionaryBuilder( hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), dict_builder_(type, pool), - values_builder_(pool) { + values_builder_(pool), + byte_width_(static_cast(*type).byte_width()) { if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } - byte_width_ = static_cast(*type).byte_width(); } #ifndef ARROW_NO_DEPRECATED_API @@ -921,7 +921,7 @@ Status DictionaryBuilder::Append(const Scalar& value) { template Status DictionaryBuilder::AppendArray(const Array& array) { - const NumericArray& numeric_array = static_cast&>(array); + const auto& numeric_array = static_cast&>(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); @@ -938,8 +938,7 @@ Status DictionaryBuilder::AppendArray(const Array& array) { return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); } - const FixedSizeBinaryArray& numeric_array = - static_cast(array); + const auto& numeric_array = static_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); @@ -1493,6 +1492,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder); + DICTIONARY_BUILDER_CASE(DECIMAL, DictionaryBuilder); default: return Status::NotImplemented(type->ToString()); } @@ -1528,6 +1528,7 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool, DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder); DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder); DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder); + DICTIONARY_ARRAY_CASE(DECIMAL, DictionaryBuilder); default: std::stringstream ss; ss << "Cannot encode array of type " << type->ToString(); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 28f3cb97232..da7386aafef 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -937,8 +937,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { public: - using DictionaryBuilder::DictionaryBuilder; using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; Status Append(const uint8_t* value, int32_t length) { return Append(internal::WrappedBinary(value, length)); @@ -958,8 +958,8 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { public: - using DictionaryBuilder::DictionaryBuilder; using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; Status Append(const uint8_t* value, int32_t length) { return Append(internal::WrappedBinary(value, length)); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2aeb03b8b9a..515b8f62f56 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -584,7 +584,7 @@ class TypeEqualsVisitor { typename std::enable_if::value || std::is_base_of::value, Status>::type - Visit(const T& type) { + Visit(const T&) { result_ = true; return Status::OK(); } diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index ee838fa38f4..149cc36b615 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -490,7 +490,10 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input if (can_pre_allocate_values) { std::shared_ptr out_data; - if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) { + const Type::type type_id = out->type->id(); + + if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY || + type_id == Type::DECIMAL)) { std::stringstream ss; ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); return Status::NotImplemented(ss.str()); @@ -614,6 +617,7 @@ class CastKernel : public UnaryKernel { FN(IN_TYPE, FloatType); \ FN(IN_TYPE, DoubleType); \ FN(IN_TYPE, FixedSizeBinaryType); \ + FN(IN_TYPE, DecimalType); \ FN(IN_TYPE, BinaryType); \ FN(IN_TYPE, StringType); diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 081cdd90832..7a07512b2ad 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -46,11 +46,11 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& const CastOptions& options, std::unique_ptr* kernel); /// \brief Cast from one array type to another -/// \param[in] context -/// \param[in] array -/// \param[in] to_type -/// \param[in] options -/// \param[out] out +/// \param[in] context the FunctionContext +/// \param[in] array array to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[out] out resulting array /// /// \since 0.7.0 /// \note API not yet finalized diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index ca536321ba3..234bd540e33 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -384,6 +384,8 @@ class OSFile { FileMode::type mode() const { return mode_; } + std::mutex& lock() { return lock_; } + protected: Status SetFileName(const std::string& file_name) { #if defined(_MSC_VER) @@ -461,6 +463,20 @@ Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { return impl_->Read(nbytes, bytes_read, out); } +Status ReadableFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) { + std::lock_guard guard(impl_->lock()); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, out); +} + +Status ReadableFile::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + std::lock_guard guard(impl_->lock()); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, out); +} + Status ReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { return impl_->ReadBuffer(nbytes, out); } @@ -590,6 +606,8 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { int fd() const { return file_->fd(); } + std::mutex& lock() { return file_->lock(); } + private: std::unique_ptr file_; int64_t position_; @@ -671,10 +689,24 @@ Status MemoryMappedFile::Read(int64_t nbytes, std::shared_ptr* out) { return Status::OK(); } +Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) { + std::lock_guard guard(memory_map_->lock()); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, out); +} + +Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + std::lock_guard guard(memory_map_->lock()); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, out); +} + bool MemoryMappedFile::supports_zero_copy() const { return true; } Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { - std::lock_guard guard(lock_); + std::lock_guard guard(memory_map_->lock()); if (!memory_map_->opened() || !memory_map_->writable()) { return Status::IOError("Unable to write"); @@ -685,7 +717,7 @@ Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t } Status MemoryMappedFile::Write(const uint8_t* data, int64_t nbytes) { - std::lock_guard guard(lock_); + std::lock_guard guard(memory_map_->lock()); if (!memory_map_->opened() || !memory_map_->writable()) { return Status::IOError("Unable to write"); diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index 1b1bbe0e495..47bed346c01 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -86,7 +86,7 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { /// \param[in] pool a MemoryPool for memory allocations /// \param[out] file ReadableFile instance /// Open file with one's own memory pool for memory allocations - static Status Open(const std::string& path, MemoryPool* memory_pool, + static Status Open(const std::string& path, MemoryPool* pool, std::shared_ptr* file); Status Close() override; @@ -96,6 +96,12 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) override; + + /// Default implementation is thread-safe + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + Status GetSize(int64_t* size) override; Status Seek(int64_t position) override; @@ -139,6 +145,12 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { // Zero copy read. Not thread-safe Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) override; + + /// Default implementation is thread-safe + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + bool supports_zero_copy() const override; /// Write data at the current position in the file. Thread-safe diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index e6d0487a9db..9cd1c5052fe 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -44,6 +44,7 @@ #include // NOLINT #include "arrow/status.h" +#include "arrow/util/logging.h" namespace fs = boost::filesystem; @@ -346,6 +347,7 @@ bool LibHdfsShim::HasPread() { tSize LibHdfsShim::Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length) { GET_SYMBOL(this, hdfsPread); + DCHECK(this->hdfsPread); return this->hdfsPread(fs, file, position, buffer, length); } diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index 694575b5f06..58fcf7a8145 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -30,20 +30,6 @@ FileInterface::~FileInterface() {} RandomAccessFile::RandomAccessFile() { set_mode(FileMode::READ); } -Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { - std::lock_guard guard(lock_); - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, bytes_read, out); -} - -Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, - std::shared_ptr* out) { - std::lock_guard guard(lock_); - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, out); -} - Status Writeable::Write(const std::string& data) { return Write(reinterpret_cast(data.c_str()), static_cast(data.size())); diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index 11441794e18..1af35efe20c 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -20,7 +20,6 @@ #include #include -#include #include #include @@ -86,11 +85,14 @@ class ARROW_EXPORT FileInterface { class ARROW_EXPORT Seekable { public: + virtual ~Seekable() = default; virtual Status Seek(int64_t position) = 0; }; class ARROW_EXPORT Writeable { public: + virtual ~Writeable() = default; + virtual Status Write(const uint8_t* data, int64_t nbytes) = 0; /// \brief Flush buffered bytes, if any @@ -101,6 +103,8 @@ class ARROW_EXPORT Writeable { class ARROW_EXPORT Readable { public: + virtual ~Readable() = default; + virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; // Does not copy if not necessary @@ -128,16 +132,13 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { /// /// Default implementation is thread-safe virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out); + uint8_t* out) = 0; /// Default implementation is thread-safe - virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out); - - std::mutex& lock() { return lock_; } + virtual Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) = 0; protected: - std::mutex lock_; - RandomAccessFile(); }; diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index 636fbd842d0..ee3beabd9a8 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -393,9 +393,9 @@ TEST_F(TestReadableFile, ThreadSafety) { ASSERT_OK(ReadableFile::Open(path_, &pool, &file_)); std::atomic correct_count(0); - const int niter = 10000; + constexpr int niter = 10000; - auto ReadData = [&correct_count, &data, niter, this]() { + auto ReadData = [&correct_count, &data, this, niter]() { std::shared_ptr buffer; for (int i = 0; i < niter; ++i) { @@ -587,9 +587,9 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) { static_cast(data.size()))); std::atomic correct_count(0); - const int niter = 10000; + constexpr int niter = 10000; - auto ReadData = [&correct_count, &data, niter, &file]() { + auto ReadData = [&correct_count, &data, &file, niter]() { std::shared_ptr buffer; for (int i = 0; i < niter; ++i) { diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc index eaf638f2002..5305b477462 100644 --- a/cpp/src/arrow/io/io-hdfs-test.cc +++ b/cpp/src/arrow/io/io-hdfs-test.cc @@ -437,7 +437,7 @@ TYPED_TEST(TestHadoopFileSystem, ThreadSafety) { ASSERT_OK(this->client_->OpenReadable(src_path, &file)); std::atomic correct_count(0); - const int niter = 1000; + constexpr int niter = 1000; auto ReadData = [&file, &correct_count, &data, niter]() { for (int i = 0; i < niter; ++i) { diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 0b91ab5186f..370d3e9566a 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/buffer.h" #include "arrow/status.h" @@ -127,67 +128,109 @@ static constexpr int kMemcopyDefaultNumThreads = 1; static constexpr int64_t kMemcopyDefaultBlocksize = 64; static constexpr int64_t kMemcopyDefaultThreshold = 1024 * 1024; -/// Input buffer must be mutable, will abort if not -FixedSizeBufferWriter::FixedSizeBufferWriter(const std::shared_ptr& buffer) - : memcopy_num_threads_(kMemcopyDefaultNumThreads), - memcopy_blocksize_(kMemcopyDefaultBlocksize), - memcopy_threshold_(kMemcopyDefaultThreshold) { - DCHECK(buffer) << "Buffer was nullptr"; - buffer_ = buffer; - DCHECK(buffer->is_mutable()) << "Must pass mutable buffer"; - mutable_data_ = buffer->mutable_data(); - size_ = buffer->size(); - position_ = 0; -} +class FixedSizeBufferWriter::FixedSizeBufferWriterImpl { + public: + /// Input buffer must be mutable, will abort if not + + /// Input buffer must be mutable, will abort if not + explicit FixedSizeBufferWriterImpl(const std::shared_ptr& buffer) + : memcopy_num_threads_(kMemcopyDefaultNumThreads), + memcopy_blocksize_(kMemcopyDefaultBlocksize), + memcopy_threshold_(kMemcopyDefaultThreshold) { + buffer_ = buffer; + DCHECK(buffer->is_mutable()) << "Must pass mutable buffer"; + mutable_data_ = buffer->mutable_data(); + size_ = buffer->size(); + position_ = 0; + } -FixedSizeBufferWriter::~FixedSizeBufferWriter() {} + ~FixedSizeBufferWriterImpl() {} -Status FixedSizeBufferWriter::Close() { - // no-op - return Status::OK(); -} + Status Close() { + // No-op + return Status::OK(); + } -Status FixedSizeBufferWriter::Seek(int64_t position) { - if (position < 0 || position >= size_) { - return Status::IOError("position out of bounds"); + Status Seek(int64_t position) { + if (position < 0 || position >= size_) { + return Status::IOError("position out of bounds"); + } + position_ = position; + return Status::OK(); } - position_ = position; - return Status::OK(); -} + + Status Tell(int64_t* position) { + *position = position_; + return Status::OK(); + } + + Status Write(const uint8_t* data, int64_t nbytes) { + if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) { + internal::parallel_memcopy(mutable_data_ + position_, data, nbytes, + memcopy_blocksize_, memcopy_num_threads_); + } else { + memcpy(mutable_data_ + position_, data, nbytes); + } + position_ += nbytes; + return Status::OK(); + } + + Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { + std::lock_guard guard(lock_); + RETURN_NOT_OK(Seek(position)); + return Write(data, nbytes); + } + + void set_memcopy_threads(int num_threads) { memcopy_num_threads_ = num_threads; } + + void set_memcopy_blocksize(int64_t blocksize) { memcopy_blocksize_ = blocksize; } + + void set_memcopy_threshold(int64_t threshold) { memcopy_threshold_ = threshold; } + + private: + std::mutex lock_; + std::shared_ptr buffer_; + uint8_t* mutable_data_; + int64_t size_; + int64_t position_; + + int memcopy_num_threads_; + int64_t memcopy_blocksize_; + int64_t memcopy_threshold_; +}; + +FixedSizeBufferWriter::~FixedSizeBufferWriter() {} + +FixedSizeBufferWriter::FixedSizeBufferWriter(const std::shared_ptr& buffer) + : impl_(new FixedSizeBufferWriterImpl(buffer)) {} + +Status FixedSizeBufferWriter::Close() { return impl_->Close(); } + +Status FixedSizeBufferWriter::Seek(int64_t position) { return impl_->Seek(position); } Status FixedSizeBufferWriter::Tell(int64_t* position) const { - *position = position_; - return Status::OK(); + return impl_->Tell(position); } Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) { - if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) { - internal::parallel_memcopy(mutable_data_ + position_, data, nbytes, - memcopy_blocksize_, memcopy_num_threads_); - } else { - memcpy(mutable_data_ + position_, data, nbytes); - } - position_ += nbytes; - return Status::OK(); + return impl_->Write(data, nbytes); } Status FixedSizeBufferWriter::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { - std::lock_guard guard(lock_); - RETURN_NOT_OK(Seek(position)); - return Write(data, nbytes); + return impl_->WriteAt(position, data, nbytes); } void FixedSizeBufferWriter::set_memcopy_threads(int num_threads) { - memcopy_num_threads_ = num_threads; + impl_->set_memcopy_threads(num_threads); } void FixedSizeBufferWriter::set_memcopy_blocksize(int64_t blocksize) { - memcopy_blocksize_ = blocksize; + impl_->set_memcopy_blocksize(blocksize); } void FixedSizeBufferWriter::set_memcopy_threshold(int64_t threshold) { - memcopy_threshold_ = threshold; + impl_->set_memcopy_threshold(threshold); } // ---------------------------------------------------------------------- @@ -233,6 +276,18 @@ Status BufferReader::Read(int64_t nbytes, std::shared_ptr* out) { return Status::OK(); } +Status BufferReader::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, out); +} + +Status BufferReader::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, out); +} + Status BufferReader::GetSize(int64_t* size) { *size = size_; return Status::OK(); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 563000f77b4..978c198c2dd 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -22,7 +22,6 @@ #include #include -#include #include "arrow/io/interfaces.h" #include "arrow/util/visibility.h" @@ -99,15 +98,8 @@ class ARROW_EXPORT FixedSizeBufferWriter : public WriteableFile { void set_memcopy_threshold(int64_t threshold); protected: - std::mutex lock_; - std::shared_ptr buffer_; - uint8_t* mutable_data_; - int64_t size_; - int64_t position_; - - int memcopy_num_threads_; - int64_t memcopy_blocksize_; - int64_t memcopy_threshold_; + class FixedSizeBufferWriterImpl; + std::unique_ptr impl_; }; /// \class BufferReader @@ -125,6 +117,12 @@ class ARROW_EXPORT BufferReader : public RandomAccessFile { // Zero copy read Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + uint8_t* out) override; + + /// Default implementation is thread-safe + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + Status GetSize(int64_t* size) override; Status Seek(int64_t position) override; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 31dc0e73e0e..9d244f11544 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -477,7 +477,6 @@ fbs::Type ToFlatbufferType(Type::type type) { return fbs::Type_INT64; default: DCHECK(false) << "Cannot reach this code"; - break; } // prevent compiler warning return fbs::Type_MIN; @@ -632,19 +631,19 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { #define VISIT_PRIMITIVE(TYPE) \ Status Visit(const TYPE& values) override { return WritePrimitiveValues(values); } - VISIT_PRIMITIVE(BooleanArray); - VISIT_PRIMITIVE(Int8Array); - VISIT_PRIMITIVE(Int16Array); - VISIT_PRIMITIVE(Int32Array); - VISIT_PRIMITIVE(Int64Array); - VISIT_PRIMITIVE(UInt8Array); - VISIT_PRIMITIVE(UInt16Array); - VISIT_PRIMITIVE(UInt32Array); - VISIT_PRIMITIVE(UInt64Array); - VISIT_PRIMITIVE(FloatArray); - VISIT_PRIMITIVE(DoubleArray); - VISIT_PRIMITIVE(BinaryArray); - VISIT_PRIMITIVE(StringArray); + VISIT_PRIMITIVE(BooleanArray) + VISIT_PRIMITIVE(Int8Array) + VISIT_PRIMITIVE(Int16Array) + VISIT_PRIMITIVE(Int32Array) + VISIT_PRIMITIVE(Int64Array) + VISIT_PRIMITIVE(UInt8Array) + VISIT_PRIMITIVE(UInt16Array) + VISIT_PRIMITIVE(UInt32Array) + VISIT_PRIMITIVE(UInt64Array) + VISIT_PRIMITIVE(FloatArray) + VISIT_PRIMITIVE(DoubleArray) + VISIT_PRIMITIVE(BinaryArray) + VISIT_PRIMITIVE(StringArray) #undef VISIT_PRIMITIVE diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 7855aeafeb0..f2dd9e74e33 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -279,8 +279,7 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { std::unique_ptr reader; - auto buffer = std::make_shared(reinterpret_cast(result.c_str()), - static_cast(result.size())); + auto buffer = std::make_shared(result); ASSERT_OK(JsonReader::Open(buffer, &reader)); ASSERT_TRUE(reader->schema()->Equals(*schema)); @@ -395,8 +394,7 @@ void CheckRoundtrip(const RecordBatch& batch) { std::string result; ASSERT_OK(writer->Finish(&result)); - auto buffer = std::make_shared(reinterpret_cast(result.c_str()), - static_cast(result.size())); + auto buffer = std::make_shared(result); std::unique_ptr reader; ASSERT_OK(JsonReader::Open(buffer, &reader)); diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index ad3af0fb69f..d454d59b285 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -63,10 +63,10 @@ TEST(TestMessage, Equals) { std::string metadata = "foo"; std::string body = "bar"; - auto b1 = GetBufferFromString(metadata); - auto b2 = GetBufferFromString(metadata); - auto b3 = GetBufferFromString(body); - auto b4 = GetBufferFromString(body); + auto b1 = std::make_shared(metadata); + auto b2 = std::make_shared(metadata); + auto b3 = std::make_shared(body); + auto b4 = std::make_shared(body); Message msg1(b1, b3); Message msg2(b2, b4); diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 082c92556b7..0c587ab7b23 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -29,6 +29,7 @@ #include "arrow/ipc/Schema_generated.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/status.h" +#include "arrow/util/logging.h" namespace arrow { namespace ipc { @@ -194,9 +195,18 @@ std::string FormatMessageType(Message::Type type) { Status ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file, std::unique_ptr* message) { + DCHECK_GT(static_cast(metadata_length), sizeof(int32_t)); + std::shared_ptr buffer; RETURN_NOT_OK(file->ReadAt(offset, metadata_length, &buffer)); + if (buffer->size() < metadata_length) { + std::stringstream ss; + ss << "Expected to read " << metadata_length << " metadata bytes but got " + << buffer->size(); + return Status::Invalid(ss.str()); + } + int32_t flatbuffer_size = *reinterpret_cast(buffer->data()); if (flatbuffer_size + static_cast(sizeof(int32_t)) > metadata_length) { diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 4bc4384f64e..522b3bdcdbb 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -82,11 +82,10 @@ class ARROW_EXPORT Message { static Status ReadFrom(const std::shared_ptr& metadata, io::InputStream* stream, std::unique_ptr* out); - /// \brief Write length-prefixed metadata and body to output stream + /// \brief Return true if message type and contents are equal /// - /// \param[in] file output stream to write to - /// \param[out] output_length the number of bytes written - /// \return Status + /// \param other another message + /// \return true if contents equal bool Equals(const Message& other) const; /// \brief the Message metadata diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 09def6ea6ed..e6ba50e742f 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -349,7 +349,6 @@ Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionar reinterpret_cast(dictionary_batch->data()); RETURN_NOT_OK( ReadRecordBatch(batch_meta, dummy_schema, kMaxNestingDepth, file, &batch)); - if (batch->num_columns() != 1) { return Status::Invalid("Dictionary record batch must only contain one field"); } @@ -526,6 +525,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { int file_end_size = static_cast(magic_size + sizeof(int32_t)); RETURN_NOT_OK(file_->ReadAt(footer_offset_ - file_end_size, file_end_size, &buffer)); + const int64_t expected_footer_size = magic_size + sizeof(int32_t); + if (buffer->size() < expected_footer_size) { + std::stringstream ss; + ss << "Unable to read " << expected_footer_size << "from end of file"; + return Status::Invalid(ss.str()); + } + if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { return Status::Invalid("Not an Arrow file"); } diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 54174f9ca17..e90dc1e5234 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -54,8 +54,8 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// Create batch reader from generic MessageReader /// - /// \param(in) message_reader a MessageReader implementation - /// \param(out) out the created RecordBatchReader object + /// \param[in] message_reader a MessageReader implementation + /// \param[out] out the created RecordBatchReader object /// \return Status static Status Open(std::unique_ptr message_reader, std::shared_ptr* out); @@ -72,9 +72,9 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// \brief Record batch stream reader from InputStream /// - /// \param(in) stream an input stream instance. Must stay alive throughout + /// \param[in] stream an input stream instance. Must stay alive throughout /// lifetime of stream reader - /// \param(out) out the created RecordBatchStreamReader object + /// \param[out] out the created RecordBatchStreamReader object /// \return Status static Status Open(io::InputStream* stream, std::shared_ptr* out); @@ -112,8 +112,8 @@ class ARROW_EXPORT RecordBatchFileReader { /// metadata footer). The metadata must have been written with memory offsets /// relative to the start of the containing file /// - /// @param file: the data source - /// @param footer_offset: the position of the end of the Arrow "file" + /// @param file the data source + /// @param footer_offset the position of the end of the Arrow "file" static Status Open(io::RandomAccessFile* file, int64_t footer_offset, std::shared_ptr* reader); @@ -138,8 +138,8 @@ class ARROW_EXPORT RecordBatchFileReader { /// Read a record batch from the file. Does not copy memory if the input /// source supports zero-copy. /// - /// \param(in) i the index of the record batch to return - /// \param(out) batch the read batch + /// \param[in] i the index of the record batch to return + /// \param[out] batch the read batch /// \return Status Status ReadRecordBatch(int i, std::shared_ptr* batch); @@ -165,20 +165,19 @@ Status ReadSchema(io::InputStream* stream, std::shared_ptr* out); /// Read record batch as encapsulated IPC message with metadata size prefix and /// header /// -/// \param(in) schema the record batch schema -/// \param(in) offset the file location of the start of the message -/// \param(in) file the file where the batch is located -/// \param(out) out the read record batch +/// \param[in] schema the record batch schema +/// \param[in] stream the file where the batch is located +/// \param[out] out the read record batch ARROW_EXPORT Status ReadRecordBatch(const std::shared_ptr& schema, io::InputStream* stream, std::shared_ptr* out); /// \brief Read record batch from file given metadata and schema /// -/// \param(in) metadata a Message containing the record batch metadata -/// \param(in) schema the record batch schema -/// \param(in) file a random access file -/// \param(out) out the read record batch +/// \param[in] metadata a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] file a random access file +/// \param[out] out the read record batch ARROW_EXPORT Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, io::RandomAccessFile* file, std::shared_ptr* out); @@ -186,7 +185,7 @@ Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& sc /// \brief Read record batch from fully encapulated Message /// /// \param[in] message a message instance containing metadata and body -/// \param[in] schema +/// \param[in] schema the record batch schema /// \param[out] out the resulting RecordBatch /// \return Status ARROW_EXPORT @@ -195,11 +194,11 @@ Status ReadRecordBatch(const Message& message, const std::shared_ptr& sc /// Read record batch from file given metadata and schema /// -/// \param(in) metadata a Message containing the record batch metadata -/// \param(in) schema the record batch schema -/// \param(in) file a random access file -/// \param(in) max_recursion_depth the maximum permitted nesting depth -/// \param(out) out the read record batch +/// \param[in] metadata a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] file a random access file +/// \param[in] max_recursion_depth the maximum permitted nesting depth +/// \param[out] out the read record batch ARROW_EXPORT Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, int max_recursion_depth, io::RandomAccessFile* file, @@ -207,9 +206,9 @@ Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& sc /// EXPERIMENTAL: Read arrow::Tensor as encapsulated IPC message in file /// -/// \param(in) offset the file location of the start of the message -/// \param(in) file the file where the batch is located -/// \param(out) out the read tensor +/// \param[in] offset the file location of the start of the message +/// \param[in] file the file where the batch is located +/// \param[out] out the read tensor ARROW_EXPORT Status ReadTensor(int64_t offset, io::RandomAccessFile* file, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 9f557f6659e..9531fd77a24 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -324,24 +324,24 @@ class RecordBatchSerializer : public ArrayVisitor { #define VISIT_FIXED_WIDTH(TYPE) \ Status Visit(const TYPE& array) override { return VisitFixedWidth(array); } - VISIT_FIXED_WIDTH(Int8Array); - VISIT_FIXED_WIDTH(Int16Array); - VISIT_FIXED_WIDTH(Int32Array); - VISIT_FIXED_WIDTH(Int64Array); - VISIT_FIXED_WIDTH(UInt8Array); - VISIT_FIXED_WIDTH(UInt16Array); - VISIT_FIXED_WIDTH(UInt32Array); - VISIT_FIXED_WIDTH(UInt64Array); - VISIT_FIXED_WIDTH(HalfFloatArray); - VISIT_FIXED_WIDTH(FloatArray); - VISIT_FIXED_WIDTH(DoubleArray); - VISIT_FIXED_WIDTH(Date32Array); - VISIT_FIXED_WIDTH(Date64Array); - VISIT_FIXED_WIDTH(TimestampArray); - VISIT_FIXED_WIDTH(Time32Array); - VISIT_FIXED_WIDTH(Time64Array); - VISIT_FIXED_WIDTH(FixedSizeBinaryArray); - VISIT_FIXED_WIDTH(DecimalArray); + VISIT_FIXED_WIDTH(Int8Array) + VISIT_FIXED_WIDTH(Int16Array) + VISIT_FIXED_WIDTH(Int32Array) + VISIT_FIXED_WIDTH(Int64Array) + VISIT_FIXED_WIDTH(UInt8Array) + VISIT_FIXED_WIDTH(UInt16Array) + VISIT_FIXED_WIDTH(UInt32Array) + VISIT_FIXED_WIDTH(UInt64Array) + VISIT_FIXED_WIDTH(HalfFloatArray) + VISIT_FIXED_WIDTH(FloatArray) + VISIT_FIXED_WIDTH(DoubleArray) + VISIT_FIXED_WIDTH(Date32Array) + VISIT_FIXED_WIDTH(Date64Array) + VISIT_FIXED_WIDTH(TimestampArray) + VISIT_FIXED_WIDTH(Time32Array) + VISIT_FIXED_WIDTH(Time64Array) + VISIT_FIXED_WIDTH(FixedSizeBinaryArray) + VISIT_FIXED_WIDTH(DecimalArray) #undef VISIT_FIXED_WIDTH diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index df03f4f1381..89d3f5f2684 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -61,7 +61,7 @@ class ARROW_EXPORT RecordBatchWriter { virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0; /// \brief Write possibly-chunked table by creating sequence of record batches - /// \param[in] table + /// \param[in] table table to write /// \return Status Status WriteTable(const Table& table); @@ -87,9 +87,9 @@ class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter { /// Create a new writer from stream sink and schema. User is responsible for /// closing the actual OutputStream. /// - /// \param(in) sink output stream to write to - /// \param(in) schema the schema of the record batches to be written - /// \param(out) out the created stream writer + /// \param[in] sink output stream to write to + /// \param[in] schema the schema of the record batches to be written + /// \param[out] out the created stream writer /// \return Status static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, std::shared_ptr* out); @@ -121,9 +121,9 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// Create a new writer from stream sink and schema /// - /// \param(in) sink output stream to write to - /// \param(in) schema the schema of the record batches to be written - /// \param(out) out the created stream writer + /// \param[in] sink output stream to write to + /// \param[in] schema the schema of the record batches to be written + /// \param[out] out the created stream writer /// \return Status static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, std::shared_ptr* out); @@ -155,13 +155,13 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// to the end of the body and end of the metadata / data header (suffixed by /// the header size) is returned in out-variables /// -/// \param(in) buffer_start_offset the start offset to use in the buffer metadata, +/// \param[in] buffer_start_offset the start offset to use in the buffer metadata, /// default should be 0 -/// \param(in) allow_64bit permit field lengths exceeding INT32_MAX. May not be +/// \param[in] allow_64bit permit field lengths exceeding INT32_MAX. May not be /// readable by other Arrow implementations -/// \param(out) metadata_length: the size of the length-prefixed flatbuffer +/// \param[out] metadata_length the size of the length-prefixed flatbuffer /// including padding to a 64-byte boundary -/// \param(out) body_length: the size of the contiguous buffer block plus +/// \param[out] body_length the size of the contiguous buffer block plus /// padding bytes /// \return Status /// @@ -198,7 +198,7 @@ Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool, /// \brief Serialize schema using stream writer as a sequence of one or more /// IPC messages /// -/// \param[in] scheam the schema to write +/// \param[in] schema the schema to write /// \param[in] pool a MemoryPool to allocate memory from /// \param[out] out the serialized schema /// \return Status diff --git a/cpp/src/arrow/memory_pool-test.cc b/cpp/src/arrow/memory_pool-test.cc index 552c79b5ae7..0a4785d5229 100644 --- a/cpp/src/arrow/memory_pool-test.cc +++ b/cpp/src/arrow/memory_pool-test.cc @@ -59,39 +59,36 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { } TEST(DefaultMemoryPoolDeathTest, MaxMemory) { - DefaultMemoryPool pool; - - ASSERT_EQ(0, pool.max_memory()); + MemoryPool* pool = default_memory_pool(); uint8_t* data; - ASSERT_OK(pool.Allocate(100, &data)); + ASSERT_OK(pool->Allocate(100, &data)); uint8_t* data2; - ASSERT_OK(pool.Allocate(100, &data2)); + ASSERT_OK(pool->Allocate(100, &data2)); - pool.Free(data, 100); - pool.Free(data2, 100); + pool->Free(data, 100); + pool->Free(data2, 100); - ASSERT_EQ(200, pool.max_memory()); + ASSERT_EQ(200, pool->max_memory()); } #endif // ARROW_VALGRIND TEST(LoggingMemoryPool, Logging) { - DefaultMemoryPool pool; - LoggingMemoryPool lp(&pool); + MemoryPool* pool = default_memory_pool(); - ASSERT_EQ(0, lp.max_memory()); + LoggingMemoryPool lp(pool); uint8_t* data; - ASSERT_OK(pool.Allocate(100, &data)); + ASSERT_OK(pool->Allocate(100, &data)); uint8_t* data2; - ASSERT_OK(pool.Allocate(100, &data2)); + ASSERT_OK(pool->Allocate(100, &data2)); - pool.Free(data, 100); - pool.Free(data2, 100); + pool->Free(data, 100); + pool->Free(data2, 100); - ASSERT_EQ(200, pool.max_memory()); + ASSERT_EQ(200, pool->max_memory()); } } // namespace arrow diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index d86fb08be89..3496636a405 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -85,73 +85,82 @@ MemoryPool::~MemoryPool() {} int64_t MemoryPool::max_memory() const { return -1; } -DefaultMemoryPool::DefaultMemoryPool() : bytes_allocated_(0) { max_memory_ = 0; } +class DefaultMemoryPool : public MemoryPool { + public: + DefaultMemoryPool() : bytes_allocated_(0) { max_memory_ = 0; } -Status DefaultMemoryPool::Allocate(int64_t size, uint8_t** out) { - RETURN_NOT_OK(AllocateAligned(size, out)); - bytes_allocated_ += size; + ~DefaultMemoryPool() {} - { - std::lock_guard guard(lock_); - if (bytes_allocated_ > max_memory_) { - max_memory_ = bytes_allocated_.load(); + Status Allocate(int64_t size, uint8_t** out) override { + RETURN_NOT_OK(AllocateAligned(size, out)); + bytes_allocated_ += size; + + { + std::lock_guard guard(lock_); + if (bytes_allocated_ > max_memory_) { + max_memory_ = bytes_allocated_.load(); + } } + return Status::OK(); } - return Status::OK(); -} -Status DefaultMemoryPool::Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) { + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { #ifdef ARROW_JEMALLOC - *ptr = reinterpret_cast(rallocx(*ptr, new_size, MALLOCX_ALIGN(kAlignment))); - if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; - return Status::OutOfMemory(ss.str()); - } + *ptr = reinterpret_cast(rallocx(*ptr, new_size, MALLOCX_ALIGN(kAlignment))); + if (*ptr == NULL) { + std::stringstream ss; + ss << "realloc of size " << new_size << " failed"; + return Status::OutOfMemory(ss.str()); + } #else - // Note: We cannot use realloc() here as it doesn't guarantee alignment. - - // Allocate new chunk - uint8_t* out; - RETURN_NOT_OK(AllocateAligned(new_size, &out)); - // Copy contents and release old memory chunk - memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); + // Note: We cannot use realloc() here as it doesn't guarantee alignment. + + // Allocate new chunk + uint8_t* out = nullptr; + RETURN_NOT_OK(AllocateAligned(new_size, &out)); + DCHECK(out); + // Copy contents and release old memory chunk + memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); #ifdef _MSC_VER - _aligned_free(*ptr); + _aligned_free(*ptr); #else - std::free(*ptr); + std::free(*ptr); #endif // defined(_MSC_VER) - *ptr = out; + *ptr = out; #endif // defined(ARROW_JEMALLOC) - bytes_allocated_ += new_size - old_size; - { - std::lock_guard guard(lock_); - if (bytes_allocated_ > max_memory_) { - max_memory_ = bytes_allocated_.load(); + bytes_allocated_ += new_size - old_size; + { + std::lock_guard guard(lock_); + if (bytes_allocated_ > max_memory_) { + max_memory_ = bytes_allocated_.load(); + } } - } - return Status::OK(); -} + return Status::OK(); + } -int64_t DefaultMemoryPool::bytes_allocated() const { return bytes_allocated_.load(); } + int64_t bytes_allocated() const override { return bytes_allocated_.load(); } -void DefaultMemoryPool::Free(uint8_t* buffer, int64_t size) { - DCHECK_GE(bytes_allocated_, size); + void Free(uint8_t* buffer, int64_t size) override { + DCHECK_GE(bytes_allocated_, size); #ifdef _MSC_VER - _aligned_free(buffer); + _aligned_free(buffer); #elif defined(ARROW_JEMALLOC) - dallocx(buffer, MALLOCX_ALIGN(kAlignment)); + dallocx(buffer, MALLOCX_ALIGN(kAlignment)); #else - std::free(buffer); + std::free(buffer); #endif - bytes_allocated_ -= size; -} + bytes_allocated_ -= size; + } -int64_t DefaultMemoryPool::max_memory() const { return max_memory_.load(); } + int64_t max_memory() const override { return max_memory_.load(); } -DefaultMemoryPool::~DefaultMemoryPool() {} + private: + mutable std::mutex lock_; + std::atomic bytes_allocated_; + std::atomic max_memory_; +}; MemoryPool* default_memory_pool() { static DefaultMemoryPool default_memory_pool_; diff --git a/cpp/src/arrow/memory_pool.h b/cpp/src/arrow/memory_pool.h index 5bb2b5669ed..52ec67fee8d 100644 --- a/cpp/src/arrow/memory_pool.h +++ b/cpp/src/arrow/memory_pool.h @@ -20,7 +20,6 @@ #include #include -#include #include "arrow/util/visibility.h" @@ -69,26 +68,6 @@ class ARROW_EXPORT MemoryPool { MemoryPool(); }; -class ARROW_EXPORT DefaultMemoryPool : public MemoryPool { - public: - DefaultMemoryPool(); - virtual ~DefaultMemoryPool(); - - Status Allocate(int64_t size, uint8_t** out) override; - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override; - - void Free(uint8_t* buffer, int64_t size) override; - - int64_t bytes_allocated() const override; - - int64_t max_memory() const override; - - private: - mutable std::mutex lock_; - std::atomic bytes_allocated_; - std::atomic max_memory_; -}; - class ARROW_EXPORT LoggingMemoryPool : public MemoryPool { public: explicit LoggingMemoryPool(MemoryPool* pool); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index cc1acf4befd..31b93442402 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -199,7 +199,7 @@ class ArrayPrinter : public PrettyPrinter { } } - Status Visit(const NullArray& array) { return Status::OK(); } + Status Visit(const NullArray&) { return Status::OK(); } template typename std::enable_if::value || @@ -213,7 +213,7 @@ class ArrayPrinter : public PrettyPrinter { return Status::OK(); } - Status Visit(const IntervalArray& array) { return Status::NotImplemented("interval"); } + Status Visit(const IntervalArray&) { return Status::NotImplemented("interval"); } Status WriteValidityBitmap(const Array& array); diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 84aad82e2a9..af53a1631f7 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -40,7 +40,7 @@ endif() set(ARROW_PYTHON_MIN_TEST_LIBS arrow_python_test_main - arrow_python_shared + arrow_python_static arrow_shared) set(ARROW_PYTHON_TEST_LINK_LIBS ${ARROW_PYTHON_MIN_TEST_LIBS}) @@ -57,7 +57,7 @@ set(ARROW_PYTHON_SRCS init.cc io.cc numpy_convert.cc - pandas_to_arrow.cc + numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc ) @@ -100,7 +100,7 @@ install(FILES io.h numpy_convert.h numpy_interop.h - pandas_to_arrow.h + numpy_to_arrow.h python_to_arrow.h platform.h pyarrow.h diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 4ceb3f1a45d..a000ac5fa5a 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -25,7 +25,7 @@ #include "arrow/python/helpers.h" #include "arrow/python/io.h" #include "arrow/python/numpy_convert.h" -#include "arrow/python/pandas_to_arrow.h" +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" #endif // ARROW_PYTHON_API_H diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index be738e7f922..88b594cac94 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -615,8 +615,8 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyAcquireGIL lock; OwnedRef decimal_ref; OwnedRef Decimal_ref; - RETURN_NOT_OK(ImportModule("decimal", &decimal_ref)); - RETURN_NOT_OK(ImportFromModule(decimal_ref, "Decimal", &Decimal_ref)); + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_ref)); + RETURN_NOT_OK(internal::ImportFromModule(decimal_ref, "Decimal", &Decimal_ref)); PyObject* Decimal = Decimal_ref.obj(); for (int c = 0; c < data.num_chunks(); c++) { @@ -633,7 +633,8 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, const uint8_t* raw_value = arr->GetValue(i); std::string decimal_string; RETURN_NOT_OK(RawDecimalToString(raw_value, precision, scale, &decimal_string)); - RETURN_NOT_OK(DecimalFromString(Decimal, decimal_string, out_values++)); + *out_values++ = internal::DecimalFromString(Decimal, decimal_string); + RETURN_IF_PYERROR(); } } } diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index a281fe3c629..b4f4a41f440 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -59,6 +59,10 @@ Status DeserializeDict(PyObject* context, const Array& array, int64_t start_idx, const auto& data = static_cast(array); ScopedRef keys, vals; ScopedRef result(PyDict_New()); + RETURN_IF_PYERROR(); + + DCHECK_EQ(2, data.num_fields()); + RETURN_NOT_OK(DeserializeList(context, *data.field(0), start_idx, stop_idx, base, tensors, keys.ref())); RETURN_NOT_OK(DeserializeList(context, *data.field(1), start_idx, stop_idx, base, diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 3650c058efd..7509f30eb4e 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -54,7 +54,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); /// _serialize_callback method for serialization and a _deserialize_callback /// method for deserialization. If context is None, no custom serialization /// will be attempted. -/// \param[in] object +/// \param[in] object object to deserialize /// \param[in] base a Python object holding the underlying data that any NumPy /// arrays will reference, to avoid premature deallocation /// \param[out] out the returned object diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 747b872af0a..69a19e7a367 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -359,7 +360,11 @@ class TypedConverterVisitor : public TypedConverter { if (PySequence_Check(obj)) { for (int64_t i = 0; i < size; ++i) { OwnedRef ref(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } } } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); @@ -370,7 +375,11 @@ class TypedConverterVisitor : public TypedConverter { // consuming at size. while ((item = PyIter_Next(iter)) && i < size) { OwnedRef ref(item); - RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + if (ref.obj() == Py_None) { + RETURN_NOT_OK(this->typed_builder_->AppendNull()); + } else { + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + } ++i; } if (size != i) { @@ -388,52 +397,136 @@ class TypedConverterVisitor : public TypedConverter { class NullConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - return Status::Invalid("NullConverter: passed non-None value"); - } + return Status::Invalid("NullConverter: passed non-None value"); } }; class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - if (item.obj() == Py_True) { - return typed_builder_->Append(true); - } else { - return typed_builder_->Append(false); - } + return typed_builder_->Append(item.obj() == Py_True); + } +}; + +class Int8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int16Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class Int32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; class Int64Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - int64_t val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = static_cast(PyLong_AsLongLong(item.obj())); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class UInt8Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); } }; -class DateConverter : public TypedConverterVisitor { +class UInt16Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_Date* pydate = reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDate_to_ms(pydate)); + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + + if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || + val < std::numeric_limits::min())) { + return Status::Invalid( + "Cannot coerce values to array type that would " + "lose data"); + } + RETURN_IF_PYERROR(); + return typed_builder_->Append(static_cast(val)); + } +}; + +class UInt64Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); + } +}; + +class DateConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + auto pydate = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDate_to_ms(pydate)); } }; @@ -441,27 +534,17 @@ class TimestampConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - PyDateTime_DateTime* pydatetime = - reinterpret_cast(item.obj()); - return typed_builder_->Append(PyDateTime_to_us(pydatetime)); - } + auto pydatetime = reinterpret_cast(item.obj()); + return typed_builder_->Append(PyDateTime_to_us(pydatetime)); } }; class DoubleConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { - double val; - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - val = PyFloat_AsDouble(item.obj()); - RETURN_IF_PYERROR(); - return typed_builder_->Append(val); - } + double val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + return typed_builder_->Append(val); } }; @@ -473,10 +556,7 @@ class BytesConverter : public TypedConverterVisitorAppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -504,10 +584,7 @@ class FixedWidthBytesConverter Py_ssize_t expected_length = std::dynamic_pointer_cast(typed_builder_->type()) ->byte_width(); - if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - return Status::OK(); - } else if (PyUnicode_Check(item.obj())) { + if (PyUnicode_Check(item.obj())) { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); @@ -535,9 +612,7 @@ class UTF8Converter : public TypedConverterVisitor Py_ssize_t length; PyObject* obj = item.obj(); - if (obj == Py_None) { - return typed_builder_->AppendNull(); - } else if (PyBytes_Check(obj)) { + if (PyBytes_Check(obj)) { tmp.reset( PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj))); RETURN_IF_PYERROR(); @@ -565,14 +640,10 @@ class ListConverter : public TypedConverterVisitor { Status Init(ArrayBuilder* builder) override; inline Status AppendItem(const OwnedRef& item) override { - if (item.obj() == Py_None) { - return typed_builder_->AppendNull(); - } else { - RETURN_NOT_OK(typed_builder_->Append()); - PyObject* item_obj = item.obj(); - int64_t list_size = static_cast(PySequence_Size(item_obj)); - return value_converter_->AppendData(item_obj, list_size); - } + RETURN_NOT_OK(typed_builder_->Append()); + PyObject* item_obj = item.obj(); + int64_t list_size = static_cast(PySequence_Size(item_obj)); + return value_converter_->AppendData(item_obj, list_size); } protected: @@ -584,16 +655,12 @@ class DecimalConverter public: inline Status AppendItem(const OwnedRef& item) { /// TODO(phillipc): Check for nan? - if (item.obj() != Py_None) { - std::string string; - RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string)); - - Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); - return typed_builder_->Append(value); - } + std::string string; + RETURN_NOT_OK(internal::PythonDecimalToString(item.obj(), &string)); - return typed_builder_->AppendNull(); + Decimal128 value; + RETURN_NOT_OK(Decimal128::FromString(string, &value)); + return typed_builder_->Append(value); } }; @@ -604,8 +671,22 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::BOOL: return std::make_shared(); + case Type::INT8: + return std::make_shared(); + case Type::INT16: + return std::make_shared(); + case Type::INT32: + return std::make_shared(); case Type::INT64: return std::make_shared(); + case Type::UINT8: + return std::make_shared(); + case Type::UINT16: + return std::make_shared(); + case Type::UINT32: + return std::make_shared(); + case Type::UINT64: + return std::make_shared(); case Type::DATE64: return std::make_shared(); case Type::TIMESTAMP: diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index fb2fed7f0ca..ad6a7f125e0 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -55,6 +55,8 @@ std::shared_ptr GetPrimitiveType(Type::type type) { } } +namespace internal { + Status ImportModule(const std::string& module_name, OwnedRef* ref) { PyObject* module = PyImport_ImportModule(module_name.c_str()); RETURN_IF_PYERROR(); @@ -106,10 +108,9 @@ Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision, return Decimal128::FromString(c_string, nullptr, precision, scale); } -Status DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string, - PyObject** out) { +PyObject* DecimalFromString(PyObject* decimal_constructor, + const std::string& decimal_string) { DCHECK_NE(decimal_constructor, nullptr); - DCHECK_NE(out, nullptr); auto string_size = decimal_string.size(); DCHECK_GT(string_size, 0); @@ -117,11 +118,10 @@ Status DecimalFromString(PyObject* decimal_constructor, const std::string& decim auto string_bytes = decimal_string.c_str(); DCHECK_NE(string_bytes, nullptr); - *out = PyObject_CallFunction(decimal_constructor, const_cast("s#"), string_bytes, + return PyObject_CallFunction(decimal_constructor, const_cast("s#"), string_bytes, string_size); - RETURN_IF_PYERROR(); - return Status::OK(); } +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 8b8c6673c8e..01ab91657d9 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -28,26 +28,28 @@ #include "arrow/util/visibility.h" namespace arrow { - namespace py { class OwnedRef; -ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); +ARROW_EXPORT +std::shared_ptr GetPrimitiveType(Type::type type); + +namespace internal { -Status ARROW_EXPORT ImportModule(const std::string& module_name, OwnedRef* ref); -Status ARROW_EXPORT ImportFromModule(const OwnedRef& module, - const std::string& module_name, OwnedRef* ref); +Status ImportModule(const std::string& module_name, OwnedRef* ref); +Status ImportFromModule(const OwnedRef& module, const std::string& module_name, + OwnedRef* ref); -Status ARROW_EXPORT PythonDecimalToString(PyObject* python_decimal, std::string* out); +Status PythonDecimalToString(PyObject* python_decimal, std::string* out); -Status ARROW_EXPORT InferDecimalPrecisionAndScale(PyObject* python_decimal, - int* precision = nullptr, - int* scale = nullptr); +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision = nullptr, + int* scale = nullptr); -Status ARROW_EXPORT DecimalFromString(PyObject* decimal_constructor, - const std::string& decimal_string, PyObject** out); +PyObject* DecimalFromString(PyObject* decimal_constructor, + const std::string& decimal_string); +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc similarity index 87% rename from cpp/src/arrow/python/pandas_to_arrow.cc rename to cpp/src/arrow/python/numpy_to_arrow.cc index dc5b67f53e4..c0ce61cca55 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -19,10 +19,9 @@ #define ARROW_NO_DEFAULT_MEMORY_POOL +#include "arrow/python/numpy_to_arrow.h" #include "arrow/python/numpy_interop.h" -#include "arrow/python/pandas_to_arrow.h" - #include #include #include @@ -60,10 +59,14 @@ namespace py { using internal::NumPyTypeSize; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); + // ---------------------------------------------------------------------- // Conversion utilities -static inline bool PyFloat_isnan(const PyObject* obj) { +namespace { + +inline bool PyFloat_isnan(const PyObject* obj) { if (PyFloat_Check(obj)) { double val = PyFloat_AS_DOUBLE(obj); return val != val; @@ -71,11 +74,12 @@ static inline bool PyFloat_isnan(const PyObject* obj) { return false; } } -static inline bool PandasObjectIsNull(const PyObject* obj) { + +inline bool PandasObjectIsNull(const PyObject* obj) { return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); } -static inline bool PyObject_is_string(const PyObject* obj) { +inline bool PyObject_is_string(const PyObject* obj) { #if PY_MAJOR_VERSION >= 3 return PyUnicode_Check(obj) || PyBytes_Check(obj); #else @@ -83,14 +87,14 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } -static inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } +inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); } -static inline bool PyObject_is_integer(const PyObject* obj) { +inline bool PyObject_is_integer(const PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } template -static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { +inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits traits; typedef typename traits::value_type T; @@ -109,7 +113,7 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { } // Returns null count -static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { +int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; Ndarray1DIndexer mask_values(mask); @@ -123,29 +127,6 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap return null_count; } -template -static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { - typedef internal::npy_traits traits; - typedef typename traits::value_type T; - - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(array); - if (traits::supports_nulls) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(builder->Append(values[i])); - } - } - return Status::OK(); -} - Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { if (PyArray_NDIM(numpy_array) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); @@ -162,13 +143,13 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { return Status::OK(); } -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); +} // namespace /// Append as many string objects from NumPy arrays to a `StringBuilder` as we /// can fit /// /// \param[in] offset starting offset for appending -/// \param[out] values_consumed ending offset where we stopped appending. Will +/// \param[out] end_offset ending offset where we stopped appending. Will /// be length of arr if fully consumed /// \param[out] have_bytes true if we encountered any PyBytes object static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset, @@ -272,14 +253,15 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas // ---------------------------------------------------------------------- // Conversion from NumPy-in-Pandas to Arrow -class PandasConverter { +class NumPyConverter { public: - PandasConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type) + NumPyConverter(MemoryPool* pool, PyObject* ao, PyObject* mo, + const std::shared_ptr& type, bool use_pandas_null_sentinels) : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), - mask_(nullptr) { + mask_(nullptr), + use_pandas_null_sentinels_(use_pandas_null_sentinels) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } @@ -291,6 +273,39 @@ class PandasConverter { return astrides[0] != PyArray_DESCR(arr_)->elsize; } + Status Convert(); + + const std::vector>& result() const { return out_arrays_; } + + template + typename std::enable_if::value || + std::is_same::value, + Status>::type + Visit(const T& type) { + return VisitNative(); + } + + Status Visit(const Date32Type& type) { return VisitNative(); } + Status Visit(const Date64Type& type) { return VisitNative(); } + Status Visit(const TimestampType& type) { return VisitNative(); } + Status Visit(const Time32Type& type) { return VisitNative(); } + Status Visit(const Time64Type& type) { return VisitNative(); } + + Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const FixedSizeBinaryType& type) { + return TypeNotImplemented(type.ToString()); + } + + Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } + + Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } + + protected: Status InitNullBitmap() { int64_t null_bytes = BitUtil::BytesForBits(length_); @@ -317,6 +332,32 @@ class PandasConverter { return Status::OK(); } + template + Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) { + typedef internal::npy_traits traits; + typedef typename traits::value_type T; + + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(array); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(builder->Append(values[i])); + } + } + return Status::OK(); + } + Status PushArray(const std::shared_ptr& data) { std::shared_ptr result; RETURN_NOT_OK(MakeArray(data, &result)); @@ -328,7 +369,10 @@ class PandasConverter { Status VisitNative() { using traits = internal::arrow_traits; - if (mask_ != nullptr || traits::supports_nulls) { + const bool null_sentinels_possible = + (use_pandas_null_sentinels_ && traits::supports_nulls); + + if (mask_ != nullptr || null_sentinels_possible) { RETURN_NOT_OK(InitNullBitmap()); } @@ -338,7 +382,7 @@ class PandasConverter { int64_t null_count = 0; if (mask_ != nullptr) { null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (traits::supports_nulls) { + } else if (null_sentinels_possible) { // TODO(wesm): this presumes the NumPy C type and arrow C type are the // same null_count = ValuesToBitmap(arr_, null_bitmap_data_); @@ -350,58 +394,17 @@ class PandasConverter { return PushArray(arr_data); } - template - typename std::enable_if::value || - std::is_same::value, - Status>::type - Visit(const T& type) { - return VisitNative(); - } - - Status Visit(const Date32Type& type) { return VisitNative(); } - Status Visit(const Date64Type& type) { return VisitNative(); } - Status Visit(const TimestampType& type) { return VisitNative(); } - Status Visit(const Time32Type& type) { return VisitNative(); } - Status Visit(const Time64Type& type) { return VisitNative(); } - Status TypeNotImplemented(std::string type_name) { std::stringstream ss; - ss << "PandasConverter doesn't implement <" << type_name << "> conversion. "; + ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; return Status::NotImplemented(ss.str()); } - Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const FixedSizeBinaryType& type) { - return TypeNotImplemented(type.ToString()); - } - - Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } - - Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); } - - Status Convert() { - if (PyArray_NDIM(arr_) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - if (type_ == nullptr) { - return Status::Invalid("Must pass data type"); - } - - // Visit the type to perform conversion - return VisitTypeInline(*type_, this); - } - - const std::vector>& result() const { return out_arrays_; } - // ---------------------------------------------------------------------- // Conversion logic for various object dtype arrays + Status ConvertObjects(); + template Status ConvertTypedLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list); @@ -419,17 +422,17 @@ class PandasConverter { PyObject* list); Status ConvertDecimals(); Status ConvertTimes(); - Status ConvertObjects(); Status ConvertObjectsInfer(); Status ConvertObjectsInferAndCast(); - protected: MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; PyArrayObject* mask_; int64_t length_; + bool use_pandas_null_sentinels_; + // Used in visitor pattern std::vector> out_arrays_; @@ -437,6 +440,23 @@ class PandasConverter { uint8_t* null_bitmap_data_; }; +Status NumPyConverter::Convert() { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) { + return ConvertObjects(); + } + + if (type_ == nullptr) { + return Status::Invalid("Must pass data type for non-object arrays"); + } + + // Visit the type to perform conversion + return VisitTypeInline(*type_, this); +} + template void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { // Passing input_data as non-const is a concession to PyObject* @@ -482,7 +502,7 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len } template -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { using traits = internal::arrow_traits; using T = typename traits::T; @@ -513,7 +533,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* data) { } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { // Handle LONGLONG->INT64 and other fun things int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); int type_size = NumPyTypeSize(type_num_compat); @@ -552,7 +572,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* } template <> -inline Status PandasConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { int64_t nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); RETURN_NOT_OK(buffer->Resize(nbytes)); @@ -590,7 +610,7 @@ struct UnboxDate { }; template -Status PandasConverter::ConvertDates() { +Status NumPyConverter::ConvertDates() { PyAcquireGIL lock; using BuilderType = typename TypeTraits::BuilderType; @@ -626,14 +646,14 @@ Status PandasConverter::ConvertDates() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertDecimals() { +Status NumPyConverter::ConvertDecimals() { PyAcquireGIL lock; // Import the decimal module and Decimal class OwnedRef decimal; OwnedRef Decimal; - RETURN_NOT_OK(ImportModule("decimal", &decimal)); - RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); + RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); Ndarray1DIndexer objects(arr_); PyObject* object = objects[0]; @@ -641,7 +661,7 @@ Status PandasConverter::ConvertDecimals() { int precision; int scale; - RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(object, &precision, &scale)); type_ = std::make_shared(precision, scale); @@ -652,7 +672,7 @@ Status PandasConverter::ConvertDecimals() { object = objects[i]; if (PyObject_IsInstance(object, Decimal.obj())) { std::string string; - RETURN_NOT_OK(PythonDecimalToString(object, &string)); + RETURN_NOT_OK(internal::PythonDecimalToString(object, &string)); Decimal128 value; RETURN_NOT_OK(Decimal128::FromString(string, &value)); @@ -669,7 +689,7 @@ Status PandasConverter::ConvertDecimals() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertTimes() { +Status NumPyConverter::ConvertTimes() { // Convert array of datetime.time objects to Arrow PyAcquireGIL lock; PyDateTime_IMPORT; @@ -697,7 +717,7 @@ Status PandasConverter::ConvertTimes() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectStrings() { +Status NumPyConverter::ConvertObjectStrings() { PyAcquireGIL lock; // The output type at this point is inconclusive because there may be bytes @@ -729,7 +749,7 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } -Status PandasConverter::ConvertObjectFloats() { +Status NumPyConverter::ConvertObjectFloats() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -764,7 +784,7 @@ Status PandasConverter::ConvertObjectFloats() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectIntegers() { +Status NumPyConverter::ConvertObjectIntegers() { PyAcquireGIL lock; Int64Builder builder(pool_); @@ -799,11 +819,11 @@ Status PandasConverter::ConvertObjectIntegers() { return PushBuilderResult(&builder); } -Status PandasConverter::ConvertObjectFixedWidthBytes( +Status NumPyConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; - int32_t byte_width = static_cast(*type).byte_width(); + const int32_t byte_width = static_cast(*type).byte_width(); // The output type at this point is inconclusive because there may be bytes // and unicode mixed in the object array @@ -822,7 +842,7 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( return Status::OK(); } -Status PandasConverter::ConvertBooleans() { +Status NumPyConverter::ConvertBooleans() { PyAcquireGIL lock; Ndarray1DIndexer objects(arr_); @@ -864,7 +884,7 @@ Status PandasConverter::ConvertBooleans() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInfer() { +Status NumPyConverter::ConvertObjectsInfer() { Ndarray1DIndexer objects; PyAcquireGIL lock; @@ -873,8 +893,8 @@ Status PandasConverter::ConvertObjectsInfer() { OwnedRef decimal; OwnedRef Decimal; - RETURN_NOT_OK(ImportModule("decimal", &decimal)); - RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); + RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); for (int64_t i = 0; i < length_; ++i) { PyObject* obj = objects[i]; @@ -912,10 +932,10 @@ Status PandasConverter::ConvertObjectsInfer() { return Status::OK(); } -Status PandasConverter::ConvertObjectsInferAndCast() { +Status NumPyConverter::ConvertObjectsInferAndCast() { size_t position = out_arrays_.size(); RETURN_NOT_OK(ConvertObjectsInfer()); - + DCHECK_EQ(position + 1, out_arrays_.size()); std::shared_ptr arr = out_arrays_[position]; // Perform cast @@ -932,7 +952,7 @@ Status PandasConverter::ConvertObjectsInferAndCast() { return Status::OK(); } -Status PandasConverter::ConvertObjects() { +Status NumPyConverter::ConvertObjects() { // Python object arrays are annoying, since we could have one of: // // * Strings @@ -1005,8 +1025,8 @@ Status LoopPySequence(PyObject* sequence, T func) { } template -inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { typedef internal::npy_traits traits; typedef typename traits::BuilderClass BuilderT; @@ -1050,7 +1070,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; @@ -1091,7 +1111,7 @@ inline Status PandasConverter::ConvertTypedLists( } template <> -inline Status PandasConverter::ConvertTypedLists( +inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; // TODO: If there are bytes involed, convert to Binary representation @@ -1145,8 +1165,8 @@ inline Status PandasConverter::ConvertTypedLists( return ConvertTypedLists(type, builder, list); \ } -Status PandasConverter::ConvertLists(const std::shared_ptr& type, - ListBuilder* builder, PyObject* list) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type, + ListBuilder* builder, PyObject* list) { switch (type->id()) { LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) @@ -1162,10 +1182,10 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type, LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) LIST_CASE(STRING, NPY_OBJECT, StringType) case Type::LIST: { - const ListType& list_type = static_cast(*type); + const auto& list_type = static_cast(*type); auto value_builder = static_cast(builder->value_builder()); - auto foreach_item = [&](PyObject* object) { + auto foreach_item = [this, &builder, &value_builder, &list_type](PyObject* object) { if (PandasObjectIsNull(object)) { return builder->AppendNull(); } else { @@ -1185,7 +1205,7 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type, } } -Status PandasConverter::ConvertLists(const std::shared_ptr& type) { +Status NumPyConverter::ConvertLists(const std::shared_ptr& type) { std::unique_ptr array_builder; RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder)); ListBuilder* list_builder = static_cast(array_builder.get()); @@ -1193,21 +1213,15 @@ Status PandasConverter::ConvertLists(const std::shared_ptr& type) { return PushBuilderResult(list_builder); } -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out) { + NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels); RETURN_NOT_OK(converter.Convert()); - *out = converter.result()[0]; - DCHECK(*out); - return Status::OK(); -} - -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out) { - PandasConverter converter(pool, ao, mo, type); - RETURN_NOT_OK(converter.ConvertObjects()); - *out = std::make_shared(converter.result()); + const auto& output_arrays = converter.result(); + DCHECK_GT(output_arrays.size(), 0); + *out = std::make_shared(output_arrays); return Status::OK(); } diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h similarity index 69% rename from cpp/src/arrow/python/pandas_to_arrow.h rename to cpp/src/arrow/python/numpy_to_arrow.h index 3e655ba3fee..4a870fff9fc 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -17,8 +17,8 @@ // Converting from pandas memory representation to Arrow data structures -#ifndef ARROW_PYTHON_PANDAS_TO_ARROW_H -#define ARROW_PYTHON_PANDAS_TO_ARROW_H +#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H +#define ARROW_PYTHON_NUMPY_TO_ARROW_H #include "arrow/python/platform.h" @@ -36,24 +36,21 @@ class Status; namespace py { -ARROW_EXPORT -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, std::shared_ptr* out); - -/// Convert dtype=object arrays. If target data type is not known, pass a type -/// with nullptr +/// Convert NumPy arrays to Arrow. If target data type is not known, pass a +/// type with nullptr /// /// \param[in] pool Memory pool for any memory allocations /// \param[in] ao an ndarray with the array data /// \param[in] mo an ndarray with a null mask (True is null), optional -/// \param[in] type +/// \param[in] type a specific type to cast to, may be null /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_EXPORT -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, - const std::shared_ptr& type, - std::shared_ptr* out); +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, + bool use_pandas_null_sentinels, + const std::shared_ptr& type, + std::shared_ptr* out); } // namespace py } // namespace arrow -#endif // ARROW_PYTHON_PANDAS_TO_ARROW_H +#endif // ARROW_PYTHON_NUMPY_TO_ARROW_H diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index e1796c097d6..86391a18598 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -39,10 +39,10 @@ TEST(DecimalTest, TestPythonDecimalToString) { OwnedRef decimal; OwnedRef Decimal; - ASSERT_OK(ImportModule("decimal", &decimal)); + ASSERT_OK(internal::ImportModule("decimal", &decimal)); ASSERT_NE(decimal.obj(), nullptr); - ASSERT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + ASSERT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); ASSERT_NE(Decimal.obj(), nullptr); std::string decimal_string("-39402950693754869342983"); @@ -61,7 +61,7 @@ TEST(DecimalTest, TestPythonDecimalToString) { ASSERT_NE(python_object, nullptr); std::string string_result; - ASSERT_OK(PythonDecimalToString(python_object, &string_result)); + ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result)); } TEST(PandasConversionTest, TestObjectBlockWriteFails) { diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index c57091fc069..9ba78213b2c 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -297,12 +297,11 @@ class DictBuilder { /// Construct an Arrow StructArray representing the dictionary. /// Contains a field "keys" for the keys and "vals" for the values. - - /// \param list_data + /// \param val_list_data /// List containing the data from nested lists in the value /// list of the dictionary /// - /// \param dict_data + /// \param val_dict_data /// List containing the data from nested dictionaries in the /// value list of the dictionary Status Finish(const Array* key_tuple_data, const Array* key_dict_data, @@ -543,7 +542,11 @@ Status SerializeSequences(PyObject* context, std::vector sequences, ScopedRef iterator(PyObject_GetIter(sequence)); RETURN_IF_PYERROR(); ScopedRef item; - while (item.reset(PyIter_Next(iterator.get())), item.get()) { + while (true) { + item.reset(PyIter_Next(iterator.get())); + if (!item.get()) { + break; + } RETURN_NOT_OK(Append(context, item.get(), &builder, &sublists, &subtuples, &subdicts, &subsets, tensors_out)); } diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index de751510151..4ebef720f7a 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -70,11 +70,14 @@ static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, } static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { - struct tm date = {0}; + struct tm date; + memset(&date, 0, sizeof(struct tm)); date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900; date.tm_mon = PyDateTime_GET_MONTH(pydate) - 1; date.tm_mday = PyDateTime_GET_DAY(pydate); - struct tm epoch = {0}; + struct tm epoch; + memset(&epoch, 0, sizeof(struct tm)); + epoch.tm_year = 70; epoch.tm_mday = 1; #ifdef _MSC_VER @@ -88,7 +91,8 @@ static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { } static inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { - struct tm datetime = {0}; + struct tm datetime; + memset(&datetime, 0, sizeof(struct tm)); datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900; datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1; datetime.tm_mday = PyDateTime_GET_DAY(pydatetime); @@ -96,7 +100,8 @@ static inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { datetime.tm_min = PyDateTime_DATE_GET_MINUTE(pydatetime); datetime.tm_sec = PyDateTime_DATE_GET_SECOND(pydatetime); int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); - struct tm epoch = {0}; + struct tm epoch; + memset(&epoch, 0, sizeof(struct tm)); epoch.tm_year = 70; epoch.tm_mday = 1; #ifdef _MSC_VER diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index b0aeed1925e..b490310c26a 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -140,10 +140,6 @@ TEST_F(TestColumn, BasicAPI) { ASSERT_EQ(300, column_->length()); ASSERT_EQ(30, column_->null_count()); ASSERT_EQ(3, column_->data()->num_chunks()); - - // nullptr array should not break - column_.reset(new Column(f0, std::shared_ptr(nullptr))); - ASSERT_NE(column_.get(), nullptr); } TEST_F(TestColumn, ChunksInhomogeneous) { diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 3d3ecd2734e..009b5cf6373 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -42,6 +42,8 @@ ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { } } +std::shared_ptr ChunkedArray::type() const { return chunks_[0]->type(); } + bool ChunkedArray::Equals(const ChunkedArray& other) const { if (length_ != other.length()) { return false; @@ -105,10 +107,10 @@ Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) Column::Column(const std::shared_ptr& field, const std::shared_ptr& data) : field_(field) { - if (data) { - data_ = std::make_shared(ArrayVector({data})); - } else { + if (!data) { data_ = std::make_shared(ArrayVector({})); + } else { + data_ = std::make_shared(ArrayVector({data})); } } @@ -153,13 +155,6 @@ Status Column::ValidateData() { // ---------------------------------------------------------------------- // RecordBatch methods -void AssertBatchValid(const RecordBatch& batch) { - Status s = batch.Validate(); - if (!s.ok()) { - DCHECK(false) << s.ToString(); - } -} - RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows) : schema_(schema), num_rows_(num_rows) { boxed_columns_.resize(schema->num_fields()); @@ -199,6 +194,7 @@ std::shared_ptr RecordBatch::column(int i) const { if (!boxed_columns_[i]) { DCHECK(MakeArray(columns_[i], &boxed_columns_[i]).ok()); } + DCHECK(boxed_columns_[i]); return boxed_columns_[i]; } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index a66772e5a71..324112bfb3e 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -53,7 +53,7 @@ class ARROW_EXPORT ChunkedArray { const ArrayVector& chunks() const { return chunks_; } - std::shared_ptr type() const { return chunks_[0]->type(); } + std::shared_ptr type() const; bool Equals(const ChunkedArray& other) const; bool Equals(const std::shared_ptr& other) const; @@ -113,7 +113,7 @@ class ARROW_EXPORT Column { /// sequence of fields, each a contiguous Arrow array class ARROW_EXPORT RecordBatch { public: - /// \param[in] schema + /// \param[in] schema The record batch schema /// \param[in] num_rows length of fields in the record batch. Each array /// should have the same length as num_rows /// \param[in] columns the record batch fields as vector of arrays @@ -209,16 +209,16 @@ class ARROW_EXPORT Table { public: /// \brief Construct Table from schema and columns /// If columns is zero-length, the table's number of rows is zero - /// \param schema - /// \param columns - /// \param number of rows in table, -1 (default) to infer from columns + /// \param schema The table schema (column types) + /// \param columns The table's columns + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, const std::vector>& columns, int64_t num_rows = -1); /// \brief Construct Table from schema and arrays - /// \param schema - /// \param arrays - /// \param number of rows in table, -1 (default) to infer from columns + /// \param schema The table schema (column types) + /// \param arrays The table's columns as arrays + /// \param num_rows number of rows in table, -1 (default) to infer from columns Table(const std::shared_ptr& schema, const std::vector>& arrays, int64_t num_rows = -1); @@ -231,7 +231,7 @@ class ARROW_EXPORT Table { /// \return the table's schema std::shared_ptr schema() const { return schema_; } - /// \param[i] i column index, does not boundscheck + /// \param[in] i column index, does not boundscheck /// \return the i-th column std::shared_ptr column(int i) const { return columns_[i]; } @@ -283,7 +283,7 @@ class ARROW_EXPORT RecordBatchReader { /// Read the next record batch in the stream. Return nullptr for batch when /// reaching end of stream /// - /// \param(out) batch the next loaded batch, nullptr at end of stream + /// \param[out] batch the next loaded batch, nullptr at end of stream /// \return Status virtual Status ReadNext(std::shared_ptr* batch) = 0; diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 710344ab0d2..3242fadd50d 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -68,8 +68,7 @@ TEST(TestField, TestAddMetadata) { new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); auto f0 = field("f0", int32()); auto f1 = field("f0", int32(), true, metadata); - std::shared_ptr f2; - ASSERT_OK(f0->AddMetadata(metadata, &f2)); + std::shared_ptr f2 = f0->AddMetadata(metadata); ASSERT_FALSE(f2->Equals(*f0)); ASSERT_TRUE(f2->Equals(*f1)); @@ -184,8 +183,7 @@ TEST_F(TestSchema, TestAddMetadata) { auto metadata = std::shared_ptr( new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); auto schema = std::make_shared(fields); - std::shared_ptr new_schema; - ASSERT_OK(schema->AddMetadata(metadata, &new_schema)); + std::shared_ptr new_schema = schema->AddMetadata(metadata); ASSERT_TRUE(metadata->Equals(*new_schema->metadata())); // Not copied diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 87bb73b1e7c..b9e314440f5 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -36,12 +36,16 @@ std::shared_ptr Field::AddMetadata( return std::make_shared(name_, type_, nullable_, metadata); } +#ifndef ARROW_NO_DEPRECATED_API + Status Field::AddMetadata(const std::shared_ptr& metadata, std::shared_ptr* out) const { *out = AddMetadata(metadata); return Status::OK(); } +#endif + std::shared_ptr Field::RemoveMetadata() const { return std::make_shared(name_, type_, nullable_); } @@ -307,12 +311,16 @@ std::shared_ptr Schema::AddMetadata( return std::make_shared(fields_, metadata); } +#ifndef ARROW_NO_DEPRECATED_API + Status Schema::AddMetadata(const std::shared_ptr& metadata, std::shared_ptr* out) const { *out = AddMetadata(metadata); return Status::OK(); } +#endif + std::shared_ptr Schema::metadata() const { return metadata_; } std::shared_ptr Schema::RemoveMetadata() const { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 7630f48151b..c1da6e27043 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -227,9 +227,11 @@ class ARROW_EXPORT Field { std::shared_ptr metadata() const { return metadata_; } - /// \deprecated +#ifndef ARROW_NO_DEPRECATED_API + /// \note Deprecated since 0.8.0 Status AddMetadata(const std::shared_ptr& metadata, std::shared_ptr* out) const; +#endif std::shared_ptr AddMetadata( const std::shared_ptr& metadata) const; @@ -749,9 +751,11 @@ class ARROW_EXPORT Schema { std::shared_ptr* out) const; Status RemoveField(int i, std::shared_ptr* out) const; - /// \deprecated +#ifndef ARROW_NO_DEPRECATED_API + /// \note Deprecated since 0.8.0 Status AddMetadata(const std::shared_ptr& metadata, std::shared_ptr* out) const; +#endif /// \brief Replace key-value metadata with new metadata /// diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 15bf3595a96..e0116cc567b 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -31,6 +31,7 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" namespace arrow { @@ -48,9 +49,9 @@ Status BitUtil::BytesToBits(const std::vector& bytes, MemoryPool* pool, std::shared_ptr buffer; RETURN_NOT_OK(AllocateBuffer(pool, bit_length, &buffer)); - - memset(buffer->mutable_data(), 0, static_cast(bit_length)); - FillBitsFromBytes(bytes, buffer->mutable_data()); + uint8_t* out_buf = buffer->mutable_data(); + memset(out_buf, 0, static_cast(bit_length)); + FillBitsFromBytes(bytes, out_buf); *out = buffer; return Status::OK(); diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index 1aaec11d9ca..196c19a660b 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -25,6 +25,7 @@ #include #include "arrow/status.h" +#include "arrow/util/macros.h" namespace arrow { @@ -41,7 +42,8 @@ Status BrotliCodec::Decompress(int64_t input_len, const uint8_t* input, return Status::OK(); } -int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { +int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, + const uint8_t* ARROW_ARG_UNUSED(input)) { return BrotliEncoderMaxCompressedSize(input_len); } diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index cda40ad8c3a..001edeb01b6 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -22,6 +22,7 @@ #include #include "arrow/status.h" +#include "arrow/util/macros.h" namespace arrow { @@ -39,7 +40,8 @@ Status Lz4Codec::Decompress(int64_t input_len, const uint8_t* input, int64_t out return Status::OK(); } -int64_t Lz4Codec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { +int64_t Lz4Codec::MaxCompressedLen(int64_t input_len, + const uint8_t* ARROW_ARG_UNUSED(input)) { return LZ4_compressBound(static_cast(input_len)); } diff --git a/cpp/src/arrow/util/compression_snappy.cc b/cpp/src/arrow/util/compression_snappy.cc index e284bd4358d..2edaef7cf4b 100644 --- a/cpp/src/arrow/util/compression_snappy.cc +++ b/cpp/src/arrow/util/compression_snappy.cc @@ -33,7 +33,8 @@ namespace arrow { // Snappy implementation Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, - int64_t output_len, uint8_t* output_buffer) { + int64_t ARROW_ARG_UNUSED(output_len), + uint8_t* output_buffer) { if (!snappy::RawUncompress(reinterpret_cast(input), static_cast(input_len), reinterpret_cast(output_buffer))) { @@ -42,13 +43,14 @@ Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, return Status::OK(); } -int64_t SnappyCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { +int64_t SnappyCodec::MaxCompressedLen(int64_t input_len, + const uint8_t* ARROW_ARG_UNUSED(input)) { return snappy::MaxCompressedLength(input_len); } Status SnappyCodec::Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, - int64_t* output_length) { + int64_t ARROW_ARG_UNUSED(output_buffer_len), + uint8_t* output_buffer, int64_t* output_length) { size_t output_len; snappy::RawCompress(reinterpret_cast(input), static_cast(input_len), diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 0656fd64b5c..3a520240ec6 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -165,7 +165,7 @@ class GZipCodec::GZipCodecImpl { return Status::OK(); } - int64_t MaxCompressedLen(int64_t input_length, const uint8_t* input) { + int64_t MaxCompressedLen(int64_t input_length, const uint8_t* ARROW_ARG_UNUSED(input)) { // Most be in compression mode if (!compressor_initialized_) { Status s = InitCompressor(); diff --git a/cpp/src/arrow/util/compression_zstd.cc b/cpp/src/arrow/util/compression_zstd.cc index d19ac43449b..20306f48efb 100644 --- a/cpp/src/arrow/util/compression_zstd.cc +++ b/cpp/src/arrow/util/compression_zstd.cc @@ -23,6 +23,7 @@ #include #include "arrow/status.h" +#include "arrow/util/macros.h" using std::size_t; @@ -42,7 +43,8 @@ Status ZSTDCodec::Decompress(int64_t input_len, const uint8_t* input, int64_t ou return Status::OK(); } -int64_t ZSTDCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { +int64_t ZSTDCodec::MaxCompressedLen(int64_t input_len, + const uint8_t* ARROW_ARG_UNUSED(input)) { return ZSTD_compressBound(input_len); } diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc index 639f02e482b..822fcaea74e 100644 --- a/cpp/src/arrow/util/cpu-info.cc +++ b/cpp/src/arrow/util/cpu-info.cc @@ -76,6 +76,8 @@ static struct { }; static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); +namespace { + // Helper function to parse for hardware flags. // values contains a list of space-seperated flags. check to see if the flags we // care about are present. @@ -90,6 +92,8 @@ int64_t ParseCPUFlags(const string& values) { return flags; } +} // namespace + #ifdef _WIN32 bool RetrieveCacheSize(int64_t* cache_sizes) { if (!cache_sizes) { diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index a0dea09216a..58496a874f1 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -75,8 +75,8 @@ class ARROW_EXPORT Decimal128 { /// -21 / 5 -> -4, -1 /// 21 / -5 -> -4, 1 /// -21 / -5 -> 4, -1 - /// @param right the number to divide by - /// @param remainder the remainder after the division + /// \param divisor the number to divide by + /// \param remainder the remainder after the division Status Divide(const Decimal128& divisor, Decimal128* result, Decimal128* remainder) const; diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h index d5fb212f33a..3bba07bb20f 100644 --- a/cpp/src/arrow/util/hash-util.h +++ b/cpp/src/arrow/util/hash-util.h @@ -199,7 +199,7 @@ class HashUtil { static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) { // IMPALA-2270: this function should never be used for zero-byte inputs. DCHECK_GT(bytes, 0); - uint64_t hash_u64 = hash | ((uint64_t)hash << 32); + uint64_t hash_u64 = hash | (static_cast(hash) << 32); hash_u64 = FnvHash64(data, bytes, hash_u64); return static_cast((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF)); } diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index cf74ddf37b4..4f379537c48 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -89,12 +89,14 @@ int64_t KeyValueMetadata::size() const { std::string KeyValueMetadata::key(int64_t i) const { DCHECK_GE(i, 0); - return keys_[static_cast(i)]; + DCHECK_LT(static_cast(i), keys_.size()); + return keys_[i]; } std::string KeyValueMetadata::value(int64_t i) const { DCHECK_GE(i, 0); - return values_[static_cast(i)]; + DCHECK_LT(static_cast(i), values_.size()); + return values_[i]; } std::shared_ptr KeyValueMetadata::Copy() const { diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index b7e2ceea9f3..39815f30315 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -45,7 +45,7 @@ namespace arrow { #define ARROW_CHECK(condition) \ (condition) ? 0 \ : ::arrow::internal::FatalLog(ARROW_FATAL) \ - << __FILE__ << __LINE__ << " Check failed: " #condition " " + << __FILE__ << ":" << __LINE__ << " Check failed: " #condition " " #ifdef NDEBUG #define ARROW_DFATAL ARROW_WARNING @@ -90,7 +90,7 @@ namespace internal { class NullLog { public: template - NullLog& operator<<(const T& t) { + NullLog& operator<<(const T& ARROW_ARG_UNUSED(t)) { return *this; } }; diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 2f1db0924a1..a5f6e57079a 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -26,7 +26,7 @@ #endif #define ARROW_UNUSED(x) (void)x - +#define ARROW_ARG_UNUSED(x) // // GCC can be told that a certain branch is not likely to be taken (for // instance, a CHECK failure), and use that information in static analysis. diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h index a0ec8a2e939..32ac43f919b 100644 --- a/cpp/src/arrow/util/sse-util.h +++ b/cpp/src/arrow/util/sse-util.h @@ -176,27 +176,27 @@ static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) return 0; } -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { +static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; } -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { +static inline uint32_t SSE4_crc32_u16(uint32_t, uint16_t) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; } -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { +static inline uint32_t SSE4_crc32_u32(uint32_t, uint32_t) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; } -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { +static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; } -static inline int64_t POPCNT_popcnt_u64(uint64_t a) { +static inline int64_t POPCNT_popcnt_u64(uint64_t) { DCHECK(false) << "CPU doesn't support SSE 4.2"; return 0; } @@ -205,27 +205,27 @@ static inline int64_t POPCNT_popcnt_u64(uint64_t a) { #else -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { +static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) { DCHECK(false) << "SSE support is not enabled"; return 0; } -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { +static inline uint32_t SSE4_crc32_u16(uint32_t, uint16_t) { DCHECK(false) << "SSE support is not enabled"; return 0; } -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { +static inline uint32_t SSE4_crc32_u32(uint32_t, uint32_t) { DCHECK(false) << "SSE support is not enabled"; return 0; } -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { +static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) { DCHECK(false) << "SSE support is not enabled"; return 0; } -static inline int64_t POPCNT_popcnt_u64(uint64_t a) { +static inline int64_t POPCNT_popcnt_u64(uint64_t) { DCHECK(false) << "SSE support is not enabled"; return 0; } diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 53a0fee0aae..a7b01b0f631 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -56,10 +56,7 @@ ARRAY_VISITOR_DEFAULT(ListArray); ARRAY_VISITOR_DEFAULT(StructArray); ARRAY_VISITOR_DEFAULT(UnionArray); ARRAY_VISITOR_DEFAULT(DictionaryArray); - -Status ArrayVisitor::Visit(const DecimalArray& array) { - return Status::NotImplemented("decimal"); -} +ARRAY_VISITOR_DEFAULT(DecimalArray); #undef ARRAY_VISITOR_DEFAULT diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 7e91202623e..c9339545fca 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -94,7 +94,14 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") set_property(SOURCE malloc.cc APPEND_STRING PROPERTY COMPILE_FLAGS - " -Wno-parentheses-equality -Wno-shorten-64-to-32") + " -Wno-parentheses-equality \ +-Wno-shorten-64-to-32 \ +-Wno-unused-macros ") + + set_property(SOURCE thirdparty/xxhash.cc + APPEND_STRING + PROPERTY COMPILE_FLAGS + "-Wno-unused-macros") endif() if ("${COMPILER_FAMILY}" STREQUAL "gcc") @@ -111,6 +118,7 @@ target_link_libraries(plasma_store plasma_static) install(FILES common.h common_generated.h + compat.h client.h events.h plasma.h diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 5e28d4f2af7..e57a2a6f300 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -286,7 +286,6 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, /// calls will not do anything. The client will only send a message to the store /// releasing the object when the client is truly done with the object. /// -/// @param conn The plasma connection. /// @param object_id The object ID to attempt to release. Status PlasmaClient::PerformRelease(const ObjectID& object_id) { // Decrement the count of the number of instances of this object that are @@ -357,6 +356,7 @@ Status PlasmaClient::Contains(const ObjectID& object_id, bool* has_object) { std::vector buffer; RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType_PlasmaContainsReply, &buffer)); ObjectID object_id2; + DCHECK_GT(buffer.size(), 0); RETURN_NOT_OK( ReadContainsReply(buffer.data(), buffer.size(), &object_id2, has_object)); } @@ -401,7 +401,8 @@ static inline bool compute_object_hash_parallel(XXH64_state_t* hash_state, } } - XXH64_update(hash_state, (unsigned char*)threadhash, sizeof(threadhash)); + XXH64_update(hash_state, reinterpret_cast(threadhash), + sizeof(threadhash)); return true; } @@ -409,12 +410,14 @@ static uint64_t compute_object_hash(const ObjectBuffer& obj_buffer) { XXH64_state_t hash_state; XXH64_reset(&hash_state, XXH64_DEFAULT_SEED); if (obj_buffer.data_size >= kBytesInMB) { - compute_object_hash_parallel(&hash_state, (unsigned char*)obj_buffer.data, + compute_object_hash_parallel(&hash_state, + reinterpret_cast(obj_buffer.data), obj_buffer.data_size); } else { - XXH64_update(&hash_state, (unsigned char*)obj_buffer.data, obj_buffer.data_size); + XXH64_update(&hash_state, reinterpret_cast(obj_buffer.data), + obj_buffer.data_size); } - XXH64_update(&hash_state, (unsigned char*)obj_buffer.metadata, + XXH64_update(&hash_state, reinterpret_cast(obj_buffer.metadata), obj_buffer.metadata_size); return XXH64_digest(&hash_state); } @@ -548,8 +551,6 @@ Status PlasmaClient::Disconnect() { return Status::OK(); } -#define h_addr h_addr_list[0] - Status PlasmaClient::Transfer(const char* address, int port, const ObjectID& object_id) { return SendDataRequest(manager_conn_, object_id, address, port); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 50ec55f5ec8..145942441c9 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -82,15 +82,15 @@ class ARROW_EXPORT PlasmaClient { /// Connect to the local plasma store and plasma manager. Return /// the resulting connection. /// - /// @param store_socket_name The name of the UNIX domain socket to use to + /// \param store_socket_name The name of the UNIX domain socket to use to /// connect to the Plasma store. - /// @param manager_socket_name The name of the UNIX domain socket to use to + /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. - /// @param release_delay Number of released objects that are kept around + /// \param release_delay Number of released objects that are kept around /// and not evicted to avoid too many munmaps. - /// @param num_retries number of attempts to connect to IPC socket, default 50 - /// @return The return status. + /// \param num_retries number of attempts to connect to IPC socket, default 50 + /// \return The return status. Status Connect(const std::string& store_socket_name, const std::string& manager_socket_name, int release_delay, int num_retries = -1); @@ -98,17 +98,17 @@ class ARROW_EXPORT PlasmaClient { /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. /// - /// @param object_id The ID to use for the newly created object. - /// @param data_size The size in bytes of the space to be allocated for this + /// \param object_id The ID to use for the newly created object. + /// \param data_size The size in bytes of the space to be allocated for this /// object's /// data (this does not include space used for metadata). - /// @param metadata The object's metadata. If there is no metadata, this + /// \param metadata The object's metadata. If there is no metadata, this /// pointer /// should be NULL. - /// @param metadata_size The size in bytes of the metadata. If there is no + /// \param metadata_size The size in bytes of the metadata. If there is no /// metadata, this should be 0. - /// @param data The address of the newly created object will be written here. - /// @return The return status. + /// \param data The address of the newly created object will be written here. + /// \return The return status. Status Create(const ObjectID& object_id, int64_t data_size, uint8_t* metadata, int64_t metadata_size, uint8_t** data); @@ -119,14 +119,14 @@ class ARROW_EXPORT PlasmaClient { /// but /// the caller should not release objects that were not retrieved. /// - /// @param object_ids The IDs of the objects to get. - /// @param num_object_ids The number of object IDs to get. - /// @param timeout_ms The amount of time in milliseconds to wait before this + /// \param object_ids The IDs of the objects to get. + /// \param num_objects The number of object IDs to get. + /// \param timeout_ms The amount of time in milliseconds to wait before this /// request times out. If this value is -1, then no timeout is set. - /// @param object_buffers An array where the results will be stored. If the + /// \param object_buffers An array where the results will be stored. If the /// data /// size field is -1, then the object was not retrieved. - /// @return The return status. + /// \return The return status. Status Get(const ObjectID* object_ids, int64_t num_objects, int64_t timeout_ms, ObjectBuffer* object_buffers); @@ -136,8 +136,8 @@ class ARROW_EXPORT PlasmaClient { /// the address returned by Get is no longer valid. This should be called /// once for each call to Get (with the same object ID). /// - /// @param object_id The ID of the object that is no longer needed. - /// @return The return status. + /// \param object_id The ID of the object that is no longer needed. + /// \return The return status. Status Release(const ObjectID& object_id); /// Check if the object store contains a particular object and the object has @@ -146,18 +146,18 @@ class ARROW_EXPORT PlasmaClient { /// @todo: We may want to indicate if the object has been created but not /// sealed. /// - /// @param object_id The ID of the object whose presence we are checking. - /// @param has_object The function will write true at this address if + /// \param object_id The ID of the object whose presence we are checking. + /// \param has_object The function will write true at this address if /// the object is present and false if it is not present. - /// @return The return status. + /// \return The return status. Status Contains(const ObjectID& object_id, bool* has_object); /// Seal an object in the object store. The object will be immutable after /// this /// call. /// - /// @param object_id The ID of the object to seal. - /// @return The return status. + /// \param object_id The ID of the object to seal. + /// \return The return status. Status Seal(const ObjectID& object_id); /// Delete an object from the object store. This currently assumes that the @@ -166,52 +166,51 @@ class ARROW_EXPORT PlasmaClient { /// @todo We may want to allow the deletion of objects that are not present or /// haven't been sealed. /// - /// @param object_id The ID of the object to delete. - /// @return The return status. + /// \param object_id The ID of the object to delete. + /// \return The return status. Status Delete(const ObjectID& object_id); /// Delete objects until we have freed up num_bytes bytes or there are no more /// released objects that can be deleted. /// - /// @param num_bytes The number of bytes to try to free up. - /// @param num_bytes_evicted Out parameter for total number of bytes of space + /// \param num_bytes The number of bytes to try to free up. + /// \param num_bytes_evicted Out parameter for total number of bytes of space /// retrieved. - /// @return The return status. + /// \return The return status. Status Evict(int64_t num_bytes, int64_t& num_bytes_evicted); /// Compute the hash of an object in the object store. /// - /// @param conn The object containing the connection state. - /// @param object_id The ID of the object we want to hash. - /// @param digest A pointer at which to return the hash digest of the object. + /// \param object_id The ID of the object we want to hash. + /// \param digest A pointer at which to return the hash digest of the object. /// The pointer must have at least kDigestSize bytes allocated. - /// @return The return status. + /// \return The return status. Status Hash(const ObjectID& object_id, uint8_t* digest); /// Subscribe to notifications when objects are sealed in the object store. /// Whenever an object is sealed, a message will be written to the client /// socket that is returned by this method. /// - /// @param fd Out parameter for the file descriptor the client should use to + /// \param fd Out parameter for the file descriptor the client should use to /// read notifications /// from the object store about sealed objects. - /// @return The return status. + /// \return The return status. Status Subscribe(int* fd); /// Receive next object notification for this client if Subscribe has been called. /// - /// @param fd The file descriptor we are reading the notification from. - /// @param object_id Out parameter, the object_id of the object that was sealed. - /// @param data_size Out parameter, the data size of the object that was sealed. - /// @param metadata_size Out parameter, the metadata size of the object that was sealed. - /// @return The return status. + /// \param fd The file descriptor we are reading the notification from. + /// \param object_id Out parameter, the object_id of the object that was sealed. + /// \param data_size Out parameter, the data size of the object that was sealed. + /// \param metadata_size Out parameter, the metadata size of the object that was sealed. + /// \return The return status. Status GetNotification(int fd, ObjectID* object_id, int64_t* data_size, int64_t* metadata_size); /// Disconnect from the local plasma instance, including the local store and /// manager. /// - /// @return The return status. + /// \return The return status. Status Disconnect(); /// Attempt to initiate the transfer of some objects from remote Plasma @@ -236,17 +235,17 @@ class ARROW_EXPORT PlasmaClient { /// This method is idempotent in the sense that it is ok to call it multiple /// times. /// - /// @param num_object_ids The number of object IDs fetch is being called on. - /// @param object_ids The IDs of the objects that fetch is being called on. - /// @return The return status. + /// \param num_object_ids The number of object IDs fetch is being called on. + /// \param object_ids The IDs of the objects that fetch is being called on. + /// \return The return status. Status Fetch(int num_object_ids, const ObjectID* object_ids); /// Wait for (1) a specified number of objects to be available (sealed) in the /// local Plasma Store or in a remote Plasma Store, or (2) for a timeout to /// expire. This is a blocking call. /// - /// @param num_object_requests Size of the object_requests array. - /// @param object_requests Object event array. Each element contains a request + /// \param num_object_requests Size of the object_requests array. + /// \param object_requests Object event array. Each element contains a request /// for a particular object_id. The type of request is specified in the /// "type" field. /// - A PLASMA_QUERY_LOCAL request is satisfied when object_id becomes @@ -260,36 +259,34 @@ class ARROW_EXPORT PlasmaClient { /// available either at the local Plasma Store or on a remote Plasma /// Store. In this case, the functions sets the "status" field to /// ObjectStatus_Local or ObjectStatus_Remote. - /// @param num_ready_objects The number of requests in object_requests array + /// \param num_ready_objects The number of requests in object_requests array /// that /// must be satisfied before the function returns, unless it timeouts. /// The num_ready_objects should be no larger than num_object_requests. - /// @param timeout_ms Timeout value in milliseconds. If this timeout expires + /// \param timeout_ms Timeout value in milliseconds. If this timeout expires /// before min_num_ready_objects of requests are satisfied, the /// function /// returns. - /// @param num_objects_ready Out parameter for number of satisfied requests in + /// \param num_objects_ready Out parameter for number of satisfied requests in /// the object_requests list. If the returned number is less than /// min_num_ready_objects this means that timeout expired. - /// @return The return status. + /// \return The return status. Status Wait(int64_t num_object_requests, ObjectRequest* object_requests, int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); /// Transfer local object to a different plasma manager. /// - /// @param conn The object containing the connection state. - /// @param addr IP address of the plasma manager we are transfering to. - /// @param port Port of the plasma manager we are transfering to. - /// @object_id ObjectID of the object we are transfering. - /// @return The return status. + /// \param addr IP address of the plasma manager we are transfering to. + /// \param port Port of the plasma manager we are transfering to. + /// \param object_id ObjectID of the object we are transfering. + /// \return The return status. Status Transfer(const char* addr, int port, const ObjectID& object_id); /// Return the status of a given object. This method may query the object /// table. /// - /// @param conn The object containing the connection state. - /// @param object_id The ID of the object whose status we query. - /// @param object_status Out parameter for object status. Can take the + /// \param object_id The ID of the object whose status we query. + /// \param object_status Out parameter for object status. Can take the /// following values. /// - PLASMA_CLIENT_LOCAL, if object is stored in the local Plasma /// Store. @@ -300,13 +297,12 @@ class ARROW_EXPORT PlasmaClient { /// Plasma Store. /// - PLASMA_CLIENT_DOES_NOT_EXIST, if the object doesn’t exist in the /// system. - /// @return The return status. + /// \return The return status. Status Info(const ObjectID& object_id, int* object_status); /// Get the file descriptor for the socket connection to the plasma manager. /// - /// @param conn The plasma connection. - /// @return The file descriptor for the manager connection. If there is no + /// \return The file descriptor for the manager connection. If there is no /// connection to the manager, this is -1. int get_manager_fd(); diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 66d5f3069d0..cc67ffe4686 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -26,6 +26,8 @@ #define __STDC_FORMAT_MACROS #endif +#include "plasma/compat.h" + #include "arrow/status.h" #include "arrow/util/logging.h" diff --git a/cpp/src/plasma/compat.h b/cpp/src/plasma/compat.h new file mode 100644 index 00000000000..ce751da1d71 --- /dev/null +++ b/cpp/src/plasma/compat.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PLASMA_COMPAT_H +#define PLASMA_COMPAT_H + +// Workaround for multithreading on XCode 9, see +// https://issues.apache.org/jira/browse/ARROW-1622 and +// https://github.com/tensorflow/tensorflow/issues/13220#issuecomment-331579775 +// This should be a short-term fix until the problem is fixed upstream. +#ifdef __APPLE__ +#ifndef _MACH_PORT_T +#define _MACH_PORT_T +#include /* __darwin_mach_port_t */ +typedef __darwin_mach_port_t mach_port_t; +#include +mach_port_t pthread_mach_thread_np(pthread_t); +#endif /* _MACH_PORT_T */ +#endif /* __APPLE__ */ + +#endif // PLASMA_COMPAT_H diff --git a/cpp/src/plasma/events.h b/cpp/src/plasma/events.h index 42914848645..419c2ebfba7 100644 --- a/cpp/src/plasma/events.h +++ b/cpp/src/plasma/events.h @@ -66,7 +66,6 @@ class EventLoop { /// Remove a file event handler from the event loop. /// /// @param fd The file descriptor of the event handler. - /// @return Void. void RemoveFileEvent(int fd); /// Register a handler that will be called after a time slice of @@ -84,8 +83,6 @@ class EventLoop { int RemoveTimer(int64_t timer_id); /// \brief Run the event loop. - /// - /// @return Void. void Start(); /// \brief Stop the event loop diff --git a/cpp/src/plasma/eviction_policy.h b/cpp/src/plasma/eviction_policy.h index dd1c873466e..de33dabcbaf 100644 --- a/cpp/src/plasma/eviction_policy.h +++ b/cpp/src/plasma/eviction_policy.h @@ -70,7 +70,6 @@ class EvictionPolicy { /// cache. /// /// @param object_id The object ID of the object that was created. - /// @return Void. void object_created(const ObjectID& object_id); /// This method will be called when the Plasma store needs more space, perhaps @@ -94,7 +93,6 @@ class EvictionPolicy { /// @param object_id The ID of the object that is now being used. /// @param objects_to_evict The object IDs that were chosen for eviction will /// be stored into this vector. - /// @return Void. void begin_object_access(const ObjectID& object_id, std::vector* objects_to_evict); @@ -106,7 +104,6 @@ class EvictionPolicy { /// @param object_id The ID of the object that is no longer being used. /// @param objects_to_evict The object IDs that were chosen for eviction will /// be stored into this vector. - /// @return Void. void end_object_access(const ObjectID& object_id, std::vector* objects_to_evict); diff --git a/cpp/src/plasma/fling.cc b/cpp/src/plasma/fling.cc index 79da4f43a19..b84648b25a9 100644 --- a/cpp/src/plasma/fling.cc +++ b/cpp/src/plasma/fling.cc @@ -37,6 +37,9 @@ int send_fd(int conn, int fd) { init_msg(&msg, &iov, buf, sizeof(buf)); struct cmsghdr* header = CMSG_FIRSTHDR(&msg); + if (header == nullptr) { + return -1; + } header->cmsg_level = SOL_SOCKET; header->cmsg_type = SCM_RIGHTS; header->cmsg_len = CMSG_LEN(sizeof(int)); @@ -64,8 +67,9 @@ int recv_fd(int conn) { for (struct cmsghdr* header = CMSG_FIRSTHDR(&msg); header != NULL; header = CMSG_NXTHDR(&msg, header)) if (header->cmsg_level == SOL_SOCKET && header->cmsg_type == SCM_RIGHTS) { - ssize_t count = - (header->cmsg_len - (CMSG_DATA(header) - (unsigned char*)header)) / sizeof(int); + ssize_t count = (header->cmsg_len - + (CMSG_DATA(header) - reinterpret_cast(header))) / + sizeof(int); for (int i = 0; i < count; ++i) { int fd = (reinterpret_cast(CMSG_DATA(header)))[i]; if (found_fd == -1) { diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index 9bb43399082..afe7053329b 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -26,10 +26,6 @@ using arrow::Status; -/* Number of times we try binding to a socket. */ -#define NUM_BIND_ATTEMPTS 5 -#define BIND_TIMEOUT_MS 100 - /* Number of times we try connecting to a socket. */ #define NUM_CONNECT_ATTEMPTS 50 #define CONNECT_TIMEOUT_MS 100 @@ -134,7 +130,8 @@ int bind_ipc_sock(const std::string& pathname, bool shall_listen) { } strncpy(socket_address.sun_path, pathname.c_str(), pathname.size() + 1); - if (bind(socket_fd, (struct sockaddr*)&socket_address, sizeof(socket_address)) != 0) { + if (bind(socket_fd, reinterpret_cast(&socket_address), + sizeof(socket_address)) != 0) { ARROW_LOG(ERROR) << "Bind failed for pathname " << pathname; close(socket_fd); return -1; @@ -197,8 +194,8 @@ int connect_ipc_sock(const std::string& pathname) { } strncpy(socket_address.sun_path, pathname.c_str(), pathname.size() + 1); - if (connect(socket_fd, (struct sockaddr*)&socket_address, sizeof(socket_address)) != - 0) { + if (connect(socket_fd, reinterpret_cast(&socket_address), + sizeof(socket_address)) != 0) { close(socket_fd); return -1; } @@ -227,6 +224,7 @@ uint8_t* read_message_async(int sock) { uint8_t* message = reinterpret_cast(malloc(size)); s = ReadBytes(sock, message, size); if (!s.ok()) { + free(message); /* The other side has closed the socket. */ ARROW_LOG(DEBUG) << "Socket has been closed, or some other error has occurred."; close(sock); diff --git a/cpp/src/plasma/io.h b/cpp/src/plasma/io.h index ef96c06ccea..4beb1346be4 100644 --- a/cpp/src/plasma/io.h +++ b/cpp/src/plasma/io.h @@ -27,6 +27,7 @@ #include #include "arrow/status.h" +#include "plasma/compat.h" // TODO(pcm): Replace our own custom message header (message type, // message length, plasma protocol verion) with one that is serialized diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index 476002f68c0..603ff8a4fac 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -31,6 +31,8 @@ #include #include +#include "plasma/compat.h" + #include "arrow/status.h" #include "arrow/util/logging.h" #include "plasma/common.h" diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index 305e3d5d9ce..2261b6a624a 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -79,8 +79,9 @@ Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, PlasmaObjectSpec plasma_object(object->handle.store_fd, object->handle.mmap_size, object->data_offset, object->data_size, object->metadata_offset, object->metadata_size); - auto message = CreatePlasmaCreateReply(fbb, fbb.CreateString(object_id.binary()), - &plasma_object, (PlasmaError)error_code); + auto message = + CreatePlasmaCreateReply(fbb, fbb.CreateString(object_id.binary()), &plasma_object, + static_cast(error_code)); return PlasmaSend(sock, MessageType_PlasmaCreateReply, &fbb, message); } @@ -123,7 +124,7 @@ Status ReadSealRequest(uint8_t* data, size_t size, ObjectID* object_id, Status SendSealReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; auto message = CreatePlasmaSealReply(fbb, fbb.CreateString(object_id.binary()), - (PlasmaError)error); + static_cast(error)); return PlasmaSend(sock, MessageType_PlasmaSealReply, &fbb, message); } @@ -154,7 +155,7 @@ Status ReadReleaseRequest(uint8_t* data, size_t size, ObjectID* object_id) { Status SendReleaseReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; auto message = CreatePlasmaReleaseReply(fbb, fbb.CreateString(object_id.binary()), - (PlasmaError)error); + static_cast(error)); return PlasmaSend(sock, MessageType_PlasmaReleaseReply, &fbb, message); } @@ -185,7 +186,7 @@ Status ReadDeleteRequest(uint8_t* data, size_t size, ObjectID* object_id) { Status SendDeleteReply(int sock, ObjectID object_id, int error) { flatbuffers::FlatBufferBuilder fbb; auto message = CreatePlasmaDeleteReply(fbb, fbb.CreateString(object_id.binary()), - (PlasmaError)error); + static_cast(error)); return PlasmaSend(sock, MessageType_PlasmaDeleteReply, &fbb, message); } @@ -526,8 +527,8 @@ Status ReadDataReply(uint8_t* data, size_t size, ObjectID* object_id, auto message = flatbuffers::GetRoot(data); DCHECK(verify_flatbuffer(message, data, size)); *object_id = ObjectID::from_binary(message->object_id()->str()); - *object_size = (int64_t)message->object_size(); - *metadata_size = (int64_t)message->metadata_size(); + *object_size = static_cast(message->object_size()); + *metadata_size = static_cast(message->metadata_size()); return Status::OK(); } diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index aaa2bad67c3..210cce16238 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -458,8 +458,7 @@ void PlasmaStore::disconnect_client(int client_fd) { /// be /// buffered, and this will be called again when the send buffer has room. /// -/// @param client The client to send the notification to. -/// @return Void. +/// @param client_fd The client to send the notification to. void PlasmaStore::send_notifications(int client_fd) { auto it = pending_notifications_.find(client_fd); @@ -666,7 +665,7 @@ class PlasmaStoreRunner { std::unique_ptr store_; }; -static PlasmaStoreRunner* g_runner = nullptr; +static std::unique_ptr g_runner = nullptr; void HandleSignal(int signal) { if (signal == SIGTERM) { @@ -684,10 +683,9 @@ void start_server(char* socket_name, int64_t system_memory, std::string plasma_d // to a client that has already died, the store could die. signal(SIGPIPE, SIG_IGN); - PlasmaStoreRunner runner; - g_runner = &runner; + g_runner.reset(new PlasmaStoreRunner()); signal(SIGTERM, HandleSignal); - runner.Start(socket_name, system_memory, plasma_directory, hugepages_enabled); + g_runner->Start(socket_name, system_memory, plasma_directory, hugepages_enabled); } } // namespace plasma diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 61a3a245610..d03d11f4ef0 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -77,7 +77,6 @@ class PlasmaStore { /// be called on objects that are returned by the eviction policy to evict. /// /// @param object_ids Object IDs of the objects to be deleted. - /// @return Void. void delete_objects(const std::vector& object_ids); /// Process a get request from a client. This method assumes that we will @@ -91,7 +90,6 @@ class PlasmaStore { /// @param client The client making this request. /// @param object_ids Object IDs of the objects to be gotten. /// @param timeout_ms The timeout for the get request in milliseconds. - /// @return Void. void process_get_request(Client* client, const std::vector& object_ids, int64_t timeout_ms); @@ -101,7 +99,6 @@ class PlasmaStore { /// @param digest The digest of the object. This is used to tell if two /// objects /// with the same object ID are the same. - /// @return Void. void seal_object(const ObjectID& object_id, unsigned char digest[]); /// Check if the plasma store contains an object: @@ -115,25 +112,21 @@ class PlasmaStore { /// /// @param object_id The object ID of the object that is being released. /// @param client The client making this request. - /// @param Void. void release_object(const ObjectID& object_id, Client* client); /// Subscribe a file descriptor to updates about new sealed objects. /// /// @param client The client making this request. - /// @return Void. void subscribe_to_updates(Client* client); /// Connect a new client to the PlasmaStore. /// /// @param listener_sock The socket that is listening to incoming connections. - /// @return Void. void connect_client(int listener_sock); /// Disconnect a client from the PlasmaStore. /// /// @param client_fd The client file descriptor that is disconnected. - /// @return Void. void disconnect_client(int client_fd); void send_notifications(int client_fd); diff --git a/cpp/src/plasma/test/serialization_tests.cc b/cpp/src/plasma/test/serialization_tests.cc index c76f5ce1092..7c9d90133a6 100644 --- a/cpp/src/plasma/test/serialization_tests.cc +++ b/cpp/src/plasma/test/serialization_tests.cc @@ -43,7 +43,7 @@ int create_temp_file(void) { * Seek to the beginning of a file and read a message from it. * * @param fd File descriptor of the file. - * @param message type Message type that we expect in the file. + * @param message_type Message type that we expect in the file. * * @return Pointer to the content of the message. Needs to be freed by the * caller. @@ -226,7 +226,7 @@ TEST(PlasmaSerialization, DeleteReply) { TEST(PlasmaSerialization, StatusRequest) { int fd = create_temp_file(); - int64_t num_objects = 2; + constexpr int64_t num_objects = 2; ObjectID object_ids[num_objects]; object_ids[0] = ObjectID::from_random(); object_ids[1] = ObjectID::from_random(); @@ -249,10 +249,11 @@ TEST(PlasmaSerialization, StatusReply) { ARROW_CHECK_OK(SendStatusReply(fd, object_ids, object_statuses, 2)); std::vector data = read_message_from_file(fd, MessageType_PlasmaStatusReply); int64_t num_objects = ReadStatusReply_num_objects(data.data(), data.size()); - ObjectID object_ids_read[num_objects]; - int object_statuses_read[num_objects]; - ARROW_CHECK_OK(ReadStatusReply(data.data(), data.size(), object_ids_read, - object_statuses_read, num_objects)); + + std::vector object_ids_read(num_objects); + std::vector object_statuses_read(num_objects); + ARROW_CHECK_OK(ReadStatusReply(data.data(), data.size(), object_ids_read.data(), + object_statuses_read.data(), num_objects)); ASSERT_EQ(object_ids[0], object_ids_read[0]); ASSERT_EQ(object_ids[1], object_ids_read[1]); ASSERT_EQ(object_statuses[0], object_statuses_read[0]); diff --git a/cpp/src/plasma/thirdparty/ae/ae_epoll.c b/cpp/src/plasma/thirdparty/ae/ae_epoll.c index 410aac70dc5..2f70550a980 100644 --- a/cpp/src/plasma/thirdparty/ae/ae_epoll.c +++ b/cpp/src/plasma/thirdparty/ae/ae_epoll.c @@ -72,7 +72,8 @@ static void aeApiFree(aeEventLoop *eventLoop) { static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { aeApiState *state = eventLoop->apidata; - struct epoll_event ee = {0}; /* avoid valgrind warning */ + struct epoll_event ee; + memset(&ee, 0, sizeof(struct epoll_event)); // avoid valgrind warning /* If the fd was already monitored for some event, we need a MOD * operation. Otherwise we need an ADD operation. */ int op = eventLoop->events[fd].mask == AE_NONE ? @@ -89,7 +90,8 @@ static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) { aeApiState *state = eventLoop->apidata; - struct epoll_event ee = {0}; /* avoid valgrind warning */ + struct epoll_event ee; + memset(&ee, 0, sizeof(struct epoll_event)); // avoid valgrind warning int mask = eventLoop->events[fd].mask & (~delmask); ee.events = 0; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index c528937bfdc..6877c18f624 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -140,7 +140,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta return existingLedger; } - final BufferLedger ledger = new BufferLedger(allocator, new ReleaseListener(allocator)); + final BufferLedger ledger = new BufferLedger(allocator); if (retain) { ledger.inc(); } @@ -151,54 +151,41 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta } } - /** * The way that a particular BufferLedger communicates back to the AllocationManager that it * now longer needs to hold * a reference to particular piece of memory. + * Can only be called when you already hold the writeLock. */ - private class ReleaseListener { - - private final BufferAllocator allocator; - - public ReleaseListener(BufferAllocator allocator) { - this.allocator = allocator; - } - - /** - * Can only be called when you already hold the writeLock. - */ - public void release() { - allocator.assertOpen(); + private void release(final BufferLedger ledger) { + final BaseAllocator allocator = ledger.getAllocator(); + allocator.assertOpen(); - final BufferLedger oldLedger = map.remove(allocator); - oldLedger.allocator.dissociateLedger(oldLedger); + final BufferLedger oldLedger = map.remove(allocator); + oldLedger.allocator.dissociateLedger(oldLedger); - if (oldLedger == owningLedger) { - if (map.isEmpty()) { - // no one else owns, lets release. - oldLedger.allocator.releaseBytes(size); - underlying.release(); - amDestructionTime = System.nanoTime(); - owningLedger = null; - } else { - // we need to change the owning allocator. we've been removed so we'll get whatever is - // top of list - BufferLedger newLedger = map.values().iterator().next(); - - // we'll forcefully transfer the ownership and not worry about whether we exceeded the - // limit - // since this consumer can't do anything with this. - oldLedger.transferBalance(newLedger); - } + if (oldLedger == owningLedger) { + if (map.isEmpty()) { + // no one else owns, lets release. + oldLedger.allocator.releaseBytes(size); + underlying.release(); + amDestructionTime = System.nanoTime(); + owningLedger = null; } else { - if (map.isEmpty()) { - throw new IllegalStateException("The final removal of a ledger should be connected to " + - "the owning ledger."); - } + // we need to change the owning allocator. we've been removed so we'll get whatever is + // top of list + BufferLedger newLedger = map.values().iterator().next(); + + // we'll forcefully transfer the ownership and not worry about whether we exceeded the + // limit + // since this consumer can't do anything with this. + oldLedger.transferBalance(newLedger); + } + } else { + if (map.isEmpty()) { + throw new IllegalStateException("The final removal of a ledger should be connected to " + + "the owning ledger."); } - - } } @@ -221,16 +208,22 @@ public class BufferLedger { // correctly private final long lCreationTime = System.nanoTime(); private final BaseAllocator allocator; - private final ReleaseListener listener; private final HistoricalLog historicalLog = BaseAllocator.DEBUG ? new HistoricalLog (BaseAllocator.DEBUG_LOG_LENGTH, "BufferLedger[%d]", 1) : null; private volatile long lDestructionTime = 0; - private BufferLedger(BaseAllocator allocator, ReleaseListener listener) { + private BufferLedger(BaseAllocator allocator) { this.allocator = allocator; - this.listener = listener; + } + + /** + * Get the allocator for this ledger + * @return allocator + */ + private BaseAllocator getAllocator() { + return allocator; } /** @@ -340,7 +333,7 @@ public int decrement(int decrement) { outcome = bufRefCnt.addAndGet(-decrement); if (outcome == 0) { lDestructionTime = System.nanoTime(); - listener.release(); + release(this); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 470317f8c07..6511efcb7d5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -368,7 +368,7 @@ public UnionVector promoteToUnion() { return vector; } - private int lastSet; + private int lastSet = 0; public class Accessor extends BaseRepeatedAccessor { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 8bb0f26d978..c6ebd61aa07 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -44,6 +44,8 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullableVarBinaryVector; +import org.apache.arrow.vector.NullableVarCharVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; @@ -63,10 +65,10 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.ValueVector.Mutator; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.schema.ArrowVectorType; @@ -84,7 +86,6 @@ import com.google.common.base.Objects; public class JsonFileReader implements AutoCloseable, DictionaryProvider { - private final File inputFile; private final JsonParser parser; private final BufferAllocator allocator; private Schema schema; @@ -93,7 +94,6 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider { public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { super(); - this.inputFile = inputFile; this.allocator = allocator; MappingJsonFactory jsonFactory = new MappingJsonFactory(); this.parser = jsonFactory.createParser(inputFile); @@ -216,10 +216,9 @@ public VectorSchemaRoot read() throws IOException { } } - /* - * TODO: This method doesn't load some vectors correctly. For instance, it doesn't initialize - * `lastSet` in ListVector, VarCharVector, NullableVarBinaryVector A better way of implementing - * this function is to use `loadFieldBuffers` methods in FieldVector. + /** + * TODO: A better way of implementing this function is to use `loadFieldBuffers` methods in + * FieldVector to set the inner-vector data as done in `ArrowFileReader`. */ private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException { List vectorTypes = field.getTypeLayout().getVectorTypes(); @@ -234,29 +233,42 @@ private void readVector(Field field, FieldVector vector) throws JsonParseExcepti if (started && !Objects.equal(field.getName(), name)) { throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name); } + + // Initialize the vector with required capacity int count = readNextField("count", Integer.class); + vector.setInitialCapacity(count); vector.allocateNew(); - vector.getMutator().setValueCount(count); + + // Read inner vectors for (int v = 0; v < vectorTypes.size(); v++) { ArrowVectorType vectorType = vectorTypes.get(v); - BufferBacked innerVector = fieldInnerVectors.get(v); + ValueVector valueVector = (ValueVector) fieldInnerVectors.get(v); nextFieldIs(vectorType.getName()); readToken(START_ARRAY); - ValueVector valueVector = (ValueVector) innerVector; - int innerVectorCount = vectorType.equals(OFFSET) ? count + 1 : count; - valueVector.setInitialCapacity(innerVectorCount); - valueVector.allocateNew(); - for (int i = 0; i < innerVectorCount; i++) { parser.nextToken(); setValueFromParser(valueVector, i); } - Mutator mutator = valueVector.getMutator(); - mutator.setValueCount(innerVectorCount); readToken(END_ARRAY); } - // if children + + // Set lastSet before valueCount to prevent setValueCount from filling empty values + switch (vector.getMinorType()) { + case LIST: + // ListVector starts lastSet from index 0, so lastSet value is always last index written + 1 + ((ListVector) vector).getMutator().setLastSet(count); + break; + case VARBINARY: + ((NullableVarBinaryVector) vector).getMutator().setLastSet(count - 1); + break; + case VARCHAR: + ((NullableVarCharVector) vector).getMutator().setLastSet(count - 1); + break; + } + vector.getMutator().setValueCount(count); + + // read child vectors, if any List fields = field.getChildren(); if (!fields.isEmpty()) { List vectorChildren = vector.getChildrenFromFields(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java index c05d5904977..ba62de0a6d9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java @@ -32,6 +32,7 @@ import org.apache.arrow.vector.NullableDecimalVector; import org.apache.arrow.vector.NullableIntVector; import org.apache.arrow.vector.NullableTimeMilliVector; +import org.apache.arrow.vector.NullableVarBinaryVector; import org.apache.arrow.vector.NullableVarCharVector; import org.apache.arrow.vector.ValueVector.Accessor; import org.apache.arrow.vector.VectorSchemaRoot; @@ -541,4 +542,52 @@ public void writeUnionData(int count, NullableMapVector parent) { writer.setValueCount(count); varchar.release(); } + + protected void writeVarBinaryData(int count, NullableMapVector parent) { + Assert.assertTrue(count < 100); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + ListWriter listWriter = rootWriter.list("list"); + ArrowBuf varbin = allocator.buffer(count); + for (int i = 0; i < count; i++) { + varbin.setByte(i, i); + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varBinary().writeVarBinary(0, i + 1, varbin); + } + listWriter.endList(); + } + writer.setValueCount(count); + varbin.release(); + } + + protected void validateVarBinary(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + ListVector listVector = (ListVector) root.getVector("list"); + byte[] expectedArray = new byte[count]; + int numVarBinaryValues = 0; + for (int i = 0; i < count; i++) { + expectedArray[i] = (byte) i; + Object obj = listVector.getAccessor().getObject(i); + List objList = (List) obj; + if (i % 3 == 0) { + Assert.assertTrue(objList.isEmpty()); + } else { + byte[] expected = Arrays.copyOfRange(expectedArray, 0, i + 1); + for (int j = 0; j < i % 3; j++) { + byte[] result = (byte[]) objList.get(j); + Assert.assertArrayEquals(result, expected); + numVarBinaryValues++; + } + } + } + + // ListVector lastSet should be the index of last value + 1 + Assert.assertEquals(listVector.getMutator().getLastSet(), count); + + // NullableVarBinaryVector lastSet should be the index of last value + NullableVarBinaryVector binaryVector = (NullableVarBinaryVector) listVector.getChildrenFromFields().get(0); + Assert.assertEquals(binaryVector.getMutator().getLastSet(), numVarBinaryValues - 1); + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index c483ba7de91..81e58989fcc 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -622,6 +622,47 @@ public void testWriteReadFixedSizeList() throws IOException { } } + @Test + public void testWriteReadVarBin() throws IOException { + File file = new File("target/mytest_varbin.arrow"); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + int count = COUNT; + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = NullableMapVector.empty("parent", vectorAllocator)) { + writeVarBinaryData(count, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateVarBinary(count, root); + write(parent.getChild("root"), file, stream); + } + + // read from file + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator)) { + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + Schema schema = root.getSchema(); + LOGGER.debug("reading schema: " + schema); + Assert.assertTrue(arrowReader.loadNextBatch()); + validateVarBinary(count, root); + } + + // read from stream + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + ByteArrayInputStream input = new ByteArrayInputStream(stream.toByteArray()); + ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator)) { + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + Schema schema = root.getSchema(); + LOGGER.debug("reading schema: " + schema); + Assert.assertTrue(arrowReader.loadNextBatch()); + validateVarBinary(count, root); + } + } + /** * Writes the contents of parents to file. If outStream is non-null, also writes it diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java index 960567fc870..ee90d340d7c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java @@ -285,4 +285,33 @@ public void testSetStructLength() throws IOException { } } + @Test + public void testWriteReadVarBinJSON() throws IOException { + File file = new File("target/mytest_varbin.json"); + int count = COUNT; + + // write + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = NullableMapVector.empty("parent", vectorAllocator)) { + writeVarBinaryData(count, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateVarBinary(count, root); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE)) { + JsonFileReader reader = new JsonFileReader(file, readerAllocator); + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateVarBinary(count, root); + } + reader.close(); + } + } } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index a636d51b420..d148d1105b6 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -72,8 +72,8 @@ endif(CCACHE_FOUND) ############################################################ include(BuildUtils) -include(SetupCxxFlags) include(CompilerInfo) +include(SetupCxxFlags) # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") diff --git a/python/manylinux1/scripts/build_boost.sh b/python/manylinux1/scripts/build_boost.sh index 3c11f3aeb6f..4650cde9532 100755 --- a/python/manylinux1/scripts/build_boost.sh +++ b/python/manylinux1/scripts/build_boost.sh @@ -16,10 +16,13 @@ # specific language governing permissions and limitations # under the License. -wget --no-check-certificate http://downloads.sourceforge.net/project/boost/boost/1.60.0/boost_1_60_0.tar.gz -O /boost_1_60_0.tar.gz -tar xf boost_1_60_0.tar.gz -pushd /boost_1_60_0 +BOOST_VERSION=1.65.1 +BOOST_VERSION_UNDERSCORE=${BOOST_VERSION//\./_} + +wget --no-check-certificate https://dl.bintray.com/boostorg/release/${BOOST_VERSION}/source/boost_${BOOST_VERSION_UNDERSCORE}.tar.gz -O /boost_${BOOST_VERSION_UNDERSCORE}.tar.gz +tar xf boost_${BOOST_VERSION_UNDERSCORE}.tar.gz +pushd /boost_${BOOST_VERSION_UNDERSCORE} ./bootstrap.sh ./bjam cxxflags=-fPIC cflags=-fPIC --prefix=/usr --with-filesystem --with-date_time --with-system --with-regex install popd -rm -rf boost_1_60_0.tar.gz boost_1_60_0 +rm -rf boost_${BOOST_VERSION_UNDERSCORE}.tar.gz boost_${BOOST_VERSION_UNDERSCORE} diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 0d76a35f4ed..ac069482274 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, dictionary, field, type_for_alias, DataType, NAType, Field, Schema, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index eec6180165c..f402defc9b0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,58 +16,161 @@ # under the License. -def array(object sequence, DataType type=None, MemoryPool memory_pool=None, - size=None): +cdef _sequence_to_array(object sequence, object size, DataType type, + CMemoryPool* pool): + cdef shared_ptr[CArray] out + cdef int64_t c_size + if type is None: + with nogil: + check_status(ConvertPySequence(sequence, pool, &out)) + else: + if size is None: + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type + ) + ) + else: + c_size = size + with nogil: + check_status( + ConvertPySequence( + sequence, pool, &out, type.sp_type, c_size + ) + ) + + return pyarrow_wrap_array(out) + + +cdef _is_array_like(obj): + try: + import pandas + return isinstance(obj, (np.ndarray, pd.Series, pd.Index, Categorical)) + except: + return isinstance(obj, np.ndarray) + + +cdef _ndarray_to_array(object values, object mask, DataType type, + c_bool use_pandas_null_sentinels, + CMemoryPool* pool): + cdef shared_ptr[CChunkedArray] chunked_out + cdef shared_ptr[CDataType] c_type + + dtype = values.dtype + + if type is None and dtype != object: + with nogil: + check_status(NumPyDtypeToArrow(dtype, &c_type)) + + if type is not None: + c_type = type.sp_type + + with nogil: + check_status(NdarrayToArrow(pool, values, mask, + use_pandas_null_sentinels, + c_type, &chunked_out)) + + if chunked_out.get().num_chunks() > 1: + return pyarrow_wrap_chunked_array(chunked_out) + else: + return pyarrow_wrap_array(chunked_out.get().chunk(0)) + + +cdef DataType _ensure_type(object type): + if type is None: + return None + elif not isinstance(type, DataType): + return type_for_alias(type) + else: + return type + + +def array(object obj, type=None, mask=None, + MemoryPool memory_pool=None, size=None, + from_pandas=False): """ - Create pyarrow.Array instance from a Python sequence + Create pyarrow.Array instance from a Python object Parameters ---------- - sequence : sequence-like or iterable object of Python objects. - If both type and size are specified may be a single use iterable. - type : pyarrow.DataType, optional - If not passed, will be inferred from the data + obj : sequence, iterable, ndarray or Series + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool + size : int64, optional Size of the elements. If the imput is larger than size bail at this length. For iterators, if size is larger than the input iterator this will be treated as a "max size", but will involve an initial allocation of size followed by a resize to the actual size (so if you know the exact size specifying it correctly will give you better performance). + from_pandas : boolean, default False + Use pandas's semantics for inferring nulls from values in ndarray-like + data. If passed, the mask tasks precendence, but if a value is unmasked + (not-null), but still null according to pandas semantics, then it is + null + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), np.array([0, 1], + ... dtype=bool)) + + [ + 1, + NA + ] Returns ------- - array : pyarrow.Array + array : pyarrow.Array or pyarrow.ChunkedArray (if object data + overflowed binary storage) """ - cdef: - shared_ptr[CArray] sp_array - CMemoryPool* pool - int64_t c_size + type = _ensure_type(type) + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - pool = maybe_unbox_memory_pool(memory_pool) - if type is None: - with nogil: - check_status(ConvertPySequence(sequence, pool, &sp_array)) - else: - if size is None: - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type - ) - ) - else: - c_size = size - with nogil: - check_status( - ConvertPySequence( - sequence, pool, &sp_array, type.sp_type, c_size - ) - ) + if _is_array_like(obj): + if mask is not None: + mask = get_series_values(mask) + + values = get_series_values(obj) - return pyarrow_wrap_array(sp_array) + if isinstance(values, Categorical): + return DictionaryArray.from_arrays( + values.codes, values.categories.values, + mask=mask, ordered=values.ordered, + memory_pool=memory_pool) + else: + values, type = pdcompat.get_datetimetz_type(values, obj.dtype, + type) + return _ndarray_to_array(values, mask, type, from_pandas, pool) + else: + if mask is not None: + raise ValueError("Masks only supported with ndarray-like inputs") + return _sequence_to_array(obj, size, type, pool) def _normalize_slice(object arrow_obj, slice key): @@ -112,7 +215,7 @@ cdef class Array: with nogil: check_status(DebugPrint(deref(self.ap), 0)) - def cast(self, DataType target_type, safe=True): + def cast(self, object target_type, safe=True): """ Cast array values to another data type @@ -130,42 +233,37 @@ cdef class Array: cdef: CCastOptions options shared_ptr[CArray] result + DataType type + + type = _ensure_type(target_type) if not safe: options.allow_int_overflow = 1 with nogil: - check_status(Cast(_context(), self.ap[0], target_type.sp_type, + check_status(Cast(_context(), self.ap[0], type.sp_type, options, &result)) return pyarrow_wrap_array(result) @staticmethod - def from_pandas(obj, mask=None, DataType type=None, - timestamps_to_ms=False, - MemoryPool memory_pool=None): + def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): """ - Convert pandas.Series to an Arrow Array. + Convert pandas.Series to an Arrow Array, using pandas's semantics about + what values indicate nulls. See pyarrow.array for more general + conversion from arrays or sequences to Arrow arrays Parameters ---------- - series : pandas.Series or numpy.ndarray - - mask : pandas.Series or numpy.ndarray, optional - boolean mask if the object is null (True) or valid (False) - + sequence : ndarray, Inded Series + mask : array (boolean), optional + Indicate which values are null (True) or not null (False) type : pyarrow.DataType - Explicit type to attempt to coerce to - - timestamps_to_ms : bool, optional - Convert datetime columns to ms resolution. This is needed for - compatibility with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate the resulting Arrow array. + Explicit type to attempt to coerce to, otherwise will be inferred + from the data + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool Notes ----- @@ -173,78 +271,13 @@ cdef class Array: representation). Timezone-naive data will be implicitly interpreted as UTC. - Examples - -------- - - >>> import pandas as pd - >>> import pyarrow as pa - >>> pa.Array.from_pandas(pd.Series([1, 2])) - - [ - 1, - 2 - ] - - >>> import numpy as np - >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], - ... dtype=bool)) - - [ - 1, - NA - ] - Returns ------- array : pyarrow.Array or pyarrow.ChunkedArray (if object data - overflowed binary storage) + overflows binary buffer) """ - cdef: - shared_ptr[CArray] out - shared_ptr[CChunkedArray] chunked_out - shared_ptr[CDataType] c_type - CMemoryPool* pool - - if mask is not None: - mask = get_series_values(mask) - - values = get_series_values(obj) - pool = maybe_unbox_memory_pool(memory_pool) - - if isinstance(values, Categorical): - return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - memory_pool=memory_pool) - elif values.dtype == object: - # Object dtype undergoes a different conversion path as more type - # inference may be needed - if type is not None: - c_type = type.sp_type - with nogil: - check_status(PandasObjectsToArrow( - pool, values, mask, c_type, &chunked_out)) - - if chunked_out.get().num_chunks() > 1: - return pyarrow_wrap_chunked_array(chunked_out) - else: - out = chunked_out.get().chunk(0) - else: - values, type = pdcompat.maybe_coerce_datetime64( - values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms) - - if type is None: - dtype = values.dtype - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) - else: - c_type = type.sp_type - - with nogil: - check_status(PandasToArrow( - pool, values, mask, c_type, &out)) - - return pyarrow_wrap_array(out) + return array(obj, mask=mask, type=type, memory_pool=memory_pool, + from_pandas=True) property null_count: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5e6708871e6..fc17d1c06ae 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -766,13 +766,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) - CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CChunkedArray]* out) + CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, + c_bool use_pandas_null_sentinels, + const shared_ptr[CDataType]& type, + shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, shared_ptr[CTensor]* out) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d1e6f5a8096..be48aeb442d 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -203,7 +203,7 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): } -def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): +def dataframe_to_arrays(df, schema, preserve_index): names = [] arrays = [] index_columns = [] @@ -223,15 +223,13 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): field = schema.field_by_name(name) type = getattr(field, "type", None) - array = pa.Array.from_pandas( - col, type=type, timestamps_to_ms=timestamps_to_ms - ) + array = pa.array(col, from_pandas=True, type=type) arrays.append(array) names.append(name) types.append(array.type) for i, column in enumerate(index_columns): - array = pa.Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) + array = pa.array(column) arrays.append(array) names.append(index_level_name(column, i)) types.append(array.type) @@ -242,25 +240,15 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): return names, arrays, metadata -def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False): - if timestamps_to_ms: - import warnings - warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning) - +def get_datetimetz_type(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: return values, type_ - coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]' - - if coerce_ms: - values = values.astype('datetime64[ms]') - type_ = pa.timestamp('ms') - if isinstance(dtype, DatetimeTZDtype): tz = dtype.tz - unit = 'ms' if coerce_ms else dtype.unit + unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 3a847f77c4f..c37ed3b200e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -348,10 +348,10 @@ cdef class StructValue(ArrayValue): cdef dict _scalar_classes = { _Type_BOOL: BooleanValue, - _Type_UINT8: Int8Value, - _Type_UINT16: Int16Value, - _Type_UINT32: Int32Value, - _Type_UINT64: Int64Value, + _Type_UINT8: UInt8Value, + _Type_UINT16: UInt16Value, + _Type_UINT32: UInt32Value, + _Type_UINT64: UInt64Value, _Type_INT8: Int8Value, _Type_INT16: Int16Value, _Type_INT32: Int32Value, diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 028797e45b8..e5422a5beca 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -575,7 +575,7 @@ cdef class RecordBatch: pyarrow.RecordBatch """ names, arrays, metadata = pdcompat.dataframe_to_arrays( - df, False, schema, preserve_index + df, schema, preserve_index ) return cls.from_arrays(arrays, names, metadata) @@ -714,21 +714,13 @@ cdef class Table: return result @classmethod - def from_pandas(cls, df, bint timestamps_to_ms=False, - Schema schema=None, bint preserve_index=True): + def from_pandas(cls, df, Schema schema=None, bint preserve_index=True): """ Convert pandas.DataFrame to an Arrow Table Parameters ---------- df : pandas.DataFrame - timestamps_to_ms : bool - Convert datetime columns to ms resolution. This is needed for - compability with other functionality like Parquet I/O which - only supports milliseconds. - - .. deprecated:: 0.7.0 - schema : pyarrow.Schema, optional The expected schema of the Arrow Table. This can be used to indicate the type of columns if we cannot infer it automatically. @@ -754,7 +746,6 @@ cdef class Table: """ names, arrays, metadata = pdcompat.dataframe_to_arrays( df, - timestamps_to_ms=timestamps_to_ms, schema=schema, preserve_index=preserve_index ) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f316417caaf..3bf392686f0 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -149,6 +149,14 @@ def test_array_factory_invalid_type(): pa.array(arr) +def test_array_ref_to_ndarray_base(): + arr = np.array([1, 2, 3]) + + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) @@ -170,8 +178,8 @@ def test_dictionary_from_boxed_arrays(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) - iarr = pa.Array.from_pandas(indices) - darr = pa.Array.from_pandas(dictionary) + iarr = pa.array(indices) + darr = pa.array(dictionary) d1 = pa.DictionaryArray.from_arrays(iarr, darr) @@ -201,9 +209,9 @@ def test_dictionary_with_pandas(): def test_list_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') - offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32()) + offsets = pa.array(offsets_arr, type='int32') pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] - values = pa.array(pyvalues, type=pa.binary()) + values = pa.array(pyvalues, type='binary') result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) @@ -214,22 +222,22 @@ def test_list_from_arrays(): def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) casted = in_arr.cast(out_type, safe=safe) - expected = pa.Array.from_pandas(out_data, type=out_type) + expected = pa.array(out_data, type=out_type) assert casted.equals(expected) def test_cast_integers_safe(): safe_cases = [ - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), - (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) ] @@ -237,13 +245,13 @@ def test_cast_integers_safe(): _check_cast_case(case) unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), pa.uint16()), - (np.array([50000], dtype='u2'), pa.uint16(), pa.int16()) + (np.array([50000], dtype='i4'), 'int32', 'int16'), + (np.array([70000], dtype='i4'), 'int32', 'uint16'), + (np.array([-1], dtype='i4'), 'int32', 'uint16'), + (np.array([50000], dtype='u2'), 'uint16', 'int16') ] for in_data, in_type, out_type in unsafe_cases: - in_arr = pa.Array.from_pandas(in_data, type=in_type) + in_arr = pa.array(in_data, type=in_type) with pytest.raises(pa.ArrowInvalid): in_arr.cast(out_type) @@ -252,11 +260,11 @@ def test_cast_integers_safe(): def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting unsafe_cases = [ - (np.array([50000], dtype='i4'), pa.int32(), + (np.array([50000], dtype='i4'), 'int32', np.array([50000], dtype='i2'), pa.int16()), - (np.array([70000], dtype='i4'), pa.int32(), + (np.array([70000], dtype='i4'), 'int32', np.array([70000], dtype='u2'), pa.uint16()), - (np.array([-1], dtype='i4'), pa.int32(), + (np.array([-1], dtype='i4'), 'int32', np.array([-1], dtype='u2'), pa.uint16()), (np.array([50000], dtype='u2'), pa.uint16(), np.array([50000], dtype='i2'), pa.int16()) @@ -315,3 +323,17 @@ def test_simple_type_construction(): ) def test_logical_type(type, expected): assert get_logical_type(type) == expected + + +def test_array_conversions_no_sentinel_values(): + arr = np.array([1, 2, 3, 4], dtype='int8') + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + assert arr2.type == 'int8' + + arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), + type='float32') + assert arr3.type == 'float32' + assert arr3.null_count == 0 diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5d56cde7d48..182f3afc7e6 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -18,7 +18,7 @@ from collections import OrderedDict -from datetime import datetime, date, time +from datetime import date, time import unittest import decimal import json @@ -82,7 +82,7 @@ def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def _check_series_roundtrip(self, s, type_=None): - arr = pa.Array.from_pandas(s, type=type_) + arr = pa.array(s, from_pandas=True, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if isinstance(arr.type, pa.TimestampType) and arr.type.tz is not None: @@ -93,7 +93,7 @@ def _check_series_roundtrip(self, s, type_=None): def _check_array_roundtrip(self, values, expected=None, mask=None, type=None): - arr = pa.Array.from_pandas(values, mask=mask, type=type) + arr = pa.array(values, from_pandas=True, mask=mask, type=type) result = arr.to_pandas() values_nulls = pd.isnull(values) @@ -152,7 +152,7 @@ def test_float_nulls(self): for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, from_pandas=True, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan @@ -223,7 +223,7 @@ def test_integer_with_nulls(self): for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) - arr = pa.Array.from_pandas(values, null_mask) + arr = pa.array(values, mask=null_mask) arrays.append(arr) expected = values.astype('f8') @@ -244,8 +244,8 @@ def test_array_from_pandas_type_cast(self): target_type = pa.int8() - result = pa.Array.from_pandas(arr, type=target_type) - expected = pa.Array.from_pandas(arr.astype('int8')) + result = pa.array(arr, type=target_type) + expected = pa.array(arr.astype('int8')) assert result.equals(expected) def test_boolean_no_nulls(self): @@ -266,7 +266,7 @@ def test_boolean_nulls(self): mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 - arr = pa.Array.from_pandas(values, mask) + arr = pa.array(values, mask=mask) expected = values.astype(object) expected[mask] = None @@ -292,7 +292,7 @@ def test_all_nulls_cast_numeric(self): arr = np.array([None], dtype=object) def _check_type(t): - a2 = pa.Array.from_pandas(arr, type=t) + a2 = pa.array(arr, type=t) assert a2.type == t assert a2[0].as_py() is None @@ -325,7 +325,7 @@ def test_bytes_exceed_2gb(self): df = pd.DataFrame({ 'strings': np.array([val] * 4000, dtype=object) }) - arr = pa.Array.from_pandas(df['strings']) + arr = pa.array(df['strings']) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 2 arr = None @@ -365,19 +365,6 @@ def test_timestamps_notimezone_no_nulls(self): expected_schema=schema, ) - def test_timestamps_to_ms_explicit_schema(self): - # ARROW-1328 - df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]}) - pa_type = pa.from_numpy_dtype(df['datetime'].dtype) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = pa.Array.from_pandas(df['datetime'], type=pa_type, - timestamps_to_ms=True) - - tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'), - arr.to_pandas()) - def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ @@ -450,11 +437,11 @@ def test_date_objects_typed(self): t32 = pa.date32() t64 = pa.date64() - a32 = pa.Array.from_pandas(arr, type=t32) - a64 = pa.Array.from_pandas(arr, type=t64) + a32 = pa.array(arr, type=t32) + a64 = pa.array(arr, type=t64) - a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32) - a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64) + a32_expected = pa.array(arr_i4, mask=mask, type=t32) + a64_expected = pa.array(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) @@ -481,8 +468,8 @@ def test_dates_from_integers(self): arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 - a1 = pa.Array.from_pandas(arr, type=t1) - a2 = pa.Array.from_pandas(arr2, type=t2) + a1 = pa.array(arr, type=t1) + a2 = pa.array(arr2, type=t2) expected = date(2017, 4, 3) assert a1[0].as_py() == expected @@ -520,7 +507,7 @@ def test_column_of_arrays_to_py(self): np.arange(1, dtype=dtype) ]) type_ = pa.list_(pa.int8()) - parr = pa.Array.from_pandas(arr, type=type_) + parr = pa.array(arr, type=type_) assert parr[0].as_py() == list(range(10)) assert parr[1].as_py() == list(range(5)) @@ -592,7 +579,7 @@ def test_column_of_lists_strided(self): def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) - arr = pa.Array.from_pandas(data) + arr = pa.array(data) expected = pa.array(list(data)) assert arr.equals(expected) assert arr.type == pa.list_(pa.null()) @@ -600,7 +587,7 @@ def test_nested_lists_all_none(self): data2 = np.array([None, None, [None, None], np.array([None, None], dtype=object)], dtype=object) - arr = pa.Array.from_pandas(data2) + arr = pa.array(data2) expected = pa.array([None, None, [None, None], [None, None]]) assert arr.equals(expected) @@ -760,7 +747,7 @@ def test_pytime_from_pandas(self): t1 = pa.time64('us') aobjs = np.array(pytimes + [None], dtype=object) - parr = pa.Array.from_pandas(aobjs) + parr = pa.array(aobjs) assert parr.type == t1 assert parr[0].as_py() == pytimes[0] assert parr[1].as_py() == pytimes[1] @@ -775,18 +762,18 @@ def test_pytime_from_pandas(self): arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') - a1 = pa.Array.from_pandas(arr, type=pa.time64('us')) + a1 = pa.array(arr, type=pa.time64('us')) assert a1[0].as_py() == pytimes[0] - a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns')) + a2 = pa.array(arr * 1000, type=pa.time64('ns')) assert a2[0].as_py() == pytimes[0] - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), - type=pa.time32('ms')) + a3 = pa.array((arr / 1000).astype('i4'), + type=pa.time32('ms')) assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), - type=pa.time32('s')) + a4 = pa.array((arr / 1000000).astype('i4'), + type=pa.time32('s')) assert a4[0].as_py() == pytimes[0].replace(microsecond=0) def test_arrow_time_to_pandas(self): @@ -809,14 +796,14 @@ def test_arrow_time_to_pandas(self): null_mask = np.array([False, False, True], dtype=bool) - a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us')) - a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask, - type=pa.time64('ns')) + a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) + a2 = pa.array(arr * 1000, mask=null_mask, + type=pa.time64('ns')) - a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask, - type=pa.time32('ms')) - a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask, - type=pa.time32('s')) + a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, + type=pa.time32('ms')) + a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, + type=pa.time32('s')) names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) @@ -841,8 +828,8 @@ def test_arrow_time_to_pandas(self): tm.assert_frame_equal(df, expected_df) - def _check_numpy_array_roundtrip(self, np_array): - arr = pa.Array.from_pandas(np_array) + def _check_array_from_pandas_roundtrip(self, np_array): + arr = pa.array(np_array, from_pandas=True) result = arr.to_pandas() npt.assert_array_equal(result, np_array) @@ -853,7 +840,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - self._check_numpy_array_roundtrip(datetime64_ns) + self._check_array_from_pandas_roundtrip(datetime64_ns) datetime64_us = np.array([ '2007-07-13T01:23:34.123456', @@ -861,7 +848,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539', '2010-08-13T05:46:57.437699'], dtype='datetime64[us]') - self._check_numpy_array_roundtrip(datetime64_us) + self._check_array_from_pandas_roundtrip(datetime64_us) datetime64_ms = np.array([ '2007-07-13T01:23:34.123', @@ -869,7 +856,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - self._check_numpy_array_roundtrip(datetime64_ms) + self._check_array_from_pandas_roundtrip(datetime64_ms) datetime64_s = np.array([ '2007-07-13T01:23:34', @@ -877,7 +864,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56', '2010-08-13T05:46:57'], dtype='datetime64[s]') - self._check_numpy_array_roundtrip(datetime64_s) + self._check_array_from_pandas_roundtrip(datetime64_s) datetime64_d = np.array([ '2007-07-13', @@ -885,11 +872,11 @@ def test_numpy_datetime64_columns(self): '2006-01-15', '2010-08-19'], dtype='datetime64[D]') - self._check_numpy_array_roundtrip(datetime64_d) + self._check_array_from_pandas_roundtrip(datetime64_d) def test_all_nones(self): def _check_series(s): - converted = pa.Array.from_pandas(s) + converted = pa.array(s) assert isinstance(converted, pa.NullArray) assert len(converted) == 3 assert converted.null_count == 3 diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index eb23894f480..b0593fe885d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -457,8 +457,26 @@ def test_column_of_arrays(tmpdir): @parquet def test_coerce_timestamps(tmpdir): + from collections import OrderedDict # ARROW-622 - df, schema = dataframe_with_arrays() + arrays = OrderedDict() + fields = [pa.field('datetime64', + pa.list_(pa.timestamp('ms')))] + arrays['datetime64'] = [ + np.array(['2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + None, + None, + np.array(['2007-07-13T02', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + ] + + df = pd.DataFrame(arrays) + schema = pa.schema(fields) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, schema=schema) @@ -497,41 +515,41 @@ def test_column_of_lists(tmpdir): def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') - a1 = pa.Array.from_pandas(data1, type=t1) + a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 - a2 = pa.Array.from_pandas(data2, type=t2) + a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') - a3 = pa.Array.from_pandas(data3, type=t3) + a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') - a4 = pa.Array.from_pandas(data4, type=t4) + a4 = pa.array(data4, type=t4) t5 = pa.time64('us') - a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) + a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') - a6 = pa.Array.from_pandas(data4, type=t6) + a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') - ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) + ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') - a7 = pa.Array.from_pandas(data7, type=t7) + a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 - a7_us = pa.Array.from_pandas(data7_us, type=t7_us) + a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', @@ -575,7 +593,7 @@ def _assert_unsupported(array): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') - a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) + a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7) @@ -1295,7 +1313,7 @@ def test_large_table_int32_overflow(): arr = np.ones(size, dtype='uint8') - parr = pa.Array.from_pandas(arr, type=pa.uint8()) + parr = pa.array(arr, type=pa.uint8()) table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 4bb6a5af7dc..c77be98054c 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -69,6 +69,56 @@ def test_type_list(): assert str(l2) == 'list' +def test_type_comparisons(): + val = pa.int32() + assert val == pa.int32() + assert val == 'int32' + + with pytest.raises(TypeError): + val == 5 + + +def test_type_for_alias(): + cases = [ + ('i1', pa.int8()), + ('int8', pa.int8()), + ('i2', pa.int16()), + ('int16', pa.int16()), + ('i4', pa.int32()), + ('int32', pa.int32()), + ('i8', pa.int64()), + ('int64', pa.int64()), + ('u1', pa.uint8()), + ('uint8', pa.uint8()), + ('u2', pa.uint16()), + ('uint16', pa.uint16()), + ('u4', pa.uint32()), + ('uint32', pa.uint32()), + ('u8', pa.uint64()), + ('uint64', pa.uint64()), + ('f4', pa.float32()), + ('float32', pa.float32()), + ('f8', pa.float64()), + ('float64', pa.float64()), + ('date32', pa.date32()), + ('date64', pa.date64()), + ('string', pa.string()), + ('str', pa.string()), + ('binary', pa.binary()), + ('time32[s]', pa.time32('s')), + ('time32[ms]', pa.time32('ms')), + ('time64[us]', pa.time64('us')), + ('time64[ns]', pa.time64('ns')), + ('timestamp[s]', pa.timestamp('s')), + ('timestamp[ms]', pa.timestamp('ms')), + ('timestamp[us]', pa.timestamp('us')), + ('timestamp[ns]', pa.timestamp('ns')), + ] + + for val, expected in cases: + assert pa.type_for_alias(val) == expected + + def test_type_string(): t = pa.string() assert str(t) == 'string' diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b298e740250..316e09a6efd 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -72,11 +72,19 @@ cdef class DataType: def __repr__(self): return '{0.__class__.__name__}({0})'.format(self) - def __richcmp__(DataType self, DataType other, int op): + def __richcmp__(DataType self, object other, int op): + cdef DataType other_type + if not isinstance(other, DataType): + if not isinstance(other, six.string_types): + raise TypeError(other) + other_type = type_for_alias(other) + else: + other_type = other + if op == cp.Py_EQ: - return self.type.Equals(deref(other.type)) + return self.type.Equals(deref(other_type.type)) elif op == cp.Py_NE: - return not self.type.Equals(deref(other.type)) + return not self.type.Equals(deref(other_type.type)) else: raise TypeError('Invalid comparison') @@ -922,6 +930,64 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +cdef dict _type_aliases = { + 'null': null, + 'i1': int8, + 'int8': int8, + 'i2': int16, + 'int16': int16, + 'i4': int32, + 'int32': int32, + 'i8': int64, + 'int64': int64, + 'u1': uint8, + 'uint8': uint8, + 'u2': uint16, + 'uint16': uint16, + 'u4': uint32, + 'uint32': uint32, + 'u8': uint64, + 'uint64': uint64, + 'f4': float32, + 'float32': float32, + 'f8': float64, + 'float64': float64, + 'string': string, + 'str': string, + 'utf8': string, + 'binary': binary, + 'date32': date32, + 'date64': date64, + 'time32[s]': time32('s'), + 'time32[ms]': time32('ms'), + 'time64[us]': time64('us'), + 'time64[ns]': time64('ns'), + 'timestamp[s]': timestamp('s'), + 'timestamp[ms]': timestamp('ms'), + 'timestamp[us]': timestamp('us'), + 'timestamp[ns]': timestamp('ns'), +} + + +def type_for_alias(name): + """ + Return DataType given a string alias if one exists + + Returns + ------- + type : DataType + """ + name = name.lower() + try: + alias = _type_aliases[name] + except KeyError: + raise ValueError('No type alias for {0}'.format(name)) + + if isinstance(alias, DataType): + return alias + return alias() + + def schema(fields): """ Construct pyarrow.Schema from collection of fields