diff --git a/.gitmodules b/.gitmodules index 71722b21777..824b44e1100 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,7 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git + +[submodule "cpp/thirdparty/abseil/abseil-cpp"] + path = cpp/thirdparty/abseil/abseil-cpp + url = https://github.com/abseil/abseil-cpp diff --git a/appveyor.yml b/appveyor.yml index 18ad9f5f56c..eae3e64fae7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -75,6 +75,8 @@ install: - call generated_changes.bat # Decide to exit if current job doesn't exercise affected topics - call ci\appveyor-filter-changes.bat + # AppVeyor doesn't checkout submodules + - git submodule update --init --recursive before_build: - call ci\appveyor-install.bat diff --git a/ci/cpp-python-msvc-build.bat b/ci/cpp-python-msvc-build.bat index 3237e1880ac..642424b9e6e 100644 --- a/ci/cpp-python-msvc-build.bat +++ b/ci/cpp-python-msvc-build.bat @@ -17,6 +17,11 @@ @echo on +set CMAKE_COMMON_FLAGS= +if "%USE_CLCACHE%" == "true" ( + set CMAKE_COMMON_FLAGS=%CMAKE_COMMON_FLAGS% -DCMAKE_CXX_COMPILER=clcache +) + if "%JOB%" == "Static_Crt_Build" ( @rem Since we link the CRT statically, we should also disable building @rem the Arrow shared library to link the tests statically, otherwise @@ -26,7 +31,7 @@ if "%JOB%" == "Static_Crt_Build" ( mkdir cpp\build-debug pushd cpp\build-debug - cmake -G "%GENERATOR%" ^ + cmake -G "%GENERATOR%" %CMAKE_COMMON_FLAGS% ^ -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ @@ -41,7 +46,7 @@ if "%JOB%" == "Static_Crt_Build" ( mkdir cpp\build-release pushd cpp\build-release - cmake -G "%GENERATOR%" ^ + cmake -G "%GENERATOR%" %CMAKE_COMMON_FLAGS% ^ -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ @@ -65,7 +70,7 @@ if "%JOB%" == "Build_Debug" ( mkdir cpp\build-debug pushd cpp\build-debug - cmake -G "%GENERATOR%" ^ + cmake -G "%GENERATOR%" %CMAKE_COMMON_FLAGS% ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ @@ -119,7 +124,7 @@ set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data mkdir cpp\build pushd cpp\build -cmake -G "%GENERATOR%" ^ +cmake -G "%GENERATOR%" %CMAKE_COMMON_FLAGS% ^ -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ diff --git a/ci/detect-changes.py b/ci/detect-changes.py index df041b921c8..741a6348624 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -189,7 +189,11 @@ def run_from_travis(): else: # Test affected topics affected_files = list_travis_affected_files() - perr("Affected files:", affected_files) + if len(affected_files) > 100: + perr("Affected files: too many to list ({0})" + .format(len(affected_files))) + else: + perr("Affected files:", affected_files) affected = get_affected_topics(affected_files) assert set(affected) <= set(ALL_TOPICS), affected diff --git a/ci/travis_script_manylinux.sh b/ci/travis_script_manylinux.sh index 386a68488a1..08f2e6ef937 100755 --- a/ci/travis_script_manylinux.sh +++ b/ci/travis_script_manylinux.sh @@ -21,7 +21,7 @@ set -ex pushd python/manylinux1 -git clone ../../ arrow +git clone --recurse-submodules ../../ arrow docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . docker run --shm-size=2g --rm -e PYARROW_PARALLEL=3 -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh diff --git a/cpp/.gitignore b/cpp/.gitignore index ec846b35ba6..04dce373cfd 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -thirdparty/ CMakeFiles/ CMakeCache.txt CTestTestfile.cmake @@ -24,6 +23,8 @@ cmake_install.cmake build/ *-build/ Testing/ +*_ep-build +*_ep-install ######################################### # Editor temporary/working/backup files # diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35114856f1a..261cdfce220 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -261,9 +261,9 @@ Pass multiple labels by dividing with semicolons") ON) if (MSVC) - option(ARROW_USE_CLCACHE - "Use clcache if available" - ON) +# option(ARROW_USE_CLCACHE +# "Use clcache if available" +# ON) set(BROTLI_MSVC_STATIC_LIB_SUFFIX "-static" CACHE STRING "Brotli static lib suffix used on Windows with MSVC (default -static)") @@ -341,14 +341,15 @@ if (ARROW_TENSORFLOW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() -if (MSVC AND ARROW_USE_CLCACHE AND - (("${CMAKE_GENERATOR}" STREQUAL "NMake Makefiles") OR - ("${CMAKE_GENERATOR}" STREQUAL "Ninja"))) - find_program(CLCACHE_FOUND clcache) - if(CLCACHE_FOUND) - set(CMAKE_CXX_COMPILER ${CLCACHE_FOUND}) - endif(CLCACHE_FOUND) -endif() +# if (MSVC AND ARROW_USE_CLCACHE AND +# (NOT ${CMAKE_CXX_COMPILER} MATCHES "clcache") AND +# (("${CMAKE_GENERATOR}" STREQUAL "NMake Makefiles") OR +# ("${CMAKE_GENERATOR}" STREQUAL "Ninja"))) +# find_program(CLCACHE_FOUND clcache) +# if(CLCACHE_FOUND) +# set(CMAKE_CXX_COMPILER ${CLCACHE_FOUND}) +# endif(CLCACHE_FOUND) +# endif() ############################################################ # Compiler flags @@ -609,6 +610,8 @@ set(ARROW_LINK_LIBS) # Libraries to link statically with libarrow.so set(ARROW_STATIC_LINK_LIBS) +set(ARROW_STATIC_LINK_LIBS absl::strings ${ARROW_STATIC_LINK_LIBS}) + if (ARROW_WITH_BROTLI) SET(ARROW_STATIC_LINK_LIBS brotli_dec diff --git a/cpp/README.md b/cpp/README.md index 33670d25f22..a36d7e45d42 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -45,7 +45,7 @@ sudo apt-get install cmake \ On macOS, you can use [Homebrew][1]: ```shell -git clone https://github.com/apache/arrow.git +git clone --recurse-submodules https://github.com/apache/arrow.git cd arrow brew update && brew bundle --file=c_glib/Brewfile ``` @@ -56,7 +56,7 @@ If you are developing on Windows, see the [Windows developer guide][2]. Simple debug build: - git clone https://github.com/apache/arrow.git + git clone --recurse-submodules https://github.com/apache/arrow.git cd arrow/cpp mkdir debug cd debug @@ -65,7 +65,7 @@ Simple debug build: Simple release build: - git clone https://github.com/apache/arrow.git + git clone --recurse-submodules https://github.com/apache/arrow.git cd arrow/cpp mkdir release cd release @@ -398,4 +398,4 @@ both of these options would be used rarely. Current known uses-cases when they a [2]: https://github.com/apache/arrow/blob/master/cpp/apidoc/Windows.md [3]: https://google.github.io/styleguide/cppguide.html [4]: https://github.com/include-what-you-use/include-what-you-use -[5]: https://github.com/apache/arrow/blob/master/cpp/thirdparty/README.md \ No newline at end of file +[5]: https://github.com/apache/arrow/blob/master/cpp/thirdparty/README.md diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e25f954d296..adf3fab23be 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -273,10 +273,13 @@ endif() if (NOT MSVC) # Set -fPIC on all external projects - set(EP_CXX_FLAGS "${EP_CXX_FLAGS} -fPIC") - set(EP_C_FLAGS "${EP_C_FLAGS} -fPIC") + set(EP_EXTRA_CXX_FLAGS "-fPIC") + set(EP_EXTRA_C_FLAGS "-fPIC") endif() +set(EP_CXX_FLAGS "${EP_CXX_FLAGS} ${EP_EXTRA_CXX_FLAGS}") +set(EP_C_FLAGS "${EP_C_FLAGS} ${EP_EXTRA_C_FLAGS}") + # Ensure that a default make is set if ("${MAKE}" STREQUAL "") if (NOT MSVC) @@ -294,6 +297,37 @@ else() message(STATUS "Found pthread: ${PTHREAD_LIBRARY}") endif() + +# ---------------------------------------------------------------------- +# Add abseil dependency (mandatory) + +# We only use a private abseil-cpp as we want to control its version. + +# XXX FetchContent needs CMake 3.11+, use a git submodule in the meantime +# FetchContent_Declare( +# abseil +# URL "https://github.com/abseil/abseil-cpp/archive/${ABSEIL_VERSION}.tar.gz" +# ) +# FetchContent_Populate(abseil) + +set(abseil_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/abseil/abseil-cpp) +set(abseil_BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/abseil_ep-build) + +set(ABSL_COMPILE_CXXFLAGS ${EP_EXTRA_CXX_FLAGS}) + +# add_subdirectory() is the "natural" way to incorporate Abseil. +# Another way is to use ExternalProject_Add, see example at +# https://github.com/googlecartographer/cartographer/blob/master/cmake/modules/FindAbseil.cmake +add_subdirectory(${abseil_SOURCE_DIR} ${abseil_BINARY_DIR}) + +# Abseil currently doesn't support installing (https://github.com/abseil/abseil-cpp/issues/111), +# still we need to install the Abseil headers since we expose Abseil types +# in our public APIs. +install(DIRECTORY ${abseil_SOURCE_DIR}/ DESTINATION include + FILES_MATCHING PATTERN "*.h") + +include_directories(SYSTEM ${abseil_SOURCE_DIR}) + # ---------------------------------------------------------------------- # Add Boost dependencies (code adapted from Apache Kudu (incubating)) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index bec5a99947d..db4bd89dacd 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1189,7 +1189,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { result_->GetValue(i, &length); ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast(strings[i % N].size()), length); - ASSERT_EQ(strings[i % N], result_->GetString(i)); + ASSERT_EQ(strings[i % N], result_->GetView(i)); pos += length; } @@ -1220,7 +1220,7 @@ TEST_F(TestStringBuilder, TestAppendVector) { result_->GetValue(i, &length); ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast(strings[i % N].size()), length); - ASSERT_EQ(strings[i % N], result_->GetString(i)); + ASSERT_EQ(strings[i % N], result_->GetView(i)); pos += length; } else { @@ -1254,7 +1254,7 @@ TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) { result_->GetValue(i, &length); ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast(strlen(string)), length); - ASSERT_EQ(strings[i % N], result_->GetString(i)); + ASSERT_EQ(strings[i % N], result_->GetView(i)); pos += length; } else { @@ -1286,7 +1286,7 @@ TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { result_->GetValue(i, &length); ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast(strlen(strings[i % N])), length); - ASSERT_EQ(strings[i % N], result_->GetString(i)); + ASSERT_EQ(strings[i % N], result_->GetView(i)); pos += length; } else { @@ -1375,9 +1375,8 @@ TEST_F(TestBinaryArray, TestGetValue) { if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); } else { - int32_t len = -1; - const uint8_t* bytes = strings_->GetValue(i, &len); - ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len)); + ASSERT_FALSE(strings_->IsNull(i)); + ASSERT_EQ(strings_->GetView(i), expected_[i]); } } } @@ -1387,9 +1386,8 @@ TEST_F(TestBinaryArray, TestNullValuesInitialized) { if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); } else { - int32_t len = -1; - const uint8_t* bytes = strings_->GetValue(i, &len); - ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len)); + ASSERT_FALSE(strings_->IsNull(i)); + ASSERT_EQ(strings_->GetView(i), expected_[i]); } } TestInitialized(*strings_); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index a2d1d5ecf2f..1fe9ebc76da 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -25,6 +25,9 @@ #include #include +#include +#include + #include "arrow/buffer.h" #include "arrow/type.h" #include "arrow/type_fwd.h" @@ -484,27 +487,33 @@ class ARROW_EXPORT BinaryArray : public FlatArray { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); - // Return the pointer to the given elements bytes - // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy - // pointer + offset + /// Return the pointer to the given elements bytes + // XXX should GetValue(int64_t i) return a string_view? const uint8_t* GetValue(int64_t i, int32_t* out_length) const { // Account for base offset i += data_->offset; - const int32_t pos = raw_value_offsets_[i]; *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; } + /// \brief Get binary value as a absl::string_view + /// + /// \param i the value index + /// \return the view over the selected value + absl::string_view GetView(int64_t i) const { + // Account for base offset + i += data_->offset; + const int32_t pos = raw_value_offsets_[i]; + return absl::string_view(reinterpret_cast(raw_data_ + pos), + raw_value_offsets_[i + 1] - pos); + } + /// \brief Get binary value as a std::string /// /// \param i the value index /// \return the value copied into a std::string - std::string GetString(int64_t i) const { - int32_t length = 0; - const uint8_t* bytes = GetValue(i, &length); - return std::string(reinterpret_cast(bytes), static_cast(length)); - } + std::string GetString(int64_t i) const { return std::string(GetView(i)); } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[1]; } @@ -550,14 +559,6 @@ class ARROW_EXPORT StringArray : public BinaryArray { const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); - - // Construct a std::string - // TODO: std::bad_alloc possibility - std::string GetString(int64_t i) const { - int32_t nchars; - const uint8_t* str = GetValue(i, &nchars); - return std::string(reinterpret_cast(str), nchars); - } }; // ---------------------------------------------------------------------- @@ -577,6 +578,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const uint8_t* GetValue(int64_t i) const; const uint8_t* Value(int64_t i) const { return GetValue(i); } + absl::string_view GetView(int64_t i) const { + return absl::string_view(reinterpret_cast(GetValue(i)), byte_width()); + } + int32_t byte_width() const { return byte_width_; } const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 677f2fd256c..8bbfe8c567a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -760,7 +760,6 @@ Status BooleanBuilder::AppendValues(const std::vector& values) { // DictionaryBuilder using internal::DictionaryScalar; -using internal::WrappedBinary; namespace { @@ -809,32 +808,28 @@ struct DictionaryHashHelper> { using Scalar = typename DictionaryScalar::type; static Scalar GetDictionaryValue(const Builder& builder, int64_t index) { - int32_t v_length; - const uint8_t* v_ptr = builder.GetValue(index, &v_length); - return WrappedBinary(v_ptr, v_length); + return builder.GetView(index); } static int64_t HashValue(const Scalar& value, int byte_width) { - return HashUtil::Hash(value.ptr_, value.length_, 0); + return HashUtil::Hash(value.data(), static_cast(value.length()), + 0); } static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) { - int32_t other_length; - const uint8_t* other_ptr = builder.GetValue(index, &other_length); - return value.length_ != other_length || - memcmp(value.ptr_, other_ptr, other_length) != 0; + const Scalar other = GetDictionaryValue(builder, index); + return value.length() != other.length() || + memcmp(value.data(), other.data(), other.length()) != 0; } static Status AppendValue(Builder& builder, const Scalar& value) { - return builder.Append(value.ptr_, value.length_); + return builder.Append(value); } static Status AppendArray(Builder& builder, const Array& in_array) { const auto& array = checked_cast(in_array); for (uint64_t index = 0, limit = array.length(); index < limit; ++index) { - int32_t length; - const uint8_t* ptr = array.GetValue(index, &length); - RETURN_NOT_OK(builder.Append(ptr, length)); + RETURN_NOT_OK(builder.Append(array.GetView(index))); } return Status::OK(); } @@ -1033,12 +1028,12 @@ Status DictionaryBuilder::AppendArray(const Array& array) { return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); } - const auto& numeric_array = checked_cast(array); + const auto& typed_array = checked_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); + RETURN_NOT_OK(Append(typed_array.GetValue(i))); } } return Status::OK(); @@ -1087,21 +1082,20 @@ Status DictionaryBuilder::FinishInternal(std::shared_ptr* o // StringType and BinaryType specializations // -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - const BinaryArray& binary_array = checked_cast(array); \ - WrappedBinary value(nullptr, 0); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - value.ptr_ = binary_array.GetValue(i, &value.length_); \ - RETURN_NOT_OK(Append(value)); \ - } \ - } \ - return Status::OK(); \ +#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ + \ + template <> \ + Status DictionaryBuilder::AppendArray(const Array& array) { \ + using ArrayType = typename TypeTraits::ArrayType; \ + const ArrayType& binary_array = checked_cast(array); \ + for (int64_t i = 0; i < array.length(); i++) { \ + if (array.IsNull(i)) { \ + RETURN_NOT_OK(AppendNull()); \ + } else { \ + RETURN_NOT_OK(Append(binary_array.GetView(i))); \ + } \ + } \ + return Status::OK(); \ } BINARY_DICTIONARY_SPECIALIZATIONS(StringType); @@ -1314,6 +1308,19 @@ const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { return value_data_builder_.data() + offset; } +absl::string_view BinaryBuilder::GetView(int64_t i) const { + const int32_t* offsets = offsets_builder_.data(); + int32_t offset = offsets[i]; + int32_t value_length; + if (i == (length_ - 1)) { + value_length = static_cast(value_data_builder_.length()) - offset; + } else { + value_length = offsets[i + 1] - offset; + } + return absl::string_view( + reinterpret_cast(value_data_builder_.data() + offset), value_length); +} + StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} Status StringBuilder::AppendValues(const std::vector& values, @@ -1455,6 +1462,12 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { return data_ptr + i * byte_width_; } +absl::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { + const uint8_t* data_ptr = byte_builder_.data(); + return absl::string_view(reinterpret_cast(data_ptr + i * byte_width_), + byte_width_); +} + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 96741974f4a..a2d057a33f4 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -27,6 +27,9 @@ #include #include +#include +#include + #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" @@ -846,8 +849,8 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { return Append(reinterpret_cast(value), length); } - Status Append(const std::string& value) { - return Append(value.c_str(), static_cast(value.size())); + Status Append(absl::string_view value) { + return Append(value.data(), static_cast(value.size())); } Status AppendNull(); @@ -871,6 +874,11 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { /// This pointer becomes invalid on the next modifying operation. const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + absl::string_view GetView(int64_t i) const; + protected: TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; @@ -953,6 +961,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { /// This pointer becomes invalid on the next modifying operation. const uint8_t* GetValue(int64_t i) const; + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + absl::string_view GetView(int64_t i) const; + protected: int32_t byte_width_; BufferBuilder byte_builder_; @@ -1024,10 +1037,14 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { namespace internal { -// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here. +// TODO deprecate this struct WrappedBinary { WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {} + operator absl::string_view() const { + return absl::string_view(reinterpret_cast(ptr_), length_); + } + const uint8_t* ptr_; int32_t length_; }; @@ -1039,12 +1056,12 @@ struct DictionaryScalar { template <> struct DictionaryScalar { - using type = WrappedBinary; + using type = absl::string_view; }; template <> struct DictionaryScalar { - using type = WrappedBinary; + using type = absl::string_view; }; template <> @@ -1154,17 +1171,11 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder(value), length); } Status Append(const char* value, int32_t length) { - return Append( - internal::WrappedBinary(reinterpret_cast(value), length)); - } - - Status Append(const std::string& value) { - return Append(internal::WrappedBinary(reinterpret_cast(value.c_str()), - static_cast(value.size()))); + return Append(absl::string_view(value, length)); } }; @@ -1175,17 +1186,11 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder(value), length); } Status Append(const char* value, int32_t length) { - return Append( - internal::WrappedBinary(reinterpret_cast(value), length)); - } - - Status Append(const std::string& value) { - return Append(internal::WrappedBinary(reinterpret_cast(value.c_str()), - static_cast(value.size()))); + return Append(absl::string_view(value, length)); } }; diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 639f3f85c47..f41f30fa110 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -660,9 +660,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { - int32_t length; - const uint8_t* value = dictionary.GetValue(in[i], &length); - RETURN_NOT_OK(binary_builder->Append(value, length)); + RETURN_NOT_OK(binary_builder->Append(dictionary.GetView(in[i]))); } else { RETURN_NOT_OK(binary_builder->AppendNull()); } @@ -670,9 +668,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, } } else { for (int64_t i = 0; i < indices.length(); ++i) { - int32_t length; - const uint8_t* value = dictionary.GetValue(in[i], &length); - RETURN_NOT_OK(binary_builder->Append(value, length)); + RETURN_NOT_OK(binary_builder->Append(dictionary.GetView(in[i]))); } } @@ -804,10 +800,8 @@ struct CastFunctor> { continue; } - int32_t length = -1; - auto str = input_array.GetValue(i, &length); - if (!converter(reinterpret_cast(str), static_cast(length), - out_data)) { + auto str = input_array.GetView(i); + if (!converter(str.data(), str.length(), out_data)) { std::stringstream ss; ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); @@ -836,11 +830,9 @@ struct CastFunctor(str), static_cast(length), - &value)) { + auto str = input_array.GetView(i); + if (!converter(str.data(), str.length(), &value)) { std::stringstream ss; ss << "Failed to cast String '" << input_array.GetString(i) << "' into " << output->type->ToString(); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 141b0a447c6..2874245daae 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -150,11 +150,7 @@ class ArrayPrinter : public PrettyPrinter { template inline typename std::enable_if::value, Status>::type WriteDataValues(const T& array) { - WriteValues(array, [&](int64_t i) { - int32_t length; - const char* buf = reinterpret_cast(array.GetValue(i, &length)); - (*sink_) << "\"" << std::string(buf, length) << "\""; - }); + WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\""; }); return Status::OK(); } @@ -162,11 +158,7 @@ class ArrayPrinter : public PrettyPrinter { template inline typename std::enable_if::value, Status>::type WriteDataValues(const T& array) { - WriteValues(array, [&](int64_t i) { - int32_t length; - const uint8_t* buf = array.GetValue(i, &length); - (*sink_) << HexEncode(buf, length); - }); + WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); return Status::OK(); } @@ -174,9 +166,7 @@ class ArrayPrinter : public PrettyPrinter { inline typename std::enable_if::value, Status>::type WriteDataValues(const T& array) { - int32_t width = array.byte_width(); - WriteValues(array, - [&](int64_t i) { (*sink_) << HexEncode(array.GetValue(i), width); }); + WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); return Status::OK(); } diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 0e55ee498bf..a0f46f5f66c 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -70,7 +70,7 @@ ADD_ARROW_LIB(arrow_python OUTPUTS ARROW_PYTHON_LIBRARIES SHARED_LINK_FLAGS "" SHARED_LINK_LIBS ${ARROW_PYTHON_SHARED_LINK_LIBS} - STATIC_LINK_LIBS "${PYTHON_OTHER_LIBS}" + STATIC_LINK_LIBS ${PYTHON_OTHER_LIBS} EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" ) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 6a142cd7e35..08835bc2241 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -72,22 +72,22 @@ struct WrapBytes {}; template <> struct WrapBytes { - static inline PyObject* Wrap(const uint8_t* data, int64_t length) { - return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyUnicode_FromStringAndSize(data, length); } }; template <> struct WrapBytes { - static inline PyObject* Wrap(const uint8_t* data, int64_t length) { - return PyBytes_FromStringAndSize(reinterpret_cast(data), length); + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); } }; template <> struct WrapBytes { - static inline PyObject* Wrap(const uint8_t* data, int64_t length) { - return PyBytes_FromStringAndSize(reinterpret_cast(data), length); + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); } }; @@ -396,21 +396,18 @@ inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - const uint8_t* data_ptr; - int32_t length; const bool has_nulls = data.null_count() > 0; for (int64_t i = 0; i < arr.length(); ++i) { if (has_nulls && arr.IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - data_ptr = arr.GetValue(i, &length); - *out_values = WrapBytes::Wrap(data_ptr, length); + auto view = arr.GetView(i); + *out_values = WrapBytes::Wrap(view.data(), view.length()); if (*out_values == nullptr) { PyErr_Clear(); std::stringstream ss; - ss << "Wrapping " - << std::string(reinterpret_cast(data_ptr), length) << " failed"; + ss << "Wrapping " << view << " failed"; return Status::UnknownError(ss.str()); } } @@ -436,37 +433,6 @@ inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -inline Status ConvertFixedSizeBinary(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - PyAcquireGIL lock; - for (int c = 0; c < data.num_chunks(); c++) { - auto arr = checked_cast(data.chunk(c).get()); - - const uint8_t* data_ptr; - int32_t length = - std::dynamic_pointer_cast(arr->type())->byte_width(); - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - data_ptr = arr->GetValue(i); - *out_values = WrapBytes::Wrap(data_ptr, length); - if (*out_values == nullptr) { - PyErr_Clear(); - std::stringstream ss; - ss << "Wrapping " - << std::string(reinterpret_cast(data_ptr), length) << " failed"; - return Status::UnknownError(ss.str()); - } - } - ++out_values; - } - } - return Status::OK(); -} - inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; @@ -763,7 +729,7 @@ class ObjectBlock : public PandasBlock { } else if (type == Type::STRING) { RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { - RETURN_NOT_OK(ConvertFixedSizeBinary(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::DATE32) { RETURN_NOT_OK(ConvertDates(options_, data, out_buffer)); } else if (type == Type::DATE64) { @@ -1809,20 +1775,20 @@ class ArrowDeserializer { return func(options_, data_, out_values); } - // UTF8 strings + // Strings and binary template typename std::enable_if::value, Status>::type Visit( const Type& type) { return VisitObjects(ConvertBinaryLike); } - Status Visit(const NullType& type) { return VisitObjects(ConvertNulls); } - // Fixed length binary strings Status Visit(const FixedSizeBinaryType& type) { - return VisitObjects(ConvertFixedSizeBinary); + return VisitObjects(ConvertBinaryLike); } + Status Visit(const NullType& type) { return VisitObjects(ConvertNulls); } + Status Visit(const Decimal128Type& type) { return VisitObjects(ConvertDecimals); } Status Visit(const Time32Type& type) { return VisitObjects(ConvertTimes); } diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index 3bf23d75c71..ee53c2f544e 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -125,15 +125,13 @@ Status GetValue(PyObject* context, const UnionArray& parent, const Array& arr, return Status::OK(); } case Type::BINARY: { - int32_t nchars; - const uint8_t* str = checked_cast(arr).GetValue(index, &nchars); - *result = PyBytes_FromStringAndSize(reinterpret_cast(str), nchars); + auto view = checked_cast(arr).GetView(index); + *result = PyBytes_FromStringAndSize(view.data(), view.length()); return CheckPyError(); } case Type::STRING: { - int32_t nchars; - const uint8_t* str = checked_cast(arr).GetValue(index, &nchars); - *result = PyUnicode_FromStringAndSize(reinterpret_cast(str), nchars); + auto view = checked_cast(arr).GetView(index); + *result = PyUnicode_FromStringAndSize(view.data(), view.length()); return CheckPyError(); } case Type::HALF_FLOAT: { diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index a2af87caf59..d7b8bacd376 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -21,16 +21,18 @@ #include #include +#include + #include "arrow/status.h" namespace arrow { static const char* kAsciiTable = "0123456789ABCDEF"; -static inline std::string HexEncode(const uint8_t* data, int32_t length) { +static inline std::string HexEncode(const char* data, size_t length) { std::string hex_string; hex_string.reserve(length * 2); - for (int32_t j = 0; j < length; ++j) { + for (size_t j = 0; j < length; ++j) { // Convert to 2 base16 digits hex_string.push_back(kAsciiTable[data[j] >> 4]); hex_string.push_back(kAsciiTable[data[j] & 15]); @@ -38,6 +40,14 @@ static inline std::string HexEncode(const uint8_t* data, int32_t length) { return hex_string; } +static inline std::string HexEncode(const uint8_t* data, int32_t length) { + return HexEncode(reinterpret_cast(data), length); +} + +static inline std::string HexEncode(absl::string_view str) { + return HexEncode(str.data(), str.size()); +} + static inline Status ParseHexValue(const char* data, uint8_t* out) { char c1 = data[0]; char c2 = data[1]; diff --git a/cpp/thirdparty/abseil/README.md b/cpp/thirdparty/abseil/README.md new file mode 100644 index 00000000000..61dd4a7e4d9 --- /dev/null +++ b/cpp/thirdparty/abseil/README.md @@ -0,0 +1,26 @@ + + +This directory contains a vendored git commit of the abseil-cpp (*) library +as a git submodule. +(*) https://github.com/abseil/abseil-cpp + +To bump the version: +1. git checkout the relevant version inside the gitsubmodule +2. update ABSEIL_VERSION in cpp/thirdparty/versions.txt diff --git a/cpp/thirdparty/abseil/abseil-cpp b/cpp/thirdparty/abseil/abseil-cpp new file mode 160000 index 00000000000..6c7de165d1c --- /dev/null +++ b/cpp/thirdparty/abseil/abseil-cpp @@ -0,0 +1 @@ +Subproject commit 6c7de165d1c82684359ccb630bb5f83263fa5ebc diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 381282bd801..09cc368df1f 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -17,20 +17,21 @@ # Toolchain library versions +ABSEIL_VERSION=20180600 BOOST_VERSION=1.67.0 -GTEST_VERSION=1.8.0 -GFLAGS_VERSION=v2.2.0 -GBENCHMARK_VERSION=v1.4.1 +BROTLI_VERSION=v0.6.0 FLATBUFFERS_VERSION=02a7807dd8d26f5668ffbbec0360dc107bbfabd5 -RAPIDJSON_VERSION=v1.1.0 +GBENCHMARK_VERSION=v1.4.1 +GFLAGS_VERSION=v2.2.0 +GLOG_VERSION=v0.3.5 +GRPC_VERSION=1.14.1 +GTEST_VERSION=1.8.0 JEMALLOC_VERSION=17c897976c60b0e6e4f4a365c751027244dada7a -SNAPPY_VERSION=1.1.3 -BROTLI_VERSION=v0.6.0 LZ4_VERSION=v1.7.5 -ZLIB_VERSION=1.2.8 -ZSTD_VERSION=v1.2.0 -PROTOBUF_VERSION=v3.6.1 -GRPC_VERSION=1.14.1 ORC_VERSION=1.5.1 +PROTOBUF_VERSION=v3.6.1 +RAPIDJSON_VERSION=v1.1.0 +SNAPPY_VERSION=1.1.3 THRIFT_VERSION=0.11.0 -GLOG_VERSION=v0.3.5 +ZLIB_VERSION=1.2.8 +ZSTD_VERSION=v1.2.0 diff --git a/dev/container/README.md b/dev/container/README.md index d8636e95d76..20201cd2db5 100644 --- a/dev/container/README.md +++ b/dev/container/README.md @@ -38,7 +38,7 @@ directory, `/io`. ``` $ mkdir -p io/arrow -$ git clone https://github.com/apache/arrow.git io/arrow +$ git clone --recurse-submodules https://github.com/apache/arrow.git io/arrow $ mkdir -p io/parquet-cpp $ git clone https://github.com/apache/parquet-cpp.git io/parquet-cpp ``` diff --git a/dev/tasks/nightlies.sample.yml b/dev/tasks/nightlies.sample.yml index b15fa9caa77..f43c3069b8e 100644 --- a/dev/tasks/nightlies.sample.yml +++ b/dev/tasks/nightlies.sample.yml @@ -66,7 +66,7 @@ script: # to build against a specific branch of a fork # git clone -b https://github.com//arrow - pushd .. - - git clone -b master https://github.com/apache/arrow + - git clone -b master --recurse-submodules https://github.com/apache/arrow # submit packaging tasks - | diff --git a/dev/tasks/python-wheels/travis.linux.yml b/dev/tasks/python-wheels/travis.linux.yml index 8ee82d44aca..4eec435eb39 100644 --- a/dev/tasks/python-wheels/travis.linux.yml +++ b/dev/tasks/python-wheels/travis.linux.yml @@ -41,7 +41,7 @@ script: - mkdir -p dist - pushd arrow/python/manylinux1 - - git clone ../../ arrow + - git clone --recurse-submodules ../../ arrow - docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . - docker run --shm-size=2g --rm -e SETUPTOOLS_SCM_PRETEND_VERSION=$PYARROW_VERSION diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8ec5b1a5b1e..f30de6abc05 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -300,6 +300,7 @@ endfunction() # Always bundle includes file(COPY ${ARROW_INCLUDE_DIR}/arrow DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include) +file(COPY ${ARROW_INCLUDE_DIR}/absl DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include) if (PYARROW_BUNDLE_ARROW_CPP) # arrow diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index eefd9761ba4..a2e11cef126 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -60,7 +60,7 @@ First, let's clone the Arrow git repository: mkdir repos cd repos - git clone https://github.com/apache/arrow.git + git clone --recurse-submodules https://github.com/apache/arrow.git You should now see @@ -264,7 +264,7 @@ First, starting from fresh clones of Apache Arrow: .. code-block:: shell - git clone https://github.com/apache/arrow.git + git clone --recurse-submodules https://github.com/apache/arrow.git .. code-block:: shell diff --git a/python/manylinux1/README.md b/python/manylinux1/README.md index 2ec662d502e..8fe4112f2ab 100644 --- a/python/manylinux1/README.md +++ b/python/manylinux1/README.md @@ -33,7 +33,7 @@ for all supported Python versions and place them in the `dist` folder. ```bash # Create a clean copy of the arrow source tree -git clone ../../ arrow +git clone --recurse-submodules ../../ arrow # Build the native baseimage docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . # Build the python packages