From b7adbce0f4cb2ed3ca04680915091759e292e90b Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 14 Apr 2020 05:08:20 +0000 Subject: [PATCH 1/4] ARROW-8440: [C++] Refine SIMD header files - Include all necessary SIMD header files in a single header "simd.h". - Simplify architecture dependent CRC code in hash_util.h. - Remove SSEUtil namespace which contains some SSE constants. These codes are not used and I can't find proper place to hold them. - Remove sse_util.h and neon_util.h. - Remove ARROW_SIMD_LEVEL=NONE which duplicates ARROW_USE_SIMD=OFF. --- cpp/cmake_modules/DefineOptions.cmake | 1 - cpp/src/arrow/io/memory_benchmark.cc | 13 ++- cpp/src/arrow/json/rapidjson_defs.h | 3 - cpp/src/arrow/util/byte_stream_split.h | 6 +- cpp/src/arrow/util/hash_util.h | 68 ++++++------- cpp/src/arrow/util/{neon_util.h => simd.h} | 42 ++++---- cpp/src/arrow/util/sse_util.h | 108 --------------------- 7 files changed, 54 insertions(+), 187 deletions(-) rename cpp/src/arrow/util/{neon_util.h => simd.h} (65%) delete mode 100644 cpp/src/arrow/util/sse_util.h diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 533e2e45af4..234ad8f7dfc 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -101,7 +101,6 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option_string(ARROW_SIMD_LEVEL "SIMD compiler optimization level" "SSE4_2" # default to SSE4.2 - "NONE" "SSE4_2" "AVX2" "AVX512") diff --git a/cpp/src/arrow/io/memory_benchmark.cc b/cpp/src/arrow/io/memory_benchmark.cc index 7e6ba781a01..eefcdb3df0e 100644 --- a/cpp/src/arrow/io/memory_benchmark.cc +++ b/cpp/src/arrow/io/memory_benchmark.cc @@ -22,8 +22,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/neon_util.h" -#include "arrow/util/sse_util.h" +#include "arrow/util/simd.h" #include "benchmark/benchmark.h" @@ -45,7 +44,7 @@ using BufferPtr = std::shared_ptr; #ifdef ARROW_HAVE_SSE4_2 -#ifdef ARROW_AVX512 +#ifdef ARROW_HAVE_AVX512 using VectorType = __m512i; #define VectorSet _mm512_set1_epi32 @@ -59,7 +58,7 @@ using VectorType = __m512i; #else -#ifdef ARROW_AVX2 +#ifdef ARROW_HAVE_AVX2 using VectorType = __m256i; #define VectorSet _mm256_set1_epi32 @@ -71,7 +70,7 @@ using VectorType = __m256i; asm volatile("vmovntdqa %[src], %[dst]" : [ dst ] "=v"(DST) : [ src ] "m"(SRC) :) #define VectorStreamWrite _mm256_stream_si256 -#else // ARROW_AVX2 not set +#else // ARROW_HAVE_AVX2 not set using VectorType = __m128i; #define VectorSet _mm_set1_epi32 @@ -83,8 +82,8 @@ using VectorType = __m128i; asm volatile("movntdqa %[src], %[dst]" : [ dst ] "=x"(DST) : [ src ] "m"(SRC) :) #define VectorStreamWrite _mm_stream_si128 -#endif // ARROW_AVX2 -#endif // ARROW_AVX512 +#endif // ARROW_HAVE_AVX2 +#endif // ARROW_HAVE_AVX512 static void Read(void* src, void* dst, size_t size) { const auto simd = static_cast(src); diff --git a/cpp/src/arrow/json/rapidjson_defs.h b/cpp/src/arrow/json/rapidjson_defs.h index 5b52669a5e8..9ed81d000c5 100644 --- a/cpp/src/arrow/json/rapidjson_defs.h +++ b/cpp/src/arrow/json/rapidjson_defs.h @@ -32,9 +32,6 @@ } \ } -#include "arrow/util/neon_util.h" -#include "arrow/util/sse_util.h" - // enable SIMD whitespace skipping, if available #if defined(ARROW_HAVE_SSE4_2) #define RAPIDJSON_SSE2 1 diff --git a/cpp/src/arrow/util/byte_stream_split.h b/cpp/src/arrow/util/byte_stream_split.h index 46be2e78d60..bab50746064 100644 --- a/cpp/src/arrow/util/byte_stream_split.h +++ b/cpp/src/arrow/util/byte_stream_split.h @@ -17,16 +17,12 @@ #pragma once -#include "arrow/util/sse_util.h" +#include "arrow/util/simd.h" #include "arrow/util/ubsan.h" #include #include -#ifdef ARROW_HAVE_AVX2 -#include -#endif // ARROW_HAVE_AVX2 - #ifdef ARROW_HAVE_SSE4_2 // Enable the SIMD for ByteStreamSplit Encoder/Decoder #define ARROW_HAVE_SIMD_SPLIT diff --git a/cpp/src/arrow/util/hash_util.h b/cpp/src/arrow/util/hash_util.h index c5b870143b4..39eb6f02693 100644 --- a/cpp/src/arrow/util/hash_util.h +++ b/cpp/src/arrow/util/hash_util.h @@ -27,39 +27,27 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/neon_util.h" -#include "arrow/util/sse_util.h" +#include "arrow/util/simd.h" -static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { +#ifdef ARROW_HAVE_SSE4_2 +constexpr auto HW_crc32_u8 = _mm_crc32_u8; +constexpr auto HW_crc32_u16 = _mm_crc32_u16; +constexpr auto HW_crc32_u32 = _mm_crc32_u32; +constexpr auto HW_crc32_u64 = _mm_crc32_u64; +#elif defined(ARROW_HAVE_ARMV8_CRC) +constexpr auto HW_crc32_u8 = __crc32cb; +constexpr auto HW_crc32_u16 = __crc32ch; +constexpr auto HW_crc32_u32 = __crc32cw; +constexpr auto HW_crc32_u64 = __crc32cd; +#else +static inline uint32_t _hw_crc32_nope(uint32_t crc, uint64_t v) { DCHECK(false) << "Hardware CRC support is not enabled"; return 0; } - -#ifdef ARROW_HAVE_SSE4_2 -#define HW_crc32_u8 SSE4_crc32_u8 -#define HW_crc32_u16 SSE4_crc32_u16 -#define HW_crc32_u32 SSE4_crc32_u32 -#define HW_crc32_u64 SSE4_crc32_u64 -#elif defined(ARROW_HAVE_ARMV8_CRC) -#define HW_crc32_u8 ARMCE_crc32_u8 -#define HW_crc32_u16 ARMCE_crc32_u16 -#define HW_crc32_u32 ARMCE_crc32_u32 -#define HW_crc32_u64 ARMCE_crc32_u64 +constexpr auto HW_crc32_u8 = _hw_crc32_nope; +constexpr auto HW_crc32_u16 = _hw_crc32_nope; +constexpr auto HW_crc32_u32 = _hw_crc32_nope; +constexpr auto HW_crc32_u64 = _hw_crc32_nope; #endif namespace arrow { @@ -99,29 +87,29 @@ class HashUtil { uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; /* First 8 byte for better pipelining */ - crc0 = ARMCE_crc32_u64(crc, *buf64++); + crc0 = HW_crc32_u64(crc, *buf64++); /* 3 blocks crc32c parallel computation * * 42 * 8 * 3 = 1008 (bytes) */ for (int i = 0; i < BLK_LENGTH; i++, buf64++) { - crc0 = ARMCE_crc32_u64(crc0, *buf64); - crc1 = ARMCE_crc32_u64(crc1, *(buf64 + BLK_LENGTH)); - crc2 = ARMCE_crc32_u64(crc2, *(buf64 + (BLK_LENGTH * 2))); + crc0 = HW_crc32_u64(crc0, *buf64); + crc1 = HW_crc32_u64(crc1, *(buf64 + BLK_LENGTH)); + crc2 = HW_crc32_u64(crc2, *(buf64 + (BLK_LENGTH * 2))); } buf64 += (BLK_LENGTH * 2); /* Last 8 bytes */ - crc = ARMCE_crc32_u64(crc2, *buf64++); + crc = HW_crc32_u64(crc2, *buf64++); t0 = (uint64_t)vmull_p64(crc0, k0); t1 = (uint64_t)vmull_p64(crc1, k1); /* Merge (crc0, crc1, crc2) -> crc */ - crc1 = ARMCE_crc32_u64(0, t1); + crc1 = HW_crc32_u64(0, t1); crc ^= crc1; - crc0 = ARMCE_crc32_u64(0, t0); + crc0 = HW_crc32_u64(0, t0); crc ^= crc0; length -= 1024; @@ -129,25 +117,25 @@ class HashUtil { buf8 = reinterpret_cast(buf64); while (length >= 8) { - crc = ARMCE_crc32_u64(crc, *reinterpret_cast(buf8)); + crc = HW_crc32_u64(crc, *reinterpret_cast(buf8)); buf8 += 8; length -= 8; } /* The following is more efficient than the straight loop */ if (length >= 4) { - crc = ARMCE_crc32_u32(crc, *reinterpret_cast(buf8)); + crc = HW_crc32_u32(crc, *reinterpret_cast(buf8)); buf8 += 4; length -= 4; } if (length >= 2) { - crc = ARMCE_crc32_u16(crc, *reinterpret_cast(buf8)); + crc = HW_crc32_u16(crc, *reinterpret_cast(buf8)); buf8 += 2; length -= 2; } - if (length >= 1) crc = ARMCE_crc32_u8(crc, *(buf8)); + if (length >= 1) crc = HW_crc32_u8(crc, *(buf8)); return crc; } diff --git a/cpp/src/arrow/util/neon_util.h b/cpp/src/arrow/util/simd.h similarity index 65% rename from cpp/src/arrow/util/neon_util.h rename to cpp/src/arrow/util/simd.h index a82c2f65bb6..84c93a825cf 100644 --- a/cpp/src/arrow/util/neon_util.h +++ b/cpp/src/arrow/util/simd.h @@ -17,6 +17,24 @@ #pragma once +#ifdef _MSC_VER +// MSVC x86_64/arm64 + +#if defined(_M_AMD64) || defined(_M_X64) +#include +#elif defined(_M_ARM64) +#include +#endif + +#else +// gcc/clang (possibly others) + +#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512) +#include +#elif defined(ARROW_HAVE_SSE4_2) +#include +#endif + #ifdef ARROW_HAVE_NEON #include #endif @@ -25,26 +43,4 @@ #include #endif -namespace arrow { - -#ifdef ARROW_HAVE_ARMV8_CRC - -static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) { - return __crc32cb(crc, v); -} - -static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t v) { - return __crc32ch(crc, v); -} - -static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t v) { - return __crc32cw(crc, v); -} - -static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) { - return __crc32cd(crc, v); -} - -#endif // ARROW_HAVE_ARMV8_CRC - -} // namespace arrow +#endif diff --git a/cpp/src/arrow/util/sse_util.h b/cpp/src/arrow/util/sse_util.h deleted file mode 100644 index d8b3f227cf4..00000000000 --- a/cpp/src/arrow/util/sse_util.h +++ /dev/null @@ -1,108 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29. Pared down to a minimal set of -// functions needed for parquet-cpp - -#pragma once - -#include "arrow/util/macros.h" - -#ifdef ARROW_HAVE_SSE4_2 - -// MSVC x86-64 -#if (defined(_M_AMD64) || defined(_M_X64)) -#include -#else -// gcc/clang (possibly others) -#include -#endif - -#endif // ARROW_HAVE_SSE4_2 - -namespace arrow { - -/// This class contains constants useful for text processing with SSE4.2 intrinsics. -namespace SSEUtil { -/// Number of characters that fit in 64/128 bit register. SSE provides instructions -/// for loading 64 or 128 bits into a register at a time. -static const int CHARS_PER_64_BIT_REGISTER = 8; -static const int CHARS_PER_128_BIT_REGISTER = 16; - -/// SSE4.2 adds instructions for text processing. The instructions have a control -/// byte that determines some of functionality of the instruction. (Equivalent to -/// GCC's _SIDD_CMP_EQUAL_ANY, etc). -static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr -static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp -static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) -static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. - -/// In this mode, SSE text processing functions will return a mask of all the -/// characters that matched. -static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; - -/// In this mode, SSE text processing functions will return the number of -/// bytes that match consecutively from the beginning. -static const int STRCMP_MODE = - PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; - -/// Precomputed mask values up to 16 bits. -static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, - 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, -}; -} // namespace SSEUtil - -#ifdef ARROW_HAVE_SSE4_2 - -/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen -/// IR load time) that the processor supports SSE 4.2 before calling these. These are -/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. - -template -static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestrm(str1, len1, str2, len2, MODE); -} - -template -static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestri(str1, len1, str2, len2, MODE); -} - -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { - return _mm_crc32_u8(crc, v); -} - -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { - return _mm_crc32_u16(crc, v); -} - -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { - return _mm_crc32_u32(crc, v); -} - -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { -#if ARROW_BITNESS == 32 - return 0; -#else - return static_cast(_mm_crc32_u64(crc, v)); -#endif -} - -#endif // ARROW_HAVE_SSE4_2 - -} // namespace arrow From 11d95d1f83d79e61171076856747b604be7f8621 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 20 Apr 2020 02:56:25 +0000 Subject: [PATCH 2/4] Restore ARROW_SIMD_LEVEL=NONE --- cpp/cmake_modules/DefineOptions.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 234ad8f7dfc..533e2e45af4 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -101,6 +101,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option_string(ARROW_SIMD_LEVEL "SIMD compiler optimization level" "SSE4_2" # default to SSE4.2 + "NONE" "SSE4_2" "AVX2" "AVX512") From 479b992a8455b55f7a35676aed8b06c3d5dfd614 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 22 Apr 2020 02:28:28 +0000 Subject: [PATCH 3/4] Remove ARROW_USE_SIMD --- cpp/cmake_modules/DefineOptions.cmake | 3 --- cpp/cmake_modules/SetupCxxFlags.cmake | 8 +++----- docs/source/developers/benchmarks.rst | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 533e2e45af4..c8d82523269 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -95,9 +95,6 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" OFF) - # Disable this option to exercise non-SIMD fallbacks - define_option(ARROW_USE_SIMD "Build with SIMD optimizations" ON) - define_option_string(ARROW_SIMD_LEVEL "SIMD compiler optimization level" "SSE4_2" # default to SSE4.2 diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 6110a5aa525..ae8c8d0b21a 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -305,7 +305,7 @@ if(BUILD_WARNING_FLAGS) endif(BUILD_WARNING_FLAGS) # Only enable additional instruction sets if they are supported -if(ARROW_CPU_FLAG STREQUAL "x86" AND ARROW_USE_SIMD) +if(ARROW_CPU_FLAG STREQUAL "x86") if(ARROW_SIMD_LEVEL STREQUAL "AVX512") if(NOT CXX_SUPPORTS_AVX512) message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") @@ -327,7 +327,7 @@ if(ARROW_CPU_FLAG STREQUAL "x86" AND ARROW_USE_SIMD) endif() endif() -if(ARROW_CPU_FLAG STREQUAL "ppc" AND ARROW_USE_SIMD) +if(ARROW_CPU_FLAG STREQUAL "ppc") if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ALTIVEC_FLAG}") endif() @@ -342,9 +342,7 @@ if(ARROW_CPU_FLAG STREQUAL "armv8") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") - if(ARROW_USE_SIMD) - add_definitions(-DARROW_HAVE_NEON) - endif() + add_definitions(-DARROW_HAVE_NEON) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4") diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst index 2ae767bdda6..d85dc2baf98 100644 --- a/docs/source/developers/benchmarks.rst +++ b/docs/source/developers/benchmarks.rst @@ -59,7 +59,7 @@ Sometimes, it is required to pass custom CMake flags, e.g. .. code-block:: shell export CC=clang-8 CXX=clang++8 - archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON" + archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE" Comparison ========== From f0813fb1f1088bfa4f3ddcfbaa2346264c5448bd Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 27 Apr 2020 01:32:41 +0000 Subject: [PATCH 4/4] Remove hash_util.h --- cpp/src/arrow/util/hash_util.h | 238 --------------------------------- 1 file changed, 238 deletions(-) delete mode 100644 cpp/src/arrow/util/hash_util.h diff --git a/cpp/src/arrow/util/hash_util.h b/cpp/src/arrow/util/hash_util.h deleted file mode 100644 index 39eb6f02693..00000000000 --- a/cpp/src/arrow/util/hash_util.h +++ /dev/null @@ -1,238 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala (incubating) as of 2016-02-22 - -// XXX(ARROW-6468): this header is now unused. We keep CRC hash implementations -// around in case they're useful some day (Parquet checksumming?). - -#pragma once - -#include -#include - -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/simd.h" - -#ifdef ARROW_HAVE_SSE4_2 -constexpr auto HW_crc32_u8 = _mm_crc32_u8; -constexpr auto HW_crc32_u16 = _mm_crc32_u16; -constexpr auto HW_crc32_u32 = _mm_crc32_u32; -constexpr auto HW_crc32_u64 = _mm_crc32_u64; -#elif defined(ARROW_HAVE_ARMV8_CRC) -constexpr auto HW_crc32_u8 = __crc32cb; -constexpr auto HW_crc32_u16 = __crc32ch; -constexpr auto HW_crc32_u32 = __crc32cw; -constexpr auto HW_crc32_u64 = __crc32cd; -#else -static inline uint32_t _hw_crc32_nope(uint32_t crc, uint64_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} -constexpr auto HW_crc32_u8 = _hw_crc32_nope; -constexpr auto HW_crc32_u16 = _hw_crc32_nope; -constexpr auto HW_crc32_u32 = _hw_crc32_nope; -constexpr auto HW_crc32_u64 = _hw_crc32_nope; -#endif - -namespace arrow { - -/// Utility class to compute hash values. -class HashUtil { - public: -#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARMV8_CRC) - static constexpr bool have_hardware_crc32 = true; -#else - static constexpr bool have_hardware_crc32 = false; -#endif - -#ifdef ARROW_HAVE_ARMV8_CRYPTO -/* Crc32c Parallel computation - * Algorithm comes from Intel whitepaper: - * crc-iscsi-polynomial-crc32-instruction-paper - * - * Input data is divided into three equal-sized blocks - * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes - * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes - */ -#define BLK_LENGTH 42 - static uint32_t Armv8CrcHashParallel(const void* data, int32_t nbytes, uint32_t crc) { - const uint8_t* buf8; - const uint64_t* buf64 = reinterpret_cast(data); - int32_t length = nbytes; - - while (length >= 1024) { - uint64_t t0, t1; - uint32_t crc0 = 0, crc1 = 0, crc2 = 0; - - /* parallel computation params: - * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1)); - * k1 = CRC32(x ^ (42 * 8 * 8 - 1)); - */ - uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; - - /* First 8 byte for better pipelining */ - crc0 = HW_crc32_u64(crc, *buf64++); - - /* 3 blocks crc32c parallel computation - * - * 42 * 8 * 3 = 1008 (bytes) - */ - for (int i = 0; i < BLK_LENGTH; i++, buf64++) { - crc0 = HW_crc32_u64(crc0, *buf64); - crc1 = HW_crc32_u64(crc1, *(buf64 + BLK_LENGTH)); - crc2 = HW_crc32_u64(crc2, *(buf64 + (BLK_LENGTH * 2))); - } - buf64 += (BLK_LENGTH * 2); - - /* Last 8 bytes */ - crc = HW_crc32_u64(crc2, *buf64++); - - t0 = (uint64_t)vmull_p64(crc0, k0); - t1 = (uint64_t)vmull_p64(crc1, k1); - - /* Merge (crc0, crc1, crc2) -> crc */ - crc1 = HW_crc32_u64(0, t1); - crc ^= crc1; - crc0 = HW_crc32_u64(0, t0); - crc ^= crc0; - - length -= 1024; - } - - buf8 = reinterpret_cast(buf64); - while (length >= 8) { - crc = HW_crc32_u64(crc, *reinterpret_cast(buf8)); - buf8 += 8; - length -= 8; - } - - /* The following is more efficient than the straight loop */ - if (length >= 4) { - crc = HW_crc32_u32(crc, *reinterpret_cast(buf8)); - buf8 += 4; - length -= 4; - } - - if (length >= 2) { - crc = HW_crc32_u16(crc, *reinterpret_cast(buf8)); - buf8 += 2; - length -= 2; - } - - if (length >= 1) crc = HW_crc32_u8(crc, *(buf8)); - - return crc; - } -#endif - - /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions. The input hash - /// parameter is the current hash/seed value. - /// This should only be called if SSE/ArmCRC is supported. - /// This is ~4x faster than Fnv/Boost Hash. - /// TODO: crc32 hashes with different seeds do not result in different hash functions. - /// The resulting hashes are correlated. - static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) { - const uint8_t* p = reinterpret_cast(data); - const uint8_t* end = p + nbytes; - -#if ARROW_BITNESS >= 64 - while (p <= end - 8) { - hash = HW_crc32_u64(hash, *reinterpret_cast(p)); - p += 8; - } -#endif - - while (p <= end - 4) { - hash = HW_crc32_u32(hash, *reinterpret_cast(p)); - p += 4; - } - while (p < end) { - hash = HW_crc32_u8(hash, *p); - ++p; - } - - // The lower half of the CRC hash has has poor uniformity, so swap the halves - // for anyone who only uses the first several bits of the hash. - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// A variant of CRC32 hashing that computes two independent running CRCs - /// over interleaved halves of the input, giving out a 64-bit integer. - /// The result's quality should be improved by a finalization step. - /// - /// In addition to producing more bits of output, this should be twice - /// faster than CrcHash on CPUs that can overlap several independent - /// CRC computations. - static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) { - const uint8_t* p = reinterpret_cast(data); - - uint32_t h1 = static_cast(hash >> 32); - uint32_t h2 = static_cast(hash); - -#if ARROW_BITNESS >= 64 - while (nbytes >= 16) { - h1 = HW_crc32_u64(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u64(h2, *reinterpret_cast(p + 8)); - nbytes -= 16; - p += 16; - } - if (nbytes >= 8) { - h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); - nbytes -= 8; - p += 8; - } -#else - while (nbytes >= 8) { - h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); - nbytes -= 8; - p += 8; - } -#endif - - if (nbytes >= 4) { - h1 = HW_crc32_u16(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u16(h2, *reinterpret_cast(p + 2)); - nbytes -= 4; - p += 4; - } - switch (nbytes) { - case 3: - h1 = HW_crc32_u8(h1, p[2]); - // fallthrough - case 2: - h2 = HW_crc32_u8(h2, p[1]); - // fallthrough - case 1: - h1 = HW_crc32_u8(h1, p[0]); - // fallthrough - case 0: - break; - default: - assert(0); - } - - // A finalization step is recommended to mix up the result's bits - return (static_cast(h1) << 32) + h2; - } -}; - -} // namespace arrow