From 7b3f9314df13fceee9dcc1f96f2b27ee97246d9e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 15 Oct 2020 15:31:34 +0200 Subject: [PATCH 1/4] ARROW-10313: [C++] Faster UTF8 validation for small strings This improves CSV string conversion by about 30%. --- cpp/src/arrow/util/utf8.h | 55 ++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index d5875c4590b..89101b113ed 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -87,8 +87,9 @@ ARROW_EXPORT void InitializeUTF8(); inline bool ValidateUTF8(const uint8_t* data, int64_t size) { static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL; - // For some reason, defining this variable outside the loop helps clang - uint64_t mask; + static constexpr uint32_t high_bits_32 = 0x80808080UL; + static constexpr uint16_t high_bits_16 = 0x8080U; + static constexpr uint8_t high_bits_8 = 0x80U; #ifndef NDEBUG internal::CheckUTF8Initialized(); @@ -98,8 +99,9 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { // XXX This is doing an unaligned access. Contemporary architectures // (x86-64, AArch64, PPC64) support it natively and often have good // performance nevertheless. - memcpy(&mask, data, 8); - if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) { + uint64_t mask64; + memcpy(&mask64, data, 8); + if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) { // 8 bytes of pure ASCII, move forward size -= 8; data += 8; @@ -154,13 +156,52 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { return false; } - // Validate string tail one byte at a time + // Check if string tail is full ASCII (common case, fast) + if (size >= 4) { + uint32_t mask1, mask2; + memcpy(&mask2, data + size - 4, 4); + memcpy(&mask1, data, 4); + if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_32) == 0)) { + return true; + } + } else if (size >= 2) { + uint16_t mask1, mask2; + memcpy(&mask2, data + size - 2, 2); + memcpy(&mask1, data, 2); + if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_16) == 0)) { + return true; + } + } else if (size == 1) { + if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) { + return true; + } + } else { + /* size == 0 */ + return true; + } + + // Fall back to UTF8 validation of tail string. // Note the state table is designed so that, once in the reject state, // we remain in that state until the end. So we needn't check for // rejection at each char (we don't gain much by short-circuiting here). uint16_t state = internal::kUTF8ValidateAccept; - while (size-- > 0) { - state = internal::ValidateOneUTF8Byte(*data++, state); + switch (size) { + case 7: + state = internal::ValidateOneUTF8Byte(data[size - 7], state); + case 6: + state = internal::ValidateOneUTF8Byte(data[size - 6], state); + case 5: + state = internal::ValidateOneUTF8Byte(data[size - 5], state); + case 4: + state = internal::ValidateOneUTF8Byte(data[size - 4], state); + case 3: + state = internal::ValidateOneUTF8Byte(data[size - 3], state); + case 2: + state = internal::ValidateOneUTF8Byte(data[size - 2], state); + case 1: + state = internal::ValidateOneUTF8Byte(data[size - 1], state); + default: + break; } return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); } From aa883274eab376e53156c70d10b52d9ed0e2e0c7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 15 Oct 2020 19:30:54 +0200 Subject: [PATCH 2/4] Use SafeLoadAs --- cpp/src/arrow/util/utf8.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 89101b113ed..42d2ab5c42a 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -27,6 +27,7 @@ #include "arrow/util/macros.h" #include "arrow/util/simd.h" #include "arrow/util/string_view.h" +#include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" namespace arrow { @@ -158,17 +159,15 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { // Check if string tail is full ASCII (common case, fast) if (size >= 4) { - uint32_t mask1, mask2; - memcpy(&mask2, data + size - 4, 4); - memcpy(&mask1, data, 4); - if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_32) == 0)) { + uint32_t tail_mask = SafeLoadAs(data + size - 4); + uint32_t head_mask = SafeLoadAs(data); + if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) { return true; } } else if (size >= 2) { - uint16_t mask1, mask2; - memcpy(&mask2, data + size - 2, 2); - memcpy(&mask1, data, 2); - if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_16) == 0)) { + uint16_t tail_mask = SafeLoadAs(data + size - 2); + uint16_t head_mask = SafeLoadAs(data); + if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) { return true; } } else if (size == 1) { From bdb11ec048d2101169933630b43fa0733dd4eb97 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 15 Oct 2020 21:14:26 +0200 Subject: [PATCH 3/4] Fix SafeLoadAs usage Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/util/utf8.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 42d2ab5c42a..a20115eafa0 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -165,8 +165,8 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { return true; } } else if (size >= 2) { - uint16_t tail_mask = SafeLoadAs(data + size - 2); - uint16_t head_mask = SafeLoadAs(data); + uint16_t tail_mask = SafeLoadAs(data + size - 2); + uint16_t head_mask = SafeLoadAs(data); if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) { return true; } From 30de08893ce8022dd61968f5d3225391fc471824 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 15 Oct 2020 21:14:47 +0200 Subject: [PATCH 4/4] Use SafeLoadAs instead of memcpy Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/util/utf8.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index a20115eafa0..c089fa7fff6 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -100,8 +100,7 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { // XXX This is doing an unaligned access. Contemporary architectures // (x86-64, AArch64, PPC64) support it natively and often have good // performance nevertheless. - uint64_t mask64; - memcpy(&mask64, data, 8); + uint64_t mask64 = SafeLoadAs(data); if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) { // 8 bytes of pure ASCII, move forward size -= 8;