From 7b3f9314df13fceee9dcc1f96f2b27ee97246d9e Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 15 Oct 2020 15:31:34 +0200
Subject: [PATCH 1/4] ARROW-10313: [C++] Faster UTF8 validation for small
 strings

This improves CSV string conversion by about 30%.
---
 cpp/src/arrow/util/utf8.h | 55 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index d5875c4590b..89101b113ed 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -87,8 +87,9 @@ ARROW_EXPORT void InitializeUTF8();
 
 inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
   static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
-  // For some reason, defining this variable outside the loop helps clang
-  uint64_t mask;
+  static constexpr uint32_t high_bits_32 = 0x80808080UL;
+  static constexpr uint16_t high_bits_16 = 0x8080U;
+  static constexpr uint8_t high_bits_8 = 0x80U;
 
 #ifndef NDEBUG
   internal::CheckUTF8Initialized();
@@ -98,8 +99,9 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
     // XXX This is doing an unaligned access.  Contemporary architectures
     // (x86-64, AArch64, PPC64) support it natively and often have good
     // performance nevertheless.
-    memcpy(&mask, data, 8);
-    if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) {
+    uint64_t mask64;
+    memcpy(&mask64, data, 8);
+    if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
       // 8 bytes of pure ASCII, move forward
       size -= 8;
       data += 8;
@@ -154,13 +156,52 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
     return false;
   }
 
-  // Validate string tail one byte at a time
+  // Check if string tail is full ASCII (common case, fast)
+  if (size >= 4) {
+    uint32_t mask1, mask2;
+    memcpy(&mask2, data + size - 4, 4);
+    memcpy(&mask1, data, 4);
+    if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_32) == 0)) {
+      return true;
+    }
+  } else if (size >= 2) {
+    uint16_t mask1, mask2;
+    memcpy(&mask2, data + size - 2, 2);
+    memcpy(&mask1, data, 2);
+    if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_16) == 0)) {
+      return true;
+    }
+  } else if (size == 1) {
+    if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
+      return true;
+    }
+  } else {
+    /* size == 0 */
+    return true;
+  }
+
+  // Fall back to UTF8 validation of tail string.
   // Note the state table is designed so that, once in the reject state,
   // we remain in that state until the end.  So we needn't check for
   // rejection at each char (we don't gain much by short-circuiting here).
   uint16_t state = internal::kUTF8ValidateAccept;
-  while (size-- > 0) {
-    state = internal::ValidateOneUTF8Byte(*data++, state);
+  switch (size) {
+    case 7:
+      state = internal::ValidateOneUTF8Byte(data[size - 7], state);
+    case 6:
+      state = internal::ValidateOneUTF8Byte(data[size - 6], state);
+    case 5:
+      state = internal::ValidateOneUTF8Byte(data[size - 5], state);
+    case 4:
+      state = internal::ValidateOneUTF8Byte(data[size - 4], state);
+    case 3:
+      state = internal::ValidateOneUTF8Byte(data[size - 3], state);
+    case 2:
+      state = internal::ValidateOneUTF8Byte(data[size - 2], state);
+    case 1:
+      state = internal::ValidateOneUTF8Byte(data[size - 1], state);
+    default:
+      break;
   }
   return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
 }

From aa883274eab376e53156c70d10b52d9ed0e2e0c7 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 15 Oct 2020 19:30:54 +0200
Subject: [PATCH 2/4] Use SafeLoadAs

---
 cpp/src/arrow/util/utf8.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 89101b113ed..42d2ab5c42a 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -27,6 +27,7 @@
 #include "arrow/util/macros.h"
 #include "arrow/util/simd.h"
 #include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -158,17 +159,15 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
 
   // Check if string tail is full ASCII (common case, fast)
   if (size >= 4) {
-    uint32_t mask1, mask2;
-    memcpy(&mask2, data + size - 4, 4);
-    memcpy(&mask1, data, 4);
-    if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_32) == 0)) {
+    uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+    uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+    if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
       return true;
     }
   } else if (size >= 2) {
-    uint16_t mask1, mask2;
-    memcpy(&mask2, data + size - 2, 2);
-    memcpy(&mask1, data, 2);
-    if (ARROW_PREDICT_TRUE(((mask1 | mask2) & high_bits_16) == 0)) {
+    uint16_t tail_mask = SafeLoadAs<uint32_t>(data + size - 2);
+    uint16_t head_mask = SafeLoadAs<uint32_t>(data);
+    if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
       return true;
     }
   } else if (size == 1) {

From bdb11ec048d2101169933630b43fa0733dd4eb97 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 15 Oct 2020 21:14:26 +0200
Subject: [PATCH 3/4] Fix SafeLoadAs usage

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/util/utf8.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 42d2ab5c42a..a20115eafa0 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -165,8 +165,8 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
       return true;
     }
   } else if (size >= 2) {
-    uint16_t tail_mask = SafeLoadAs<uint32_t>(data + size - 2);
-    uint16_t head_mask = SafeLoadAs<uint32_t>(data);
+    uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
+    uint16_t head_mask = SafeLoadAs<uint16_t>(data);
     if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
       return true;
     }

From 30de08893ce8022dd61968f5d3225391fc471824 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 15 Oct 2020 21:14:47 +0200
Subject: [PATCH 4/4] Use SafeLoadAs instead of memcpy

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/util/utf8.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index a20115eafa0..c089fa7fff6 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -100,8 +100,7 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
     // XXX This is doing an unaligned access.  Contemporary architectures
     // (x86-64, AArch64, PPC64) support it natively and often have good
     // performance nevertheless.
-    uint64_t mask64;
-    memcpy(&mask64, data, 8);
+    uint64_t mask64 = SafeLoadAs<uint64_t>(data);
     if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
       // 8 bytes of pure ASCII, move forward
       size -= 8;