From 2cd1d8bd57294f8a9472b7ed4cfa87e9428c0a52 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Tue, 7 Jul 2020 13:35:42 +0200
Subject: [PATCH 01/28] ARROW-9268: [C++] add string_is{alpnum,alpha...,upper}
 kernels

---
 .../arrow/compute/kernels/scalar_string.cc    | 497 +++++++++++++++++-
 .../kernels/scalar_string_benchmark.cc        |  10 +
 .../compute/kernels/scalar_string_test.cc     | 140 +++++
 cpp/src/arrow/util/utf8.h                     |  19 +
 python/pyarrow/compute.py                     |  22 +
 python/pyarrow/tests/test_compute.py          |  44 ++
 6 files changed, 730 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index a9cf01467f6..d44c0eae217 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -50,6 +50,11 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
                                                               : utf8_code_unit;
 }
 
+template <typename T>
+static inline bool IsAsciiCharacter(T character) {
+  return character < 128;
+}
+
 // TODO: optional ascii validation
 
 struct AsciiLength {
@@ -183,6 +188,54 @@ struct UTF8Transform {
   }
 };
 
+template <typename StringType, typename Derived>
+struct BinaryToBoolean {
+  using offset_type = typename StringType::offset_type;
+  using ArrayType = typename TypeTraits<StringType>::ArrayType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::ARRAY) {
+      EnsureLookupTablesFilled();
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+      ArrayData* out_arr = out->mutable_array();
+
+      // offset_type input_ncodeunits = input_boxed.total_values_length();
+      offset_type input_nstrings = static_cast<offset_type>(input.length);
+
+      FirstTimeBitmapWriter bitmap_writer(out_arr->buffers[1]->mutable_data(),
+                                          out_arr->offset, input.length);
+      for (int64_t i = 0; i < input_nstrings; i++) {
+        offset_type input_string_ncodeunits;
+        const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
+        offset_type encoded_nbytes;
+        bool boolean_result =
+            Derived::Predicate(ctx, input_string, input_string_ncodeunits);
+        if (!ctx->status().ok()) {
+          // UTF decoding can lead to issues
+          return;
+        }
+        if (boolean_result) {
+          bitmap_writer.Set();
+        }
+        bitmap_writer.Next();
+      }
+      bitmap_writer.Finish();
+    } else {
+      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+      if (input.is_valid) {
+        offset_type data_nbytes = static_cast<offset_type>(input.value->size());
+        bool boolean_result = Derived::Predicate(ctx, input.value->data(), data_nbytes);
+        if (!ctx->status().ok()) {
+          // UTF decoding can lead to issues
+          return;
+        }
+        out->value = std::make_shared<BooleanScalar>(boolean_result);
+      }
+    }
+  }
+};
+
 template <typename Type>
 struct UTF8Upper : UTF8Transform<Type, UTF8Upper<Type>> {
   inline static uint32_t TransformCodepoint(uint32_t codepoint) {
@@ -322,9 +375,7 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
   } else {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
     if (input.is_valid) {
-      auto result = checked_pointer_cast<BooleanScalar>(MakeNullScalar(out->type()));
       uint8_t result_value = 0;
-      result->is_valid = true;
       std::array<offset_type, 2> offsets{0,
                                          static_cast<offset_type>(input.value->size())};
       transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
@@ -409,6 +460,413 @@ void AddBinaryContainsExact(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// IsAlpha/Digit etc
+
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
+  uint32_t general_category = 1 << utf8proc_category(codepoint);
+  // for e.g. undefined (but valid) codepoints, general_category == 0
+  return (general_category != 0) && ((general_category & mask) != 0);
+}
+
+template <typename... Categories>
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask,
+                                                utf8proc_category_t category,
+                                                Categories... categories) {
+  return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...);
+}
+
+template <typename... Categories>
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
+                                                utf8proc_category_t category,
+                                                Categories... categories) {
+  return HasAnyUnicodeGeneralCategory(codepoint, static_cast<uint32_t>(1u << category),
+                                      categories...);
+}
+
+static inline bool IsUpperCaseCharacterRoman(uint32_t codepoint) {
+  // Roman letter Ⅰ to Ⅿ are seen as capital (see 4.2 of Unicode spec)
+  // DerivedCoreProperties.txt should have this information, but it is not stored in
+  // the utf8proc library.
+  return (codepoint >= 0x2160) && (codepoint <= 0x216f);
+}
+
+static inline bool IsUpperCaseCharacterCircled(uint32_t codepoint) {
+  // Circled letters Ⓐ-Ⓩ are seen as capital (see 4.2 of Unicode spec)
+  // DerivedCoreProperties.txt should have this information, but it is not stored in
+  // the utf8proc library.
+  return (codepoint >= 0x24b6) && (codepoint <= 0x24cf);
+}
+
+static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
+                                      UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
+         IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint);
+  ;
+}
+
+static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL);
+}
+
+static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
+         IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint);
+}
+
+static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
+                                      UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
+                                      UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
+}
+
+static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
+  return (ascii_character >= 'a') && (ascii_character <= 'z');
+}
+
+static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
+  return (ascii_character >= 'A') && (ascii_character <= 'Z');
+}
+
+static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
+  return IsLowerCaseCharacterAscii(ascii_character) ||
+         IsUpperCaseCharacterAscii(ascii_character);
+}
+
+static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
+  return IsCasedCharacterAscii(ascii_character);  // same
+}
+
+static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
+}
+
+static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= '0') && (ascii_character <= '9'));
+}
+
+static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
+  // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
+  // utf8proc has no support for this, this is the best we can do:
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
+}
+
+static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
+  // Formally this is not correct, but utf8proc does not allow us to query for Numerical
+  // properties, e.g. Numeric_Value and Numeric_Type
+  // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
+  // Numeric_Type=Numeric.
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
+                                      UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
+}
+
+static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
+  auto property = utf8proc_get_property(codepoint);
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) ||
+         property->bidi_class == UTF8PROC_BIDI_CLASS_WS ||
+         property->bidi_class == UTF8PROC_BIDI_CLASS_B ||
+         property->bidi_class == UTF8PROC_BIDI_CLASS_S;
+}
+
+static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
+         (ascii_character == ' ');
+}
+
+static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
+  uint32_t general_category = utf8proc_category(codepoint);
+  return (general_category != 0) &&
+         !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
+                                       UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
+                                       UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,
+                                       UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
+}
+
+static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= ' ') && (ascii_character <= '~'));
+}
+
+template <typename StringType, typename Derived, bool allow_empty = false>
+struct CharacterPredicateUnicode
+    : BinaryToBoolean<StringType,
+                      CharacterPredicateUnicode<StringType, Derived, allow_empty>> {
+  using offset_type = typename StringType::offset_type;
+  static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
+                               offset_type input_string_ncodeunits) {
+    if (allow_empty && input_string_ncodeunits == 0) {
+      return true;
+    }
+    bool all;
+    bool any = false;
+    if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
+            input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
+              any |= Derived::PredicateCharacterAny(codepoint);
+              return Derived::PredicateCharacterAll(codepoint);
+            }))) {
+      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+      return false;
+    }
+    return all & any;
+  }
+  static inline bool PredicateCharacterAny(uint32_t) {
+    return true;  // default condition make sure there is at least 1 charachter
+  }
+};
+
+template <typename StringType, typename Derived, bool allow_empty = false>
+struct CharacterPredicateAscii
+    : BinaryToBoolean<StringType,
+                      CharacterPredicateAscii<StringType, Derived, allow_empty>> {
+  using offset_type = typename StringType::offset_type;
+  static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
+                               offset_type input_string_ncodeunits) {
+    if (allow_empty && input_string_ncodeunits == 0) {
+      return true;
+    }
+    bool any = false;
+    bool all = std::all_of(input, input + input_string_ncodeunits,
+                           [&any](uint8_t ascii_character) {
+                             any |= Derived::PredicateCharacterAny(ascii_character);
+                             return Derived::PredicateCharacterAll(ascii_character);
+                           });
+    return all & any;
+  }
+  static inline bool PredicateCharacterAny(uint8_t) {
+    return true;  // default condition make sure there is at least 1 charachter
+  }
+};
+
+template <typename StringType>
+struct IsAlphaNumericUnicode
+    : CharacterPredicateUnicode<StringType, IsAlphaNumericUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsAlphaCharacterUnicode(codepoint) || IsDecimalCharacterUnicode(codepoint) ||
+           IsNumericCharacterUnicode(codepoint) || IsDigitCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsAlphaNumericAscii
+    : CharacterPredicateAscii<StringType, IsAlphaNumericAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsAlphaCharacterAscii(codepoint) || IsDecimalCharacterAscii(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsAlphaUnicode
+    : CharacterPredicateUnicode<StringType, IsAlphaUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsAlphaCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsAlphaAscii : CharacterPredicateAscii<StringType, IsAlphaAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    return IsAlphaCharacterAscii(ascii_character);
+  }
+};
+
+template <typename StringType>
+struct IsDecimalUnicode
+    : CharacterPredicateUnicode<StringType, IsDecimalUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsDecimalCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsDecimalAscii : CharacterPredicateAscii<StringType, IsDecimalAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    return IsDecimalCharacterAscii(ascii_character);
+  }
+};
+
+template <typename StringType>
+struct IsDigitUnicode
+    : CharacterPredicateUnicode<StringType, IsDigitUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsDigitCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsNumericUnicode
+    : CharacterPredicateUnicode<StringType, IsNumericUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsNumericCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsAscii : BinaryToBoolean<StringType, IsAscii<StringType>> {
+  using offset_type = typename StringType::offset_type;
+  static bool Predicate(KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_nascii_characters) {
+    return std::all_of(input, input + input_string_nascii_characters,
+                       IsAsciiCharacter<uint8_t>);
+  }
+};
+
+template <typename StringType>
+struct IsLowerUnicode
+    : CharacterPredicateUnicode<StringType, IsLowerUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    // Only for cased character it needs to be lower case
+    return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
+  }
+  static inline bool PredicateCharacterAny(uint32_t codepoint) {
+    return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
+  }
+};
+
+template <typename StringType>
+struct IsLowerAscii : CharacterPredicateAscii<StringType, IsLowerAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    // Only for cased character it needs to be lower case
+    return !IsCasedCharacterAscii(ascii_character) ||
+           IsLowerCaseCharacterAscii(ascii_character);
+  }
+  static inline bool PredicateCharacterAny(uint8_t ascii_character) {
+    return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
+  }
+};
+
+template <typename StringType>
+struct IsPrintableUnicode
+    : CharacterPredicateUnicode<StringType, IsPrintableUnicode<StringType>,
+                                /*allow_empty=*/true> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsPrintableAscii
+    : CharacterPredicateAscii<StringType, IsPrintableAscii<StringType>,
+                              /*allow_empty=*/true> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    return IsPrintableCharacterAscii(ascii_character);
+  }
+};
+
+template <typename StringType>
+struct IsSpaceUnicode
+    : CharacterPredicateUnicode<StringType, IsSpaceUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    return IsSpaceCharacterUnicode(codepoint);
+  }
+};
+
+template <typename StringType>
+struct IsSpaceAscii : CharacterPredicateAscii<StringType, IsSpaceAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    return IsSpaceCharacterAscii(ascii_character);
+  }
+};
+
+template <typename StringType>
+struct IsTitleUnicode : BinaryToBoolean<StringType, IsTitleUnicode<StringType>> {
+  using offset_type = typename StringType::offset_type;
+  static bool Predicate(KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) {
+    // rules:
+    // * 1: lower case follows cased
+    // * 2: upper case follows uncased
+    // * 3: at least 1 cased character (which logically should be upper/title)
+    bool rules_1_and_2;
+    bool previous_cased = false;  // in LL, LU or LT
+    bool rule_3 = false;
+    bool status =
+        arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
+                               [&previous_cased, &rule_3](uint32_t codepoint) {
+                                 if (IsLowerCaseCharacterUnicode(codepoint)) {
+                                   if (!previous_cased) return false;  // rule 1 broken
+                                   previous_cased = true;
+                                 } else if (IsCasedCharacterUnicode(codepoint)) {
+                                   if (previous_cased) return false;  // rule 2 broken
+                                   // next should be a lower case or uncased
+                                   previous_cased = true;
+                                   rule_3 = true;  // rule 3 obeyed
+                                 } else {
+                                   // a non-cased char, like _ or 1
+                                   // next should be upper case or more uncased
+                                   previous_cased = false;
+                                 }
+                                 return true;
+                               });
+    if (!ARROW_PREDICT_TRUE(status)) {
+      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+      return false;
+    }
+    return rules_1_and_2 & rule_3;
+  }
+};
+
+template <typename StringType>
+struct IsTitleAscii : BinaryToBoolean<StringType, IsTitleAscii<StringType>> {
+  using offset_type = typename StringType::offset_type;
+  static bool Predicate(KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) {
+    // rules:
+    // * 1: lower case follows cased
+    // * 2: upper case follows uncased
+    // * 3: at least 1 cased character (which logically should be upper/title)
+    bool rules_1_and_2 = true;
+    bool previous_cased = false;  // in LL, LU or LT
+    bool rule_3 = false;
+    // we cannot rely on std::all_of because we need guaranteed order
+    for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
+      if (IsLowerCaseCharacterAscii(*c)) {
+        if (!previous_cased) {
+          // rule 1 broken
+          rules_1_and_2 = false;
+          break;
+        }
+        previous_cased = true;
+      } else if (IsCasedCharacterAscii(*c)) {
+        if (previous_cased) {
+          // rule 2 broken
+          rules_1_and_2 = false;
+          break;
+        }
+        // next should be a lower case or uncased
+        previous_cased = true;
+        rule_3 = true;  // rule 3 obeyed
+      } else {
+        // a non-cased char, like _ or 1
+        // next should be upper case or more uncased
+        previous_cased = false;
+      }
+    }
+    return rules_1_and_2 & rule_3;
+  }
+};
+
+template <typename StringType>
+struct IsUpperUnicode
+    : CharacterPredicateUnicode<StringType, IsUpperUnicode<StringType>> {
+  static inline bool PredicateCharacterAll(uint32_t codepoint) {
+    // Only for cased character it needs to be lower case
+    return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
+  }
+  static inline bool PredicateCharacterAny(uint32_t codepoint) {
+    return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
+  }
+};
+
+template <typename StringType>
+struct IsUpperAscii : CharacterPredicateAscii<StringType, IsUpperAscii<StringType>> {
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    // Only for cased character it needs to be lower case
+    return !IsCasedCharacterAscii(ascii_character) ||
+           IsUpperCaseCharacterAscii(ascii_character);
+  }
+  static inline bool PredicateCharacterAny(uint8_t ascii_character) {
+    return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
+  }
+};
+
 // ----------------------------------------------------------------------
 // strptime string parsing
 
@@ -477,6 +935,16 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+template <template <typename> class Transformer>
+void AddUnaryString(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
+  ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
+  ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
+  DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32));
+  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), exec_64));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
 #endif
 
 }  // namespace
@@ -484,10 +952,35 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
+
+  AddUnaryString<IsAscii>("binary_isascii", registry);
+
+  AddUnaryString<IsAlphaNumericAscii>("string_isalnum_ascii", registry);
+  AddUnaryString<IsAlphaAscii>("string_isalpha_ascii", registry);
+  AddUnaryString<IsDecimalAscii>("string_isdecimal_ascii", registry);
+  // no isdigic for ascii, since it is the same as isdecimal
+  AddUnaryString<IsLowerAscii>("string_islower_ascii", registry);
+  // no isnumeric for ascii, since it is the same as isdecimal
+  AddUnaryString<IsPrintableAscii>("string_isprintable_ascii", registry);
+  AddUnaryString<IsSpaceAscii>("string_isspace_ascii", registry);
+  AddUnaryString<IsTitleAscii>("string_istitle_ascii", registry);
+  AddUnaryString<IsUpperAscii>("string_isupper_ascii", registry);
 #ifdef ARROW_WITH_UTF8PROC
   MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);
+  AddUnaryString<IsAlphaNumericUnicode>("string_isalnum_unicode", registry);
+  AddUnaryString<IsAlphaUnicode>("string_isalpha_unicode", registry);
+  AddUnaryString<IsDecimalUnicode>("string_isdecimal_unicode", registry);
+  AddUnaryString<IsDigitUnicode>("string_isdigit_unicode", registry);
+  AddUnaryString<IsLowerUnicode>("string_islower_unicode", registry);
+  AddUnaryString<IsNumericUnicode>("string_isnumeric_unicode", registry);
+  AddUnaryString<IsPrintableUnicode>("string_isprintable_unicode", registry);
+  AddUnaryString<IsSpaceUnicode>("string_isspace_unicode", registry);
+  AddUnaryString<IsTitleUnicode>("string_istitle_unicode", registry);
+  AddUnaryString<IsUpperUnicode>("string_isupper_unicode", registry);
+
 #endif
+
   AddAsciiLength(registry);
   AddBinaryContainsExact(registry);
   AddStrptime(registry);
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index 46ee129b03c..e65d2dca2be 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -57,6 +57,10 @@ static void AsciiUpper(benchmark::State& state) {
   UnaryStringBenchmark(state, "ascii_upper");
 }
 
+static void IsAlphaAscii(benchmark::State& state) {
+  UnaryStringBenchmark(state, "string_isalpha_ascii");
+}
+
 static void BinaryContainsExact(benchmark::State& state) {
   BinaryContainsExactOptions options("abac");
   UnaryStringBenchmark(state, "binary_contains_exact", &options);
@@ -70,14 +74,20 @@ static void Utf8Upper(benchmark::State& state) {
 static void Utf8Lower(benchmark::State& state) {
   UnaryStringBenchmark(state, "utf8_lower");
 }
+
+static void IsAlphaUnicode(benchmark::State& state) {
+  UnaryStringBenchmark(state, "string_isalpha_unicode");
+}
 #endif
 
 BENCHMARK(AsciiLower);
 BENCHMARK(AsciiUpper);
+BENCHMARK(IsAlphaAscii);
 BENCHMARK(BinaryContainsExact);
 #ifdef ARROW_WITH_UTF8PROC
 BENCHMARK(Utf8Lower);
 BENCHMARK(Utf8Upper);
+BENCHMARK(IsAlphaUnicode);
 #endif
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 0989401d034..67f24d1ef07 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -145,8 +145,148 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
                                   CallFunction("utf8_lower", {invalid_input}));
 }
 
+TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
+  // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
+  // UTF8PROC_CATEGORY_LO
+  this->CheckUnary("string_isalnum_unicode", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
+                   boolean(), "[true, null, true, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
+  // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
+  // UTF8PROC_CATEGORY_LO
+  this->CheckUnary("string_isalpha_unicode", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]",
+                   boolean(), "[true, null, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsAscii) {
+  this->CheckUnary("binary_isascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(),
+                   "[true, null, false, true]");
+}
+
+TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
+  // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
+  this->CheckUnary("string_isdecimal_unicode",
+                   "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(),
+                   "[true, null, true, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsNumericUnicode) {
+  // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
+  this->CheckUnary("string_isnumeric_unicode",
+                   "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(),
+                   "[true, null, true, true, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsLowerUnicode) {
+  // ٣ is arabic 3 (decimal), Φ capital
+  this->CheckUnary("string_islower_unicode",
+                   "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\"]", boolean(),
+                   "[false, null, true, false, true, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
+  // U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable
+  // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
+  this->CheckUnary(
+      "string_isprintable_unicode",
+      "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
+      "[true, null, false, true, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
+  // U+2008 (utf8: \xe2\x80\x88) is punctuaction space
+  this->CheckUnary("string_isspace_unicode", "[\" \", null, \"  \", \"\\t\\r\"]",
+                   boolean(), "[true, null, true, true]");
+  this->CheckUnary("string_isspace_unicode",
+                   "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(),
+                   "[false, null, false, false, true]");
+}
+
+TYPED_TEST(TestStringKernels, IsTitleUnicode) {
+  // ٣ is arabic 3 (decimal), Φ capital
+  this->CheckUnary("string_istitle_unicode",
+                   "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
+                   boolean(), "[true, null, true, true, true, true, true]");
+  this->CheckUnary(
+      "string_istitle_unicode",
+      "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsǄ\", \"ΦΦ\", \"ǆ\", \"_\"]",
+      boolean(), "[false, null, false, false, false, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsUpperUnicode) {
+  // ٣ is arabic 3 (decimal), Φ capital
+  // Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
+  this->CheckUnary(
+      "string_isupper_unicode",
+      "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\", \"Ⓐ\", \"Ⓩ\"]",
+      boolean(), "[false, null, false, true, true, true, false, true, true, true, true]");
+}
+
 #endif  // ARROW_WITH_UTF8PROC
 
+TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
+  this->CheckUnary("string_isalnum_ascii", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
+                   boolean(), "[false, null, false, false, false]");
+  this->CheckUnary("string_isalnum_ascii",
+                   "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]", boolean(),
+                   "[true, null, true, true, true, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsAlphaAscii) {
+  this->CheckUnary("string_isalpha_ascii",
+                   "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]", boolean(),
+                   "[false, true, null, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsDecimalAscii) {
+  // ٣ is arabic 3
+  this->CheckUnary("string_isdecimal_ascii", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
+                   boolean(), "[true, null, false, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsLowerAscii) {
+  // ٣ is arabic 3 (decimal), φ lower greek
+  this->CheckUnary("string_islower_ascii",
+                   "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
+                   "[false, null, true, false, true, false, false]");
+}
+TYPED_TEST(TestStringKernels, IsPrintableAscii) {
+  // \xe2\x80\x88 is punctuaction space
+  this->CheckUnary("string_isprintable_ascii",
+                   "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
+                   "[true, null, false, true, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsSpaceAscii) {
+  // \xe2\x80\x88 is punctuaction space
+  // Note: for ascii version, the non-ascii chars are seen as non-cased
+  this->CheckUnary("string_isspace_ascii", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
+                   "[true, null, true, true]");
+  this->CheckUnary("string_isspace_ascii",
+                   "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(),
+                   "[false, null, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsTitleAscii) {
+  // ٣ is arabic 3 (decimal), Φ capital
+  // Note: for ascii version, the non-ascii chars are seen as non-cased
+  this->CheckUnary("string_istitle_ascii",
+                   "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
+                   boolean(), "[true, null, true, true, true, false, false]");
+  this->CheckUnary(
+      "string_istitle_ascii",
+      "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsǄ\", \"ΦΦ\", \"ǆ\", \"_\"]",
+      boolean(), "[false, null, false, false, true, false, false, false]");
+}
+
+TYPED_TEST(TestStringKernels, IsUpperAscii) {
+  // ٣ is arabic 3 (decimal), Φ capital greek
+  this->CheckUnary("string_isupper_ascii",
+                   "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
+                   "[false, null, false, true, true, false, false]");
+}
+
 TYPED_TEST(TestStringKernels, BinaryContainsExact) {
   BinaryContainsExactOptions options{"ab"};
   this->CheckUnary("binary_contains_exact", "[]", boolean(), "[]", &options);
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 1775b19d0fe..d5875c4590b 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -366,5 +366,24 @@ static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
   return true;
 }
 
+template <class UnaryPredicate>
+static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* result,
+                             UnaryPredicate&& predicate) {
+  const uint8_t* i = first;
+  while (i < last) {
+    uint32_t codepoint = 0;
+    if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+      return false;
+    }
+
+    if (!predicate(codepoint)) {
+      *result = false;
+      return true;
+    }
+  }
+  *result = true;
+  return true;
+}
+
 }  // namespace util
 }  // namespace arrow
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index ae7dae86d18..bfe484518ef 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -103,6 +103,28 @@ def func(left, right):
 utf8_upper = _simple_unary_function('utf8_upper')
 utf8_lower = _simple_unary_function('utf8_lower')
 
+binary_isascii = _simple_unary_function('binary_isascii')
+
+string_isalnum_ascii = _simple_unary_function('string_isalnum_ascii')
+string_isalnum_unicode = _simple_unary_function('string_isalnum_unicode')
+string_isalpha_ascii = _simple_unary_function('string_isalpha_ascii')
+string_isalpha_unicode = _simple_unary_function('string_isalpha_unicode')
+string_isascii = binary_isascii
+string_isdecimal_ascii = _simple_unary_function('string_isdecimal_ascii')
+string_isdecimal_unicode = _simple_unary_function('string_isdecimal_unicode')
+string_isdigit_unicode = _simple_unary_function('string_isdigit_unicode')
+string_isdigit_ascii = string_isdecimal_ascii # alias
+string_islower_unicode = _simple_unary_function('string_islower_unicode')
+string_islower_ascii = _simple_unary_function('string_islower_ascii')
+string_isnumeric_unicode = _simple_unary_function('string_isnumeric_unicode')
+string_isnumeric_ascii = string_isdecimal_ascii  # alias
+string_isprintable_unicode = _simple_unary_function('string_isprintable_unicode')
+string_isprintable_ascii = _simple_unary_function('string_isprintable_ascii')
+string_istitle_unicode = _simple_unary_function('string_istitle_unicode')
+string_istitle_ascii = _simple_unary_function('string_istitle_ascii')
+string_isupper_unicode = _simple_unary_function('string_isupper_unicode')
+string_isupper_ascii = _simple_unary_function('string_isupper_ascii')
+
 is_valid = _simple_unary_function('is_valid')
 is_null = _simple_unary_function('is_null')
 
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 00486d54149..b197517a91f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -95,6 +95,50 @@ def test_binary_contains_exact():
     assert expected.equals(result)
 
 
+
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isalpha =  {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6, }
+# utf8proc claims these are upper case, they are not
+utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
+# utf8proc misses quite a few, and does some false claims?
+utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2, 0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc, 0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6, 0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd, 0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, 0xab66, 0xab67, 0xab68, }
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba, 0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68, 0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
+# utf8proc does not store if a codepoint is numeric
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, }
+# utf8proc has no no digit information
+digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
+
+codepoints_ignore = {
+    'isalnum': numeric_info_missing | digit_info_missing | utf8proc_issue_isalpha,
+    'isalpha': utf8proc_issue_isalpha,
+    'isdigit': digit_info_missing,
+    'isupper': utf8proc_issue_isupper,
+    'isprintable': utf8proc_issue_isprintable,
+    'isnumeric': numeric_info_missing,
+    'islower': utf8proc_issue_islower
+}
+
+@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace', 'isupper',])
+@pytest.mark.parametrize('ascii', [False, True])
+def test_string_py_compat_boolean(function_name, ascii):
+    variant = 'ascii' if ascii else 'unicode'
+    arrow_name = f'string_{function_name}_{variant}'
+    py_name = function_name
+    for i in range(128 if ascii else 0x11000):
+        if i in range(0xD800, 0xE000):
+            continue  # bug? pyarrow doesn't allow utf16 surrogates
+        # the issues we know of, we skip
+        if i in codepoints_ignore.get(function_name, []):
+            continue
+        c = chr(i)
+        if hasattr(pc, arrow_name):
+            ar = pa.array([c])
+            cpython_value = getattr(c, py_name)()
+            arrow_value = getattr(pc, arrow_name)(ar)[0]
+            assert arrow_value == cpython_value
+
+
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 def test_take(ty, values):
     arr = pa.array(values, type=ty)

From 08e2898b1868f227f7c2450040f8099fde4dd09e Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Tue, 7 Jul 2020 14:20:14 +0200
Subject: [PATCH 02/28] lint python

---
 python/pyarrow/tests/test_compute.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index b197517a91f..0e40dc33cf4 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -95,19 +95,24 @@ def test_binary_contains_exact():
     assert expected.equals(result)
 
 
-
 # utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
-utf8proc_issue_isalpha =  {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6, }
+utf8proc_issue_isalpha = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89,
+                          0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6, }
 # utf8proc claims these are upper case, they are not
-utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
+utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2,
+                          0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
 # utf8proc misses quite a few, and does some false claims?
-utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2, 0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc, 0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6, 0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd, 0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, 0xab66, 0xab67, 0xab68, }
+utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9,
+                          0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2, 0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc, 0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6, 0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd, 0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, 0xab66, 0xab67, 0xab68, }
 # utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
-utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba, 0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68, 0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
+utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba, 0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68, 0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87,
+                              0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
 # utf8proc does not store if a codepoint is numeric
-numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, }
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9,
+                        0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, }
 # utf8proc has no no digit information
-digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
+digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d,
+                      0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
 
 codepoints_ignore = {
     'isalnum': numeric_info_missing | digit_info_missing | utf8proc_issue_isalpha,
@@ -119,7 +124,8 @@ def test_binary_contains_exact():
     'islower': utf8proc_issue_islower
 }
 
-@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace', 'isupper',])
+
+@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace', 'isupper', ])
 @pytest.mark.parametrize('ascii', [False, True])
 def test_string_py_compat_boolean(function_name, ascii):
     variant = 'ascii' if ascii else 'unicode'

From c85837a5ad0e41aa77ab12021f25a947343d1ada Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Tue, 7 Jul 2020 14:20:36 +0200
Subject: [PATCH 03/28] convert arrow scalar to python value

---
 python/pyarrow/tests/test_compute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 0e40dc33cf4..0b578bc9abe 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -141,7 +141,7 @@ def test_string_py_compat_boolean(function_name, ascii):
         if hasattr(pc, arrow_name):
             ar = pa.array([c])
             cpython_value = getattr(c, py_name)()
-            arrow_value = getattr(pc, arrow_name)(ar)[0]
+            arrow_value = getattr(pc, arrow_name)(ar)[0].as_py()
             assert arrow_value == cpython_value
 
 

From afd2207423937a873451b89e900452374cff9877 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Tue, 7 Jul 2020 14:23:33 +0200
Subject: [PATCH 04/28] move utf8proc code to ifdef block

---
 .../arrow/compute/kernels/scalar_string.cc    | 56 ++++++++++---------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index d44c0eae217..c740ed0a904 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -462,6 +462,8 @@ void AddBinaryContainsExact(FunctionRegistry* registry) {
 
 // IsAlpha/Digit etc
 
+#ifdef ARROW_WITH_UTF8PROC
+
 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
   uint32_t general_category = 1 << utf8proc_category(codepoint);
   // for e.g. undefined (but valid) codepoints, general_category == 0
@@ -519,31 +521,10 @@ static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
                                       UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
 }
 
-static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
-  return (ascii_character >= 'a') && (ascii_character <= 'z');
-}
-
-static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
-  return (ascii_character >= 'A') && (ascii_character <= 'Z');
-}
-
-static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
-  return IsLowerCaseCharacterAscii(ascii_character) ||
-         IsUpperCaseCharacterAscii(ascii_character);
-}
-
-static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
-  return IsCasedCharacterAscii(ascii_character);  // same
-}
-
 static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
 }
 
-static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
-  return ((ascii_character >= '0') && (ascii_character <= '9'));
-}
-
 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
   // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
   // utf8proc has no support for this, this is the best we can do:
@@ -567,11 +548,6 @@ static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
          property->bidi_class == UTF8PROC_BIDI_CLASS_S;
 }
 
-static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
-  return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
-         (ascii_character == ' ');
-}
-
 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
   uint32_t general_category = utf8proc_category(codepoint);
   return (general_category != 0) &&
@@ -581,6 +557,34 @@ static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
                                        UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
 }
 
+#endif
+
+static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
+  return (ascii_character >= 'a') && (ascii_character <= 'z');
+}
+
+static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
+  return (ascii_character >= 'A') && (ascii_character <= 'Z');
+}
+
+static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
+  return IsLowerCaseCharacterAscii(ascii_character) ||
+         IsUpperCaseCharacterAscii(ascii_character);
+}
+
+static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
+  return IsCasedCharacterAscii(ascii_character);  // same
+}
+
+static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= '0') && (ascii_character <= '9'));
+}
+
+static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
+         (ascii_character == ' ');
+}
+
 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }

From ad1c1acad641978a20da0cf04f630fe5ccbd8202 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Tue, 7 Jul 2020 14:47:00 +0200
Subject: [PATCH 05/28] remove unused variable

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index c740ed0a904..3ca0c80b592 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -208,7 +208,6 @@ struct BinaryToBoolean {
       for (int64_t i = 0; i < input_nstrings; i++) {
         offset_type input_string_ncodeunits;
         const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
-        offset_type encoded_nbytes;
         bool boolean_result =
             Derived::Predicate(ctx, input_string, input_string_ncodeunits);
         if (!ctx->status().ok()) {

From a1f99353d14fba0789c005c5f543349ed37afec3 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 11:33:37 +0200
Subject: [PATCH 06/28] use UTF8PROC_CATEGORY_CN instead of 0

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 3ca0c80b592..a42e1da3301 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -465,8 +465,9 @@ void AddBinaryContainsExact(FunctionRegistry* registry) {
 
 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
   uint32_t general_category = 1 << utf8proc_category(codepoint);
-  // for e.g. undefined (but valid) codepoints, general_category == 0
-  return (general_category != 0) && ((general_category & mask) != 0);
+  // for e.g. undefined (but valid) codepoints, general_category == 0 ==
+  // UTF8PROC_CATEGORY_CN
+  return (general_category != UTF8PROC_CATEGORY_CN) && ((general_category & mask) != 0);
 }
 
 template <typename... Categories>
@@ -549,7 +550,7 @@ static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
 
 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
   uint32_t general_category = utf8proc_category(codepoint);
-  return (general_category != 0) &&
+  return (general_category != UTF8PROC_CATEGORY_CN) &&
          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
                                        UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
                                        UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,

From ae9a111aecb3fb6b89146dc54457abafb850959a Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 11:34:36 +0200
Subject: [PATCH 07/28] better version of islower/isupper using trick and more
 tests

---
 .../arrow/compute/kernels/scalar_string.cc    | 34 ++++++++-----------
 .../compute/kernels/scalar_string_test.cc     | 22 +++++++++---
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index a42e1da3301..6f6798191b2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -485,34 +485,30 @@ static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
                                       categories...);
 }
 
-static inline bool IsUpperCaseCharacterRoman(uint32_t codepoint) {
-  // Roman letter Ⅰ to Ⅿ are seen as capital (see 4.2 of Unicode spec)
-  // DerivedCoreProperties.txt should have this information, but it is not stored in
-  // the utf8proc library.
-  return (codepoint >= 0x2160) && (codepoint <= 0x216f);
-}
-
-static inline bool IsUpperCaseCharacterCircled(uint32_t codepoint) {
-  // Circled letters Ⓐ-Ⓩ are seen as capital (see 4.2 of Unicode spec)
-  // DerivedCoreProperties.txt should have this information, but it is not stored in
-  // the utf8proc library.
-  return (codepoint >= 0x24b6) && (codepoint <= 0x24cf);
-}
-
 static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
-         IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint);
-  ;
+         ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) ||
+          (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint));
 }
 
 static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
-  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL);
+  // although this trick seems to work for upper case, this is not enough for lower case
+  // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
+  // best we can do
+  return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) ||
+          ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) &&
+           (static_cast<uint32_t>(utf8proc_tolower(codepoint)) == codepoint))) &&
+         !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 }
 
 static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
-  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
-         IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint);
+  // this seems to be a good workaround for utf8proc not having case information
+  // https://github.com/JuliaStrings/utf8proc/issues/195
+  return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
+          ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) == codepoint) &&
+           (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint))) &&
+         !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 }
 
 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 67f24d1ef07..c65185f45c6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -183,6 +183,9 @@ TYPED_TEST(TestStringKernels, IsLowerUnicode) {
   this->CheckUnary("string_islower_unicode",
                    "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\"]", boolean(),
                    "[false, null, true, false, true, false, false]");
+  // lower case character utf8proc does not know about
+  // this->CheckUnary("string_islower_unicode", "[\"ª\", \"ₕ\"]", boolean(), "[true,
+  // true]");
 }
 
 TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
@@ -216,11 +219,20 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) {
 
 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
-  // Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
-  this->CheckUnary(
-      "string_isupper_unicode",
-      "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\", \"Ⓐ\", \"Ⓩ\"]",
-      boolean(), "[false, null, false, true, true, true, false, true, true, true, true]");
+  this->CheckUnary("string_isupper_unicode",
+                   "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
+                   boolean(),
+                   "[false, null, false, true, true, true, false, true, true]");
+  // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
+  // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
+  // case
+  // * U+1F88 - ᾈ - \E1\xBE\x88 - Greek Capital Letter Alpha with Psili and Prosgegrammeni
+  // - title case
+  // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A -  new in unicode 13
+  // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
+  this->CheckUnary("string_isupper_unicode",
+                   "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"Ꞻ\", \"ꞻ\"]", boolean(),
+                   "[true, true, true, false, true, false]");
 }
 
 #endif  // ARROW_WITH_UTF8PROC

From bdb040cb49531ce2f93607737b04078659035132 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 11:35:11 +0200
Subject: [PATCH 08/28] put tests in place that document what we do not support

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index c65185f45c6..7b39a2c1a03 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -171,11 +171,22 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
                    "[true, null, true, false, false, false]");
 }
 
+TYPED_TEST(TestStringKernels, IsDigitUnicode) {
+  // These are digits according to Python, but we don't have the information in
+  // utf8proc for this
+  // this->CheckUnary("string_isdigit_unicode", "[\"²\", \"①\"]", boolean(), "[true,
+  // true]");
+}
+
 TYPED_TEST(TestStringKernels, IsNumericUnicode) {
   // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
   this->CheckUnary("string_isnumeric_unicode",
                    "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(),
                    "[true, null, true, true, false, false]");
+  // These are numerical according to Python, but we don't have the information in
+  // utf8proc for this
+  // this->CheckUnary("string_isnumeric_unicode", "[\"㐅\", \"卌\"]", boolean(),
+  //                  "[true, null, true, true, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsLowerUnicode) {

From 3c59966ca00d2ce415c3566f477bcb3ae49a9441 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 11:47:09 +0200
Subject: [PATCH 09/28] make python tests more robust by ignoring undefined
 codepoints

---
 python/pyarrow/tests/test_compute.py | 120 +++++++++++++++++++++------
 1 file changed, 94 insertions(+), 26 deletions(-)

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 0b578bc9abe..63dab14120e 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -95,31 +95,100 @@ def test_binary_contains_exact():
     assert expected.equals(result)
 
 
-# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
-utf8proc_issue_isalpha = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 0x10e88, 0x10e89,
-                          0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6, }
-# utf8proc claims these are upper case, they are not
-utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2,
-                          0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
-# utf8proc misses quite a few, and does some false claims?
-utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9,
-                          0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2, 0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc, 0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6, 0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd, 0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, 0xab66, 0xab67, 0xab68, }
-# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
-utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba, 0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68, 0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87,
-                              0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
+# We use isprintable to find about codepoints that Python doesn't know, but
+# utfproc does (or in future version of Python the other way around).
+# These codepoints cannot be compared between Arrow and the Python
+# implementation.
+def _find_new_unicode_codepoints():
+    new = set()
+    for i in range(0x11000):
+        c = chr(i)
+        if i in range(0xD800, 0xE000):
+            continue  # bug? pyarrow doesn't allow utf16 surrogates
+        ar = pa.array([c])
+        if pc.string_isprintable_unicode(ar)[0].as_py() != c.isprintable():
+            new.add(i)
+    return new
+
+
+new_unicode_codepoints = _find_new_unicode_codepoints()
+
+# Python claims there are not alpha, not sure why, they are in
+#  gc='Other Letter': https://graphemica.com/%E1%B3%B2
+unknown_issue_isalpha = {0x1cf2, 0x1cf3}
+# utf8proc does not know if codepoints are lower case
+utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
+                          0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
+                          0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
+                          0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33,
+                          0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39,
+                          0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f,
+                          0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45,
+                          0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b,
+                          0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51,
+                          0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57,
+                          0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d,
+                          0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63,
+                          0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69,
+                          0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e,
+                          0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4,
+                          0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa,
+                          0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0,
+                          0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6,
+                          0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc,
+                          0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090,
+                          0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096,
+                          0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c,
+                          0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8,
+                          0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, }
 # utf8proc does not store if a codepoint is numeric
-numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9,
-                        0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, }
-# utf8proc has no no digit information
-digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d,
-                      0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
+                        0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
+                        0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
+                        0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
+                        0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
+                        0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
+                        0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
+                        0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
+                        0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
+                        0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
+                        0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5,
+                        0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca,
+                        0x10fcb, }
+# utf8proc has no no digit/numeric information
+digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
+                      0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070,
+                      0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080,
+                      0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
+                      0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464,
+                      0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476,
+                      0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488,
+                      0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f,
+                      0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9,
+                      0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777,
+                      0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e,
+                      0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786,
+                      0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e,
+                      0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41,
+                      0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63,
+                      0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, }
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
+                        0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
+                        0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
+                        0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
+                        0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
+                        0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
+                        0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
+                        0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
+                        0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
+                        0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
+                        0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }
 
 codepoints_ignore = {
-    'isalnum': numeric_info_missing | digit_info_missing | utf8proc_issue_isalpha,
-    'isalpha': utf8proc_issue_isalpha,
+    'isalnum': numeric_info_missing | digit_info_missing |
+    unknown_issue_isalpha,
+    'isalpha': unknown_issue_isalpha,
     'isdigit': digit_info_missing,
-    'isupper': utf8proc_issue_isupper,
-    'isprintable': utf8proc_issue_isprintable,
     'isnumeric': numeric_info_missing,
     'islower': utf8proc_issue_islower
 }
@@ -129,20 +198,19 @@ def test_binary_contains_exact():
 @pytest.mark.parametrize('ascii', [False, True])
 def test_string_py_compat_boolean(function_name, ascii):
     variant = 'ascii' if ascii else 'unicode'
-    arrow_name = f'string_{function_name}_{variant}'
+    arrow_name = 'string_%s_%s' % (function_name, variant)
     py_name = function_name
     for i in range(128 if ascii else 0x11000):
         if i in range(0xD800, 0xE000):
             continue  # bug? pyarrow doesn't allow utf16 surrogates
         # the issues we know of, we skip
-        if i in codepoints_ignore.get(function_name, []):
+        if i in codepoints_ignore.get(function_name, []) or i in new_unicode_codepoints:
             continue
         c = chr(i)
         if hasattr(pc, arrow_name):
             ar = pa.array([c])
-            cpython_value = getattr(c, py_name)()
-            arrow_value = getattr(pc, arrow_name)(ar)[0].as_py()
-            assert arrow_value == cpython_value
+            assert getattr(pc, arrow_name)(
+                ar)[0].as_py() == getattr(c, py_name)()
 
 
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)

From e01cf196622c7023a1af269eb76834a97508fe10 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 12:02:15 +0200
Subject: [PATCH 10/28] rename string_func_unicode -> utf8_func

---
 .../arrow/compute/kernels/scalar_string.cc    | 36 ++++----
 .../kernels/scalar_string_benchmark.cc        |  4 +-
 .../compute/kernels/scalar_string_test.cc     | 90 +++++++++----------
 python/pyarrow/compute.py                     | 37 ++++----
 python/pyarrow/tests/test_compute.py          | 20 +++--
 5 files changed, 90 insertions(+), 97 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 6f6798191b2..e6406d21cf4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -955,29 +955,29 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
 
   AddUnaryString<IsAscii>("binary_isascii", registry);
 
-  AddUnaryString<IsAlphaNumericAscii>("string_isalnum_ascii", registry);
-  AddUnaryString<IsAlphaAscii>("string_isalpha_ascii", registry);
-  AddUnaryString<IsDecimalAscii>("string_isdecimal_ascii", registry);
+  AddUnaryString<IsAlphaNumericAscii>("ascii_isalnum", registry);
+  AddUnaryString<IsAlphaAscii>("ascii_isalpha", registry);
+  AddUnaryString<IsDecimalAscii>("ascii_isdecimal", registry);
   // no isdigic for ascii, since it is the same as isdecimal
-  AddUnaryString<IsLowerAscii>("string_islower_ascii", registry);
+  AddUnaryString<IsLowerAscii>("ascii_islower", registry);
   // no isnumeric for ascii, since it is the same as isdecimal
-  AddUnaryString<IsPrintableAscii>("string_isprintable_ascii", registry);
-  AddUnaryString<IsSpaceAscii>("string_isspace_ascii", registry);
-  AddUnaryString<IsTitleAscii>("string_istitle_ascii", registry);
-  AddUnaryString<IsUpperAscii>("string_isupper_ascii", registry);
+  AddUnaryString<IsPrintableAscii>("ascii_isprintable", registry);
+  AddUnaryString<IsSpaceAscii>("ascii_isspace", registry);
+  AddUnaryString<IsTitleAscii>("ascii_istitle", registry);
+  AddUnaryString<IsUpperAscii>("ascii_isupper", registry);
 #ifdef ARROW_WITH_UTF8PROC
   MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);
-  AddUnaryString<IsAlphaNumericUnicode>("string_isalnum_unicode", registry);
-  AddUnaryString<IsAlphaUnicode>("string_isalpha_unicode", registry);
-  AddUnaryString<IsDecimalUnicode>("string_isdecimal_unicode", registry);
-  AddUnaryString<IsDigitUnicode>("string_isdigit_unicode", registry);
-  AddUnaryString<IsLowerUnicode>("string_islower_unicode", registry);
-  AddUnaryString<IsNumericUnicode>("string_isnumeric_unicode", registry);
-  AddUnaryString<IsPrintableUnicode>("string_isprintable_unicode", registry);
-  AddUnaryString<IsSpaceUnicode>("string_isspace_unicode", registry);
-  AddUnaryString<IsTitleUnicode>("string_istitle_unicode", registry);
-  AddUnaryString<IsUpperUnicode>("string_isupper_unicode", registry);
+  AddUnaryString<IsAlphaNumericUnicode>("utf8_isalnum", registry);
+  AddUnaryString<IsAlphaUnicode>("utf8_isalpha", registry);
+  AddUnaryString<IsDecimalUnicode>("utf8_isdecimal", registry);
+  AddUnaryString<IsDigitUnicode>("utf8_isdigit", registry);
+  AddUnaryString<IsLowerUnicode>("utf8_islower", registry);
+  AddUnaryString<IsNumericUnicode>("utf8_isnumeric", registry);
+  AddUnaryString<IsPrintableUnicode>("utf8_isprintable", registry);
+  AddUnaryString<IsSpaceUnicode>("utf8_isspace", registry);
+  AddUnaryString<IsTitleUnicode>("utf8_istitle", registry);
+  AddUnaryString<IsUpperUnicode>("utf8_isupper", registry);
 
 #endif
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index e65d2dca2be..67fade71532 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -58,7 +58,7 @@ static void AsciiUpper(benchmark::State& state) {
 }
 
 static void IsAlphaAscii(benchmark::State& state) {
-  UnaryStringBenchmark(state, "string_isalpha_ascii");
+  UnaryStringBenchmark(state, "ascii_isalpha");
 }
 
 static void BinaryContainsExact(benchmark::State& state) {
@@ -76,7 +76,7 @@ static void Utf8Lower(benchmark::State& state) {
 }
 
 static void IsAlphaUnicode(benchmark::State& state) {
-  UnaryStringBenchmark(state, "string_isalpha_unicode");
+  UnaryStringBenchmark(state, "utf8_isalpha");
 }
 #endif
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 7b39a2c1a03..417f403bcd8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -148,15 +148,15 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
   // UTF8PROC_CATEGORY_LO
-  this->CheckUnary("string_isalnum_unicode", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
-                   boolean(), "[true, null, true, false, false]");
+  this->CheckUnary("utf8_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
+                   "[true, null, true, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
   // UTF8PROC_CATEGORY_LO
-  this->CheckUnary("string_isalpha_unicode", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]",
-                   boolean(), "[true, null, false, false, false]");
+  this->CheckUnary("utf8_isalpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
+                   "[true, null, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsAscii) {
@@ -166,36 +166,33 @@ TYPED_TEST(TestStringKernels, IsAscii) {
 
 TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
   // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
-  this->CheckUnary("string_isdecimal_unicode",
-                   "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(),
-                   "[true, null, true, false, false, false]");
+  this->CheckUnary("utf8_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
+                   boolean(), "[true, null, true, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsDigitUnicode) {
   // These are digits according to Python, but we don't have the information in
   // utf8proc for this
-  // this->CheckUnary("string_isdigit_unicode", "[\"²\", \"①\"]", boolean(), "[true,
+  // this->CheckUnary("utf8_isdigit", "[\"²\", \"①\"]", boolean(), "[true,
   // true]");
 }
 
 TYPED_TEST(TestStringKernels, IsNumericUnicode) {
   // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
-  this->CheckUnary("string_isnumeric_unicode",
-                   "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(),
-                   "[true, null, true, true, false, false]");
+  this->CheckUnary("utf8_isnumeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
+                   boolean(), "[true, null, true, true, false, false]");
   // These are numerical according to Python, but we don't have the information in
   // utf8proc for this
-  // this->CheckUnary("string_isnumeric_unicode", "[\"㐅\", \"卌\"]", boolean(),
+  // this->CheckUnary("utf8_isnumeric", "[\"㐅\", \"卌\"]", boolean(),
   //                  "[true, null, true, true, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsLowerUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
-  this->CheckUnary("string_islower_unicode",
-                   "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\"]", boolean(),
-                   "[false, null, true, false, true, false, false]");
+  this->CheckUnary("utf8_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\"]",
+                   boolean(), "[false, null, true, false, true, false, false]");
   // lower case character utf8proc does not know about
-  // this->CheckUnary("string_islower_unicode", "[\"ª\", \"ₕ\"]", boolean(), "[true,
+  // this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
   // true]");
 }
 
@@ -203,37 +200,35 @@ TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
   // U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable
   // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
   this->CheckUnary(
-      "string_isprintable_unicode",
+      "utf8_isprintable",
       "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
       "[true, null, false, true, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
   // U+2008 (utf8: \xe2\x80\x88) is punctuaction space
-  this->CheckUnary("string_isspace_unicode", "[\" \", null, \"  \", \"\\t\\r\"]",
-                   boolean(), "[true, null, true, true]");
-  this->CheckUnary("string_isspace_unicode",
-                   "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(),
-                   "[false, null, false, false, true]");
+  this->CheckUnary("utf8_isspace", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
+                   "[true, null, true, true]");
+  this->CheckUnary("utf8_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
+                   boolean(), "[false, null, false, false, true]");
 }
 
 TYPED_TEST(TestStringKernels, IsTitleUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
-  this->CheckUnary("string_istitle_unicode",
+  this->CheckUnary("utf8_istitle",
                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
                    boolean(), "[true, null, true, true, true, true, true]");
   this->CheckUnary(
-      "string_istitle_unicode",
+      "utf8_istitle",
       "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsǄ\", \"ΦΦ\", \"ǆ\", \"_\"]",
       boolean(), "[false, null, false, false, false, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
-  this->CheckUnary("string_isupper_unicode",
-                   "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
-                   boolean(),
-                   "[false, null, false, true, true, true, false, true, true]");
+  this->CheckUnary(
+      "utf8_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
+      boolean(), "[false, null, false, true, true, true, false, true, true]");
   // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
   // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
   // case
@@ -241,7 +236,7 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // - title case
   // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A -  new in unicode 13
   // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
-  this->CheckUnary("string_isupper_unicode",
+  this->CheckUnary("utf8_isupper",
                    "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"Ꞻ\", \"ꞻ\"]", boolean(),
                    "[true, true, true, false, true, false]");
 }
@@ -249,34 +244,31 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
 #endif  // ARROW_WITH_UTF8PROC
 
 TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
-  this->CheckUnary("string_isalnum_ascii", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
+  this->CheckUnary("ascii_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
                    boolean(), "[false, null, false, false, false]");
-  this->CheckUnary("string_isalnum_ascii",
-                   "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]", boolean(),
-                   "[true, null, true, true, true, false]");
+  this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
+                   boolean(), "[true, null, true, true, true, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsAlphaAscii) {
-  this->CheckUnary("string_isalpha_ascii",
-                   "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]", boolean(),
-                   "[false, true, null, false, false, false]");
+  this->CheckUnary("ascii_isalpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
+                   boolean(), "[false, true, null, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsDecimalAscii) {
   // ٣ is arabic 3
-  this->CheckUnary("string_isdecimal_ascii", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
+  this->CheckUnary("ascii_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
                    boolean(), "[true, null, false, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsLowerAscii) {
   // ٣ is arabic 3 (decimal), φ lower greek
-  this->CheckUnary("string_islower_ascii",
-                   "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
-                   "[false, null, true, false, true, false, false]");
+  this->CheckUnary("ascii_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]",
+                   boolean(), "[false, null, true, false, true, false, false]");
 }
 TYPED_TEST(TestStringKernels, IsPrintableAscii) {
   // \xe2\x80\x88 is punctuaction space
-  this->CheckUnary("string_isprintable_ascii",
+  this->CheckUnary("ascii_isprintable",
                    "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
                    "[true, null, false, true, false]");
 }
@@ -284,30 +276,28 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) {
 TYPED_TEST(TestStringKernels, IsSpaceAscii) {
   // \xe2\x80\x88 is punctuaction space
   // Note: for ascii version, the non-ascii chars are seen as non-cased
-  this->CheckUnary("string_isspace_ascii", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
+  this->CheckUnary("ascii_isspace", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
                    "[true, null, true, true]");
-  this->CheckUnary("string_isspace_ascii",
-                   "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(),
-                   "[false, null, false, false, false]");
+  this->CheckUnary("ascii_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
+                   boolean(), "[false, null, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsTitleAscii) {
   // ٣ is arabic 3 (decimal), Φ capital
   // Note: for ascii version, the non-ascii chars are seen as non-cased
-  this->CheckUnary("string_istitle_ascii",
+  this->CheckUnary("ascii_istitle",
                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
                    boolean(), "[true, null, true, true, true, false, false]");
   this->CheckUnary(
-      "string_istitle_ascii",
+      "ascii_istitle",
       "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsǄ\", \"ΦΦ\", \"ǆ\", \"_\"]",
       boolean(), "[false, null, false, false, true, false, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, IsUpperAscii) {
   // ٣ is arabic 3 (decimal), Φ capital greek
-  this->CheckUnary("string_isupper_ascii",
-                   "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
-                   "[false, null, false, true, true, false, false]");
+  this->CheckUnary("ascii_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]",
+                   boolean(), "[false, null, false, true, true, false, false]");
 }
 
 TYPED_TEST(TestStringKernels, BinaryContainsExact) {
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index bfe484518ef..165895af7fb 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -105,25 +105,24 @@ def func(left, right):
 
 binary_isascii = _simple_unary_function('binary_isascii')
 
-string_isalnum_ascii = _simple_unary_function('string_isalnum_ascii')
-string_isalnum_unicode = _simple_unary_function('string_isalnum_unicode')
-string_isalpha_ascii = _simple_unary_function('string_isalpha_ascii')
-string_isalpha_unicode = _simple_unary_function('string_isalpha_unicode')
-string_isascii = binary_isascii
-string_isdecimal_ascii = _simple_unary_function('string_isdecimal_ascii')
-string_isdecimal_unicode = _simple_unary_function('string_isdecimal_unicode')
-string_isdigit_unicode = _simple_unary_function('string_isdigit_unicode')
-string_isdigit_ascii = string_isdecimal_ascii # alias
-string_islower_unicode = _simple_unary_function('string_islower_unicode')
-string_islower_ascii = _simple_unary_function('string_islower_ascii')
-string_isnumeric_unicode = _simple_unary_function('string_isnumeric_unicode')
-string_isnumeric_ascii = string_isdecimal_ascii  # alias
-string_isprintable_unicode = _simple_unary_function('string_isprintable_unicode')
-string_isprintable_ascii = _simple_unary_function('string_isprintable_ascii')
-string_istitle_unicode = _simple_unary_function('string_istitle_unicode')
-string_istitle_ascii = _simple_unary_function('string_istitle_ascii')
-string_isupper_unicode = _simple_unary_function('string_isupper_unicode')
-string_isupper_ascii = _simple_unary_function('string_isupper_ascii')
+ascii_isalnum = _simple_unary_function('ascii_isalnum')
+utf8_isalnum = _simple_unary_function('utf8_isalnum')
+ascii_isalpha = _simple_unary_function('ascii_isalpha')
+utf8_isalpha = _simple_unary_function('utf8_isalpha')
+ascii_isdecimal = _simple_unary_function('ascii_isdecimal')
+utf8_isdecimal = _simple_unary_function('utf8_isdecimal')
+ascii_isdigit = ascii_isdecimal  # alias
+utf8_isdigit = _simple_unary_function('utf8_isdigit')
+ascii_islower = _simple_unary_function('ascii_islower')
+utf8_islower = _simple_unary_function('utf8_islower')
+ascii_isnumeric = ascii_isdecimal  # alias
+utf8_isnumeric = _simple_unary_function('utf8_isnumeric')
+ascii_isprintable = _simple_unary_function('ascii_isprintable')
+utf8_isprintable = _simple_unary_function('utf8_isprintable')
+ascii_istitle = _simple_unary_function('ascii_istitle')
+utf8_istitle = _simple_unary_function('utf8_istitle')
+ascii_isupper = _simple_unary_function('ascii_isupper')
+utf8_isupper = _simple_unary_function('utf8_isupper')
 
 is_valid = _simple_unary_function('is_valid')
 is_null = _simple_unary_function('is_null')
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 63dab14120e..52107f70dfa 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -101,12 +101,12 @@ def test_binary_contains_exact():
 # implementation.
 def _find_new_unicode_codepoints():
     new = set()
-    for i in range(0x11000):
+    for i in range(0x80, 0x11000):
         c = chr(i)
         if i in range(0xD800, 0xE000):
             continue  # bug? pyarrow doesn't allow utf16 surrogates
         ar = pa.array([c])
-        if pc.string_isprintable_unicode(ar)[0].as_py() != c.isprintable():
+        if pc.utf8_isprintable(ar)[0].as_py() != c.isprintable():
             new.add(i)
     return new
 
@@ -194,17 +194,21 @@ def _find_new_unicode_codepoints():
 }
 
 
-@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace', 'isupper', ])
-@pytest.mark.parametrize('ascii', [False, True])
-def test_string_py_compat_boolean(function_name, ascii):
-    variant = 'ascii' if ascii else 'unicode'
-    arrow_name = 'string_%s_%s' % (function_name, variant)
+@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii',
+                                           'isdecimal', 'isdigit', 'islower',
+                                           'isnumeric', 'isprintable',
+                                           'isspace', 'isupper', ])
+@pytest.mark.parametrize('variant', ['ascii', 'utf8'])
+def test_string_py_compat_boolean(function_name, variant):
+    arrow_name = variant + "_" + function_name
     py_name = function_name
+    ignore = codepoints_ignore.get(function_name, set()) |\
+        new_unicode_codepoints
     for i in range(128 if ascii else 0x11000):
         if i in range(0xD800, 0xE000):
             continue  # bug? pyarrow doesn't allow utf16 surrogates
         # the issues we know of, we skip
-        if i in codepoints_ignore.get(function_name, []) or i in new_unicode_codepoints:
+        if i in ignore:
             continue
         c = chr(i)
         if hasattr(pc, arrow_name):

From 674aa38474f2f3566f2df8cd4a75d405555e79c4 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 12:08:02 +0200
Subject: [PATCH 11/28] more tests

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 417f403bcd8..6bd245de76d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -189,8 +189,11 @@ TYPED_TEST(TestStringKernels, IsNumericUnicode) {
 
 TYPED_TEST(TestStringKernels, IsLowerUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
-  this->CheckUnary("utf8_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\"]",
-                   boolean(), "[false, null, true, false, true, false, false]");
+  this->CheckUnary("utf8_islower",
+                   "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", "
+                   "\"With space\"]",
+                   boolean(),
+                   "[false, null, true, false, true, false, false, true, false]");
   // lower case character utf8proc does not know about
   // this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
   // true]");
@@ -244,8 +247,9 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
 #endif  // ARROW_WITH_UTF8PROC
 
 TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
-  this->CheckUnary("ascii_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
-                   boolean(), "[false, null, false, false, false]");
+  this->CheckUnary("ascii_isalnum",
+                   "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]",
+                   boolean(), "[false, null, false, false, false, false, false]");
   this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
                    boolean(), "[true, null, true, true, true, false]");
 }

From 35091f552a1ac7d79cc7e2559551d74566350e17 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 12:21:43 +0200
Subject: [PATCH 12/28] some compilers look at unused templates

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index e6406d21cf4..27f88c787ab 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -635,6 +635,7 @@ struct CharacterPredicateAscii
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsAlphaNumericUnicode
     : CharacterPredicateUnicode<StringType, IsAlphaNumericUnicode<StringType>> {
@@ -643,6 +644,7 @@ struct IsAlphaNumericUnicode
            IsNumericCharacterUnicode(codepoint) || IsDigitCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsAlphaNumericAscii
@@ -652,6 +654,7 @@ struct IsAlphaNumericAscii
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsAlphaUnicode
     : CharacterPredicateUnicode<StringType, IsAlphaUnicode<StringType>> {
@@ -659,6 +662,7 @@ struct IsAlphaUnicode
     return IsAlphaCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsAlphaAscii : CharacterPredicateAscii<StringType, IsAlphaAscii<StringType>> {
@@ -667,6 +671,7 @@ struct IsAlphaAscii : CharacterPredicateAscii<StringType, IsAlphaAscii<StringTyp
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsDecimalUnicode
     : CharacterPredicateUnicode<StringType, IsDecimalUnicode<StringType>> {
@@ -674,6 +679,7 @@ struct IsDecimalUnicode
     return IsDecimalCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsDecimalAscii : CharacterPredicateAscii<StringType, IsDecimalAscii<StringType>> {
@@ -682,6 +688,7 @@ struct IsDecimalAscii : CharacterPredicateAscii<StringType, IsDecimalAscii<Strin
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsDigitUnicode
     : CharacterPredicateUnicode<StringType, IsDigitUnicode<StringType>> {
@@ -689,6 +696,7 @@ struct IsDigitUnicode
     return IsDigitCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsNumericUnicode
@@ -708,6 +716,7 @@ struct IsAscii : BinaryToBoolean<StringType, IsAscii<StringType>> {
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsLowerUnicode
     : CharacterPredicateUnicode<StringType, IsLowerUnicode<StringType>> {
@@ -719,6 +728,7 @@ struct IsLowerUnicode
     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
   }
 };
+#endif
 
 template <typename StringType>
 struct IsLowerAscii : CharacterPredicateAscii<StringType, IsLowerAscii<StringType>> {
@@ -732,6 +742,7 @@ struct IsLowerAscii : CharacterPredicateAscii<StringType, IsLowerAscii<StringTyp
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsPrintableUnicode
     : CharacterPredicateUnicode<StringType, IsPrintableUnicode<StringType>,
@@ -740,6 +751,7 @@ struct IsPrintableUnicode
     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsPrintableAscii
@@ -750,6 +762,7 @@ struct IsPrintableAscii
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsSpaceUnicode
     : CharacterPredicateUnicode<StringType, IsSpaceUnicode<StringType>> {
@@ -757,6 +770,7 @@ struct IsSpaceUnicode
     return IsSpaceCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsSpaceAscii : CharacterPredicateAscii<StringType, IsSpaceAscii<StringType>> {
@@ -765,6 +779,7 @@ struct IsSpaceAscii : CharacterPredicateAscii<StringType, IsSpaceAscii<StringTyp
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsTitleUnicode : BinaryToBoolean<StringType, IsTitleUnicode<StringType>> {
   using offset_type = typename StringType::offset_type;
@@ -802,6 +817,7 @@ struct IsTitleUnicode : BinaryToBoolean<StringType, IsTitleUnicode<StringType>>
     return rules_1_and_2 & rule_3;
   }
 };
+#endif
 
 template <typename StringType>
 struct IsTitleAscii : BinaryToBoolean<StringType, IsTitleAscii<StringType>> {
@@ -843,6 +859,7 @@ struct IsTitleAscii : BinaryToBoolean<StringType, IsTitleAscii<StringType>> {
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
 template <typename StringType>
 struct IsUpperUnicode
     : CharacterPredicateUnicode<StringType, IsUpperUnicode<StringType>> {
@@ -854,6 +871,7 @@ struct IsUpperUnicode
     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
   }
 };
+#endif
 
 template <typename StringType>
 struct IsUpperAscii : CharacterPredicateAscii<StringType, IsUpperAscii<StringType>> {

From 876e1b892bcd6caef0a3521711ae1803380fce39 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 12:46:43 +0200
Subject: [PATCH 13/28] misplaced ifdefs

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 27f88c787ab..d2ec7c99980 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -696,7 +696,6 @@ struct IsDigitUnicode
     return IsDigitCharacterUnicode(codepoint);
   }
 };
-#endif
 
 template <typename StringType>
 struct IsNumericUnicode
@@ -705,6 +704,7 @@ struct IsNumericUnicode
     return IsNumericCharacterUnicode(codepoint);
   }
 };
+#endif
 
 template <typename StringType>
 struct IsAscii : BinaryToBoolean<StringType, IsAscii<StringType>> {
@@ -953,6 +953,8 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+#endif
+
 template <template <typename> class Transformer>
 void AddUnaryString(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
@@ -963,8 +965,6 @@ void AddUnaryString(std::string name, FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
-#endif
-
 }  // namespace
 
 void RegisterScalarStringAscii(FunctionRegistry* registry) {

From 52270230c7262c5c8b95fc2a87503fead93968fa Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 8 Jul 2020 13:07:44 +0200
Subject: [PATCH 14/28] use lookup table and use isalnum for benchmark

---
 cpp/src/arrow/compute/kernels/scalar_string.cc       | 10 ++++++++--
 .../arrow/compute/kernels/scalar_string_benchmark.cc | 12 ++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index d2ec7c99980..9b30cf0bf79 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -71,6 +71,7 @@ constexpr uint32_t kMaxCodepointLookup =
     0xffff;  // up to this codepoint is in a lookup table
 std::vector<uint32_t> lut_upper_codepoint;
 std::vector<uint32_t> lut_lower_codepoint;
+std::vector<utf8proc_category_t> lut_category;
 std::once_flag flag_case_luts;
 
 void EnsureLookupTablesFilled() {
@@ -80,6 +81,7 @@ void EnsureLookupTablesFilled() {
     for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
       lut_upper_codepoint.push_back(utf8proc_toupper(i));
       lut_lower_codepoint.push_back(utf8proc_tolower(i));
+      lut_category.push_back(utf8proc_category(i));
     }
   });
 }
@@ -464,10 +466,14 @@ void AddBinaryContainsExact(FunctionRegistry* registry) {
 #ifdef ARROW_WITH_UTF8PROC
 
 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
-  uint32_t general_category = 1 << utf8proc_category(codepoint);
+  utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
+                                             ? lut_category[codepoint]
+                                             : utf8proc_category(codepoint);
+  uint32_t general_category_bit = 1 << utf8proc_category(codepoint);
   // for e.g. undefined (but valid) codepoints, general_category == 0 ==
   // UTF8PROC_CATEGORY_CN
-  return (general_category != UTF8PROC_CATEGORY_CN) && ((general_category & mask) != 0);
+  return (general_category != UTF8PROC_CATEGORY_CN) &&
+         ((general_category_bit & mask) != 0);
 }
 
 template <typename... Categories>
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index 67fade71532..01a32c71f34 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -57,8 +57,8 @@ static void AsciiUpper(benchmark::State& state) {
   UnaryStringBenchmark(state, "ascii_upper");
 }
 
-static void IsAlphaAscii(benchmark::State& state) {
-  UnaryStringBenchmark(state, "ascii_isalpha");
+static void IsAlphaNumericAscii(benchmark::State& state) {
+  UnaryStringBenchmark(state, "ascii_isalnum");
 }
 
 static void BinaryContainsExact(benchmark::State& state) {
@@ -75,19 +75,19 @@ static void Utf8Lower(benchmark::State& state) {
   UnaryStringBenchmark(state, "utf8_lower");
 }
 
-static void IsAlphaUnicode(benchmark::State& state) {
-  UnaryStringBenchmark(state, "utf8_isalpha");
+static void IsAlphaNumericUnicode(benchmark::State& state) {
+  UnaryStringBenchmark(state, "utf8_isalnum");
 }
 #endif
 
 BENCHMARK(AsciiLower);
 BENCHMARK(AsciiUpper);
-BENCHMARK(IsAlphaAscii);
+BENCHMARK(IsAlphaNumericAscii);
 BENCHMARK(BinaryContainsExact);
 #ifdef ARROW_WITH_UTF8PROC
 BENCHMARK(Utf8Lower);
 BENCHMARK(Utf8Upper);
-BENCHMARK(IsAlphaUnicode);
+BENCHMARK(IsAlphaNumericUnicode);
 #endif
 
 }  // namespace compute

From 65d674df77edd6687cc57344facd03d72a8bdd4d Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Fri, 10 Jul 2020 12:06:22 +0200
Subject: [PATCH 15/28] use utf8 sequence in string (maybe the compiler does
 not like the literal utf8 char if it does not recognize it?)

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 6bd245de76d..9447e86c7c9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -240,8 +240,8 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A -  new in unicode 13
   // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
   this->CheckUnary("utf8_isupper",
-                   "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"Ꞻ\", \"ꞻ\"]", boolean(),
-                   "[true, true, true, false, true, false]");
+                   "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"\xEA\x9E\xBB\"]",
+                   boolean(), "[true, true, true, false, true, false]");
 }
 
 #endif  // ARROW_WITH_UTF8PROC

From cd426d4f43f1f182065011ff37f46989f5ca87b4 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Fri, 10 Jul 2020 13:16:21 +0200
Subject: [PATCH 16/28] Fix: was not using lut

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 9b30cf0bf79..9f03740d8e8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -469,7 +469,7 @@ static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mas
   utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
                                              ? lut_category[codepoint]
                                              : utf8proc_category(codepoint);
-  uint32_t general_category_bit = 1 << utf8proc_category(codepoint);
+  uint32_t general_category_bit = 1 << general_category;
   // for e.g. undefined (but valid) codepoints, general_category == 0 ==
   // UTF8PROC_CATEGORY_CN
   return (general_category != UTF8PROC_CATEGORY_CN) &&

From b435b94f79448e8890c017c8d300116a72617e44 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Fri, 10 Jul 2020 13:18:01 +0200
Subject: [PATCH 17/28] performance increase: ~15-20%

---
 .../arrow/compute/kernels/scalar_string.cc    | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 9f03740d8e8..d14f157c87a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -517,6 +517,13 @@ static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 }
 
+static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) {
+  return HasAnyUnicodeGeneralCategory(
+      codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
+      UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND,
+      UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
+}
+
 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
@@ -578,6 +585,12 @@ static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
   return IsCasedCharacterAscii(ascii_character);  // same
 }
 
+static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
+  return ((ascii_character >= '0') && (ascii_character <= '9')) ||
+         ((ascii_character >= 'a') && (ascii_character <= 'z')) ||
+         ((ascii_character >= 'A') && (ascii_character <= 'Z'));
+}
+
 static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= '0') && (ascii_character <= '9'));
 }
@@ -629,6 +642,8 @@ struct CharacterPredicateAscii
       return true;
     }
     bool any = false;
+    // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
+    // benchmark. I don't consider that worth it.
     bool all = std::all_of(input, input + input_string_ncodeunits,
                            [&any](uint8_t ascii_character) {
                              any |= Derived::PredicateCharacterAny(ascii_character);
@@ -646,8 +661,7 @@ template <typename StringType>
 struct IsAlphaNumericUnicode
     : CharacterPredicateUnicode<StringType, IsAlphaNumericUnicode<StringType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
-    return IsAlphaCharacterUnicode(codepoint) || IsDecimalCharacterUnicode(codepoint) ||
-           IsNumericCharacterUnicode(codepoint) || IsDigitCharacterUnicode(codepoint);
+    return IsAlphaNumericCharacterUnicode(codepoint);
   }
 };
 #endif
@@ -655,8 +669,8 @@ struct IsAlphaNumericUnicode
 template <typename StringType>
 struct IsAlphaNumericAscii
     : CharacterPredicateAscii<StringType, IsAlphaNumericAscii<StringType>> {
-  static inline bool PredicateCharacterAll(uint32_t codepoint) {
-    return IsAlphaCharacterAscii(codepoint) || IsDecimalCharacterAscii(codepoint);
+  static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+    return IsAlphaNumericCharacterAscii(ascii_character);
   }
 };
 

From cb9365c757e4be0d12368dbe144a8280b2d4ec23 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Fri, 10 Jul 2020 15:02:50 +0200
Subject: [PATCH 18/28] use different high codepoint that does not require
 unicode 13

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 9447e86c7c9..f8b4ea2f3d2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -237,10 +237,12 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // case
   // * U+1F88 - ᾈ - \E1\xBE\x88 - Greek Capital Letter Alpha with Psili and Prosgegrammeni
   // - title case
+  // U+10400 - 𐐀 - \xF0x90x90x80 - Deseret Capital Letter Long - upper case
   // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A -  new in unicode 13
+  // (not tested since it depends on the version of libutf8proc)
   // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
   this->CheckUnary("utf8_isupper",
-                   "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"\xEA\x9E\xBB\"]",
+                   "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]",
                    boolean(), "[true, true, true, false, true, false]");
 }
 

From f7a4931d30970ae5b1b5348b8e815e5840180e18 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Fri, 10 Jul 2020 16:24:22 -0500
Subject: [PATCH 19/28] Use 'ArrowType' for template parameter instead of
 clashing 'StringType'

---
 .../arrow/compute/kernels/scalar_string.cc    | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index d14f157c87a..9e97bb3fa84 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -190,10 +190,10 @@ struct UTF8Transform {
   }
 };
 
-template <typename StringType, typename Derived>
+template <typename ArrowType, typename Derived>
 struct BinaryToBoolean {
-  using offset_type = typename StringType::offset_type;
-  using ArrayType = typename TypeTraits<StringType>::ArrayType;
+  using offset_type = typename ArrowType::offset_type;
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].kind() == Datum::ARRAY) {
@@ -604,11 +604,11 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }
 
-template <typename StringType, typename Derived, bool allow_empty = false>
+template <typename ArrowType, typename Derived, bool allow_empty = false>
 struct CharacterPredicateUnicode
-    : BinaryToBoolean<StringType,
-                      CharacterPredicateUnicode<StringType, Derived, allow_empty>> {
-  using offset_type = typename StringType::offset_type;
+    : BinaryToBoolean<ArrowType,
+                      CharacterPredicateUnicode<ArrowType, Derived, allow_empty>> {
+  using offset_type = typename ArrowType::offset_type;
   static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
                                offset_type input_string_ncodeunits) {
     if (allow_empty && input_string_ncodeunits == 0) {
@@ -631,11 +631,11 @@ struct CharacterPredicateUnicode
   }
 };
 
-template <typename StringType, typename Derived, bool allow_empty = false>
+template <typename ArrowType, typename Derived, bool allow_empty = false>
 struct CharacterPredicateAscii
-    : BinaryToBoolean<StringType,
-                      CharacterPredicateAscii<StringType, Derived, allow_empty>> {
-  using offset_type = typename StringType::offset_type;
+    : BinaryToBoolean<ArrowType,
+                      CharacterPredicateAscii<ArrowType, Derived, allow_empty>> {
+  using offset_type = typename ArrowType::offset_type;
   static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
                                offset_type input_string_ncodeunits) {
     if (allow_empty && input_string_ncodeunits == 0) {
@@ -657,78 +657,76 @@ struct CharacterPredicateAscii
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
+template <typename ArrowType>
 struct IsAlphaNumericUnicode
-    : CharacterPredicateUnicode<StringType, IsAlphaNumericUnicode<StringType>> {
+    : CharacterPredicateUnicode<ArrowType, IsAlphaNumericUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename StringType>
+template <typename ArrowType>
 struct IsAlphaNumericAscii
-    : CharacterPredicateAscii<StringType, IsAlphaNumericAscii<StringType>> {
+    : CharacterPredicateAscii<ArrowType, IsAlphaNumericAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaNumericCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsAlphaUnicode
-    : CharacterPredicateUnicode<StringType, IsAlphaUnicode<StringType>> {
+template <typename ArrowType>
+struct IsAlphaUnicode : CharacterPredicateUnicode<ArrowType, IsAlphaUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename StringType>
-struct IsAlphaAscii : CharacterPredicateAscii<StringType, IsAlphaAscii<StringType>> {
+template <typename ArrowType>
+struct IsAlphaAscii : CharacterPredicateAscii<ArrowType, IsAlphaAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
+template <typename ArrowType>
 struct IsDecimalUnicode
-    : CharacterPredicateUnicode<StringType, IsDecimalUnicode<StringType>> {
+    : CharacterPredicateUnicode<ArrowType, IsDecimalUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDecimalCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename StringType>
-struct IsDecimalAscii : CharacterPredicateAscii<StringType, IsDecimalAscii<StringType>> {
+template <typename ArrowType>
+struct IsDecimalAscii : CharacterPredicateAscii<ArrowType, IsDecimalAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsDecimalCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsDigitUnicode
-    : CharacterPredicateUnicode<StringType, IsDigitUnicode<StringType>> {
+template <typename ArrowType>
+struct IsDigitUnicode : CharacterPredicateUnicode<ArrowType, IsDigitUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDigitCharacterUnicode(codepoint);
   }
 };
 
-template <typename StringType>
+template <typename ArrowType>
 struct IsNumericUnicode
-    : CharacterPredicateUnicode<StringType, IsNumericUnicode<StringType>> {
+    : CharacterPredicateUnicode<ArrowType, IsNumericUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename StringType>
-struct IsAscii : BinaryToBoolean<StringType, IsAscii<StringType>> {
-  using offset_type = typename StringType::offset_type;
+template <typename ArrowType>
+struct IsAscii : BinaryToBoolean<ArrowType, IsAscii<ArrowType>> {
+  using offset_type = typename ArrowType::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_nascii_characters) {
     return std::all_of(input, input + input_string_nascii_characters,
@@ -737,9 +735,8 @@ struct IsAscii : BinaryToBoolean<StringType, IsAscii<StringType>> {
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsLowerUnicode
-    : CharacterPredicateUnicode<StringType, IsLowerUnicode<StringType>> {
+template <typename ArrowType>
+struct IsLowerUnicode : CharacterPredicateUnicode<ArrowType, IsLowerUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
@@ -750,8 +747,8 @@ struct IsLowerUnicode
 };
 #endif
 
-template <typename StringType>
-struct IsLowerAscii : CharacterPredicateAscii<StringType, IsLowerAscii<StringType>> {
+template <typename ArrowType>
+struct IsLowerAscii : CharacterPredicateAscii<ArrowType, IsLowerAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
@@ -763,9 +760,9 @@ struct IsLowerAscii : CharacterPredicateAscii<StringType, IsLowerAscii<StringTyp
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
+template <typename ArrowType>
 struct IsPrintableUnicode
-    : CharacterPredicateUnicode<StringType, IsPrintableUnicode<StringType>,
+    : CharacterPredicateUnicode<ArrowType, IsPrintableUnicode<ArrowType>,
                                 /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
@@ -773,36 +770,34 @@ struct IsPrintableUnicode
 };
 #endif
 
-template <typename StringType>
-struct IsPrintableAscii
-    : CharacterPredicateAscii<StringType, IsPrintableAscii<StringType>,
-                              /*allow_empty=*/true> {
+template <typename ArrowType>
+struct IsPrintableAscii : CharacterPredicateAscii<ArrowType, IsPrintableAscii<ArrowType>,
+                                                  /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsPrintableCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsSpaceUnicode
-    : CharacterPredicateUnicode<StringType, IsSpaceUnicode<StringType>> {
+template <typename ArrowType>
+struct IsSpaceUnicode : CharacterPredicateUnicode<ArrowType, IsSpaceUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsSpaceCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename StringType>
-struct IsSpaceAscii : CharacterPredicateAscii<StringType, IsSpaceAscii<StringType>> {
+template <typename ArrowType>
+struct IsSpaceAscii : CharacterPredicateAscii<ArrowType, IsSpaceAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsSpaceCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsTitleUnicode : BinaryToBoolean<StringType, IsTitleUnicode<StringType>> {
-  using offset_type = typename StringType::offset_type;
+template <typename ArrowType>
+struct IsTitleUnicode : BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>> {
+  using offset_type = typename ArrowType::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_ncodeunits) {
     // rules:
@@ -839,9 +834,9 @@ struct IsTitleUnicode : BinaryToBoolean<StringType, IsTitleUnicode<StringType>>
 };
 #endif
 
-template <typename StringType>
-struct IsTitleAscii : BinaryToBoolean<StringType, IsTitleAscii<StringType>> {
-  using offset_type = typename StringType::offset_type;
+template <typename ArrowType>
+struct IsTitleAscii : BinaryToBoolean<ArrowType, IsTitleAscii<ArrowType>> {
+  using offset_type = typename ArrowType::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_ncodeunits) {
     // rules:
@@ -880,9 +875,8 @@ struct IsTitleAscii : BinaryToBoolean<StringType, IsTitleAscii<StringType>> {
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename StringType>
-struct IsUpperUnicode
-    : CharacterPredicateUnicode<StringType, IsUpperUnicode<StringType>> {
+template <typename ArrowType>
+struct IsUpperUnicode : CharacterPredicateUnicode<ArrowType, IsUpperUnicode<ArrowType>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
@@ -893,8 +887,8 @@ struct IsUpperUnicode
 };
 #endif
 
-template <typename StringType>
-struct IsUpperAscii : CharacterPredicateAscii<StringType, IsUpperAscii<StringType>> {
+template <typename ArrowType>
+struct IsUpperAscii : CharacterPredicateAscii<ArrowType, IsUpperAscii<ArrowType>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||

From 1e8c5c529b6bcf75ca0eace4c77fb80d0b2ce7d0 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Sat, 11 Jul 2020 08:14:39 +0200
Subject: [PATCH 20/28] compiler bug workaround

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 9e97bb3fa84..e0077e0cc17 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -726,7 +726,8 @@ struct IsNumericUnicode
 
 template <typename ArrowType>
 struct IsAscii : BinaryToBoolean<ArrowType, IsAscii<ArrowType>> {
-  using offset_type = typename ArrowType::offset_type;
+  using offset_type =
+      typename BinaryToBoolean<StringType, IsAscii<StringType>>::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_nascii_characters) {
     return std::all_of(input, input + input_string_nascii_characters,
@@ -797,7 +798,8 @@ struct IsSpaceAscii : CharacterPredicateAscii<ArrowType, IsSpaceAscii<ArrowType>
 #ifdef ARROW_WITH_UTF8PROC
 template <typename ArrowType>
 struct IsTitleUnicode : BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>> {
-  using offset_type = typename ArrowType::offset_type;
+  using offset_type =
+      typename BinaryToBoolean<ArrowType, IsTitleUnicode<StringType>>::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_ncodeunits) {
     // rules:

From 68868bd8faf38c38214c44059646ccf341b8e28a Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Sat, 11 Jul 2020 08:34:28 +0200
Subject: [PATCH 21/28] wrong template argument

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index e0077e0cc17..aff9e9a1dfc 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -727,7 +727,7 @@ struct IsNumericUnicode
 template <typename ArrowType>
 struct IsAscii : BinaryToBoolean<ArrowType, IsAscii<ArrowType>> {
   using offset_type =
-      typename BinaryToBoolean<StringType, IsAscii<StringType>>::offset_type;
+      typename BinaryToBoolean<ArrowType, IsAscii<ArrowType>>::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_nascii_characters) {
     return std::all_of(input, input + input_string_nascii_characters,
@@ -799,7 +799,7 @@ struct IsSpaceAscii : CharacterPredicateAscii<ArrowType, IsSpaceAscii<ArrowType>
 template <typename ArrowType>
 struct IsTitleUnicode : BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>> {
   using offset_type =
-      typename BinaryToBoolean<ArrowType, IsTitleUnicode<StringType>>::offset_type;
+      typename BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>>::offset_type;
   static bool Predicate(KernelContext* ctx, const uint8_t* input,
                         offset_type input_string_ncodeunits) {
     // rules:

From 3739e6681e7b20745a33210efe2bb86b11ab2d9c Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Sat, 11 Jul 2020 10:01:18 +0200
Subject: [PATCH 22/28] change crtp inheritance

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index aff9e9a1dfc..ca76bb4c3cb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -605,9 +605,7 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
 }
 
 template <typename ArrowType, typename Derived, bool allow_empty = false>
-struct CharacterPredicateUnicode
-    : BinaryToBoolean<ArrowType,
-                      CharacterPredicateUnicode<ArrowType, Derived, allow_empty>> {
+struct CharacterPredicateUnicode : BinaryToBoolean<ArrowType, Derived> {
   using offset_type = typename ArrowType::offset_type;
   static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
                                offset_type input_string_ncodeunits) {
@@ -632,9 +630,7 @@ struct CharacterPredicateUnicode
 };
 
 template <typename ArrowType, typename Derived, bool allow_empty = false>
-struct CharacterPredicateAscii
-    : BinaryToBoolean<ArrowType,
-                      CharacterPredicateAscii<ArrowType, Derived, allow_empty>> {
+struct CharacterPredicateAscii : BinaryToBoolean<ArrowType, Derived> {
   using offset_type = typename ArrowType::offset_type;
   static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
                                offset_type input_string_ncodeunits) {

From e9628a63a63a9e8dba1cb5f7420303b955f11d1d Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 19:47:15 -0500
Subject: [PATCH 23/28] CRTP -> inline lambdas

---
 .../arrow/compute/kernels/scalar_string.cc    | 389 +++++++++---------
 docker-compose.yml                            |   2 +
 2 files changed, 204 insertions(+), 187 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index ca76bb4c3cb..7c6f6d422b3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -190,52 +190,50 @@ struct UTF8Transform {
   }
 };
 
-template <typename ArrowType, typename Derived>
-struct BinaryToBoolean {
-  using offset_type = typename ArrowType::offset_type;
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      EnsureLookupTablesFilled();
-      const ArrayData& input = *batch[0].array();
-      ArrayType input_boxed(batch[0].array());
-      ArrayData* out_arr = out->mutable_array();
+template <typename Type, typename Predicate>
+void BinaryToBoolean(KernelContext* ctx, const ExecBatch& batch, Predicate&& predicate,
+                     Datum* out) {
+  using offset_type = typename Type::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
 
-      // offset_type input_ncodeunits = input_boxed.total_values_length();
-      offset_type input_nstrings = static_cast<offset_type>(input.length);
+  if (batch[0].kind() == Datum::ARRAY) {
+    EnsureLookupTablesFilled();
+    const ArrayData& input = *batch[0].array();
+    ArrayType input_boxed(batch[0].array());
+    ArrayData* out_arr = out->mutable_array();
 
-      FirstTimeBitmapWriter bitmap_writer(out_arr->buffers[1]->mutable_data(),
-                                          out_arr->offset, input.length);
-      for (int64_t i = 0; i < input_nstrings; i++) {
-        offset_type input_string_ncodeunits;
-        const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
-        bool boolean_result =
-            Derived::Predicate(ctx, input_string, input_string_ncodeunits);
-        if (!ctx->status().ok()) {
-          // UTF decoding can lead to issues
-          return;
-        }
-        if (boolean_result) {
-          bitmap_writer.Set();
-        }
-        bitmap_writer.Next();
+    // offset_type input_ncodeunits = input_boxed.total_values_length();
+    offset_type input_nstrings = static_cast<offset_type>(input.length);
+
+    FirstTimeBitmapWriter bitmap_writer(out_arr->buffers[1]->mutable_data(),
+                                        out_arr->offset, input.length);
+    for (int64_t i = 0; i < input_nstrings; i++) {
+      offset_type input_string_ncodeunits;
+      const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
+      bool boolean_result = predicate(ctx, input_string, input_string_ncodeunits);
+      if (!ctx->status().ok()) {
+        // UTF decoding can lead to issues
+        return;
       }
-      bitmap_writer.Finish();
-    } else {
-      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
-      if (input.is_valid) {
-        offset_type data_nbytes = static_cast<offset_type>(input.value->size());
-        bool boolean_result = Derived::Predicate(ctx, input.value->data(), data_nbytes);
-        if (!ctx->status().ok()) {
-          // UTF decoding can lead to issues
-          return;
-        }
-        out->value = std::make_shared<BooleanScalar>(boolean_result);
+      if (boolean_result) {
+        bitmap_writer.Set();
       }
+      bitmap_writer.Next();
+    }
+    bitmap_writer.Finish();
+  } else {
+    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+    if (input.is_valid) {
+      offset_type data_nbytes = static_cast<offset_type>(input.value->size());
+      bool boolean_result = predicate(ctx, input.value->data(), data_nbytes);
+      if (!ctx->status().ok()) {
+        // UTF decoding can lead to issues
+        return;
+      }
+      out->value = std::make_shared<BooleanScalar>(boolean_result);
     }
   }
-};
+}
 
 template <typename Type>
 struct UTF8Upper : UTF8Transform<Type, UTF8Upper<Type>> {
@@ -604,136 +602,146 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }
 
-template <typename ArrowType, typename Derived, bool allow_empty = false>
-struct CharacterPredicateUnicode : BinaryToBoolean<ArrowType, Derived> {
-  using offset_type = typename ArrowType::offset_type;
-  static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
-                               offset_type input_string_ncodeunits) {
-    if (allow_empty && input_string_ncodeunits == 0) {
-      return true;
-    }
-    bool all;
-    bool any = false;
-    if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
-            input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
-              any |= Derived::PredicateCharacterAny(codepoint);
-              return Derived::PredicateCharacterAll(codepoint);
-            }))) {
-      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
-      return false;
-    }
-    return all & any;
+template <typename Type, typename Derived, bool allow_empty = false>
+struct CharacterPredicateUnicode {
+  using offset_type = typename Type::offset_type;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto predicate = [](KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) -> bool {
+      if (allow_empty && input_string_ncodeunits == 0) {
+        return true;
+      }
+      bool all;
+      bool any = false;
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
+              input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
+                any |= Derived::PredicateCharacterAny(codepoint);
+                return Derived::PredicateCharacterAll(codepoint);
+              }))) {
+        ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+        return false;
+      }
+      return all & any;
+    };
+    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
   }
+
   static inline bool PredicateCharacterAny(uint32_t) {
     return true;  // default condition make sure there is at least 1 charachter
   }
 };
 
-template <typename ArrowType, typename Derived, bool allow_empty = false>
-struct CharacterPredicateAscii : BinaryToBoolean<ArrowType, Derived> {
-  using offset_type = typename ArrowType::offset_type;
-  static inline bool Predicate(KernelContext* ctx, const uint8_t* input,
-                               offset_type input_string_ncodeunits) {
-    if (allow_empty && input_string_ncodeunits == 0) {
-      return true;
-    }
-    bool any = false;
-    // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
-    // benchmark. I don't consider that worth it.
-    bool all = std::all_of(input, input + input_string_ncodeunits,
-                           [&any](uint8_t ascii_character) {
-                             any |= Derived::PredicateCharacterAny(ascii_character);
-                             return Derived::PredicateCharacterAll(ascii_character);
-                           });
-    return all & any;
+template <typename Type, typename Derived, bool allow_empty = false>
+struct CharacterPredicateAscii {
+  using offset_type = typename Type::offset_type;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto predicate = [](KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) -> bool {
+      if (allow_empty && input_string_ncodeunits == 0) {
+        return true;
+      }
+      bool any = false;
+      // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
+      // benchmark. I don't consider that worth it.
+      bool all = std::all_of(input, input + input_string_ncodeunits,
+                             [&any](uint8_t ascii_character) {
+                               any |= Derived::PredicateCharacterAny(ascii_character);
+                               return Derived::PredicateCharacterAll(ascii_character);
+                             });
+      return all & any;
+    };
+    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
   }
+
   static inline bool PredicateCharacterAny(uint8_t) {
     return true;  // default condition make sure there is at least 1 charachter
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
+template <typename Type>
 struct IsAlphaNumericUnicode
-    : CharacterPredicateUnicode<ArrowType, IsAlphaNumericUnicode<ArrowType>> {
+    : CharacterPredicateUnicode<Type, IsAlphaNumericUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename ArrowType>
+template <typename Type>
 struct IsAlphaNumericAscii
-    : CharacterPredicateAscii<ArrowType, IsAlphaNumericAscii<ArrowType>> {
+    : CharacterPredicateAscii<Type, IsAlphaNumericAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaNumericCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsAlphaUnicode : CharacterPredicateUnicode<ArrowType, IsAlphaUnicode<ArrowType>> {
+template <typename Type>
+struct IsAlphaUnicode : CharacterPredicateUnicode<Type, IsAlphaUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename ArrowType>
-struct IsAlphaAscii : CharacterPredicateAscii<ArrowType, IsAlphaAscii<ArrowType>> {
+template <typename Type>
+struct IsAlphaAscii : CharacterPredicateAscii<Type, IsAlphaAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
+template <typename Type>
 struct IsDecimalUnicode
-    : CharacterPredicateUnicode<ArrowType, IsDecimalUnicode<ArrowType>> {
+    : CharacterPredicateUnicode<Type, IsDecimalUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDecimalCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename ArrowType>
-struct IsDecimalAscii : CharacterPredicateAscii<ArrowType, IsDecimalAscii<ArrowType>> {
+template <typename Type>
+struct IsDecimalAscii : CharacterPredicateAscii<Type, IsDecimalAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsDecimalCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsDigitUnicode : CharacterPredicateUnicode<ArrowType, IsDigitUnicode<ArrowType>> {
+template <typename Type>
+struct IsDigitUnicode : CharacterPredicateUnicode<Type, IsDigitUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDigitCharacterUnicode(codepoint);
   }
 };
 
-template <typename ArrowType>
+template <typename Type>
 struct IsNumericUnicode
-    : CharacterPredicateUnicode<ArrowType, IsNumericUnicode<ArrowType>> {
+    : CharacterPredicateUnicode<Type, IsNumericUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename ArrowType>
-struct IsAscii : BinaryToBoolean<ArrowType, IsAscii<ArrowType>> {
-  using offset_type =
-      typename BinaryToBoolean<ArrowType, IsAscii<ArrowType>>::offset_type;
-  static bool Predicate(KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_nascii_characters) {
-    return std::all_of(input, input + input_string_nascii_characters,
-                       IsAsciiCharacter<uint8_t>);
+template <typename Type>
+struct IsAscii {
+  using offset_type = typename Type::offset_type;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto predicate = [](KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_nascii_characters) -> bool {
+      return std::all_of(input, input + input_string_nascii_characters,
+                         IsAsciiCharacter<uint8_t>);
+    };
+    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsLowerUnicode : CharacterPredicateUnicode<ArrowType, IsLowerUnicode<ArrowType>> {
+template <typename Type>
+struct IsLowerUnicode : CharacterPredicateUnicode<Type, IsLowerUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
@@ -744,8 +752,8 @@ struct IsLowerUnicode : CharacterPredicateUnicode<ArrowType, IsLowerUnicode<Arro
 };
 #endif
 
-template <typename ArrowType>
-struct IsLowerAscii : CharacterPredicateAscii<ArrowType, IsLowerAscii<ArrowType>> {
+template <typename Type>
+struct IsLowerAscii : CharacterPredicateAscii<Type, IsLowerAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
@@ -757,9 +765,9 @@ struct IsLowerAscii : CharacterPredicateAscii<ArrowType, IsLowerAscii<ArrowType>
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
+template <typename Type>
 struct IsPrintableUnicode
-    : CharacterPredicateUnicode<ArrowType, IsPrintableUnicode<ArrowType>,
+    : CharacterPredicateUnicode<Type, IsPrintableUnicode<Type>,
                                 /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
@@ -767,8 +775,8 @@ struct IsPrintableUnicode
 };
 #endif
 
-template <typename ArrowType>
-struct IsPrintableAscii : CharacterPredicateAscii<ArrowType, IsPrintableAscii<ArrowType>,
+template <typename Type>
+struct IsPrintableAscii : CharacterPredicateAscii<Type, IsPrintableAscii<Type>,
                                                   /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsPrintableCharacterAscii(ascii_character);
@@ -776,105 +784,112 @@ struct IsPrintableAscii : CharacterPredicateAscii<ArrowType, IsPrintableAscii<Ar
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsSpaceUnicode : CharacterPredicateUnicode<ArrowType, IsSpaceUnicode<ArrowType>> {
+template <typename Type>
+struct IsSpaceUnicode : CharacterPredicateUnicode<Type, IsSpaceUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsSpaceCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename ArrowType>
-struct IsSpaceAscii : CharacterPredicateAscii<ArrowType, IsSpaceAscii<ArrowType>> {
+template <typename Type>
+struct IsSpaceAscii : CharacterPredicateAscii<Type, IsSpaceAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsSpaceCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsTitleUnicode : BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>> {
-  using offset_type =
-      typename BinaryToBoolean<ArrowType, IsTitleUnicode<ArrowType>>::offset_type;
-  static bool Predicate(KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) {
-    // rules:
-    // * 1: lower case follows cased
-    // * 2: upper case follows uncased
-    // * 3: at least 1 cased character (which logically should be upper/title)
-    bool rules_1_and_2;
-    bool previous_cased = false;  // in LL, LU or LT
-    bool rule_3 = false;
-    bool status =
-        arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
-                               [&previous_cased, &rule_3](uint32_t codepoint) {
-                                 if (IsLowerCaseCharacterUnicode(codepoint)) {
-                                   if (!previous_cased) return false;  // rule 1 broken
-                                   previous_cased = true;
-                                 } else if (IsCasedCharacterUnicode(codepoint)) {
-                                   if (previous_cased) return false;  // rule 2 broken
-                                   // next should be a lower case or uncased
-                                   previous_cased = true;
-                                   rule_3 = true;  // rule 3 obeyed
-                                 } else {
-                                   // a non-cased char, like _ or 1
-                                   // next should be upper case or more uncased
-                                   previous_cased = false;
-                                 }
-                                 return true;
-                               });
-    if (!ARROW_PREDICT_TRUE(status)) {
-      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
-      return false;
-    }
-    return rules_1_and_2 & rule_3;
+template <typename Type>
+struct IsTitleUnicode {
+  using offset_type = typename Type::offset_type;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto predicate = [](KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) -> bool {
+      // rules:
+      // * 1: lower case follows cased
+      // * 2: upper case follows uncased
+      // * 3: at least 1 cased character (which logically should be upper/title)
+      bool rules_1_and_2;
+      bool previous_cased = false;  // in LL, LU or LT
+      bool rule_3 = false;
+      bool status =
+      arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
+                             [&previous_cased, &rule_3](uint32_t codepoint) {
+                               if (IsLowerCaseCharacterUnicode(codepoint)) {
+                                 if (!previous_cased) return false;  // rule 1 broken
+                                 previous_cased = true;
+                               } else if (IsCasedCharacterUnicode(codepoint)) {
+                                 if (previous_cased) return false;  // rule 2 broken
+                                 // next should be a lower case or uncased
+                                 previous_cased = true;
+                                 rule_3 = true;  // rule 3 obeyed
+                               } else {
+                                 // a non-cased char, like _ or 1
+                                 // next should be upper case or more uncased
+                                 previous_cased = false;
+                               }
+                               return true;
+                             });
+      if (!ARROW_PREDICT_TRUE(status)) {
+        ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+        return false;
+      }
+      return rules_1_and_2 & rule_3;
+    };
+    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
   }
 };
 #endif
 
-template <typename ArrowType>
-struct IsTitleAscii : BinaryToBoolean<ArrowType, IsTitleAscii<ArrowType>> {
-  using offset_type = typename ArrowType::offset_type;
-  static bool Predicate(KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) {
-    // rules:
-    // * 1: lower case follows cased
-    // * 2: upper case follows uncased
-    // * 3: at least 1 cased character (which logically should be upper/title)
-    bool rules_1_and_2 = true;
-    bool previous_cased = false;  // in LL, LU or LT
-    bool rule_3 = false;
-    // we cannot rely on std::all_of because we need guaranteed order
-    for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
-      if (IsLowerCaseCharacterAscii(*c)) {
-        if (!previous_cased) {
-          // rule 1 broken
-          rules_1_and_2 = false;
-          break;
-        }
-        previous_cased = true;
-      } else if (IsCasedCharacterAscii(*c)) {
-        if (previous_cased) {
-          // rule 2 broken
-          rules_1_and_2 = false;
-          break;
+template <typename Type>
+struct IsTitleAscii {
+  using offset_type = typename Type::offset_type;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto predicate = [](KernelContext* ctx, const uint8_t* input,
+                        offset_type input_string_ncodeunits) -> bool {
+      // rules:
+      // * 1: lower case follows cased
+      // * 2: upper case follows uncased
+      // * 3: at least 1 cased character (which logically should be upper/title)
+      bool rules_1_and_2 = true;
+      bool previous_cased = false;  // in LL, LU or LT
+      bool rule_3 = false;
+      // we cannot rely on std::all_of because we need guaranteed order
+      for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
+        if (IsLowerCaseCharacterAscii(*c)) {
+          if (!previous_cased) {
+            // rule 1 broken
+            rules_1_and_2 = false;
+            break;
+          }
+          previous_cased = true;
+        } else if (IsCasedCharacterAscii(*c)) {
+          if (previous_cased) {
+            // rule 2 broken
+            rules_1_and_2 = false;
+            break;
+          }
+          // next should be a lower case or uncased
+          previous_cased = true;
+          rule_3 = true;  // rule 3 obeyed
+        } else {
+          // a non-cased char, like _ or 1
+          // next should be upper case or more uncased
+          previous_cased = false;
         }
-        // next should be a lower case or uncased
-        previous_cased = true;
-        rule_3 = true;  // rule 3 obeyed
-      } else {
-        // a non-cased char, like _ or 1
-        // next should be upper case or more uncased
-        previous_cased = false;
       }
-    }
-    return rules_1_and_2 & rule_3;
+      return rules_1_and_2 & rule_3;
+    };
+    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename ArrowType>
-struct IsUpperUnicode : CharacterPredicateUnicode<ArrowType, IsUpperUnicode<ArrowType>> {
+template <typename Type>
+struct IsUpperUnicode : CharacterPredicateUnicode<Type, IsUpperUnicode<Type>> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
@@ -885,8 +900,8 @@ struct IsUpperUnicode : CharacterPredicateUnicode<ArrowType, IsUpperUnicode<Arro
 };
 #endif
 
-template <typename ArrowType>
-struct IsUpperAscii : CharacterPredicateAscii<ArrowType, IsUpperAscii<ArrowType>> {
+template <typename Type>
+struct IsUpperAscii : CharacterPredicateAscii<Type, IsUpperAscii<Type>> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
diff --git a/docker-compose.yml b/docker-compose.yml
index 62ddca0cfcd..89feef5eb21 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -319,6 +319,8 @@ services:
       ARROW_ORC: "OFF"
       ARROW_USE_ASAN: "ON"
       ARROW_USE_UBSAN: "ON"
+      # utf8proc 2.1.0 in Ubuntu Bionic has test failures
+      utf8proc_SOURCE: "BUNDLED"
     command: *cpp-command
 
   fedora-cpp:

From 33bb8bbf6f9f8f9182ce81e6a1f3533605c4b771 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 20:18:14 -0500
Subject: [PATCH 24/28] Instantiate fewer templates

---
 .../arrow/compute/kernels/scalar_string.cc    | 375 ++++++++----------
 1 file changed, 161 insertions(+), 214 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 7c6f6d422b3..cbfe4728a11 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -190,42 +190,30 @@ struct UTF8Transform {
   }
 };
 
-template <typename Type, typename Predicate>
-void BinaryToBoolean(KernelContext* ctx, const ExecBatch& batch, Predicate&& predicate,
-                     Datum* out) {
+using StringPredicate = std::function<bool(KernelContext*, const uint8_t*, size_t)>;
+
+template <typename Type>
+void BinaryToBoolean(KernelContext* ctx, const ExecBatch& batch,
+                     StringPredicate predicate, Datum* out) {
   using offset_type = typename Type::offset_type;
-  using ArrayType = typename TypeTraits<Type>::ArrayType;
 
   if (batch[0].kind() == Datum::ARRAY) {
     EnsureLookupTablesFilled();
     const ArrayData& input = *batch[0].array();
-    ArrayType input_boxed(batch[0].array());
+    ArrayIterator<Type> input_it(input);
     ArrayData* out_arr = out->mutable_array();
-
-    // offset_type input_ncodeunits = input_boxed.total_values_length();
     offset_type input_nstrings = static_cast<offset_type>(input.length);
-
-    FirstTimeBitmapWriter bitmap_writer(out_arr->buffers[1]->mutable_data(),
-                                        out_arr->offset, input.length);
-    for (int64_t i = 0; i < input_nstrings; i++) {
-      offset_type input_string_ncodeunits;
-      const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
-      bool boolean_result = predicate(ctx, input_string, input_string_ncodeunits);
-      if (!ctx->status().ok()) {
-        // UTF decoding can lead to issues
-        return;
-      }
-      if (boolean_result) {
-        bitmap_writer.Set();
-      }
-      bitmap_writer.Next();
-    }
-    bitmap_writer.Finish();
+    ::arrow::internal::GenerateBitsUnrolled(
+        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+        [&]() -> bool {
+          util::string_view val = input_it();
+          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size());
+        });
   } else {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
     if (input.is_valid) {
-      offset_type data_nbytes = static_cast<offset_type>(input.value->size());
-      bool boolean_result = predicate(ctx, input.value->data(), data_nbytes);
+      bool boolean_result =
+          predicate(ctx, input.value->data(), static_cast<size_t>(input.value->size()));
       if (!ctx->status().ok()) {
         // UTF decoding can lead to issues
         return;
@@ -602,28 +590,24 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }
 
-template <typename Type, typename Derived, bool allow_empty = false>
+template <typename Derived, bool allow_empty = false>
 struct CharacterPredicateUnicode {
-  using offset_type = typename Type::offset_type;
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    auto predicate = [](KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) -> bool {
-      if (allow_empty && input_string_ncodeunits == 0) {
-        return true;
-      }
-      bool all;
-      bool any = false;
-      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
-              input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
-                any |= Derived::PredicateCharacterAny(codepoint);
-                return Derived::PredicateCharacterAll(codepoint);
-              }))) {
-        ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
-        return false;
-      }
-      return all & any;
-    };
-    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
+  static bool Call(KernelContext* ctx, const uint8_t* input,
+                   size_t input_string_ncodeunits) {
+    if (allow_empty && input_string_ncodeunits == 0) {
+      return true;
+    }
+    bool all;
+    bool any = false;
+    if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
+            input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
+              any |= Derived::PredicateCharacterAny(codepoint);
+              return Derived::PredicateCharacterAll(codepoint);
+            }))) {
+      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+      return false;
+    }
+    return all & any;
   }
 
   static inline bool PredicateCharacterAny(uint32_t) {
@@ -631,26 +615,22 @@ struct CharacterPredicateUnicode {
   }
 };
 
-template <typename Type, typename Derived, bool allow_empty = false>
+template <typename Derived, bool allow_empty = false>
 struct CharacterPredicateAscii {
-  using offset_type = typename Type::offset_type;
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    auto predicate = [](KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) -> bool {
-      if (allow_empty && input_string_ncodeunits == 0) {
-        return true;
-      }
-      bool any = false;
-      // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
-      // benchmark. I don't consider that worth it.
-      bool all = std::all_of(input, input + input_string_ncodeunits,
-                             [&any](uint8_t ascii_character) {
-                               any |= Derived::PredicateCharacterAny(ascii_character);
-                               return Derived::PredicateCharacterAll(ascii_character);
-                             });
-      return all & any;
-    };
-    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
+  static bool Call(KernelContext* ctx, const uint8_t* input,
+                   size_t input_string_ncodeunits) {
+    if (allow_empty && input_string_ncodeunits == 0) {
+      return true;
+    }
+    bool any = false;
+    // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
+    // benchmark. I don't consider that worth it.
+    bool all = std::all_of(input, input + input_string_ncodeunits,
+                           [&any](uint8_t ascii_character) {
+                             any |= Derived::PredicateCharacterAny(ascii_character);
+                             return Derived::PredicateCharacterAll(ascii_character);
+                           });
+    return all & any;
   }
 
   static inline bool PredicateCharacterAny(uint8_t) {
@@ -659,89 +639,71 @@ struct CharacterPredicateAscii {
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsAlphaNumericUnicode
-    : CharacterPredicateUnicode<Type, IsAlphaNumericUnicode<Type>> {
+struct IsAlphaNumericUnicode : CharacterPredicateUnicode<IsAlphaNumericUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
-struct IsAlphaNumericAscii
-    : CharacterPredicateAscii<Type, IsAlphaNumericAscii<Type>> {
+struct IsAlphaNumericAscii : CharacterPredicateAscii<IsAlphaNumericAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaNumericCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsAlphaUnicode : CharacterPredicateUnicode<Type, IsAlphaUnicode<Type>> {
+struct IsAlphaUnicode : CharacterPredicateUnicode<IsAlphaUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
-struct IsAlphaAscii : CharacterPredicateAscii<Type, IsAlphaAscii<Type>> {
+struct IsAlphaAscii : CharacterPredicateAscii<IsAlphaAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsDecimalUnicode
-    : CharacterPredicateUnicode<Type, IsDecimalUnicode<Type>> {
+struct IsDecimalUnicode : CharacterPredicateUnicode<IsDecimalUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDecimalCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
-struct IsDecimalAscii : CharacterPredicateAscii<Type, IsDecimalAscii<Type>> {
+struct IsDecimalAscii : CharacterPredicateAscii<IsDecimalAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsDecimalCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsDigitUnicode : CharacterPredicateUnicode<Type, IsDigitUnicode<Type>> {
+struct IsDigitUnicode : CharacterPredicateUnicode<IsDigitUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDigitCharacterUnicode(codepoint);
   }
 };
 
-template <typename Type>
-struct IsNumericUnicode
-    : CharacterPredicateUnicode<Type, IsNumericUnicode<Type>> {
+struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsNumericCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
 struct IsAscii {
-  using offset_type = typename Type::offset_type;
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    auto predicate = [](KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_nascii_characters) -> bool {
-      return std::all_of(input, input + input_string_nascii_characters,
-                         IsAsciiCharacter<uint8_t>);
-    };
-    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
+  static bool Call(KernelContext* ctx, const uint8_t* input,
+                   size_t input_string_nascii_characters) {
+    return std::all_of(input, input + input_string_nascii_characters,
+                       IsAsciiCharacter<uint8_t>);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsLowerUnicode : CharacterPredicateUnicode<Type, IsLowerUnicode<Type>> {
+struct IsLowerUnicode : CharacterPredicateUnicode<IsLowerUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
@@ -752,8 +714,7 @@ struct IsLowerUnicode : CharacterPredicateUnicode<Type, IsLowerUnicode<Type>> {
 };
 #endif
 
-template <typename Type>
-struct IsLowerAscii : CharacterPredicateAscii<Type, IsLowerAscii<Type>> {
+struct IsLowerAscii : CharacterPredicateAscii<IsLowerAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
@@ -765,131 +726,113 @@ struct IsLowerAscii : CharacterPredicateAscii<Type, IsLowerAscii<Type>> {
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
 struct IsPrintableUnicode
-    : CharacterPredicateUnicode<Type, IsPrintableUnicode<Type>,
-                                /*allow_empty=*/true> {
+    : CharacterPredicateUnicode<IsPrintableUnicode, /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
-struct IsPrintableAscii : CharacterPredicateAscii<Type, IsPrintableAscii<Type>,
-                                                  /*allow_empty=*/true> {
+struct IsPrintableAscii
+    : CharacterPredicateAscii<IsPrintableAscii, /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsPrintableCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsSpaceUnicode : CharacterPredicateUnicode<Type, IsSpaceUnicode<Type>> {
+struct IsSpaceUnicode : CharacterPredicateUnicode<IsSpaceUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsSpaceCharacterUnicode(codepoint);
   }
 };
 #endif
 
-template <typename Type>
-struct IsSpaceAscii : CharacterPredicateAscii<Type, IsSpaceAscii<Type>> {
+struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsSpaceCharacterAscii(ascii_character);
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
 struct IsTitleUnicode {
-  using offset_type = typename Type::offset_type;
-
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    auto predicate = [](KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) -> bool {
-      // rules:
-      // * 1: lower case follows cased
-      // * 2: upper case follows uncased
-      // * 3: at least 1 cased character (which logically should be upper/title)
-      bool rules_1_and_2;
-      bool previous_cased = false;  // in LL, LU or LT
-      bool rule_3 = false;
-      bool status =
-      arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
-                             [&previous_cased, &rule_3](uint32_t codepoint) {
-                               if (IsLowerCaseCharacterUnicode(codepoint)) {
-                                 if (!previous_cased) return false;  // rule 1 broken
-                                 previous_cased = true;
-                               } else if (IsCasedCharacterUnicode(codepoint)) {
-                                 if (previous_cased) return false;  // rule 2 broken
-                                 // next should be a lower case or uncased
-                                 previous_cased = true;
-                                 rule_3 = true;  // rule 3 obeyed
-                               } else {
-                                 // a non-cased char, like _ or 1
-                                 // next should be upper case or more uncased
-                                 previous_cased = false;
-                               }
-                               return true;
-                             });
-      if (!ARROW_PREDICT_TRUE(status)) {
-        ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
-        return false;
-      }
-      return rules_1_and_2 & rule_3;
-    };
-    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
+  static bool Call(KernelContext* ctx, const uint8_t* input,
+                   size_t input_string_ncodeunits) {
+    // rules:
+    // * 1: lower case follows cased
+    // * 2: upper case follows uncased
+    // * 3: at least 1 cased character (which logically should be upper/title)
+    bool rules_1_and_2;
+    bool previous_cased = false;  // in LL, LU or LT
+    bool rule_3 = false;
+    bool status =
+        arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
+                               [&previous_cased, &rule_3](uint32_t codepoint) {
+                                 if (IsLowerCaseCharacterUnicode(codepoint)) {
+                                   if (!previous_cased) return false;  // rule 1 broken
+                                   previous_cased = true;
+                                 } else if (IsCasedCharacterUnicode(codepoint)) {
+                                   if (previous_cased) return false;  // rule 2 broken
+                                   // next should be a lower case or uncased
+                                   previous_cased = true;
+                                   rule_3 = true;  // rule 3 obeyed
+                                 } else {
+                                   // a non-cased char, like _ or 1
+                                   // next should be upper case or more uncased
+                                   previous_cased = false;
+                                 }
+                                 return true;
+                               });
+    if (!ARROW_PREDICT_TRUE(status)) {
+      ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+      return false;
+    }
+    return rules_1_and_2 & rule_3;
   }
 };
 #endif
 
-template <typename Type>
 struct IsTitleAscii {
-  using offset_type = typename Type::offset_type;
-
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    auto predicate = [](KernelContext* ctx, const uint8_t* input,
-                        offset_type input_string_ncodeunits) -> bool {
-      // rules:
-      // * 1: lower case follows cased
-      // * 2: upper case follows uncased
-      // * 3: at least 1 cased character (which logically should be upper/title)
-      bool rules_1_and_2 = true;
-      bool previous_cased = false;  // in LL, LU or LT
-      bool rule_3 = false;
-      // we cannot rely on std::all_of because we need guaranteed order
-      for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
-        if (IsLowerCaseCharacterAscii(*c)) {
-          if (!previous_cased) {
-            // rule 1 broken
-            rules_1_and_2 = false;
-            break;
-          }
-          previous_cased = true;
-        } else if (IsCasedCharacterAscii(*c)) {
-          if (previous_cased) {
-            // rule 2 broken
-            rules_1_and_2 = false;
-            break;
-          }
-          // next should be a lower case or uncased
-          previous_cased = true;
-          rule_3 = true;  // rule 3 obeyed
-        } else {
-          // a non-cased char, like _ or 1
-          // next should be upper case or more uncased
-          previous_cased = false;
+  static bool Call(KernelContext* ctx, const uint8_t* input,
+                   size_t input_string_ncodeunits) {
+    // rules:
+    // * 1: lower case follows cased
+    // * 2: upper case follows uncased
+    // * 3: at least 1 cased character (which logically should be upper/title)
+    bool rules_1_and_2 = true;
+    bool previous_cased = false;  // in LL, LU or LT
+    bool rule_3 = false;
+    // we cannot rely on std::all_of because we need guaranteed order
+    for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
+      if (IsLowerCaseCharacterAscii(*c)) {
+        if (!previous_cased) {
+          // rule 1 broken
+          rules_1_and_2 = false;
+          break;
+        }
+        previous_cased = true;
+      } else if (IsCasedCharacterAscii(*c)) {
+        if (previous_cased) {
+          // rule 2 broken
+          rules_1_and_2 = false;
+          break;
         }
+        // next should be a lower case or uncased
+        previous_cased = true;
+        rule_3 = true;  // rule 3 obeyed
+      } else {
+        // a non-cased char, like _ or 1
+        // next should be upper case or more uncased
+        previous_cased = false;
       }
-      return rules_1_and_2 & rule_3;
-    };
-    return BinaryToBoolean<Type>(ctx, batch, std::move(predicate), out);
+    }
+    return rules_1_and_2 & rule_3;
   }
 };
 
 #ifdef ARROW_WITH_UTF8PROC
-template <typename Type>
-struct IsUpperUnicode : CharacterPredicateUnicode<Type, IsUpperUnicode<Type>> {
+struct IsUpperUnicode : CharacterPredicateUnicode<IsUpperUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
@@ -900,8 +843,7 @@ struct IsUpperUnicode : CharacterPredicateUnicode<Type, IsUpperUnicode<Type>> {
 };
 #endif
 
-template <typename Type>
-struct IsUpperAscii : CharacterPredicateAscii<Type, IsUpperAscii<Type>> {
+struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
@@ -982,13 +924,18 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
 
 #endif
 
-template <template <typename> class Transformer>
-void AddUnaryString(std::string name, FunctionRegistry* registry) {
+template <typename Predicate>
+void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
-  ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
-  ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
-  DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32));
-  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), exec_64));
+  DCHECK_OK(func->AddKernel(
+      {utf8()}, boolean(), [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+        BinaryToBoolean<BinaryType>(ctx, batch, Predicate::Call, out);
+      }));
+  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(),
+                            [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+                              BinaryToBoolean<LargeBinaryType>(ctx, batch,
+                                                               Predicate::Call, out);
+                            }));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -998,31 +945,31 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
 
-  AddUnaryString<IsAscii>("binary_isascii", registry);
+  AddUnaryStringPredicate<IsAscii>("binary_isascii", registry);
 
-  AddUnaryString<IsAlphaNumericAscii>("ascii_isalnum", registry);
-  AddUnaryString<IsAlphaAscii>("ascii_isalpha", registry);
-  AddUnaryString<IsDecimalAscii>("ascii_isdecimal", registry);
+  AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_isalnum", registry);
+  AddUnaryStringPredicate<IsAlphaAscii>("ascii_isalpha", registry);
+  AddUnaryStringPredicate<IsDecimalAscii>("ascii_isdecimal", registry);
   // no isdigic for ascii, since it is the same as isdecimal
-  AddUnaryString<IsLowerAscii>("ascii_islower", registry);
+  AddUnaryStringPredicate<IsLowerAscii>("ascii_islower", registry);
   // no isnumeric for ascii, since it is the same as isdecimal
-  AddUnaryString<IsPrintableAscii>("ascii_isprintable", registry);
-  AddUnaryString<IsSpaceAscii>("ascii_isspace", registry);
-  AddUnaryString<IsTitleAscii>("ascii_istitle", registry);
-  AddUnaryString<IsUpperAscii>("ascii_isupper", registry);
+  AddUnaryStringPredicate<IsPrintableAscii>("ascii_isprintable", registry);
+  AddUnaryStringPredicate<IsSpaceAscii>("ascii_isspace", registry);
+  AddUnaryStringPredicate<IsTitleAscii>("ascii_istitle", registry);
+  AddUnaryStringPredicate<IsUpperAscii>("ascii_isupper", registry);
 #ifdef ARROW_WITH_UTF8PROC
   MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);
-  AddUnaryString<IsAlphaNumericUnicode>("utf8_isalnum", registry);
-  AddUnaryString<IsAlphaUnicode>("utf8_isalpha", registry);
-  AddUnaryString<IsDecimalUnicode>("utf8_isdecimal", registry);
-  AddUnaryString<IsDigitUnicode>("utf8_isdigit", registry);
-  AddUnaryString<IsLowerUnicode>("utf8_islower", registry);
-  AddUnaryString<IsNumericUnicode>("utf8_isnumeric", registry);
-  AddUnaryString<IsPrintableUnicode>("utf8_isprintable", registry);
-  AddUnaryString<IsSpaceUnicode>("utf8_isspace", registry);
-  AddUnaryString<IsTitleUnicode>("utf8_istitle", registry);
-  AddUnaryString<IsUpperUnicode>("utf8_isupper", registry);
+  AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_isalnum", registry);
+  AddUnaryStringPredicate<IsAlphaUnicode>("utf8_isalpha", registry);
+  AddUnaryStringPredicate<IsDecimalUnicode>("utf8_isdecimal", registry);
+  AddUnaryStringPredicate<IsDigitUnicode>("utf8_isdigit", registry);
+  AddUnaryStringPredicate<IsLowerUnicode>("utf8_islower", registry);
+  AddUnaryStringPredicate<IsNumericUnicode>("utf8_isnumeric", registry);
+  AddUnaryStringPredicate<IsPrintableUnicode>("utf8_isprintable", registry);
+  AddUnaryStringPredicate<IsSpaceUnicode>("utf8_isspace", registry);
+  AddUnaryStringPredicate<IsTitleUnicode>("utf8_istitle", registry);
+  AddUnaryStringPredicate<IsUpperUnicode>("utf8_isupper", registry);
 
 #endif
 

From a160803472c98dcc7e0eabbc355ed1e5a6fefe82 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 20:38:12 -0500
Subject: [PATCH 25/28] Some fixes, try something else to see if it fixes the
 gcc compilation issue

---
 .../arrow/compute/kernels/scalar_string.cc    | 80 +++++++++----------
 1 file changed, 38 insertions(+), 42 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index cbfe4728a11..c201c3efc00 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -190,39 +190,6 @@ struct UTF8Transform {
   }
 };
 
-using StringPredicate = std::function<bool(KernelContext*, const uint8_t*, size_t)>;
-
-template <typename Type>
-void BinaryToBoolean(KernelContext* ctx, const ExecBatch& batch,
-                     StringPredicate predicate, Datum* out) {
-  using offset_type = typename Type::offset_type;
-
-  if (batch[0].kind() == Datum::ARRAY) {
-    EnsureLookupTablesFilled();
-    const ArrayData& input = *batch[0].array();
-    ArrayIterator<Type> input_it(input);
-    ArrayData* out_arr = out->mutable_array();
-    offset_type input_nstrings = static_cast<offset_type>(input.length);
-    ::arrow::internal::GenerateBitsUnrolled(
-        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
-        [&]() -> bool {
-          util::string_view val = input_it();
-          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size());
-        });
-  } else {
-    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
-    if (input.is_valid) {
-      bool boolean_result =
-          predicate(ctx, input.value->data(), static_cast<size_t>(input.value->size()));
-      if (!ctx->status().ok()) {
-        // UTF decoding can lead to issues
-        return;
-      }
-      out->value = std::make_shared<BooleanScalar>(boolean_result);
-    }
-  }
-}
-
 template <typename Type>
 struct UTF8Upper : UTF8Transform<Type, UTF8Upper<Type>> {
   inline static uint32_t TransformCodepoint(uint32_t codepoint) {
@@ -924,18 +891,47 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
 
 #endif
 
+using StringPredicate = std::function<bool(KernelContext*, const uint8_t*, size_t)>;
+
+template <typename Type>
+void ApplyPredicate(KernelContext* ctx, const ExecBatch& batch, StringPredicate predicate,
+                    Datum* out) {
+  if (batch[0].kind() == Datum::ARRAY) {
+    EnsureLookupTablesFilled();
+    const ArrayData& input = *batch[0].array();
+    ArrayIterator<Type> input_it(input);
+    ArrayData* out_arr = out->mutable_array();
+    ::arrow::internal::GenerateBitsUnrolled(
+        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+        [&]() -> bool {
+          util::string_view val = input_it();
+          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size());
+        });
+  } else {
+    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+    if (input.is_valid) {
+      bool boolean_result =
+          predicate(ctx, input.value->data(), static_cast<size_t>(input.value->size()));
+      if (!ctx->status().ok()) {
+        // UTF decoding can lead to issues
+        return;
+      }
+      out->value = std::make_shared<BooleanScalar>(boolean_result);
+    }
+  }
+}
+
 template <typename Predicate>
 void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
-  DCHECK_OK(func->AddKernel(
-      {utf8()}, boolean(), [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-        BinaryToBoolean<BinaryType>(ctx, batch, Predicate::Call, out);
-      }));
-  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(),
-                            [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-                              BinaryToBoolean<LargeBinaryType>(ctx, batch,
-                                                               Predicate::Call, out);
-                            }));
+  auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
+  };
+  auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
+  };
+  DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
+  DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 

From 2efd9c8effeb6174f44b0fc9b3b90b9d29f0e222 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 20:39:52 -0500
Subject: [PATCH 26/28] Move LUT initialization out of if block

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index c201c3efc00..2364ddc52c8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -104,8 +104,8 @@ struct UTF8Transform {
   }
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
     if (batch[0].kind() == Datum::ARRAY) {
-      EnsureLookupTablesFilled();
       const ArrayData& input = *batch[0].array();
       ArrayType input_boxed(batch[0].array());
       ArrayData* output = out->mutable_array();
@@ -896,8 +896,8 @@ using StringPredicate = std::function<bool(KernelContext*, const uint8_t*, size_
 template <typename Type>
 void ApplyPredicate(KernelContext* ctx, const ExecBatch& batch, StringPredicate predicate,
                     Datum* out) {
+  EnsureLookupTablesFilled();
   if (batch[0].kind() == Datum::ARRAY) {
-    EnsureLookupTablesFilled();
     const ArrayData& input = *batch[0].array();
     ArrayIterator<Type> input_it(input);
     ArrayData* out_arr = out->mutable_array();

From ce7869d6722b190270932af953b48920bddb6a2b Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 20:56:13 -0500
Subject: [PATCH 27/28] Compile without utf8proc

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 2364ddc52c8..451dacf904e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -206,6 +206,10 @@ struct UTF8Lower : UTF8Transform<Type, UTF8Lower<Type>> {
   }
 };
 
+#else
+
+void EnsureLookupTablesFilled() {}
+
 #endif  // ARROW_WITH_UTF8PROC
 
 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;

From bc5a2ecbd362e0c879fa7d26bd63b3d6b8fb9b2b Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jul 2020 21:01:58 -0500
Subject: [PATCH 28/28] Skip failing test on older utf8proc

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index f8b4ea2f3d2..88a0258ee5f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -227,6 +227,9 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) {
       boolean(), "[false, null, false, false, false, false, false, false]");
 }
 
+// Older versions of utf8proc fail
+#if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5)
+
 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
   this->CheckUnary(
@@ -246,6 +249,8 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
                    boolean(), "[true, true, true, false, true, false]");
 }
 
+#endif  // UTF8PROC_VERSION_MINOR >= 5
+
 #endif  // ARROW_WITH_UTF8PROC
 
 TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {