From c9d0a478683e920867016aed7fcbef070a05ca6c Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 4 Aug 2021 08:20:04 -0400
Subject: [PATCH 01/23] initial impl of ASCII title

---
 .../arrow/compute/kernels/scalar_string.cc    | 55 ++++++++++++++-----
 1 file changed, 41 insertions(+), 14 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index f5de81acd67..16bba2aeef4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -81,6 +81,22 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
                                                               : utf8_code_unit;
 }
 
+static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
+  if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
+    utf8_code_unit -= 32;
+  } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
+    utf8_code_unit += 32;
+  }
+  return utf8_code_unit;
+}
+
+// IsAlpha/Digit etc
+
+template <typename T>
+static inline bool IsAsciiCharacter(T character) {
+  return character < 128;
+}
+
 static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
   return (ascii_character >= 'a') && (ascii_character <= 'z');
 }
@@ -117,20 +133,6 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }
 
-static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
-  if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
-    utf8_code_unit -= 32;
-  } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
-    utf8_code_unit += 32;
-  }
-  return utf8_code_unit;
-}
-
-template <typename T>
-static inline bool IsAsciiCharacter(T character) {
-  return character < 128;
-}
-
 struct BinaryLength {
   template <typename OutValue, typename Arg0Value = util::string_view>
   static OutValue Call(KernelContext*, Arg0Value val, Status*) {
@@ -677,6 +679,31 @@ struct AsciiCapitalizeTransform : public StringTransformBase {
 template <typename Type>
 using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 
+struct AsciiTitleTransform : public StringTransformBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) {
+    uint8_t* c = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    while (c <= end) {
+      uint8_t* c_;
+      // Uppercase first caseable character of current word
+      while ((c_ = c++) <= end) {
+        if (IsCasedCharacterAscii(*c_)) {
+          *c_ = ascii_toupper(*c_);
+          break;
+        }
+      }
+      // Lowercase characters until a whitespace is found
+      while ((c_ = c++) <= end) {
+        if (IsSpaceCharacterAscii(*c_)) {
+          break;
+        }
+        *c_ = ascii_tolower(*c_);
+      }
+    }
+    return input_string_ncodeunits;
+  }
+};
+
 // ----------------------------------------------------------------------
 // exact pattern detection
 

From e05fa30503a814afd28fbcc50c3eab51b6c2009d Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 4 Aug 2021 14:14:42 -0400
Subject: [PATCH 02/23] update ASCII title kernel

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 16bba2aeef4..1e2e974ac32 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -685,7 +685,7 @@ struct AsciiTitleTransform : public StringTransformBase {
     const uint8_t* end = input + input_string_ncodeunits;
     while (c <= end) {
       uint8_t* c_;
-      // Uppercase first caseable character of current word
+      // Uppercase first alpha character of current word
       while ((c_ = c++) <= end) {
         if (IsCasedCharacterAscii(*c_)) {
           *c_ = ascii_toupper(*c_);
@@ -704,6 +704,9 @@ struct AsciiTitleTransform : public StringTransformBase {
   }
 };
 
+template <typename Type>
+using AsciiTitle = StringTransformExec<Type, AsciiTitleTransform>;
+
 // ----------------------------------------------------------------------
 // exact pattern detection
 
@@ -4154,6 +4157,13 @@ const FunctionDoc ascii_capitalize_doc(
      "non-ASCII characters, use \"utf8_capitalize\" instead."),
     {"strings"});
 
+const FunctionDoc ascii_title_doc(
+    "Transform ASCII input to a string where the first alpha character in each word is uppercase and all other characters are lowercase",
+    ("For each string in `strings`, return a title version.\n\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
+     "non-ASCII characters, use \"utf8_title\" instead."),
+    {"strings"});
+
 const FunctionDoc ascii_reverse_doc(
     "Reverse ASCII input",
     ("For each ASCII string in `strings`, return a reversed version.\n\n"
@@ -4201,6 +4211,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
       "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
                                               &ascii_capitalize_doc);
+  MakeUnaryStringBatchKernel<AsciiTitle>("ascii_title", registry, &ascii_title_doc);
   MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
                                                   &ascii_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,

From b34e198149c56cf4add9ce6eb7ce2e433e1146a7 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Thu, 5 Aug 2021 07:23:23 -0400
Subject: [PATCH 03/23] add ASCII title kernel

---
 .../arrow/compute/kernels/scalar_string.cc    | 68 ++++++++++---------
 .../compute/kernels/scalar_string_test.cc     | 14 +++-
 2 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 1e2e974ac32..3cf25a6d11c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -66,29 +66,10 @@ Status RegexStatus(const RE2& regex) {
 }
 #endif
 
-// Code units in the range [a-z] can only be an encoding of an ascii
-// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
-// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
-// section 2.5 of Unicode Standard Core Specification v13.0)
-
-static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
-  return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
-                                                              : utf8_code_unit;
-}
-
-static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
-  return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
-                                                              : utf8_code_unit;
-}
-
-static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
-  if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
-    utf8_code_unit -= 32;
-  } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
-    utf8_code_unit += 32;
-  }
-  return utf8_code_unit;
-}
+// Code units in the range [a-z] can only be an encoding of an ASCII
+// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different
+// codepoint. This is guaranteed by the non-overlap design of the Unicode
+// standard. (see section 2.5 of Unicode Standard Core Specification v13.0)
 
 // IsAlpha/Digit etc
 
@@ -149,6 +130,25 @@ struct Utf8Length {
   }
 };
 
+static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
+  return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
+                                                              : utf8_code_unit;
+}
+
+static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
+  return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
+                                                              : utf8_code_unit;
+}
+
+static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) {
+  if (IsLowerCaseCharacterAscii(utf8_code_unit)) {
+    utf8_code_unit -= 32;
+  } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) {
+    utf8_code_unit += 32;
+  }
+  return utf8_code_unit;
+}
+
 #ifdef ARROW_WITH_UTF8PROC
 
 // Direct lookup tables for unicode properties
@@ -681,23 +681,24 @@ using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 
 struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) {
-    uint8_t* c = input;
-    const uint8_t* end = input + input_string_ncodeunits;
-    while (c <= end) {
-      uint8_t* c_;
+    const uint8_t* const end = input + input_string_ncodeunits;
+    while (input <= end) {
+      const uint8_t* c_;
       // Uppercase first alpha character of current word
-      while ((c_ = c++) <= end) {
+      while ((c_ = input++) <= end) {
         if (IsCasedCharacterAscii(*c_)) {
-          *c_ = ascii_toupper(*c_);
+          *output++ = ascii_toupper(*c_);
           break;
         }
+        *output++ = *c_;
       }
       // Lowercase characters until a whitespace is found
-      while ((c_ = c++) <= end) {
+      while ((c_ = input++) <= end) {
         if (IsSpaceCharacterAscii(*c_)) {
+          *output++ = *c_;
           break;
         }
-        *c_ = ascii_tolower(*c_);
+        *output++ = ascii_tolower(*c_);
       }
     }
     return input_string_ncodeunits;
@@ -4201,8 +4202,9 @@ const FunctionDoc utf8_reverse_doc(
 }  // namespace
 
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
-  // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
-  // so don't preallocate them in the output.
+  // Some kernels are able to reuse the original offsets buffer, so don't
+  // preallocate them in the output. Only kernels that invoke
+  // "StringDataTransform()" support no preallocation.
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
                                          MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 271ecb0b415..80aeed6ca7d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -413,6 +413,16 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) {
                    "\"!hello, world!\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiTitle) {
+  this->CheckUnary("ascii_title", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_title",
+                   "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
+                   "\"!hELlo, wORLd!\", \"  a b cd\"]",
+                   this->type(),
+                   "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", \"$. A3\", "
+                   "\"!Hello, World!\", \"  A B Cd\"]");
+}
+
 TYPED_TEST(TestStringKernels, AsciiReverse) {
   this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
   this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
@@ -522,7 +532,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
   // test maximum buffer growth
   this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]");
 
-  this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
+  this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(),
                    "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
 
   // Test invalid data
@@ -532,7 +542,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
 }
 
 TYPED_TEST(TestStringKernels, Utf8Capitalize) {
-  this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_capitalize", "[]", this->type(), "[]");
   this->CheckUnary("utf8_capitalize",
                    "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
                    "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",

From daa4cb7dfa6169be7fe1a47ba34e3f96f1f6ae33 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Thu, 5 Aug 2021 07:27:23 -0400
Subject: [PATCH 04/23] update docs for ASCII title

---
 docs/source/cpp/compute.rst        | 6 ++++--
 docs/source/python/api/compute.rst | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 465500e8dae..26ffb640b21 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -697,7 +697,7 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | Function name           | Arity | Input types            | Output type            | Options class                     | Notes |
 +=========================+=======+========================+========================+===================================+=======+
-| ascii_capitalize        | Unary | String-like            | String-like            |                                   |       |
+| ascii_capitalize        | Unary | String-like            | String-like            |                                   | \(1)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | ascii_lower             | Unary | String-like            | String-like            |                                   | \(1)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
@@ -705,6 +705,8 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | ascii_swapcase          | Unary | String-like            | String-like            |                                   | \(1)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_title             | Unary | String-like            | String-like            |                                   | \(1)  |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | ascii_upper             | Unary | String-like            | String-like            |                                   | \(1)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | binary_length           | Unary | Binary- or String-like | Int32 or Int64         |                                   | \(3)  |
@@ -715,7 +717,7 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | replace_substring_regex | Unary | String-like            | String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_capitalize         | Unary | String-like            | String-like            |                                   |       |
+| utf8_capitalize         | Unary | String-like            | String-like            |                                   | \(8)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_length             | Unary | String-like            | Int32 or Int64         |                                   | \(7)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index b737439147e..96679babaa9 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -264,6 +264,7 @@ String Transforms
    ascii_rtrim
    ascii_rtrim_whitespace
    ascii_swapcase
+   ascii_title
    ascii_trim
    ascii_upper
    binary_length

From 70945058155245b1d1da1cf1dfba22bfacbaa9c2 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Thu, 5 Aug 2021 08:12:13 -0400
Subject: [PATCH 05/23] fix lint error

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 3cf25a6d11c..44d91a459fa 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -4204,7 +4204,7 @@ const FunctionDoc utf8_reverse_doc(
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
   // Some kernels are able to reuse the original offsets buffer, so don't
   // preallocate them in the output. Only kernels that invoke
-  // "StringDataTransform()" support no preallocation.
+  // "StringDataTransform" support no preallocation.
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
                                          MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,

From 8e4c02735e7bfe5564733fa0601fba56cfa58c81 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 18 Aug 2021 16:40:49 -0400
Subject: [PATCH 06/23] fix linting errors

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 44d91a459fa..b5f2e978be1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -680,7 +680,8 @@ template <typename Type>
 using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 
 struct AsciiTitleTransform : public StringTransformBase {
-  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
     const uint8_t* const end = input + input_string_ncodeunits;
     while (input <= end) {
       const uint8_t* c_;
@@ -4159,7 +4160,8 @@ const FunctionDoc ascii_capitalize_doc(
     {"strings"});
 
 const FunctionDoc ascii_title_doc(
-    "Transform ASCII input to a string where the first alpha character in each word is uppercase and all other characters are lowercase",
+    "Transform ASCII input to a string where the first alpha character in each word is "
+    "uppercase and all other characters are lowercase",
     ("For each string in `strings`, return a title version.\n\n"
      "This function assumes the input is fully ASCII.  If it may contain\n"
      "non-ASCII characters, use \"utf8_title\" instead."),

From 3b8c4ce208c6123adcc9a6ebbf1ca082dbeb8f77 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Fri, 27 Aug 2021 06:36:16 -0400
Subject: [PATCH 07/23] update capitalize/title and add UTF8 title

---
 .../arrow/compute/kernels/scalar_string.cc    | 148 +++++++++++-------
 1 file changed, 95 insertions(+), 53 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b5f2e978be1..618120ce97d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -422,17 +422,25 @@ struct StringTransformExecWithState
 
 #ifdef ARROW_WITH_UTF8PROC
 
-template <typename CodepointTransform>
-struct StringTransformCodepoint : public StringTransformBase {
+struct StringTransformCodepointBase : public StringTransformBase {
   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
     EnsureLookupTablesFilled();
     return Status::OK();
   }
 
   int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
-    return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
+    // Section 5.18 of the Unicode spec claims that the number of codepoints for case
+    // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
+    // However, since we don't support all casings (SpecialCasing.txt) the growth
+    // in bytes is actually only at max 3/2 (as covered by the unittest).
+    // Note that rounding down the 3/2 is ok, since only codepoints encoded by
+    // two code units (even) can grow to 3 code units.
+    return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
   }
+};
 
+template <typename CodepointTransform>
+struct StringTransformCodepoint : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
@@ -445,20 +453,7 @@ struct StringTransformCodepoint : public StringTransformBase {
   }
 };
 
-// struct CaseMappingMixin {
-struct CaseMappingTransform {
-  static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
-    // Section 5.18 of the Unicode spec claims that the number of codepoints for case
-    // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
-    // However, since we don't support all casings (SpecialCasing.txt) the growth
-    // in bytes is actually only at max 3/2 (as covered by the unittest).
-    // Note that rounding down the 3/2 is ok, since only codepoints encoded by
-    // two code units (even) can grow to 3 code units.
-    return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
-  }
-};
-
-struct UTF8UpperTransform : public CaseMappingTransform {
+struct UTF8UpperTransform : public StringTransformCodepointBase {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
                                             : utf8proc_toupper(codepoint);
@@ -468,7 +463,7 @@ struct UTF8UpperTransform : public CaseMappingTransform {
 template <typename Type>
 using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
 
-struct UTF8LowerTransform : public CaseMappingTransform {
+struct UTF8LowerTransform : public StringTransformCodepointBase {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
                                             : utf8proc_tolower(codepoint);
@@ -478,7 +473,7 @@ struct UTF8LowerTransform : public CaseMappingTransform {
 template <typename Type>
 using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
 
-struct UTF8SwapCaseTransform : public CaseMappingTransform {
+struct UTF8SwapCaseTransform : public StringTransformCodepointBase {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     if (codepoint <= kMaxCodepointLookup) {
       return lut_swapcase_codepoint[codepoint];
@@ -498,29 +493,20 @@ template <typename Type>
 using UTF8SwapCase =
     StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
 
-struct Utf8CapitalizeTransform : public StringTransformBase {
+struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
-    if (input_string_ncodeunits > 0) {
-      // Get number of code units in first code point
-      uint32_t codepoint = 0;
-      const uint8_t* i = input;
-      if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) {
-        return kTransformError;
-      }
-      int64_t codepoint_ncodeunits =
-          std::min(static_cast<int64_t>(i - input), input_string_ncodeunits);
-      if (ARROW_PREDICT_FALSE(
-              !util::UTF8Transform(input, input + codepoint_ncodeunits, &output,
-                                   UTF8UpperTransform::TransformCodepoint))) {
-        return kTransformError;
-      }
-      if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
-              input + codepoint_ncodeunits, input + input_string_ncodeunits, &output,
-              UTF8LowerTransform::TransformCodepoint))) {
-        return kTransformError;
-      }
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* next = input;
+    if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
+      return kTransformError;
+    }
+    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
+      return kTransformError;
+    }
+    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
+      return kTransformError;
     }
     return output - output_start;
   }
@@ -529,6 +515,53 @@ struct Utf8CapitalizeTransform : public StringTransformBase {
 template <typename Type>
 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 
+struct Utf8TitleTransform: public StringTransformCodepointBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    uint8_t* output_start = output;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* curr = NULLPTR;
+    uint32_t codepoint = 0;
+
+    do {
+      // Uppercase first alpha character of current word
+      while (input < end) {
+        curr = input;
+        if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) {
+          return kTransformError;
+        }
+        if (IsCasedCharacterUnicode(codepoint)) {
+          output = util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint));
+          input = curr;
+          break;
+        }
+        output = std::copy(input, curr, output);
+        input = curr;
+      }
+
+      // Lowercase characters until a whitespace is found
+      while (input < end) {
+        curr = input;
+        if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) {
+          return kTransformError;
+        }
+        if (IsSpaceCharacterUnicode(codepoint)) {
+          output = std::copy(input, curr, output);
+          input = curr;
+          break;
+        }
+        output = util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint));
+        input = curr;
+      }
+    } while (input < end);
+
+    return output - output_start;
+  }
+};
+
+template <typename Type>
+using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+
 #endif  // ARROW_WITH_UTF8PROC
 
 struct AsciiReverseTransform : public StringTransformBase {
@@ -669,8 +702,8 @@ struct AsciiCapitalizeTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     if (input_string_ncodeunits > 0) {
-      *output = ascii_toupper(*input);
-      TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1);
+      *output++ = ascii_toupper(*input++);
+      TransformAsciiLower(input, input_string_ncodeunits - 1, output);
     }
     return input_string_ncodeunits;
   }
@@ -682,26 +715,29 @@ using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
-    const uint8_t* const end = input + input_string_ncodeunits;
-    while (input <= end) {
-      const uint8_t* c_;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* curr = NULLPTR;
+
+    do {
       // Uppercase first alpha character of current word
-      while ((c_ = input++) <= end) {
-        if (IsCasedCharacterAscii(*c_)) {
-          *output++ = ascii_toupper(*c_);
+      while ((curr = input++) < end) {
+        if (IsCasedCharacterAscii(*curr)) {
+          *output++ = ascii_toupper(*curr);
           break;
         }
-        *output++ = *c_;
+        *output++ = *curr;
       }
+
       // Lowercase characters until a whitespace is found
-      while ((c_ = input++) <= end) {
-        if (IsSpaceCharacterAscii(*c_)) {
-          *output++ = *c_;
+      while ((curr = input++) < end) {
+        if (IsSpaceCharacterAscii(*curr)) {
+          *output++ = *curr;
           break;
         }
-        *output++ = ascii_tolower(*c_);
+        *output++ = ascii_tolower(*curr);
       }
-    }
+    } while (input < end);
+
     return input_string_ncodeunits;
   }
 };
@@ -4193,6 +4229,11 @@ const FunctionDoc utf8_capitalize_doc(
      "with the first character uppercased and the others lowercased."),
     {"strings"});
 
+const FunctionDoc utf8_title_doc(
+    "Transform input to a string where the first alpha character in each word is "
+    "uppercase and all other characters are lowercase",
+    ("For each string in `strings`, return a title version."), {"strings"});
+
 const FunctionDoc utf8_reverse_doc(
     "Reverse input",
     ("For each string in `strings`, return a reversed version.\n\n"
@@ -4263,6 +4304,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
                                                    &utf8_swapcase_doc);
   MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
                                              &utf8_capitalize_doc);
+  MakeUnaryStringBatchKernel<Utf8Title>("utf8_title", registry, &utf8_title_doc);
   MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
                                                  &utf8_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,

From e77abe9debcd31e350b96be29a8831dede2730d7 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Fri, 27 Aug 2021 06:36:34 -0400
Subject: [PATCH 08/23] add UTF8 title tests

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 80aeed6ca7d..a7ac9949be7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -551,6 +551,16 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) {
                    "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
 }
 
+TYPED_TEST(TestStringKernels, Utf8Title) {
+  this->CheckUnary("utf8_title", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_title",
+                   "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
+                   "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
+                   this->type(),
+                   "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
+                   "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]");
+}
+
 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
   // UTF8PROC_CATEGORY_LO

From 8fcff85cde57d4da8b5de63d6d8b12c0e0d36f5d Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Fri, 27 Aug 2021 06:40:38 -0400
Subject: [PATCH 09/23] update docs

---
 docs/source/cpp/compute.rst        | 2 ++
 docs/source/python/api/compute.rst | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 26ffb640b21..7263d77acf2 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -729,6 +729,8 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_swapcase           | Unary | String-like            | String-like            |                                   | \(8)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_title              | Unary | String-like            | String-like            |                                   | \(8)  |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_upper              | Unary | String-like            | String-like            |                                   | \(8)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index 96679babaa9..f7f740c24e5 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -284,6 +284,7 @@ String Transforms
    utf8_rtrim
    utf8_rtrim_whitespace
    utf8_swapcase
+   utf8_title
    utf8_trim
    utf8_upper
 

From d0f277cb1104f22fa5e21ce0b9323ad99b80e3c1 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Fri, 27 Aug 2021 06:42:32 -0400
Subject: [PATCH 10/23] apply lint changes

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 618120ce97d..938daffa011 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -502,10 +502,12 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
     if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
       return kTransformError;
     }
-    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
+    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
+            input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
       return kTransformError;
     }
-    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
+    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
+            next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
       return kTransformError;
     }
     return output - output_start;
@@ -515,7 +517,7 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
 template <typename Type>
 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 
-struct Utf8TitleTransform: public StringTransformCodepointBase {
+struct Utf8TitleTransform : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
@@ -531,7 +533,8 @@ struct Utf8TitleTransform: public StringTransformCodepointBase {
           return kTransformError;
         }
         if (IsCasedCharacterUnicode(codepoint)) {
-          output = util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint));
+          output =
+              util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint));
           input = curr;
           break;
         }
@@ -550,7 +553,8 @@ struct Utf8TitleTransform: public StringTransformCodepointBase {
           input = curr;
           break;
         }
-        output = util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint));
+        output =
+            util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint));
         input = curr;
       }
     } while (input < end);

From 27009136ed2cbd79cf72c6f48b1a7d2071410f5f Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Mon, 30 Aug 2021 08:01:02 -0400
Subject: [PATCH 11/23] update function description, change copy for memcpy,
 add input size check

---
 .../arrow/compute/kernels/scalar_string.cc    | 47 +++++++++++--------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 938daffa011..8ce0f2a767f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -499,16 +499,18 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
     uint8_t* output_start = output;
     const uint8_t* end = input + input_string_ncodeunits;
     const uint8_t* next = input;
-    if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
-      return kTransformError;
-    }
-    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
-            input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
-      return kTransformError;
-    }
-    if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
-            next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
-      return kTransformError;
+    if (input_string_ncodeunits > 0) {
+      if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) {
+        return kTransformError;
+      }
+      if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
+              input, next, &output, UTF8UpperTransform::TransformCodepoint))) {
+        return kTransformError;
+      }
+      if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
+              next, end, &output, UTF8LowerTransform::TransformCodepoint))) {
+        return kTransformError;
+      }
     }
     return output - output_start;
   }
@@ -522,7 +524,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
                     uint8_t* output) {
     uint8_t* output_start = output;
     const uint8_t* end = input + input_string_ncodeunits;
-    const uint8_t* curr = NULLPTR;
+    const uint8_t* curr = nullptr;
     uint32_t codepoint = 0;
 
     do {
@@ -538,7 +540,8 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
           input = curr;
           break;
         }
-        output = std::copy(input, curr, output);
+        std::memcpy(output, input, curr - input);
+        output += curr - input;
         input = curr;
       }
 
@@ -549,7 +552,8 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
           return kTransformError;
         }
         if (IsSpaceCharacterUnicode(codepoint)) {
-          output = std::copy(input, curr, output);
+          std::memcpy(output, input, curr - input);
+          output += curr - input;
           input = curr;
           break;
         }
@@ -720,7 +724,7 @@ struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     const uint8_t* end = input + input_string_ncodeunits;
-    const uint8_t* curr = NULLPTR;
+    const uint8_t* curr = nullptr;
 
     do {
       // Uppercase first alpha character of current word
@@ -4200,9 +4204,10 @@ const FunctionDoc ascii_capitalize_doc(
     {"strings"});
 
 const FunctionDoc ascii_title_doc(
-    "Transform ASCII input to a string where the first alpha character in each word is "
-    "uppercase and all other characters are lowercase",
-    ("For each string in `strings`, return a title version.\n\n"
+    "Titlecase each word of ASCII input",
+    ("For each string in `strings`, return a titlecased version.\n"
+     "Each word in the output will start with an uppercase character and its\n"
+     "remaining characters will be lowercase.\n\n"
      "This function assumes the input is fully ASCII.  If it may contain\n"
      "non-ASCII characters, use \"utf8_title\" instead."),
     {"strings"});
@@ -4234,9 +4239,11 @@ const FunctionDoc utf8_capitalize_doc(
     {"strings"});
 
 const FunctionDoc utf8_title_doc(
-    "Transform input to a string where the first alpha character in each word is "
-    "uppercase and all other characters are lowercase",
-    ("For each string in `strings`, return a title version."), {"strings"});
+    "Titlecase each word of input",
+    ("For each string in `strings`, return a titlecased version.\n"
+     "Each word in the output will start with an uppercase character and its\n"
+     "remaining characters will be lowercase."),
+    {"strings"});
 
 const FunctionDoc utf8_reverse_doc(
     "Reverse input",

From 98b6b336a0555b4c81d8afbdc4e5ce6974ecc6a5 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Mon, 30 Aug 2021 21:32:02 -0400
Subject: [PATCH 12/23] update title implementation

---
 .../arrow/compute/kernels/scalar_string.cc    | 89 +++++++------------
 .../compute/kernels/scalar_string_test.cc     | 25 +++---
 2 files changed, 41 insertions(+), 73 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 8ce0f2a767f..285ee657c2d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -522,47 +522,27 @@ using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 struct Utf8TitleTransform : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
-    uint8_t* output_start = output;
+    const uint8_t* next = input;
     const uint8_t* end = input + input_string_ncodeunits;
-    const uint8_t* curr = nullptr;
-    uint32_t codepoint = 0;
-
-    do {
-      // Uppercase first alpha character of current word
-      while (input < end) {
-        curr = input;
-        if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) {
-          return kTransformError;
-        }
-        if (IsCasedCharacterUnicode(codepoint)) {
-          output =
-              util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint));
-          input = curr;
-          break;
-        }
-        std::memcpy(output, input, curr - input);
-        output += curr - input;
-        input = curr;
+    uint8_t* output_start = output;
+    uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint;
+    while ((input = next) < end) {
+      uint32_t codepoint;
+      if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) {
+        return kTransformError;
       }
-
-      // Lowercase characters until a whitespace is found
-      while (input < end) {
-        curr = input;
-        if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) {
-          return kTransformError;
-        }
-        if (IsSpaceCharacterUnicode(codepoint)) {
-          std::memcpy(output, input, curr - input);
-          output += curr - input;
-          input = curr;
-          break;
-        }
-        output =
-            util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint));
-        input = curr;
+      if (IsCasedCharacterUnicode(codepoint)) {
+        // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased
+        // codepoints
+        output = util::UTF8Encode(output, TransformCodepoint(codepoint));
+        TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
+      } else {
+        // Copy non-cased character, prepare to uppercase next caseable codepoint
+        std::memcpy(output, input, next - input);
+        output += next - input;
+        TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
       }
-    } while (input < end);
-
+    }
     return output - output_start;
   }
 };
@@ -724,28 +704,19 @@ struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     const uint8_t* end = input + input_string_ncodeunits;
-    const uint8_t* curr = nullptr;
-
-    do {
-      // Uppercase first alpha character of current word
-      while ((curr = input++) < end) {
-        if (IsCasedCharacterAscii(*curr)) {
-          *output++ = ascii_toupper(*curr);
-          break;
-        }
-        *output++ = *curr;
-      }
-
-      // Lowercase characters until a whitespace is found
-      while ((curr = input++) < end) {
-        if (IsSpaceCharacterAscii(*curr)) {
-          *output++ = *curr;
-          break;
-        }
-        *output++ = ascii_tolower(*curr);
+    uint8_t (*ascii_toxxx)(uint8_t) = ascii_toupper;
+    while (input < end) {
+      if (IsCasedCharacterAscii(*input)) {
+        // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased
+        // codepoints
+        *output++ = ascii_toxxx(*input++);
+        ascii_toxxx = ascii_tolower;
+      } else {
+        // Copy non-cased character, prepare to uppercase next caseable codepoint
+        *output++ = *input++;
+        ascii_toxxx = ascii_toupper;
       }
-    } while (input < end);
-
+    }
     return input_string_ncodeunits;
   }
 };
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index a7ac9949be7..5b93cbae361 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <memory>
+#include <unordered_map>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -414,13 +415,11 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) {
 }
 
 TYPED_TEST(TestStringKernels, AsciiTitle) {
-  this->CheckUnary("ascii_title", "[]", this->type(), "[]");
-  this->CheckUnary("ascii_title",
-                   "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
-                   "\"!hELlo, wORLd!\", \"  a b cd\"]",
-                   this->type(),
-                   "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", \"$. A3\", "
-                   "\"!Hello, World!\", \"  A B Cd\"]");
+  this->CheckUnary(
+      "ascii_title",
+      R"([null, "", "b", "aAaz;ZeA&", "arRoW", "iI", "a.a.a..A", "hEllO, WoRld!", "foo   baR;heHe0zOP", "!%$^.,;"])",
+      this->type(),
+      R"([null, "", "B", "Aaaz;Zea&", "Arrow", "Ii", "A.A.A..A", "Hello, World!", "Foo   Bar;Hehe0Zop", "!%$^.,;"])");
 }
 
 TYPED_TEST(TestStringKernels, AsciiReverse) {
@@ -552,13 +551,11 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) {
 }
 
 TYPED_TEST(TestStringKernels, Utf8Title) {
-  this->CheckUnary("utf8_title", "[]", this->type(), "[]");
-  this->CheckUnary("utf8_title",
-                   "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
-                   "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
-                   this->type(),
-                   "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
-                   "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]");
+  this->CheckUnary(
+      "utf8_title",
+      R"([null, "", "b", "aAaz;ZæÆ&", "ɑɽⱤoW", "ıI", "ⱥ.ⱥ.ⱥ..Ⱥ", "hEllO, WoRld!", "foo   baR;héHé0zOP", "!%$^.,;"])",
+      this->type(),
+      R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo   Bar;Héhé0Zop", "!%$^.,;"])");
 }
 
 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {

From c7f184073fc9083a36f07f14fe82b812b5a0f418 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 00:21:00 -0400
Subject: [PATCH 13/23] change while to for-loop in kernels

---
 .../arrow/compute/kernels/scalar_string.cc    | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 285ee657c2d..1672af79c3c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -522,22 +522,21 @@ using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 struct Utf8TitleTransform : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
-    const uint8_t* next = input;
-    const uint8_t* end = input + input_string_ncodeunits;
     uint8_t* output_start = output;
     uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint;
-    while ((input = next) < end) {
+    for (const uint8_t *next = input, *end = input + input_string_ncodeunits; next < end;
+         input = next) {
       uint32_t codepoint;
       if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) {
         return kTransformError;
       }
       if (IsCasedCharacterUnicode(codepoint)) {
-        // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased
+        // Lower/uppercase codepoint, prepare to lowercase next consecutive cased
         // codepoints
         output = util::UTF8Encode(output, TransformCodepoint(codepoint));
         TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
       } else {
-        // Copy non-cased character, prepare to uppercase next caseable codepoint
+        // Copy non-cased codepoint, prepare to uppercase next cased codepoint
         std::memcpy(output, input, next - input);
         output += next - input;
         TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
@@ -703,18 +702,17 @@ using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
-    const uint8_t* end = input + input_string_ncodeunits;
-    uint8_t (*ascii_toxxx)(uint8_t) = ascii_toupper;
-    while (input < end) {
-      if (IsCasedCharacterAscii(*input)) {
-        // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased
-        // codepoints
-        *output++ = ascii_toxxx(*input++);
-        ascii_toxxx = ascii_tolower;
+    uint8_t (*ascii_to_ul)(uint8_t) = ascii_toupper;
+    for (const uint8_t* ch = input; ch < (input + input_string_ncodeunits); ++ch) {
+      if (IsCasedCharacterAscii(*ch)) {
+        // Lower/uppercase character, prepare to lowercase next consecutive cased
+        // characters
+        *output++ = ascii_to_ul(*ch);
+        ascii_to_ul = ascii_tolower;
       } else {
-        // Copy non-cased character, prepare to uppercase next caseable codepoint
-        *output++ = *input++;
-        ascii_toxxx = ascii_toupper;
+        // Copy non-cased character, prepare to uppercase next cased character
+        *output++ = *ch;
+        ascii_to_ul = ascii_toupper;
       }
     }
     return input_string_ncodeunits;

From 6c9b829e12f00d95d7430a17e162882d1917add3 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 02:45:44 -0400
Subject: [PATCH 14/23] minor change to IsTitle

---
 .../arrow/compute/kernels/scalar_string.cc    | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 1672af79c3c..3390ee9d10d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -536,7 +536,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
         output = util::UTF8Encode(output, TransformCodepoint(codepoint));
         TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
       } else {
-        // Copy non-cased codepoint, prepare to uppercase next cased codepoint
+        // Copy uncased codepoint, prepare to uppercase next cased codepoint
         std::memcpy(output, input, next - input);
         output += next - input;
         TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
@@ -710,7 +710,7 @@ struct AsciiTitleTransform : public StringTransformBase {
         *output++ = ascii_to_ul(*ch);
         ascii_to_ul = ascii_tolower;
       } else {
-        // Copy non-cased character, prepare to uppercase next cased character
+        // Copy uncased character, prepare to uppercase next cased character
         *output++ = *ch;
         ascii_to_ul = ascii_toupper;
       }
@@ -1800,9 +1800,9 @@ struct IsTitleUnicode {
   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
                    Status* st) {
     // rules:
-    // * 1: lower case follows cased
-    // * 2: upper case follows uncased
-    // * 3: at least 1 cased character (which logically should be upper/title)
+    //   1. lower case follows cased
+    //   2. upper case follows uncased
+    //   3. at least 1 cased character (which logically should be upper/title)
     bool rules_1_and_2;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
@@ -1811,14 +1811,15 @@ struct IsTitleUnicode {
                                [&previous_cased, &rule_3](uint32_t codepoint) {
                                  if (IsLowerCaseCharacterUnicode(codepoint)) {
                                    if (!previous_cased) return false;  // rule 1 broken
+                                   // next should be more lower case or uncased
                                    previous_cased = true;
-                                 } else if (IsCasedCharacterUnicode(codepoint)) {
+                                 } else if (IsUpperCaseCharacterUnicode(codepoint)) {
                                    if (previous_cased) return false;  // rule 2 broken
                                    // next should be a lower case or uncased
                                    previous_cased = true;
                                    rule_3 = true;  // rule 3 obeyed
                                  } else {
-                                   // a non-cased char, like _ or 1
+                                   // an uncased char, like _ or 1
                                    // next should be upper case or more uncased
                                    previous_cased = false;
                                  }
@@ -1837,13 +1838,13 @@ struct IsTitleAscii {
   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
                    Status*) {
     // rules:
-    // * 1: lower case follows cased
-    // * 2: upper case follows uncased
-    // * 3: at least 1 cased character (which logically should be upper/title)
+    //   1. lower case follows cased
+    //   2. upper case follows uncased
+    //   3. at least 1 cased character (which logically should be upper/title)
+    // NOTE: Non-ASCII characters are seen as uncased.
     bool rules_1_and_2 = true;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
-    // we cannot rely on std::all_of because we need guaranteed order
     for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
       if (IsLowerCaseCharacterAscii(*c)) {
         if (!previous_cased) {
@@ -1851,8 +1852,9 @@ struct IsTitleAscii {
           rules_1_and_2 = false;
           break;
         }
+        // next should be more lower case or uncased
         previous_cased = true;
-      } else if (IsCasedCharacterAscii(*c)) {
+      } else if (IsUpperCaseCharacterAscii(*c)) {
         if (previous_cased) {
           // rule 2 broken
           rules_1_and_2 = false;
@@ -1862,7 +1864,7 @@ struct IsTitleAscii {
         previous_cased = true;
         rule_3 = true;  // rule 3 obeyed
       } else {
-        // a non-cased char, like _ or 1
+        // an uncased character, like _ or 1
         // next should be upper case or more uncased
         previous_cased = false;
       }
@@ -4115,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc(
     "Classify strings as ASCII titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a non-cased character, and each lowercase character follows\n"
+     "follows a uncased character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const auto utf8_is_alnum_doc =
@@ -4140,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc(
     "Classify strings as titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a non-cased character, and each lowercase character follows\n"
+     "follows a uncased character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const FunctionDoc ascii_upper_doc(

From c002e610e662c0348e7375f43b9ad62d215f96d9 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 12:08:50 -0400
Subject: [PATCH 15/23] remove temp header

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 5b93cbae361..56b10fe2693 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -16,7 +16,6 @@
 // under the License.
 
 #include <memory>
-#include <unordered_map>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>

From a1de3a9a52338e604c25cf07d845373e7831be5a Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 22:03:13 -0400
Subject: [PATCH 16/23] add kernel to R mapping

---
 r/R/expression.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/r/R/expression.R b/r/R/expression.R
index 57466cc3c71..aa9af9270c9 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -51,6 +51,7 @@
   # str_pad is defined in dplyr-functions.R
   # str_sub is defined in dplyr-functions.R
   "str_to_lower" = "utf8_lower",
+  "str_to_title" = "utf8_title",
   "str_to_upper" = "utf8_upper",
   # str_trim is defined in dplyr-functions.R
   "stri_reverse" = "utf8_reverse",

From 021a6f4996ed17804f4ede3e6961e42b2cf8e923 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 22:38:09 -0400
Subject: [PATCH 17/23] use caseless term

---
 .../arrow/compute/kernels/scalar_string.cc    | 30 +++++++++----------
 .../compute/kernels/scalar_string_test.cc     |  4 +--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 3390ee9d10d..f7424aeb87e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -536,7 +536,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
         output = util::UTF8Encode(output, TransformCodepoint(codepoint));
         TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
       } else {
-        // Copy uncased codepoint, prepare to uppercase next cased codepoint
+        // Copy caseless codepoint, prepare to uppercase next cased codepoint
         std::memcpy(output, input, next - input);
         output += next - input;
         TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
@@ -710,7 +710,7 @@ struct AsciiTitleTransform : public StringTransformBase {
         *output++ = ascii_to_ul(*ch);
         ascii_to_ul = ascii_tolower;
       } else {
-        // Copy uncased character, prepare to uppercase next cased character
+        // Copy caseless character, prepare to uppercase next cased character
         *output++ = *ch;
         ascii_to_ul = ascii_toupper;
       }
@@ -1801,7 +1801,7 @@ struct IsTitleUnicode {
                    Status* st) {
     // rules:
     //   1. lower case follows cased
-    //   2. upper case follows uncased
+    //   2. upper case follows caseless
     //   3. at least 1 cased character (which logically should be upper/title)
     bool rules_1_and_2;
     bool previous_cased = false;  // in LL, LU or LT
@@ -1811,16 +1811,16 @@ struct IsTitleUnicode {
                                [&previous_cased, &rule_3](uint32_t codepoint) {
                                  if (IsLowerCaseCharacterUnicode(codepoint)) {
                                    if (!previous_cased) return false;  // rule 1 broken
-                                   // next should be more lower case or uncased
+                                   // next should be more lower case or caseless
                                    previous_cased = true;
                                  } else if (IsUpperCaseCharacterUnicode(codepoint)) {
                                    if (previous_cased) return false;  // rule 2 broken
-                                   // next should be a lower case or uncased
+                                   // next should be a lower case or caseless
                                    previous_cased = true;
                                    rule_3 = true;  // rule 3 obeyed
                                  } else {
-                                   // an uncased char, like _ or 1
-                                   // next should be upper case or more uncased
+                                   // a caseless char, like _ or 1
+                                   // next should be upper case or more caseless
                                    previous_cased = false;
                                  }
                                  return true;
@@ -1839,9 +1839,9 @@ struct IsTitleAscii {
                    Status*) {
     // rules:
     //   1. lower case follows cased
-    //   2. upper case follows uncased
+    //   2. upper case follows caseless
     //   3. at least 1 cased character (which logically should be upper/title)
-    // NOTE: Non-ASCII characters are seen as uncased.
+    // NOTE: Non-ASCII characters are seen as caseless.
     bool rules_1_and_2 = true;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
@@ -1852,7 +1852,7 @@ struct IsTitleAscii {
           rules_1_and_2 = false;
           break;
         }
-        // next should be more lower case or uncased
+        // next should be more lower case or caseless
         previous_cased = true;
       } else if (IsUpperCaseCharacterAscii(*c)) {
         if (previous_cased) {
@@ -1860,12 +1860,12 @@ struct IsTitleAscii {
           rules_1_and_2 = false;
           break;
         }
-        // next should be a lower case or uncased
+        // next should be a lower case or caseless
         previous_cased = true;
         rule_3 = true;  // rule 3 obeyed
       } else {
-        // an uncased character, like _ or 1
-        // next should be upper case or more uncased
+        // a caseless character, like _ or 1
+        // next should be upper case or more caseless
         previous_cased = false;
       }
     }
@@ -4117,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc(
     "Classify strings as ASCII titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a uncased character, and each lowercase character follows\n"
+     "follows a caseless character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const auto utf8_is_alnum_doc =
@@ -4142,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc(
     "Classify strings as titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a uncased character, and each lowercase character follows\n"
+     "follows a caseless character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const FunctionDoc ascii_upper_doc(
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 56b10fe2693..9d4a6bb0381 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -700,7 +700,7 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) {
 
 TYPED_TEST(TestStringKernels, IsSpaceAscii) {
   // \xe2\x80\x88 is punctuation space
-  // Note: for ascii version, the non-ascii chars are seen as non-cased
+  // Note: for ascii version, the non-ascii chars are seen as caseless
   this->CheckUnary("ascii_is_space", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
                    "[true, null, true, true]");
   this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
@@ -709,7 +709,7 @@ TYPED_TEST(TestStringKernels, IsSpaceAscii) {
 
 TYPED_TEST(TestStringKernels, IsTitleAscii) {
   // ٣ is arabic 3 (decimal), Φ capital
-  // Note: for ascii version, the non-ascii chars are seen as non-cased
+  // Note: for ascii version, the non-ascii chars are seen as caseless
   this->CheckUnary("ascii_is_title",
                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
                    boolean(), "[true, null, true, true, true, false, false]");

From 983a90c7bead9b0a0ee798590fe183d998bbef14 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 23:06:27 -0400
Subject: [PATCH 18/23] use uncased term and update some comments

---
 .../arrow/compute/kernels/scalar_string.cc    | 36 +++++++++----------
 .../compute/kernels/scalar_string_test.cc     |  4 +--
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index f7424aeb87e..2fef5fca98b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -87,12 +87,13 @@ static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
 }
 
 static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
+  // Note: Non-ASCII characters are seen as uncased.
   return IsLowerCaseCharacterAscii(ascii_character) ||
          IsUpperCaseCharacterAscii(ascii_character);
 }
 
 static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
-  return IsCasedCharacterAscii(ascii_character);  // same
+  return IsCasedCharacterAscii(ascii_character);
 }
 
 static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
@@ -106,7 +107,7 @@ static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
 }
 
 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
-  return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
+  return ((ascii_character >= 9) && (ascii_character <= 13)) ||
          (ascii_character == ' ');
 }
 
@@ -536,7 +537,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase {
         output = util::UTF8Encode(output, TransformCodepoint(codepoint));
         TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
       } else {
-        // Copy caseless codepoint, prepare to uppercase next cased codepoint
+        // Copy uncased codepoint, prepare to uppercase next cased codepoint
         std::memcpy(output, input, next - input);
         output += next - input;
         TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
@@ -710,7 +711,7 @@ struct AsciiTitleTransform : public StringTransformBase {
         *output++ = ascii_to_ul(*ch);
         ascii_to_ul = ascii_tolower;
       } else {
-        // Copy caseless character, prepare to uppercase next cased character
+        // Copy uncased character, prepare to uppercase next cased character
         *output++ = *ch;
         ascii_to_ul = ascii_toupper;
       }
@@ -1801,7 +1802,7 @@ struct IsTitleUnicode {
                    Status* st) {
     // rules:
     //   1. lower case follows cased
-    //   2. upper case follows caseless
+    //   2. upper case follows uncased
     //   3. at least 1 cased character (which logically should be upper/title)
     bool rules_1_and_2;
     bool previous_cased = false;  // in LL, LU or LT
@@ -1811,16 +1812,16 @@ struct IsTitleUnicode {
                                [&previous_cased, &rule_3](uint32_t codepoint) {
                                  if (IsLowerCaseCharacterUnicode(codepoint)) {
                                    if (!previous_cased) return false;  // rule 1 broken
-                                   // next should be more lower case or caseless
+                                   // next should be more lower case or uncased
                                    previous_cased = true;
                                  } else if (IsUpperCaseCharacterUnicode(codepoint)) {
                                    if (previous_cased) return false;  // rule 2 broken
-                                   // next should be a lower case or caseless
+                                   // next should be a lower case or uncased
                                    previous_cased = true;
                                    rule_3 = true;  // rule 3 obeyed
                                  } else {
-                                   // a caseless char, like _ or 1
-                                   // next should be upper case or more caseless
+                                   // an uncased char, like _ or 1
+                                   // next should be upper case or more uncased
                                    previous_cased = false;
                                  }
                                  return true;
@@ -1837,11 +1838,10 @@ struct IsTitleUnicode {
 struct IsTitleAscii {
   static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
                    Status*) {
-    // rules:
+    // Rules:
     //   1. lower case follows cased
-    //   2. upper case follows caseless
+    //   2. upper case follows uncased
     //   3. at least 1 cased character (which logically should be upper/title)
-    // NOTE: Non-ASCII characters are seen as caseless.
     bool rules_1_and_2 = true;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
@@ -1852,7 +1852,7 @@ struct IsTitleAscii {
           rules_1_and_2 = false;
           break;
         }
-        // next should be more lower case or caseless
+        // next should be more lower case or uncased
         previous_cased = true;
       } else if (IsUpperCaseCharacterAscii(*c)) {
         if (previous_cased) {
@@ -1860,12 +1860,12 @@ struct IsTitleAscii {
           rules_1_and_2 = false;
           break;
         }
-        // next should be a lower case or caseless
+        // next should be a lower case or uncased
         previous_cased = true;
         rule_3 = true;  // rule 3 obeyed
       } else {
-        // a caseless character, like _ or 1
-        // next should be upper case or more caseless
+        // an uncased character, like _ or 1
+        // next should be upper case or more uncased
         previous_cased = false;
       }
     }
@@ -4117,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc(
     "Classify strings as ASCII titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a caseless character, and each lowercase character follows\n"
+     "follows an uncased character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const auto utf8_is_alnum_doc =
@@ -4142,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc(
     "Classify strings as titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
-     "follows a caseless character, and each lowercase character follows\n"
+     "follows an uncased character, and each lowercase character follows\n"
      "an uppercase character.\n"));
 
 const FunctionDoc ascii_upper_doc(
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 9d4a6bb0381..3ac19ead5e8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -700,7 +700,6 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) {
 
 TYPED_TEST(TestStringKernels, IsSpaceAscii) {
   // \xe2\x80\x88 is punctuation space
-  // Note: for ascii version, the non-ascii chars are seen as caseless
   this->CheckUnary("ascii_is_space", "[\" \", null, \"  \", \"\\t\\r\"]", boolean(),
                    "[true, null, true, true]");
   this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
@@ -708,8 +707,7 @@ TYPED_TEST(TestStringKernels, IsSpaceAscii) {
 }
 
 TYPED_TEST(TestStringKernels, IsTitleAscii) {
-  // ٣ is arabic 3 (decimal), Φ capital
-  // Note: for ascii version, the non-ascii chars are seen as caseless
+  // ٣ is Arabic 3 (decimal), Φ capital
   this->CheckUnary("ascii_is_title",
                    "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_Ǆ\", \"Φ\", \"Ǆ\"]",
                    boolean(), "[true, null, true, true, true, false, false]");

From 2c768289acd8054ba47e614e88332c52c1322b87 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Tue, 31 Aug 2021 23:07:08 -0400
Subject: [PATCH 19/23] compact statement in utf8_length

---
 cpp/src/arrow/util/utf8.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 0c9a368d3dd..c5a100ff267 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -560,8 +560,7 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re
 static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
   int64_t length = 0;
   while (first != last) {
-    length += ((*first & 0xc0) != 0x80);
-    ++first;
+    length += ((*first++ & 0xc0) != 0x80);
   }
   return length;
 }

From 147f7b026ced9e6550c5f67d74f9d8ef699a3a1c Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 1 Sep 2021 13:08:15 -0400
Subject: [PATCH 20/23] rename to FunctionalCaseMappingTransform

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 2fef5fca98b..32d7cff4ca8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -423,7 +423,7 @@ struct StringTransformExecWithState
 
 #ifdef ARROW_WITH_UTF8PROC
 
-struct StringTransformCodepointBase : public StringTransformBase {
+struct FunctionalCaseMappingTransform : public StringTransformBase {
   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
     EnsureLookupTablesFilled();
     return Status::OK();
@@ -441,7 +441,7 @@ struct StringTransformCodepointBase : public StringTransformBase {
 };
 
 template <typename CodepointTransform>
-struct StringTransformCodepoint : public StringTransformCodepointBase {
+struct StringTransformCodepoint : public FunctionalCaseMappingTransform {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
@@ -454,7 +454,7 @@ struct StringTransformCodepoint : public StringTransformCodepointBase {
   }
 };
 
-struct UTF8UpperTransform : public StringTransformCodepointBase {
+struct UTF8UpperTransform : public FunctionalCaseMappingTransform {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
                                             : utf8proc_toupper(codepoint);
@@ -464,7 +464,7 @@ struct UTF8UpperTransform : public StringTransformCodepointBase {
 template <typename Type>
 using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
 
-struct UTF8LowerTransform : public StringTransformCodepointBase {
+struct UTF8LowerTransform : public FunctionalCaseMappingTransform {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
                                             : utf8proc_tolower(codepoint);
@@ -474,7 +474,7 @@ struct UTF8LowerTransform : public StringTransformCodepointBase {
 template <typename Type>
 using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
 
-struct UTF8SwapCaseTransform : public StringTransformCodepointBase {
+struct UTF8SwapCaseTransform : public FunctionalCaseMappingTransform {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     if (codepoint <= kMaxCodepointLookup) {
       return lut_swapcase_codepoint[codepoint];
@@ -494,7 +494,7 @@ template <typename Type>
 using UTF8SwapCase =
     StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
 
-struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
+struct Utf8CapitalizeTransform : public FunctionalCaseMappingTransform {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
@@ -520,7 +520,7 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
 template <typename Type>
 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 
-struct Utf8TitleTransform : public StringTransformCodepointBase {
+struct Utf8TitleTransform : public FunctionalCaseMappingTransform {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;

From dbe7229e0840a640c0eef3f34be1adabbc0d42e8 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 1 Sep 2021 13:37:16 -0400
Subject: [PATCH 21/23] use boolean flag and change for-while loop

---
 .../arrow/compute/kernels/scalar_string.cc    | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 32d7cff4ca8..28d0de77573 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -524,23 +524,29 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
-    uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint;
-    for (const uint8_t *next = input, *end = input + input_string_ncodeunits; next < end;
-         input = next) {
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* next = input;
+    bool is_next_upper = true;
+    while ((input = next) < end) {
       uint32_t codepoint;
       if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) {
         return kTransformError;
       }
       if (IsCasedCharacterUnicode(codepoint)) {
-        // Lower/uppercase codepoint, prepare to lowercase next consecutive cased
-        // codepoints
-        output = util::UTF8Encode(output, TransformCodepoint(codepoint));
-        TransformCodepoint = UTF8LowerTransform::TransformCodepoint;
+        // Lower/uppercase current codepoint and
+        // prepare to lowercase next consecutive cased codepoints
+        output = is_next_upper
+                     ? util::UTF8Encode(output,
+                                        UTF8UpperTransform::TransformCodepoint(codepoint))
+                     : util::UTF8Encode(
+                           output, UTF8LowerTransform::TransformCodepoint(codepoint));
+        is_next_upper = false;
       } else {
-        // Copy uncased codepoint, prepare to uppercase next cased codepoint
+        // Copy current uncased codepoint and
+        // prepare to uppercase next cased codepoint
         std::memcpy(output, input, next - input);
         output += next - input;
-        TransformCodepoint = UTF8UpperTransform::TransformCodepoint;
+        is_next_upper = true;
       }
     }
     return output - output_start;
@@ -703,17 +709,20 @@ using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 struct AsciiTitleTransform : public StringTransformBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
-    uint8_t (*ascii_to_ul)(uint8_t) = ascii_toupper;
-    for (const uint8_t* ch = input; ch < (input + input_string_ncodeunits); ++ch) {
-      if (IsCasedCharacterAscii(*ch)) {
-        // Lower/uppercase character, prepare to lowercase next consecutive cased
-        // characters
-        *output++ = ascii_to_ul(*ch);
-        ascii_to_ul = ascii_tolower;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* next = input;
+    bool is_next_upper = true;
+    while ((input = next++) < end) {
+      if (IsCasedCharacterAscii(*input)) {
+        // Lower/uppercase current character and
+        // prepare to lowercase next consecutive cased characters
+        *output++ = is_next_upper ? ascii_toupper(*input) : ascii_tolower(*input);
+        is_next_upper = false;
       } else {
-        // Copy uncased character, prepare to uppercase next cased character
-        *output++ = *ch;
-        ascii_to_ul = ascii_toupper;
+        // Copy current uncased character and
+        // prepare to uppercase next cased character
+        *output++ = *input;
+        is_next_upper = true;
       }
     }
     return input_string_ncodeunits;

From 5955c4e8bc5581cbbd431fefe68d990caed2ac5e Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 1 Sep 2021 13:45:17 -0400
Subject: [PATCH 22/23] revert case call in IsTitle kernels

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 28d0de77573..df0827f67fd 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -1823,7 +1823,7 @@ struct IsTitleUnicode {
                                    if (!previous_cased) return false;  // rule 1 broken
                                    // next should be more lower case or uncased
                                    previous_cased = true;
-                                 } else if (IsUpperCaseCharacterUnicode(codepoint)) {
+                                 } else if (IsCasedCharacterUnicode(codepoint)) {
                                    if (previous_cased) return false;  // rule 2 broken
                                    // next should be a lower case or uncased
                                    previous_cased = true;
@@ -1863,7 +1863,7 @@ struct IsTitleAscii {
         }
         // next should be more lower case or uncased
         previous_cased = true;
-      } else if (IsUpperCaseCharacterAscii(*c)) {
+      } else if (IsCasedCharacterAscii(*c)) {
         if (previous_cased) {
           // rule 2 broken
           rules_1_and_2 = false;

From 23c93385849f9c460c62444406816409958960a6 Mon Sep 17 00:00:00 2001
From: Eduardo Ponce <edponce00@gmail.com>
Date: Wed, 1 Sep 2021 15:12:28 -0400
Subject: [PATCH 23/23] fix lint error

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index df0827f67fd..aa953119d47 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -107,8 +107,7 @@ static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
 }
 
 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
-  return ((ascii_character >= 9) && (ascii_character <= 13)) ||
-         (ascii_character == ' ');
+  return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' ');
 }
 
 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {