From 50eb552f99c02ce1c8656adbfdb748211b78ae61 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 12 May 2021 16:59:08 -0400
Subject: [PATCH 01/15] adding basic impl for string reverse

---
 .../arrow/compute/kernels/scalar_string.cc    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 65196b2a491..ae022ef0e6c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -266,6 +266,22 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct StringReverse : StringTransform<Type, StringReverse<Type>> {
+  using Base = StringTransform<Type, StringReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    // todo: this is not needed for reverse. may be change StringTransform struct to
+    // handle 2 cases. 1. for same size string transform and 2. variable size string
+    // transform
+    *output_written = input_string_ncodeunits;
+    return true;
+  }
+};
+
 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
 
 // Transform a buffer of offsets to one which begins with 0 and has same
@@ -2317,6 +2333,10 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc string_reverse_doc(
+    "Reverse input ", ("For each string in `strings`, return a reversed version."),
+    {"strings"});
+
 }  // namespace
 
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
@@ -2332,6 +2352,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
                                                    &ascii_ltrim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
                                                    &ascii_rtrim_whitespace_doc);
+  MakeUnaryStringBatchKernel<StringReverse>("string_reverse", registry,
+                                            &string_reverse_doc);
   MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry,
                                                  &ascii_lower_doc);
   MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,

From 991152938bf01836fa3de0cdc84418db0a81ca19 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 12 May 2021 17:58:49 -0400
Subject: [PATCH 02/15] adding basic test case

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index a59634b7be8..10d77c4a290 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -91,6 +91,14 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, StringReverse) {
+  this->CheckUnary("string_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("string_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"dcba\", null, \"\", \"bbb\"]");
+//  this->CheckUnary("string_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
+//                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
+}
+
 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
   // 0x7fff * 0xffff is the max a 32 bit string array can hold
   // since the utf8_upper kernel can grow it by 3/2, the max we should accept is is

From f8fe15d5278d6781f374647f96243fe1d9527581 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 13 May 2021 14:33:39 -0400
Subject: [PATCH 03/15] separating ascii and utf8 reverse kernels

---
 .../arrow/compute/kernels/scalar_string.cc    | 51 ++++++++++++++++---
 .../compute/kernels/scalar_string_test.cc     | 20 ++++++--
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index ae022ef0e6c..2b5795a9cfe 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -267,17 +267,48 @@ void EnsureLookupTablesFilled() {}
 #endif  // ARROW_WITH_UTF8PROC
 
 template <typename Type>
-struct StringReverse : StringTransform<Type, StringReverse<Type>> {
-  using Base = StringTransform<Type, StringReverse<Type>>;
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
   using offset_type = typename Base::offset_type;
 
   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
-    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    uint8_t failure = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      failure |= input[i] & 0x80;  // if a utf8 char is found, report to failure
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
     // todo: this is not needed for reverse. may be change StringTransform struct to
     // handle 2 cases. 1. for same size string transform and 2. variable size string
     // transform
     *output_written = input_string_ncodeunits;
+    return failure == 0;
+  }
+};
+
+// todo move to a proper place
+/**
+ * UTF8 codeunit size can be determined by looking at the first 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
     return true;
   }
 };
@@ -2333,9 +2364,13 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
-const FunctionDoc string_reverse_doc(
-    "Reverse input ", ("For each string in `strings`, return a reversed version."),
-    {"strings"});
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ascii input",
+    ("For each ascii string in `strings`, return a reversed version."), {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+    "Reverse utf8 input",
+    ("For each utf8 string in `strings`, return a reversed version."), {"strings"});
 
 }  // namespace
 
@@ -2352,8 +2387,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
                                                    &ascii_ltrim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
                                                    &ascii_rtrim_whitespace_doc);
-  MakeUnaryStringBatchKernel<StringReverse>("string_reverse", registry,
-                                            &string_reverse_doc);
+  MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
+  MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
   MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry,
                                                  &ascii_lower_doc);
   MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 10d77c4a290..5a55a953071 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -91,12 +91,22 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
-TYPED_TEST(TestStringKernels, StringReverse) {
-  this->CheckUnary("string_reverse", "[]", this->type(), "[]");
-  this->CheckUnary("string_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
                    "[\"dcba\", null, \"\", \"bbb\"]");
-//  this->CheckUnary("string_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
-//                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
+  FunctionOptions options{};
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}, &options));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Reverse) {
+  this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"dcba\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
 }
 
 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {

From c84ac035d9a7dd6ec3ed1423f6369ec88af56769 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 13 May 2021 15:33:17 -0400
Subject: [PATCH 04/15] minor changes

---
 cpp/src/arrow/compute/kernels/scalar_string.cc      | 4 ++--
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 2b5795a9cfe..750255e8adc 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -289,7 +289,7 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
 
 // todo move to a proper place
 /**
- * UTF8 codeunit size can be determined by looking at the first 4 bits of BYTE1
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
  */
 const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
                                                  0, 0, 0, 0, 2, 2, 3, 4};
@@ -303,7 +303,7 @@ struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
                  uint8_t* output, offset_type* output_written) {
     offset_type i = 0;
     while (i < input_string_ncodeunits) {
-      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits
       std::copy(input + i, input + (i + offset),
                 output + (input_string_ncodeunits - i - offset));
       i += offset;
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 5a55a953071..7df755b3872 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -97,8 +97,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
                    "[\"dcba\", null, \"\", \"bbb\"]");
 
   Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
-  FunctionOptions options{};
-  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}, &options));
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
 }
 
 TYPED_TEST(TestStringKernels, Utf8Reverse) {

From 124730e77044f3297b9726cdcab6835c709e8c62 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 13 May 2021 17:32:55 -0400
Subject: [PATCH 05/15] making minor changes

---
 cpp/src/arrow/compute/kernels/scalar_string.cc  | 17 ++++++++---------
 .../arrow/compute/kernels/scalar_string_test.cc | 10 ++++++----
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 750255e8adc..b80f50dd5e0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -273,22 +273,21 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
 
   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
-    uint8_t failure = 0;
+    uint8_t utf8_char_found = 0;
     for (offset_type i = 0; i < input_string_ncodeunits; i++) {
-      failure |= input[i] & 0x80;  // if a utf8 char is found, report to failure
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
       output[input_string_ncodeunits - i - 1] = input[i];
     }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
     //    std::reverse_copy(input, input + input_string_ncodeunits, output);
-    // todo: this is not needed for reverse. may be change StringTransform struct to
-    // handle 2 cases. 1. for same size string transform and 2. variable size string
-    // transform
     *output_written = input_string_ncodeunits;
-    return failure == 0;
+    return utf8_char_found == 0;
   }
 };
 
-// todo move to a proper place
-/**
+/*
  * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
  */
 const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
@@ -303,7 +302,7 @@ struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
                  uint8_t* output, offset_type* output_written) {
     offset_type i = 0;
     while (i < input_string_ncodeunits) {
-      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
       std::copy(input + i, input + (i + offset),
                 output + (input_string_ncodeunits - i - offset));
       i += offset;
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 7df755b3872..a10549a49a7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -93,8 +93,8 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
 
 TYPED_TEST(TestStringKernels, AsciiReverse) {
   this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
-  this->CheckUnary("ascii_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
-                   "[\"dcba\", null, \"\", \"bbb\"]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
 
   Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
   ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
@@ -102,10 +102,12 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
 
 TYPED_TEST(TestStringKernels, Utf8Reverse) {
   this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
-  this->CheckUnary("utf8_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(),
-                   "[\"dcba\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
   this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
                    "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]");
 }
 
 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {

From 5ca75944eb821e0a6208e5fca6b57675c1a851f5 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 14 May 2021 11:13:01 -0400
Subject: [PATCH 06/15] editing tests

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index a10549a49a7..b803be577cc 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -96,7 +96,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
   this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
                    R"(["dcba", null, "", "bbb"])");
 
-  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
+  Datum input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
   ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
 }
 
@@ -104,10 +104,10 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) {
   this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
   this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
                    R"(["dcba", null, "", "bbb"])");
-  this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
-                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
-  this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", this->type(),
-                   "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb"])", this->type(),
+                   R"(["&ÆæZzaAa", null, "", "bbb"])";
+  this->CheckUnary("utf8_reverse", R"(["ɑɽⱤæÆ&", null, "", "bbb"])", this->type(),
+                   R"(["&ÆæⱤɽɑ", null, "", "bbb"])");
 }
 
 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {

From b8a4bb3adcb93676e7a74b97fc293bfffd4ffc10 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 14 May 2021 18:12:50 -0400
Subject: [PATCH 07/15] adding review changes

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 18 ++++++++++--------
 .../compute/kernels/scalar_string_test.cc      | 17 +++++++++++------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b80f50dd5e0..f7e3e604b08 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -279,19 +279,21 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
       utf8_char_found |= input[i] & 0x80;
       output[input_string_ncodeunits - i - 1] = input[i];
     }
-    //    todo: finalize if L278 check is required or not. If not required,
-    //    simply use the following
-    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
     *output_written = input_string_ncodeunits;
     return utf8_char_found == 0;
   }
 };
 
 /*
- * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
  */
 const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
-                                                 0, 0, 0, 0, 2, 2, 3, 4};
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
 
 template <typename Type>
 struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
@@ -302,9 +304,9 @@ struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
                  uint8_t* output, offset_type* output_written) {
     offset_type i = 0;
     while (i < input_string_ncodeunits) {
-      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
-      std::copy(input + i, input + (i + offset),
-                output + (input_string_ncodeunits - i - offset));
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
       i += offset;
     }
     *output_written = input_string_ncodeunits;
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index b803be577cc..f11314193e8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -96,18 +96,23 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
   this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
                    R"(["dcba", null, "", "bbb"])");
 
-  Datum input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
-  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
+                                  CallFunction("ascii_reverse", {invalid_input}));
 }
 
 TYPED_TEST(TestStringKernels, Utf8Reverse) {
   this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
   this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
                    R"(["dcba", null, "", "bbb"])");
-  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb"])", this->type(),
-                   R"(["&ÆæZzaAa", null, "", "bbb"])";
-  this->CheckUnary("utf8_reverse", R"(["ɑɽⱤæÆ&", null, "", "bbb"])", this->type(),
-                   R"(["&ÆæⱤɽɑ", null, "", "bbb"])");
+  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", this->type(),
+                   R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
+
+  // inputs with malformed utf8 chars would produce garbage output, but the end result
+  // would produce arrays with same lengths. Hence checking offset buffer equality
+  auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
+  const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
+  ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
 }
 
 TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {

From 55e39acdbb8471d0332a1d3cc0448758c6f63e02 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 17 May 2021 14:02:15 -0400
Subject: [PATCH 08/15] adding review changes

---
 .../arrow/compute/kernels/scalar_string.cc    | 26 ++++++++++++++-----
 .../compute/kernels/scalar_string_test.cc     |  3 ++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index f7e3e604b08..18be5b4f0af 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -142,6 +142,11 @@ struct StringTransform {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     return Derived().Execute(ctx, batch, out);
   }
+
+  static Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+
   Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].kind() == Datum::ARRAY) {
       const ArrayData& input = *batch[0].array();
@@ -173,7 +178,7 @@ struct StringTransform {
         if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
                 input_string, input_string_ncodeunits, output_str + output_ncodeunits,
                 &encoded_nbytes))) {
-          return Status::Invalid("Invalid UTF8 sequence in input");
+          return Derived::InvalidStatus();
         }
         output_ncodeunits += encoded_nbytes;
         output_string_offsets[i + 1] = output_ncodeunits;
@@ -199,7 +204,7 @@ struct StringTransform {
         if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
                 input.value->data(), data_nbytes, value_buffer->mutable_data(),
                 &encoded_nbytes))) {
-          return Status::Invalid("Invalid UTF8 sequence in input");
+          return Derived::InvalidStatus();
         }
         RETURN_NOT_OK(value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true));
       }
@@ -282,6 +287,8 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
     *output_written = input_string_ncodeunits;
     return utf8_char_found == 0;
   }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }
 };
 
 /*
@@ -2353,7 +2360,7 @@ const FunctionDoc ascii_upper_doc(
 const FunctionDoc ascii_lower_doc(
     "Transform ASCII input to lowercase",
     ("For each string in `strings`, return a lowercase version.\n\n"
-     "This function assumes the input is fully ASCII.  It it may contain\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
      "non-ASCII characters, use \"utf8_lower\" instead."),
     {"strings"});
 
@@ -2366,12 +2373,19 @@ const FunctionDoc utf8_lower_doc(
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
 const FunctionDoc ascii_reverse_doc(
-    "Reverse ascii input",
-    ("For each ascii string in `strings`, return a reversed version."), {"strings"});
+    "Reverse ASCII input",
+    ("For each ASCII string in `strings`, return a reversed version.\n\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
+     "non-ASCII characters, use \"utf8_reverse\" instead."),
+    {"strings"});
 
 const FunctionDoc utf8_reverse_doc(
     "Reverse utf8 input",
-    ("For each utf8 string in `strings`, return a reversed version."), {"strings"});
+    ("For each utf8 string in `strings`, return a reversed version.\n\n"
+     "This function works on UTF8 codepoint level. As a consequence, if the input \n"
+     "contains combining UTF8 character sequences/ 'graphemes', they would also \n"
+     "be reversed."),
+    {"strings"});
 
 }  // namespace
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index f11314193e8..31352a9c6d5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -97,7 +97,8 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
                    R"(["dcba", null, "", "bbb"])");
 
   Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  testing::HasSubstr("Non-ascii sequence in input"),
                                   CallFunction("ascii_reverse", {invalid_input}));
 }
 

From 24a60883964bcc4888aa0ebdfb27ad022503c6e2 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 17 May 2021 14:18:27 -0400
Subject: [PATCH 09/15] adding rst docs

---
 docs/source/cpp/compute.rst | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 592dc4ec1b0..889a0b96559 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -435,47 +435,59 @@ String transforms
 +==========================+============+=========================+=====================+=========+=======================================+
 | ascii_lower              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| ascii_reverse            | Unary      | String-like             | String-like         | \(2)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 | ascii_upper              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(2)    |                                       |
+| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(3)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring        | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring        | Unary      | String-like             | String-like         | \(3)    | :struct:`ReplaceSubstringOptions`     |
+| replace_substring_regex  | Unary      | String-like             | String-like         | \(5)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring_regex  | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
+| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(6)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(5)    |                                       |
+| utf8_lower               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_lower               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_reverse             | Unary      | String-like             | String-like         | \(8)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_upper               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_upper               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 
 
 * \(1) Each ASCII character in the input is converted to lowercase or
   uppercase.  Non-ASCII characters are left untouched.
 
-* \(2) Output is the physical length in bytes of each input element.  Output
+* \(2) ASCII input is reversed to the output. If non-ASCII characters 
+  are present, ``Invalid`` :class:`Status` will be returned.
+
+* \(3) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
 
-* \(3) Replace non-overlapping substrings that match to
+* \(4) Replace non-overlapping substrings that match to
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left.
 
-* \(4) Replace non-overlapping substrings that match to the regular expression
+* \(5) Replace non-overlapping substrings that match to the regular expression
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left. Note that if the
   pattern contains groups, backreferencing can be used.
 
-* \(5) Output is the number of characters (not bytes) of each input element.
+* \(6) Output is the number of characters (not bytes) of each input element.
   Output type is Int32 for String, Int64 for LargeString. 
 
-* \(6) Each UTF8-encoded character in the input is converted to lowercase or
+* \(7) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
+* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters 
+  are present, the Output would be undefined (but the size of Output buffers would
+  be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), 
+  they would also be reversed. 
+
 
 String trimming
 ~~~~~~~~~~~~~~~

From 2fd1cc38f636a2e46a7674df88ab0a62e563d414 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 17 May 2021 15:10:36 -0400
Subject: [PATCH 10/15] adding review comments

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 8 ++++----
 docs/source/cpp/compute.rst                    | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 18be5b4f0af..b4de24bfbda 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -288,7 +288,7 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
     return utf8_char_found == 0;
   }
 
-  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }
+  static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); }
 };
 
 /*
@@ -2382,9 +2382,9 @@ const FunctionDoc ascii_reverse_doc(
 const FunctionDoc utf8_reverse_doc(
     "Reverse utf8 input",
     ("For each utf8 string in `strings`, return a reversed version.\n\n"
-     "This function works on UTF8 codepoint level. As a consequence, if the input \n"
-     "contains combining UTF8 character sequences/ 'graphemes', they would also \n"
-     "be reversed."),
+     "This function operates on codepoints/UTF-8 code units, not grapheme\n"
+     "clusters. Hence, it will not correctly reverse grapheme clusters\n"
+     "composed of multiple codepoints."),
     {"strings"});
 
 }  // namespace
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 889a0b96559..1cee7bcf266 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -483,10 +483,9 @@ String transforms
 * \(7) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
-* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters 
-  are present, the Output would be undefined (but the size of Output buffers would
-  be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), 
-  they would also be reversed. 
+* \(8) Each UTF8-encoded code unit is written in reverse order to the output.
+  If the input is not valid UTF8, then the output is undefined (but the size of output
+  buffers will be preserved).
 
 
 String trimming

From 1af4985fe797fe983614152ab97f95299c2a111d Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 17 May 2021 16:07:41 -0400
Subject: [PATCH 11/15] adding python docs rst

---
 docs/source/python/api/compute.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index da16ccdfa29..d74732933e7 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -144,8 +144,10 @@ String Transforms
    :toctree: ../generated/
 
    ascii_lower
+   ascii_reverse
    ascii_upper
    utf8_lower
+   utf8_reverse
    utf8_upper
 
 Containment tests

From 0bbefa8b8f7c942bda5a51c44cba036a1e11701d Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 18 May 2021 10:57:53 -0400
Subject: [PATCH 12/15] Update
 cpp/src/arrow/compute/kernels/scalar_string_test.cc

Co-authored-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 31352a9c6d5..42ad1298ddb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -98,7 +98,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) {
 
   Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
-                                  testing::HasSubstr("Non-ascii sequence in input"),
+                                  testing::HasSubstr("Non-ASCII sequence in input"),
                                   CallFunction("ascii_reverse", {invalid_input}));
 }
 

From 2c1861910518a749a69ab31ebf4b95f3bdd2fce7 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 18 May 2021 23:57:57 -0400
Subject: [PATCH 13/15] fixing lint problems

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 42ad1298ddb..2deac9bc5c4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -106,8 +106,8 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) {
   this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
   this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
                    R"(["dcba", null, "", "bbb"])");
-  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", this->type(),
-                   R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
+  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])",
+                   this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
 
   // inputs with malformed utf8 chars would produce garbage output, but the end result
   // would produce arrays with same lengths. Hence checking offset buffer equality

From 6b27e5422027d4acdb7c0a46405ae1d08ab70128 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 20 May 2021 11:01:18 -0400
Subject: [PATCH 14/15] moving ValidUtf8CodepointByteSize to util/utf8.h

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 12 +-----------
 cpp/src/arrow/util/utf8.cc                     |  2 ++
 cpp/src/arrow/util/utf8.h                      | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b4de24bfbda..dbbfcdbca04 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -291,16 +291,6 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
   static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); }
 };
 
-/*
- * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
- * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
- * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
- * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
- * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
- * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
- */
-const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 2, 2, 3, 4};
 
 template <typename Type>
 struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
@@ -311,7 +301,7 @@ struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
                  uint8_t* output, offset_type* output_written) {
     offset_type i = 0;
     while (i < input_string_ncodeunits) {
-      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      uint8_t offset = util::ValidUtf8CodepointByteSize(input + i);
       offset_type stride = std::min(i + offset, input_string_ncodeunits);
       std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
       i += offset;
diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc
index 478d8ade95f..11394d2e64c 100644
--- a/cpp/src/arrow/util/utf8.cc
+++ b/cpp/src/arrow/util/utf8.cc
@@ -64,6 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT
 
 uint16_t utf8_large_table[9 * 256] = {0xffff};
 
+const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+
 static void InitializeLargeTable() {
   for (uint32_t state = 0; state < 9; ++state) {
     for (uint32_t byte = 0; byte < 256; ++byte) {
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 2d94ca4986e..310d6913403 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -65,6 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12;
 // In this table states are multiples of 256.
 ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
 
+ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
+
 // Success / reject states when looked up in the large table
 static constexpr uint16_t kUTF8ValidateAccept = 0;
 static constexpr uint16_t kUTF8ValidateReject = 256;
@@ -293,6 +295,18 @@ Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
 
 static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
 
+// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
+// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
+// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
+// utf8_byte_size_table[14] --> 3B long UTF8 chars
+// utf8_byte_size_table[15] --> 4B long UTF8 chars
+// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
+// ex: \xFF... returns 4B
+static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
+  return internal::utf8_byte_size_table[*codeunit >> 4];
+}
+
 static inline bool Utf8IsContinuation(const uint8_t codeunit) {
   return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
 }

From 7311300db96aa385f3b5ae5766f57549fccf23a6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 21 May 2021 00:21:32 -0400
Subject: [PATCH 15/15] fixing lint issues

---
 cpp/src/arrow/compute/kernels/scalar_string.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index dbbfcdbca04..00b1fcca9e2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -291,7 +291,6 @@ struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
   static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); }
 };
 
-
 template <typename Type>
 struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
   using Base = StringTransform<Type, Utf8Reverse<Type>>;