From 332747daa3a131f22a75e7639349ea2765401f41 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 15 Jun 2020 10:58:20 +0200 Subject: [PATCH 1/5] ARROW-9131: [C++] Faster ascii_lower and ascii_upper. Following up on #7418 I tried and benchmarked a different way for * ascii_lower * ascii_upper Before (lower is similar): ``` -------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------- AsciiUpper_median 4922843 ns 4918961 ns 10 bytes_per_second=3.1457G/s items_per_second=213.17M/s ``` After: ``` -------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------- AsciiUpper_median 1391272 ns 1390014 ns 10 bytes_per_second=11.132G/s items_per_second=754.363M/s ``` This is a 3.7x speedup (on a AMD machine). Using http://quick-bench.com/JaDErmVCY23Z1tu6YZns_KBt0qU I found 4.6x speedup for clang 9, 6.4x for GCC 9.2. Also, the test is expanded a bit to include a non-ascii codepoint, to make explicit it is fine to upper or lower case a utf8 string. The non-overlap encoding of utf8 make this ok (see section 2.5 of Unicode Standard Core Specification v13.0). --- .../arrow/compute/kernels/scalar_string.cc | 69 +++---------------- .../compute/kernels/scalar_string_test.cc | 8 +-- 2 files changed, 13 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index ea31d9391aa..a04902ce390 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -64,38 +64,14 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch, } } -// Generated with -// -// print("static constexpr uint8_t kAsciiUpperTable[] = {") -// for i in range(256): -// if i > 0: print(', ', end='') -// if i >= ord('a') and i <= ord('z'): -// print(i - 32, end='') -// else: -// print(i, end='') -// print("};") - -static constexpr uint8_t kAsciiUpperTable[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; - void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { - *output++ = kAsciiUpperTable[*input++]; + const uint8_t raw_utf8_byte = *input++; + // Bytes in the range [a-z] can only be an encoding of an ascii character/codepoint, + // not the 2nd, 3rd or 4th bytes of an different codepoint. + // This guaranteed by non-overal design of the unicode standard. + // (see section 2.5 of Unicode Standard Core Specification v13.0) + *output++ = ((raw_utf8_byte >= 'a') && (raw_utf8_byte <= 'z')) ? (raw_utf8_byte - 32) : raw_utf8_byte; } } @@ -103,38 +79,11 @@ void AsciiUpperExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { StringDataTransform(ctx, batch, TransformAsciiUpper, out); } -// Generated with -// -// print("static constexpr uint8_t kAsciiLowerTable[] = {") -// for i in range(256): -// if i > 0: print(', ', end='') -// if i >= ord('A') and i <= ord('Z'): -// print(i + 32, end='') -// else: -// print(i, end='') -// print("};") - -static constexpr uint8_t kAsciiLowerTable[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; - void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { - *output++ = kAsciiLowerTable[*input++]; + // As with TransformAsciiUpper, the same guarantee holds for the range [A-Z] + const uint8_t raw_utf8_byte = *input++; + *output++ = ((raw_utf8_byte >= 'A') && (raw_utf8_byte <= 'Z')) ? (raw_utf8_byte + 32) : raw_utf8_byte; } } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 2e96b222e83..e4953ef9d3d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -57,13 +57,13 @@ TYPED_TEST(TestStringKernels, AsciiLength) { } TYPED_TEST(TestStringKernels, AsciiUpper) { - this->CheckUnary("ascii_upper", "[\"aAa&\", null, \"\", \"b\"]", this->string_type(), - "[\"AAA&\", null, \"\", \"B\"]"); + this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), + "[\"AAAZZæÆ&\", null, \"\", \"B\"]"); } TYPED_TEST(TestStringKernels, AsciiLower) { - this->CheckUnary("ascii_lower", "[\"aAa&\", null, \"\", \"b\"]", this->string_type(), - "[\"aaa&\", null, \"\", \"b\"]"); + this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), + "[\"aaazzæÆ&\", null, \"\", \"b\"]"); } TYPED_TEST(TestStringKernels, Strptime) { From b3b4b564adb577e2061d5f7d7673eea59badb968 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 15 Jun 2020 11:43:41 +0200 Subject: [PATCH 2/5] reword using proper nomenclature --- cpp/src/arrow/compute/kernels/scalar_string.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index a04902ce390..8898a9438ef 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -66,12 +66,12 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch, void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { - const uint8_t raw_utf8_byte = *input++; - // Bytes in the range [a-z] can only be an encoding of an ascii character/codepoint, - // not the 2nd, 3rd or 4th bytes of an different codepoint. + const uint8_t utf8_code_unit = *input++; + // Code units in the range [a-z] can only be an encoding of an ascii character/codepoint, + // not the 2nd, 3rd or 4th code unit (byte) of an different codepoint. // This guaranteed by non-overal design of the unicode standard. // (see section 2.5 of Unicode Standard Core Specification v13.0) - *output++ = ((raw_utf8_byte >= 'a') && (raw_utf8_byte <= 'z')) ? (raw_utf8_byte - 32) : raw_utf8_byte; + *output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) : utf8_code_unit; } } @@ -82,8 +82,8 @@ void AsciiUpperExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { // As with TransformAsciiUpper, the same guarantee holds for the range [A-Z] - const uint8_t raw_utf8_byte = *input++; - *output++ = ((raw_utf8_byte >= 'A') && (raw_utf8_byte <= 'Z')) ? (raw_utf8_byte + 32) : raw_utf8_byte; + const uint8_t utf8_code_unit = *input++; + *output++ = ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) : utf8_code_unit; } } From e26e5e0d9ca9c46ca539752313896fd98b97aeab Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 15 Jun 2020 11:56:01 +0200 Subject: [PATCH 3/5] lint/format --- cpp/src/arrow/compute/kernels/scalar_string.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 8898a9438ef..11197975ebe 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -67,11 +67,13 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch, void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { const uint8_t utf8_code_unit = *input++; - // Code units in the range [a-z] can only be an encoding of an ascii character/codepoint, - // not the 2nd, 3rd or 4th code unit (byte) of an different codepoint. - // This guaranteed by non-overal design of the unicode standard. - // (see section 2.5 of Unicode Standard Core Specification v13.0) - *output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) : utf8_code_unit; + // Code units in the range [a-z] can only be an encoding of an ascii + // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different + // codepoint. This guaranteed by non-overal design of the unicode standard. (see + // section 2.5 of Unicode Standard Core Specification v13.0) + *output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) + ? (utf8_code_unit - 32) + : utf8_code_unit; } } @@ -83,7 +85,9 @@ void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) for (int64_t i = 0; i < length; ++i) { // As with TransformAsciiUpper, the same guarantee holds for the range [A-Z] const uint8_t utf8_code_unit = *input++; - *output++ = ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) : utf8_code_unit; + *output++ = ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) + ? (utf8_code_unit + 32) + : utf8_code_unit; } } From f51e86ff547f3b41e2d23aaeb8b1b73365676df0 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 15 Jun 2020 12:41:35 +0200 Subject: [PATCH 4/5] format/linting test --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index e4953ef9d3d..209ad9408f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -57,13 +57,13 @@ TYPED_TEST(TestStringKernels, AsciiLength) { } TYPED_TEST(TestStringKernels, AsciiUpper) { - this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), - "[\"AAAZZæÆ&\", null, \"\", \"B\"]"); + this->CheckUnary("ascii_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", + this->string_type(), "[\"AAAZZæÆ&\", null, \"\", \"B\"]"); } TYPED_TEST(TestStringKernels, AsciiLower) { - this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->string_type(), - "[\"aaazzæÆ&\", null, \"\", \"b\"]"); + this->CheckUnary("ascii_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", + this->string_type(), "[\"aaazzæÆ&\", null, \"\", \"b\"]"); } TYPED_TEST(TestStringKernels, Strptime) { From 6309da782e2b5f862297fbe0f003caf239560aab Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 15 Jun 2020 13:36:43 +0200 Subject: [PATCH 5/5] fix typo --- cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 11197975ebe..b3feaaea299 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -69,7 +69,7 @@ void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) const uint8_t utf8_code_unit = *input++; // Code units in the range [a-z] can only be an encoding of an ascii // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different - // codepoint. This guaranteed by non-overal design of the unicode standard. (see + // codepoint. This guaranteed by non-overlap design of the unicode standard. (see // section 2.5 of Unicode Standard Core Specification v13.0) *output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)