From 597cc9fdfe6930e747ec607a77475d182090fe6c Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 15 Sep 2021 10:51:18 -0500 Subject: [PATCH 1/4] Re-implementing ARROW-12657 which was originally done in PR #11064 --- .../arrow/compute/kernels/scalar_cast_test.cc | 58 +++++++------ cpp/src/arrow/util/value_parsing.h | 46 ++++++++++ cpp/src/arrow/util/value_parsing_benchmark.cc | 48 +++++++++++ cpp/src/arrow/util/value_parsing_test.cc | 86 +++++++++++++++++++ 4 files changed, 214 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index fc7e42aca6f..864ae9c038f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1485,34 +1485,46 @@ TEST(Cast, StringToBoolean) { TEST(Cast, StringToInt) { for (auto string_type : {utf8(), large_utf8()}) { for (auto signed_type : {int8(), int16(), int32(), int64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0"])"), - ArrayFromJSON(signed_type, "[0, null, 127, -1, 0]")); + CheckCast( + ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"), + ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]")); } - CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0"])"), - ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0]")); + CheckCast(ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", + "0X0", "0x7FFFFFFF", "0XFFFFfFfF", "0Xf0000000"])"), + ArrayFromJSON( + int32(), + "[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]")); + + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "-9223372036854775808", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"), + ArrayFromJSON(int64(), + "[9223372036854775807, null, -9223372036854775808, 0, 0, " + "9223372036854775807, -1152921504606846975]")); + - CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "-9223372036854775808", "0", "0"])"), - ArrayFromJSON(int64(), - "[9223372036854775807, null, -9223372036854775808, 0, 0]")); for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0"])"), - ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0]")); + CheckCast(ArrayFromJSON(string_type, + R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f"])"), + ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); + } CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0"])"), - ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0]")); + ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", + "0x0", "0x7FFFFFFf", "0xFFFFFFFF"])"), + ArrayFromJSON(uint32(), + "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); + + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "18446744073709551615", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"), + ArrayFromJSON(uint64(), + "[9223372036854775807, null, 18446744073709551615, 0, 0, " + "9223372036854775807, 18446744073709551615]")); - CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "18446744073709551615", "0", "0"])"), - ArrayFromJSON(uint64(), - "[9223372036854775807, null, 18446744073709551615, 0, 0]")); for (std::string not_int8 : { "z", @@ -1520,16 +1532,14 @@ TEST(Cast, StringToInt) { "128", "-129", "0.5", + "0x", + "0xfff", }) { auto options = CastOptions::Safe(int8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); } - for (std::string not_uint8 : { - "256", - "-1", - "0.5", - }) { + for (std::string not_uint8 : {"256", "-1", "0.5", "0x", "0x3wa", "0x123"}) { auto options = CastOptions::Safe(uint8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); } diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index a92db132d0a..bf941c09361 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -273,6 +273,30 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST +template +bool ParseHex(const char* s, size_t length, T* out) { + // lets make sure that the length of the string is not too big + if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) { + return false; + } + T result = 0; + for (size_t i = 0; i < length; i++) { + result = static_cast(result << 4); + if (s[i] >= '0' && s[i] <= '9') { + result = static_cast(result | (s[i] - '0')); + } else if (s[i] >= 'A' && s[i] <= 'F') { + result = static_cast(result | (s[i] - 'A' + 10)); + } else if (s[i] >= 'a' && s[i] <= 'f') { + result = static_cast(result | (s[i] - 'a' + 10)); + } else { + /* Non-digit */ + return false; + } + } + *out = result; + return true; +} + template struct StringToUnsignedIntConverterMixin { using value_type = typename ARROW_TYPE::c_type; @@ -281,6 +305,16 @@ struct StringToUnsignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { + length -= 2; + s += 2; + + if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { + return false; + } + return true; + } // Skip leading zeros while (length > 0 && *s == '0') { length--; @@ -329,6 +363,18 @@ struct StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { + length -= 2; + s += 2; + + if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { + return false; + } + *out = static_cast(unsigned_value); + return true; + } + if (*s == '-') { negative = true; s++; diff --git a/cpp/src/arrow/util/value_parsing_benchmark.cc b/cpp/src/arrow/util/value_parsing_benchmark.cc index c113c245fff..40d139316e5 100644 --- a/cpp/src/arrow/util/value_parsing_benchmark.cc +++ b/cpp/src/arrow/util/value_parsing_benchmark.cc @@ -56,6 +56,26 @@ static std::vector MakeIntStrings(int32_t num_items) { return strings; } +template +static std::vector MakeHexStrings(int32_t num_items) { + int32_t num_bytes = sizeof(c_int); + const char* kAsciiTable = "0123456789ABCDEF"; + std::vector large_hex_chars(num_bytes * 2 + 2); + large_hex_chars[0] = '0'; + large_hex_chars[1] = 'x'; + for (int32_t i = 0; i < num_bytes * 2; ++i) { + large_hex_chars[i + 2] = kAsciiTable[i]; + } + std::string large_hex(&large_hex_chars[0], large_hex_chars.size()); + + std::vector base_strings = {"0x0", "0xA5", "0x5E", large_hex}; + std::vector strings; + for (int32_t i = 0; i < num_items; ++i) { + strings.push_back(base_strings[i % base_strings.size()]); + } + return strings; +} + static std::vector MakeFloatStrings(int32_t num_items) { std::vector base_strings = {"0.0", "5", "-12.3", "98765430000", "3456.789", "0.0012345", @@ -123,6 +143,25 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer state.SetItemsProcessed(state.iterations() * strings.size()); } +template +static void HexParsing(benchmark::State& state) { // NOLINT non-const reference + auto strings = MakeHexStrings(1000); + + while (state.KeepRunning()) { + C_TYPE total = 0; + for (const auto& s : strings) { + C_TYPE value; + if (!ParseValue(s.data(), s.length(), &value)) { + std::cerr << "Conversion failed for '" << s << "'"; + std::abort(); + } + total = static_cast(total + value); + } + benchmark::DoNotOptimize(total); + } + state.SetItemsProcessed(state.iterations() * strings.size()); +} + template static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeFloatStrings(1000); @@ -230,6 +269,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type); +BENCHMARK_TEMPLATE(HexParsing, Int8Type); +BENCHMARK_TEMPLATE(HexParsing, Int16Type); +BENCHMARK_TEMPLATE(HexParsing, Int32Type); +BENCHMARK_TEMPLATE(HexParsing, Int64Type); +BENCHMARK_TEMPLATE(HexParsing, UInt8Type); +BENCHMARK_TEMPLATE(HexParsing, UInt16Type); +BENCHMARK_TEMPLATE(HexParsing, UInt32Type); +BENCHMARK_TEMPLATE(HexParsing, UInt64Type); + BENCHMARK_TEMPLATE(FloatParsing, FloatType); BENCHMARK_TEMPLATE(FloatParsing, DoubleType); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index b5dc5619ded..54f0d0ef321 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -120,6 +120,16 @@ TEST(StringConversion, ToInt8) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversion("0xFF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToUInt8) { @@ -138,6 +148,16 @@ TEST(StringConversion, ToUInt8) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversion("0xFF", 255); + AssertConversionFails("0x"); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToInt16) { @@ -155,6 +175,16 @@ TEST(StringConversion, ToInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0XfffF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToUInt16) { @@ -172,6 +202,16 @@ TEST(StringConversion, ToUInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0xFffF", 65535); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToInt32) { @@ -189,6 +229,18 @@ TEST(StringConversion, ToInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt32) { @@ -206,6 +258,18 @@ TEST(StringConversion, ToUInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", 4294967295); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToInt64) { @@ -223,6 +287,17 @@ TEST(StringConversion, ToInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", -1152921504606846975); + AssertConversion("0xfFFFFFFFFFFFFFFf", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt64) { @@ -237,6 +312,17 @@ TEST(StringConversion, ToUInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", 17293822569102704641ULL); + AssertConversion("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL); + AssertConversionFails("0x"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToDate32) { From 5f6ad6c688ca999c68246837afa411c08a96733b Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 15 Sep 2021 10:53:50 -0500 Subject: [PATCH 2/4] style fixes --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 ++------ cpp/src/arrow/util/value_parsing_test.cc | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 864ae9c038f..8948146e6c3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1499,17 +1499,14 @@ TEST(Cast, StringToInt) { CheckCast(ArrayFromJSON(string_type, R"(["9223372036854775807", null, "-9223372036854775808", "0", "0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"), - ArrayFromJSON(int64(), + ArrayFromJSON(int64(), "[9223372036854775807, null, -9223372036854775808, 0, 0, " "9223372036854775807, -1152921504606846975]")); - - for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f"])"), ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); - } CheckCast( @@ -1521,11 +1518,10 @@ TEST(Cast, StringToInt) { CheckCast(ArrayFromJSON(string_type, R"(["9223372036854775807", null, "18446744073709551615", "0", "0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"), - ArrayFromJSON(uint64(), + ArrayFromJSON(uint64(), "[9223372036854775807, null, 18446744073709551615, 0, 0, " "9223372036854775807, 18446744073709551615]")); - for (std::string not_int8 : { "z", "12 z", diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 54f0d0ef321..ebbb733398d 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -175,7 +175,7 @@ TEST(StringConversion, ToInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); - + // Hex AssertConversion("0x0", 0); AssertConversion("0X1aA", 426); From 74b9a0d62701e4368459dc041abe10a92f9c23a9 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 15 Sep 2021 21:51:15 -0500 Subject: [PATCH 3/4] added small improvement and added one more test case --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 1 + cpp/src/arrow/util/value_parsing.h | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 8948146e6c3..656fa62be2b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1530,6 +1530,7 @@ TEST(Cast, StringToInt) { "0.5", "0x", "0xfff", + "-0xf0", }) { auto options = CastOptions::Safe(int8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index bf941c09361..acc21eff1c0 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -310,10 +310,8 @@ struct StringToUnsignedIntConverterMixin { length -= 2; s += 2; - if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { - return false; - } - return true; + return ARROW_PREDICT_TRUE(ParseHex(s, length, out)); + } // Skip leading zeros while (length > 0 && *s == '0') { From 18e8e96584c6296421375b912ded64eb097c759c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 16 Sep 2021 10:59:03 +0200 Subject: [PATCH 4/4] Fix lint --- cpp/src/arrow/util/value_parsing.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index acc21eff1c0..d99634e122e 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -311,7 +311,6 @@ struct StringToUnsignedIntConverterMixin { s += 2; return ARROW_PREDICT_TRUE(ParseHex(s, length, out)); - } // Skip leading zeros while (length > 0 && *s == '0') {