From 945632f0746291f0fa05d9d48e5206e84604a241 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 15 Sep 2021 09:29:14 -0500 Subject: [PATCH] reimplementing ARROW-12657 from a previous PR in a new branch --- .../arrow/compute/kernels/scalar_cast_test.cc | 43 ++++++----- cpp/src/arrow/util/value_parsing_benchmark.cc | 50 ++++++++++++ cpp/src/arrow/util/value_parsing_test.cc | 77 +++++++++++++++++++ 3 files changed, 151 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index bea9a0ef8dc..6367a2e004f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -315,42 +315,43 @@ class TestCast : public TestBase { CastOptions options; auto src_type = TypeTraits::type_singleton(); - std::vector is_valid = {true, false, true, true, true}; + std::vector is_valid = {true, false, true, true, true, true, true, true}; // string to int - std::vector v_int = {"0", "1", "127", "-1", "0"}; - std::vector e_int8 = {0, 1, 127, -1, 0}; - std::vector e_int16 = {0, 1, 127, -1, 0}; - std::vector e_int32 = {0, 1, 127, -1, 0}; - std::vector e_int64 = {0, 1, 127, -1, 0}; + std::vector v_int = {"0", "1", "127", "-1", "0", "0x0", "0x7f", "0XFF"}; + std::vector e_int8 = {0, 1, 127, -1, 0, 0, 127, -1}; + std::vector e_int16 = {0, 1, 127, -1, 0, 0, 127, -1}; + std::vector e_int32 = {0, 1, 127, -1, 0, 0, 127, -1}; + std::vector e_int64 = {0, 1, 127, -1, 0, 0, 127, -1}; CheckCase(v_int, is_valid, e_int8, options); CheckCase(v_int, is_valid, e_int16, options); CheckCase(v_int, is_valid, e_int32, options); CheckCase(v_int, is_valid, e_int64, options); - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; + v_int = {"2147483647", "0", "-2147483648", "0", "0", "0x7fff", "0X0", "0xF000"}; + e_int32 = {2147483647, 0, -2147483648LL, 0, 0, 2147483647, 0, -2147483648LL}; CheckCase(v_int, is_valid, e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; + v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0", "0x7FFFFFFF", "0x0", "0Xffffffff"}; + e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0, + 9223372036854775807LL, 0, (-9223372036854775807LL - 1)}; CheckCase(v_int, is_valid, e_int64, options); // string to uint - std::vector v_uint = {"0", "1", "127", "255", "0"}; - std::vector e_uint8 = {0, 1, 127, 255, 0}; - std::vector e_uint16 = {0, 1, 127, 255, 0}; - std::vector e_uint32 = {0, 1, 127, 255, 0}; - std::vector e_uint64 = {0, 1, 127, 255, 0}; + std::vector v_uint = {"0", "1", "127", "255", "0", "0x0", "0x7f", "0XFF"}; + std::vector e_uint8 = {0, 1, 127, 255, 0, 0, 127, 255}; + std::vector e_uint16 = {0, 1, 127, 255, 0, 0, 127, 255}; + std::vector e_uint32 = {0, 1, 127, 255, 0, 0, 127, 255}; + std::vector e_uint64 = {0, 1, 127, 255, 0, 0, 127, 255}; CheckCase(v_uint, is_valid, e_uint8, options); CheckCase(v_uint, is_valid, e_uint16, options); CheckCase(v_uint, is_valid, e_uint32, options); CheckCase(v_uint, is_valid, e_uint64, options); - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; + v_uint = {"4294967295", "0", "0", "0", "0", "0xFFFF", "0x0", "0x1"}; + e_uint32 = {4294967295, 0, 0, 0, 0, 4294967295, 0, 1}; CheckCase(v_uint, is_valid, e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; + v_uint = {"18446744073709551615", "0", "0", "0", "0", "0xffffFFFF", "0x0", "0x1"}; + e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0, 18446744073709551615ULL, 0, 1}; CheckCase(v_uint, is_valid, e_uint64, options); // string to float @@ -1554,6 +1555,10 @@ TEST_F(TestCast, StringToNumberErrors) { CheckFails({"128"}, is_valid, int8(), options); CheckFails({"-129"}, is_valid, int8(), options); CheckFails({"0.5"}, is_valid, int8(), options); + CheckFails({"0x"}, is_valid, int8(), options); + CheckFails({"0x12r"}, is_valid, int32(), options); + CheckFails({"0x1ffff"}, is_valid, int32(), options); + CheckFails({"-0x123"}, is_valid, int32(), options); CheckFails({"256"}, is_valid, uint8(), options); CheckFails({"-1"}, is_valid, uint8(), options); diff --git a/cpp/src/arrow/util/value_parsing_benchmark.cc b/cpp/src/arrow/util/value_parsing_benchmark.cc index c113c245fff..fb35f7bfff0 100644 --- a/cpp/src/arrow/util/value_parsing_benchmark.cc +++ b/cpp/src/arrow/util/value_parsing_benchmark.cc @@ -56,6 +56,27 @@ static std::vector MakeIntStrings(int32_t num_items) { return strings; } +template +static std::vector MakeHexStrings(int32_t num_items) { + int32_t num_bytes = sizeof(c_int); + const char* kAsciiTable = "0123456789ABCDEF"; + std::vector large_hex_chars(num_bytes * 2 + 2); + large_hex_chars[0] = '0'; + large_hex_chars[1] = 'x'; + for (int32_t i = 0; i < num_bytes * 2; ++i) { + large_hex_chars[i + 2] = kAsciiTable[i]; + } + std::string large_hex(&large_hex_chars[0], large_hex_chars.size()); + + std::vector base_strings = {"0x0", "0xA5", "0x5E", large_hex}; + std::vector strings; + for (int32_t i = 0; i < num_items; ++i) { + strings.push_back(base_strings[i % base_strings.size()]); + } + return strings; +} + + static std::vector MakeFloatStrings(int32_t num_items) { std::vector base_strings = {"0.0", "5", "-12.3", "98765430000", "3456.789", "0.0012345", @@ -123,6 +144,26 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer state.SetItemsProcessed(state.iterations() * strings.size()); } +template +static void HexParsing(benchmark::State& state) { // NOLINT non-const reference + auto strings = MakeHexStrings(1000); + + while (state.KeepRunning()) { + C_TYPE total = 0; + for (const auto& s : strings) { + C_TYPE value; + if (!ParseValue(s.data(), s.length(), &value)) { + std::cerr << "Conversion failed for '" << s << "'"; + std::abort(); + } + total = static_cast(total + value); + } + benchmark::DoNotOptimize(total); + } + state.SetItemsProcessed(state.iterations() * strings.size()); +} + + template static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeFloatStrings(1000); @@ -230,6 +271,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type); +BENCHMARK_TEMPLATE(HexParsing, Int8Type); +BENCHMARK_TEMPLATE(HexParsing, Int16Type); +BENCHMARK_TEMPLATE(HexParsing, Int32Type); +BENCHMARK_TEMPLATE(HexParsing, Int64Type); +BENCHMARK_TEMPLATE(HexParsing, UInt8Type); +BENCHMARK_TEMPLATE(HexParsing, UInt16Type); +BENCHMARK_TEMPLATE(HexParsing, UInt32Type); +BENCHMARK_TEMPLATE(HexParsing, UInt64Type); + BENCHMARK_TEMPLATE(FloatParsing, FloatType); BENCHMARK_TEMPLATE(FloatParsing, DoubleType); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 1682e1d7f1d..bdbf053c762 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -120,6 +120,17 @@ TEST(StringConversion, ToInt8) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversion("0xFF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); + } TEST(StringConversion, ToUInt8) { @@ -155,6 +166,16 @@ TEST(StringConversion, ToInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0XfffF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToUInt16) { @@ -172,6 +193,16 @@ TEST(StringConversion, ToUInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0xFffF", 65535); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToInt32) { @@ -189,6 +220,18 @@ TEST(StringConversion, ToInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt32) { @@ -206,6 +249,18 @@ TEST(StringConversion, ToUInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", 4294967295); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToInt64) { @@ -223,6 +278,17 @@ TEST(StringConversion, ToInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", -1152921504606846975); + AssertConversion("0xfFFFFFFFFFFFFFFf", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt64) { @@ -237,6 +303,17 @@ TEST(StringConversion, ToUInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", 17293822569102704641ULL); + AssertConversion("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL); + AssertConversionFails("0x"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToTimestampDate_ISO8601) {