diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index fc7e42aca6f..656fa62be2b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1485,34 +1485,42 @@ TEST(Cast, StringToBoolean) { TEST(Cast, StringToInt) { for (auto string_type : {utf8(), large_utf8()}) { for (auto signed_type : {int8(), int16(), int32(), int64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0"])"), - ArrayFromJSON(signed_type, "[0, null, 127, -1, 0]")); + CheckCast( + ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"), + ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]")); } - CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0"])"), - ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0]")); + CheckCast(ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", + "0X0", "0x7FFFFFFF", "0XFFFFfFfF", "0Xf0000000"])"), + ArrayFromJSON( + int32(), + "[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]")); - CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "-9223372036854775808", "0", "0"])"), + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "-9223372036854775808", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"), ArrayFromJSON(int64(), - "[9223372036854775807, null, -9223372036854775808, 0, 0]")); + "[9223372036854775807, null, -9223372036854775808, 0, 0, " + "9223372036854775807, -1152921504606846975]")); for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0"])"), - ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0]")); + CheckCast(ArrayFromJSON(string_type, + R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f"])"), + ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); } CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0"])"), - ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0]")); - - CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "18446744073709551615", "0", "0"])"), + ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", + "0x0", "0x7FFFFFFf", "0xFFFFFFFF"])"), + ArrayFromJSON(uint32(), + "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); + + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "18446744073709551615", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"), ArrayFromJSON(uint64(), - "[9223372036854775807, null, 18446744073709551615, 0, 0]")); + "[9223372036854775807, null, 18446744073709551615, 0, 0, " + "9223372036854775807, 18446744073709551615]")); for (std::string not_int8 : { "z", @@ -1520,16 +1528,15 @@ TEST(Cast, StringToInt) { "128", "-129", "0.5", + "0x", + "0xfff", + "-0xf0", }) { auto options = CastOptions::Safe(int8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); } - for (std::string not_uint8 : { - "256", - "-1", - "0.5", - }) { + for (std::string not_uint8 : {"256", "-1", "0.5", "0x", "0x3wa", "0x123"}) { auto options = CastOptions::Safe(uint8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); } diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index a92db132d0a..d99634e122e 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -273,6 +273,30 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST +template +bool ParseHex(const char* s, size_t length, T* out) { + // lets make sure that the length of the string is not too big + if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) { + return false; + } + T result = 0; + for (size_t i = 0; i < length; i++) { + result = static_cast(result << 4); + if (s[i] >= '0' && s[i] <= '9') { + result = static_cast(result | (s[i] - '0')); + } else if (s[i] >= 'A' && s[i] <= 'F') { + result = static_cast(result | (s[i] - 'A' + 10)); + } else if (s[i] >= 'a' && s[i] <= 'f') { + result = static_cast(result | (s[i] - 'a' + 10)); + } else { + /* Non-digit */ + return false; + } + } + *out = result; + return true; +} + template struct StringToUnsignedIntConverterMixin { using value_type = typename ARROW_TYPE::c_type; @@ -281,6 +305,13 @@ struct StringToUnsignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { + length -= 2; + s += 2; + + return ARROW_PREDICT_TRUE(ParseHex(s, length, out)); + } // Skip leading zeros while (length > 0 && *s == '0') { length--; @@ -329,6 +360,18 @@ struct StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { + length -= 2; + s += 2; + + if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { + return false; + } + *out = static_cast(unsigned_value); + return true; + } + if (*s == '-') { negative = true; s++; diff --git a/cpp/src/arrow/util/value_parsing_benchmark.cc b/cpp/src/arrow/util/value_parsing_benchmark.cc index c113c245fff..40d139316e5 100644 --- a/cpp/src/arrow/util/value_parsing_benchmark.cc +++ b/cpp/src/arrow/util/value_parsing_benchmark.cc @@ -56,6 +56,26 @@ static std::vector MakeIntStrings(int32_t num_items) { return strings; } +template +static std::vector MakeHexStrings(int32_t num_items) { + int32_t num_bytes = sizeof(c_int); + const char* kAsciiTable = "0123456789ABCDEF"; + std::vector large_hex_chars(num_bytes * 2 + 2); + large_hex_chars[0] = '0'; + large_hex_chars[1] = 'x'; + for (int32_t i = 0; i < num_bytes * 2; ++i) { + large_hex_chars[i + 2] = kAsciiTable[i]; + } + std::string large_hex(&large_hex_chars[0], large_hex_chars.size()); + + std::vector base_strings = {"0x0", "0xA5", "0x5E", large_hex}; + std::vector strings; + for (int32_t i = 0; i < num_items; ++i) { + strings.push_back(base_strings[i % base_strings.size()]); + } + return strings; +} + static std::vector MakeFloatStrings(int32_t num_items) { std::vector base_strings = {"0.0", "5", "-12.3", "98765430000", "3456.789", "0.0012345", @@ -123,6 +143,25 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer state.SetItemsProcessed(state.iterations() * strings.size()); } +template +static void HexParsing(benchmark::State& state) { // NOLINT non-const reference + auto strings = MakeHexStrings(1000); + + while (state.KeepRunning()) { + C_TYPE total = 0; + for (const auto& s : strings) { + C_TYPE value; + if (!ParseValue(s.data(), s.length(), &value)) { + std::cerr << "Conversion failed for '" << s << "'"; + std::abort(); + } + total = static_cast(total + value); + } + benchmark::DoNotOptimize(total); + } + state.SetItemsProcessed(state.iterations() * strings.size()); +} + template static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeFloatStrings(1000); @@ -230,6 +269,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type); +BENCHMARK_TEMPLATE(HexParsing, Int8Type); +BENCHMARK_TEMPLATE(HexParsing, Int16Type); +BENCHMARK_TEMPLATE(HexParsing, Int32Type); +BENCHMARK_TEMPLATE(HexParsing, Int64Type); +BENCHMARK_TEMPLATE(HexParsing, UInt8Type); +BENCHMARK_TEMPLATE(HexParsing, UInt16Type); +BENCHMARK_TEMPLATE(HexParsing, UInt32Type); +BENCHMARK_TEMPLATE(HexParsing, UInt64Type); + BENCHMARK_TEMPLATE(FloatParsing, FloatType); BENCHMARK_TEMPLATE(FloatParsing, DoubleType); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index b5dc5619ded..ebbb733398d 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -120,6 +120,16 @@ TEST(StringConversion, ToInt8) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversion("0xFF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToUInt8) { @@ -138,6 +148,16 @@ TEST(StringConversion, ToUInt8) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversion("0xFF", 255); + AssertConversionFails("0x"); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToInt16) { @@ -155,6 +175,16 @@ TEST(StringConversion, ToInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0X1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0XfffF", -1); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToUInt16) { @@ -172,6 +202,16 @@ TEST(StringConversion, ToUInt16) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversion("0xFffF", 65535); + AssertConversionFails("0x"); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); } TEST(StringConversion, ToInt32) { @@ -189,6 +229,18 @@ TEST(StringConversion, ToInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt32) { @@ -206,6 +258,18 @@ TEST(StringConversion, ToUInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", 4294967295); + AssertConversionFails("0X"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToInt64) { @@ -223,6 +287,17 @@ TEST(StringConversion, ToInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", -1152921504606846975); + AssertConversion("0xfFFFFFFFFFFFFFFf", -1); + AssertConversionFails("0X"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt64) { @@ -237,6 +312,17 @@ TEST(StringConversion, ToUInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", 17293822569102704641ULL); + AssertConversion("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL); + AssertConversionFails("0x"); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToDate32) {