Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 30 additions & 23 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1485,51 +1485,58 @@ TEST(Cast, StringToBoolean) {
TEST(Cast, StringToInt) {
for (auto string_type : {utf8(), large_utf8()}) {
for (auto signed_type : {int8(), int16(), int32(), int64()}) {
CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0"])"),
ArrayFromJSON(signed_type, "[0, null, 127, -1, 0]"));
CheckCast(
ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"),
ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]"));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a test case for hex with negative sign (eg., -0x34) and another for multiple prefixed zeros (eg., 00000x7f).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those cases are explicitly not allowed. Do you mean that I should add them to CheckCastFails?
A negative before a hex does not make sense. If the value can be negative (signed integer) then you should use the binary representation of that negative number, for example for int8 0xFF is -1 and 0xF0 is -128.
For the case of multiple prefixed zeros, I opted for not allowing that to be allowed, since allowing it would make the logic more complex (probably slower) and it seems like it would be a very odd way for a hex string to be formatted.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh, sorry...I meant as failing test cases. It is just a way to ensure more code coverage in tests.

}

CheckCast(
ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0"])"),
ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0]"));
CheckCast(ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0",
"0X0", "0x7FFFFFFF", "0XFFFFfFfF", "0Xf0000000"])"),
ArrayFromJSON(
int32(),
"[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]"));

CheckCast(ArrayFromJSON(
string_type,
R"(["9223372036854775807", null, "-9223372036854775808", "0", "0"])"),
CheckCast(ArrayFromJSON(string_type,
R"(["9223372036854775807", null, "-9223372036854775808", "0",
"0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"),
ArrayFromJSON(int64(),
"[9223372036854775807, null, -9223372036854775808, 0, 0]"));
"[9223372036854775807, null, -9223372036854775808, 0, 0, "
"9223372036854775807, -1152921504606846975]"));

for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) {
CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0"])"),
ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0]"));
CheckCast(ArrayFromJSON(string_type,
R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f"])"),
ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]"));
}

CheckCast(
ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0"])"),
ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0]"));

CheckCast(ArrayFromJSON(
string_type,
R"(["9223372036854775807", null, "18446744073709551615", "0", "0"])"),
ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0",
"0x0", "0x7FFFFFFf", "0xFFFFFFFF"])"),
ArrayFromJSON(uint32(),
"[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]"));

CheckCast(ArrayFromJSON(string_type,
R"(["9223372036854775807", null, "18446744073709551615", "0",
"0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"),
ArrayFromJSON(uint64(),
"[9223372036854775807, null, 18446744073709551615, 0, 0]"));
"[9223372036854775807, null, 18446744073709551615, 0, 0, "
"9223372036854775807, 18446744073709551615]"));

for (std::string not_int8 : {
"z",
"12 z",
"128",
"-129",
"0.5",
"0x",
"0xfff",
"-0xf0",
}) {
auto options = CastOptions::Safe(int8());
CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options);
}

for (std::string not_uint8 : {
"256",
"-1",
"0.5",
}) {
for (std::string not_uint8 : {"256", "-1", "0.5", "0x", "0x3wa", "0x123"}) {
auto options = CastOptions::Safe(uint8());
CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options);
}
Expand Down
43 changes: 43 additions & 0 deletions cpp/src/arrow/util/value_parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,30 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
#undef PARSE_UNSIGNED_ITERATION
#undef PARSE_UNSIGNED_ITERATION_LAST

template <typename T>
bool ParseHex(const char* s, size_t length, T* out) {
// lets make sure that the length of the string is not too big
if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) {
return false;
}
T result = 0;
for (size_t i = 0; i < length; i++) {
result = static_cast<T>(result << 4);
if (s[i] >= '0' && s[i] <= '9') {
result = static_cast<T>(result | (s[i] - '0'));
} else if (s[i] >= 'A' && s[i] <= 'F') {
result = static_cast<T>(result | (s[i] - 'A' + 10));
} else if (s[i] >= 'a' && s[i] <= 'f') {
result = static_cast<T>(result | (s[i] - 'a' + 10));
} else {
/* Non-digit */
return false;
}
Comment on lines +284 to +294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lookup table approach may have better performance.
We can leave it as a followup task.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth investigating, but I would request we leave it as followup task.

}
*out = result;
return true;
}

template <class ARROW_TYPE>
struct StringToUnsignedIntConverterMixin {
using value_type = typename ARROW_TYPE::c_type;
Expand All @@ -281,6 +305,13 @@ struct StringToUnsignedIntConverterMixin {
if (ARROW_PREDICT_FALSE(length == 0)) {
return false;
}
// If it starts with 0x then its hex
if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
length -= 2;
s += 2;

return ARROW_PREDICT_TRUE(ParseHex(s, length, out));
}
// Skip leading zeros
while (length > 0 && *s == '0') {
length--;
Expand Down Expand Up @@ -329,6 +360,18 @@ struct StringToSignedIntConverterMixin {
if (ARROW_PREDICT_FALSE(length == 0)) {
return false;
}
// If it starts with 0x then its hex
if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
length -= 2;
s += 2;

if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) {
return false;
}
*out = static_cast<value_type>(unsigned_value);
return true;
}

if (*s == '-') {
negative = true;
s++;
Expand Down
48 changes: 48 additions & 0 deletions cpp/src/arrow/util/value_parsing_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,26 @@ static std::vector<std::string> MakeIntStrings(int32_t num_items) {
return strings;
}

template <typename c_int>
static std::vector<std::string> MakeHexStrings(int32_t num_items) {
int32_t num_bytes = sizeof(c_int);
const char* kAsciiTable = "0123456789ABCDEF";
std::vector<char> large_hex_chars(num_bytes * 2 + 2);
large_hex_chars[0] = '0';
large_hex_chars[1] = 'x';
for (int32_t i = 0; i < num_bytes * 2; ++i) {
large_hex_chars[i + 2] = kAsciiTable[i];
}
std::string large_hex(&large_hex_chars[0], large_hex_chars.size());

std::vector<std::string> base_strings = {"0x0", "0xA5", "0x5E", large_hex};
std::vector<std::string> strings;
for (int32_t i = 0; i < num_items; ++i) {
strings.push_back(base_strings[i % base_strings.size()]);
}
return strings;
}

static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
std::vector<std::string> base_strings = {"0.0", "5", "-12.3",
"98765430000", "3456.789", "0.0012345",
Expand Down Expand Up @@ -123,6 +143,25 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer
state.SetItemsProcessed(state.iterations() * strings.size());
}

template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
static void HexParsing(benchmark::State& state) { // NOLINT non-const reference
auto strings = MakeHexStrings<C_TYPE>(1000);

while (state.KeepRunning()) {
C_TYPE total = 0;
for (const auto& s : strings) {
C_TYPE value;
if (!ParseValue<ARROW_TYPE>(s.data(), s.length(), &value)) {
std::cerr << "Conversion failed for '" << s << "'";
std::abort();
}
total = static_cast<C_TYPE>(total + value);
}
benchmark::DoNotOptimize(total);
}
state.SetItemsProcessed(state.iterations() * strings.size());
}

template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference
auto strings = MakeFloatStrings(1000);
Expand Down Expand Up @@ -230,6 +269,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type);
BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type);
BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type);

BENCHMARK_TEMPLATE(HexParsing, Int8Type);
BENCHMARK_TEMPLATE(HexParsing, Int16Type);
BENCHMARK_TEMPLATE(HexParsing, Int32Type);
BENCHMARK_TEMPLATE(HexParsing, Int64Type);
BENCHMARK_TEMPLATE(HexParsing, UInt8Type);
BENCHMARK_TEMPLATE(HexParsing, UInt16Type);
BENCHMARK_TEMPLATE(HexParsing, UInt32Type);
BENCHMARK_TEMPLATE(HexParsing, UInt64Type);

BENCHMARK_TEMPLATE(FloatParsing, FloatType);
BENCHMARK_TEMPLATE(FloatParsing, DoubleType);

Expand Down
86 changes: 86 additions & 0 deletions cpp/src/arrow/util/value_parsing_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ TEST(StringConversion, ToInt8) {
AssertConversionFails<Int8Type>("-");
AssertConversionFails<Int8Type>("0.0");
AssertConversionFails<Int8Type>("e");

// Hex
AssertConversion<Int8Type>("0x0", 0);
AssertConversion<Int8Type>("0X1A", 26);
AssertConversion<Int8Type>("0xb", 11);
AssertConversion<Int8Type>("0x7F", 127);
AssertConversion<Int8Type>("0xFF", -1);
AssertConversionFails<Int8Type>("0x");
AssertConversionFails<Int8Type>("0x100");
AssertConversionFails<Int8Type>("0x1g");
}

TEST(StringConversion, ToUInt8) {
Expand All @@ -138,6 +148,16 @@ TEST(StringConversion, ToUInt8) {
AssertConversionFails<UInt8Type>("-");
AssertConversionFails<UInt8Type>("0.0");
AssertConversionFails<UInt8Type>("e");

// Hex
AssertConversion<UInt8Type>("0x0", 0);
AssertConversion<UInt8Type>("0x1A", 26);
AssertConversion<UInt8Type>("0xb", 11);
AssertConversion<UInt8Type>("0x7F", 127);
AssertConversion<UInt8Type>("0xFF", 255);
AssertConversionFails<UInt8Type>("0x");
AssertConversionFails<UInt8Type>("0x100");
AssertConversionFails<UInt8Type>("0x1g");
}

TEST(StringConversion, ToInt16) {
Expand All @@ -155,6 +175,16 @@ TEST(StringConversion, ToInt16) {
AssertConversionFails<Int16Type>("-");
AssertConversionFails<Int16Type>("0.0");
AssertConversionFails<Int16Type>("e");

// Hex
AssertConversion<Int16Type>("0x0", 0);
AssertConversion<Int16Type>("0X1aA", 426);
AssertConversion<Int16Type>("0xb", 11);
AssertConversion<Int16Type>("0x7ffF", 32767);
AssertConversion<Int16Type>("0XfffF", -1);
AssertConversionFails<Int16Type>("0x");
AssertConversionFails<Int16Type>("0x10000");
AssertConversionFails<Int16Type>("0x1g");
}

TEST(StringConversion, ToUInt16) {
Expand All @@ -172,6 +202,16 @@ TEST(StringConversion, ToUInt16) {
AssertConversionFails<UInt16Type>("-");
AssertConversionFails<UInt16Type>("0.0");
AssertConversionFails<UInt16Type>("e");

// Hex
AssertConversion<UInt16Type>("0x0", 0);
AssertConversion<UInt16Type>("0x1aA", 426);
AssertConversion<UInt16Type>("0xb", 11);
AssertConversion<UInt16Type>("0x7ffF", 32767);
AssertConversion<UInt16Type>("0xFffF", 65535);
AssertConversionFails<UInt16Type>("0x");
AssertConversionFails<UInt16Type>("0x10000");
AssertConversionFails<UInt16Type>("0x1g");
}

TEST(StringConversion, ToInt32) {
Expand All @@ -189,6 +229,18 @@ TEST(StringConversion, ToInt32) {
AssertConversionFails<Int32Type>("-");
AssertConversionFails<Int32Type>("0.0");
AssertConversionFails<Int32Type>("e");

// Hex
AssertConversion<Int32Type>("0x0", 0);
AssertConversion<Int32Type>("0x123ABC", 1194684);
AssertConversion<Int32Type>("0xA4B35", 674613);
AssertConversion<Int32Type>("0x7FFFFFFF", 2147483647);
AssertConversion<Int32Type>("0x123abc", 1194684);
AssertConversion<Int32Type>("0xA4b35", 674613);
AssertConversion<Int32Type>("0x7FFFfFfF", 2147483647);
AssertConversion<Int32Type>("0XFFFFfFfF", -1);
AssertConversionFails<Int32Type>("0X");
AssertConversionFails<Int32Type>("0x23512ak");
}

TEST(StringConversion, ToUInt32) {
Expand All @@ -206,6 +258,18 @@ TEST(StringConversion, ToUInt32) {
AssertConversionFails<UInt32Type>("-");
AssertConversionFails<UInt32Type>("0.0");
AssertConversionFails<UInt32Type>("e");

// Hex
AssertConversion<UInt32Type>("0x0", 0);
AssertConversion<UInt32Type>("0x123ABC", 1194684);
AssertConversion<UInt32Type>("0xA4B35", 674613);
AssertConversion<UInt32Type>("0x7FFFFFFF", 2147483647);
AssertConversion<UInt32Type>("0x123abc", 1194684);
AssertConversion<UInt32Type>("0xA4b35", 674613);
AssertConversion<UInt32Type>("0x7FFFfFfF", 2147483647);
AssertConversion<UInt32Type>("0XFFFFfFfF", 4294967295);
AssertConversionFails<UInt32Type>("0X");
AssertConversionFails<UInt32Type>("0x23512ak");
}

TEST(StringConversion, ToInt64) {
Expand All @@ -223,6 +287,17 @@ TEST(StringConversion, ToInt64) {
AssertConversionFails<Int64Type>("-");
AssertConversionFails<Int64Type>("0.0");
AssertConversionFails<Int64Type>("e");

// Hex
AssertConversion<Int64Type>("0x0", 0);
AssertConversion<Int64Type>("0x5415a123ABC123cb", 6058926048274359243);
AssertConversion<Int64Type>("0xA4B35", 674613);
AssertConversion<Int64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807);
AssertConversion<Int64Type>("0XF000000000000001", -1152921504606846975);
AssertConversion<Int64Type>("0xfFFFFFFFFFFFFFFf", -1);
AssertConversionFails<Int64Type>("0X");
AssertConversionFails<Int64Type>("0x12345678901234567");
AssertConversionFails<Int64Type>("0x23512ak");
}

TEST(StringConversion, ToUInt64) {
Expand All @@ -237,6 +312,17 @@ TEST(StringConversion, ToUInt64) {
AssertConversionFails<UInt64Type>("-");
AssertConversionFails<UInt64Type>("0.0");
AssertConversionFails<UInt64Type>("e");

// Hex
AssertConversion<UInt64Type>("0x0", 0);
AssertConversion<UInt64Type>("0x5415a123ABC123cb", 6058926048274359243);
AssertConversion<UInt64Type>("0xA4B35", 674613);
AssertConversion<UInt64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807);
AssertConversion<UInt64Type>("0XF000000000000001", 17293822569102704641ULL);
AssertConversion<UInt64Type>("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL);
AssertConversionFails<UInt64Type>("0x");
AssertConversionFails<UInt64Type>("0x12345678901234567");
AssertConversionFails<UInt64Type>("0x23512ak");
}

TEST(StringConversion, ToDate32) {
Expand Down