Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 24 additions & 19 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -315,42 +315,43 @@ class TestCast : public TestBase {
CastOptions options;
auto src_type = TypeTraits<SourceType>::type_singleton();

std::vector<bool> is_valid = {true, false, true, true, true};
std::vector<bool> is_valid = {true, false, true, true, true, true, true, true};

// string to int
std::vector<std::string> v_int = {"0", "1", "127", "-1", "0"};
std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0};
std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0};
std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0};
std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0};
std::vector<std::string> v_int = {"0", "1", "127", "-1", "0", "0x0", "0x7f", "0XFF"};
std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0, 0, 127, -1};
std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0, 0, 127, -1};
std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0, 0, 127, -1};
std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0, 0, 127, -1};
CheckCase<SourceType, Int8Type>(v_int, is_valid, e_int8, options);
CheckCase<SourceType, Int16Type>(v_int, is_valid, e_int16, options);
CheckCase<SourceType, Int32Type>(v_int, is_valid, e_int32, options);
CheckCase<SourceType, Int64Type>(v_int, is_valid, e_int64, options);

v_int = {"2147483647", "0", "-2147483648", "0", "0"};
e_int32 = {2147483647, 0, -2147483648LL, 0, 0};
v_int = {"2147483647", "0", "-2147483648", "0", "0", "0x7fff", "0X0", "0xF000"};
e_int32 = {2147483647, 0, -2147483648LL, 0, 0, 2147483647, 0, -2147483648LL};
CheckCase<SourceType, Int32Type>(v_int, is_valid, e_int32, options);
v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"};
e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0};
v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0", "0x7FFFFFFF", "0x0", "0Xffffffff"};
e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0,
9223372036854775807LL, 0, (-9223372036854775807LL - 1)};
CheckCase<SourceType, Int64Type>(v_int, is_valid, e_int64, options);

// string to uint
std::vector<std::string> v_uint = {"0", "1", "127", "255", "0"};
std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0};
std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0};
std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0};
std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0};
std::vector<std::string> v_uint = {"0", "1", "127", "255", "0", "0x0", "0x7f", "0XFF"};
std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0, 0, 127, 255};
std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0, 0, 127, 255};
std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0, 0, 127, 255};
std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0, 0, 127, 255};
CheckCase<SourceType, UInt8Type>(v_uint, is_valid, e_uint8, options);
CheckCase<SourceType, UInt16Type>(v_uint, is_valid, e_uint16, options);
CheckCase<SourceType, UInt32Type>(v_uint, is_valid, e_uint32, options);
CheckCase<SourceType, UInt64Type>(v_uint, is_valid, e_uint64, options);

v_uint = {"4294967295", "0", "0", "0", "0"};
e_uint32 = {4294967295, 0, 0, 0, 0};
v_uint = {"4294967295", "0", "0", "0", "0", "0xFFFF", "0x0", "0x1"};
e_uint32 = {4294967295, 0, 0, 0, 0, 4294967295, 0, 1};
CheckCase<SourceType, UInt32Type>(v_uint, is_valid, e_uint32, options);
v_uint = {"18446744073709551615", "0", "0", "0", "0"};
e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0};
v_uint = {"18446744073709551615", "0", "0", "0", "0", "0xffffFFFF", "0x0", "0x1"};
e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0, 18446744073709551615ULL, 0, 1};
CheckCase<SourceType, UInt64Type>(v_uint, is_valid, e_uint64, options);

// string to float
Expand Down Expand Up @@ -1554,6 +1555,10 @@ TEST_F(TestCast, StringToNumberErrors) {
CheckFails<StringType>({"128"}, is_valid, int8(), options);
CheckFails<StringType>({"-129"}, is_valid, int8(), options);
CheckFails<StringType>({"0.5"}, is_valid, int8(), options);
CheckFails<StringType>({"0x"}, is_valid, int8(), options);
CheckFails<StringType>({"0x12r"}, is_valid, int32(), options);
CheckFails<StringType>({"0x1ffff"}, is_valid, int32(), options);
CheckFails<StringType>({"-0x123"}, is_valid, int32(), options);

CheckFails<StringType>({"256"}, is_valid, uint8(), options);
CheckFails<StringType>({"-1"}, is_valid, uint8(), options);
Expand Down
50 changes: 50 additions & 0 deletions cpp/src/arrow/util/value_parsing_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,27 @@ static std::vector<std::string> MakeIntStrings(int32_t num_items) {
return strings;
}

template <typename c_int>
static std::vector<std::string> MakeHexStrings(int32_t num_items) {
int32_t num_bytes = sizeof(c_int);
const char* kAsciiTable = "0123456789ABCDEF";
std::vector<char> large_hex_chars(num_bytes * 2 + 2);
large_hex_chars[0] = '0';
large_hex_chars[1] = 'x';
for (int32_t i = 0; i < num_bytes * 2; ++i) {
large_hex_chars[i + 2] = kAsciiTable[i];
}
std::string large_hex(&large_hex_chars[0], large_hex_chars.size());

std::vector<std::string> base_strings = {"0x0", "0xA5", "0x5E", large_hex};
std::vector<std::string> strings;
for (int32_t i = 0; i < num_items; ++i) {
strings.push_back(base_strings[i % base_strings.size()]);
}
return strings;
}


static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
std::vector<std::string> base_strings = {"0.0", "5", "-12.3",
"98765430000", "3456.789", "0.0012345",
Expand Down Expand Up @@ -123,6 +144,26 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer
state.SetItemsProcessed(state.iterations() * strings.size());
}

template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
static void HexParsing(benchmark::State& state) { // NOLINT non-const reference
auto strings = MakeHexStrings<C_TYPE>(1000);

while (state.KeepRunning()) {
C_TYPE total = 0;
for (const auto& s : strings) {
C_TYPE value;
if (!ParseValue<ARROW_TYPE>(s.data(), s.length(), &value)) {
std::cerr << "Conversion failed for '" << s << "'";
std::abort();
}
total = static_cast<C_TYPE>(total + value);
}
benchmark::DoNotOptimize(total);
}
state.SetItemsProcessed(state.iterations() * strings.size());
}


template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference
auto strings = MakeFloatStrings(1000);
Expand Down Expand Up @@ -230,6 +271,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type);
BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type);
BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type);

BENCHMARK_TEMPLATE(HexParsing, Int8Type);
BENCHMARK_TEMPLATE(HexParsing, Int16Type);
BENCHMARK_TEMPLATE(HexParsing, Int32Type);
BENCHMARK_TEMPLATE(HexParsing, Int64Type);
BENCHMARK_TEMPLATE(HexParsing, UInt8Type);
BENCHMARK_TEMPLATE(HexParsing, UInt16Type);
BENCHMARK_TEMPLATE(HexParsing, UInt32Type);
BENCHMARK_TEMPLATE(HexParsing, UInt64Type);

BENCHMARK_TEMPLATE(FloatParsing, FloatType);
BENCHMARK_TEMPLATE(FloatParsing, DoubleType);

Expand Down
77 changes: 77 additions & 0 deletions cpp/src/arrow/util/value_parsing_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,17 @@ TEST(StringConversion, ToInt8) {
AssertConversionFails<Int8Type>("-");
AssertConversionFails<Int8Type>("0.0");
AssertConversionFails<Int8Type>("e");

// Hex
AssertConversion<Int8Type>("0x0", 0);
AssertConversion<Int8Type>("0X1A", 26);
AssertConversion<Int8Type>("0xb", 11);
AssertConversion<Int8Type>("0x7F", 127);
AssertConversion<Int8Type>("0xFF", -1);
AssertConversionFails<Int8Type>("0x");
AssertConversionFails<Int8Type>("0x100");
AssertConversionFails<Int8Type>("0x1g");

}

TEST(StringConversion, ToUInt8) {
Expand Down Expand Up @@ -155,6 +166,16 @@ TEST(StringConversion, ToInt16) {
AssertConversionFails<Int16Type>("-");
AssertConversionFails<Int16Type>("0.0");
AssertConversionFails<Int16Type>("e");

// Hex
AssertConversion<Int16Type>("0x0", 0);
AssertConversion<Int16Type>("0X1aA", 426);
AssertConversion<Int16Type>("0xb", 11);
AssertConversion<Int16Type>("0x7ffF", 32767);
AssertConversion<Int16Type>("0XfffF", -1);
AssertConversionFails<Int16Type>("0x");
AssertConversionFails<Int16Type>("0x10000");
AssertConversionFails<Int16Type>("0x1g");
}

TEST(StringConversion, ToUInt16) {
Expand All @@ -172,6 +193,16 @@ TEST(StringConversion, ToUInt16) {
AssertConversionFails<UInt16Type>("-");
AssertConversionFails<UInt16Type>("0.0");
AssertConversionFails<UInt16Type>("e");

// Hex
AssertConversion<UInt16Type>("0x0", 0);
AssertConversion<UInt16Type>("0x1aA", 426);
AssertConversion<UInt16Type>("0xb", 11);
AssertConversion<UInt16Type>("0x7ffF", 32767);
AssertConversion<UInt16Type>("0xFffF", 65535);
AssertConversionFails<UInt16Type>("0x");
AssertConversionFails<UInt16Type>("0x10000");
AssertConversionFails<UInt16Type>("0x1g");
}

TEST(StringConversion, ToInt32) {
Expand All @@ -189,6 +220,18 @@ TEST(StringConversion, ToInt32) {
AssertConversionFails<Int32Type>("-");
AssertConversionFails<Int32Type>("0.0");
AssertConversionFails<Int32Type>("e");

// Hex
AssertConversion<Int32Type>("0x0", 0);
AssertConversion<Int32Type>("0x123ABC", 1194684);
AssertConversion<Int32Type>("0xA4B35", 674613);
AssertConversion<Int32Type>("0x7FFFFFFF", 2147483647);
AssertConversion<Int32Type>("0x123abc", 1194684);
AssertConversion<Int32Type>("0xA4b35", 674613);
AssertConversion<Int32Type>("0x7FFFfFfF", 2147483647);
AssertConversion<Int32Type>("0XFFFFfFfF", -1);
AssertConversionFails<Int32Type>("0X");
AssertConversionFails<Int32Type>("0x23512ak");
}

TEST(StringConversion, ToUInt32) {
Expand All @@ -206,6 +249,18 @@ TEST(StringConversion, ToUInt32) {
AssertConversionFails<UInt32Type>("-");
AssertConversionFails<UInt32Type>("0.0");
AssertConversionFails<UInt32Type>("e");

// Hex
AssertConversion<UInt32Type>("0x0", 0);
AssertConversion<UInt32Type>("0x123ABC", 1194684);
AssertConversion<UInt32Type>("0xA4B35", 674613);
AssertConversion<UInt32Type>("0x7FFFFFFF", 2147483647);
AssertConversion<UInt32Type>("0x123abc", 1194684);
AssertConversion<UInt32Type>("0xA4b35", 674613);
AssertConversion<UInt32Type>("0x7FFFfFfF", 2147483647);
AssertConversion<UInt32Type>("0XFFFFfFfF", 4294967295);
AssertConversionFails<UInt32Type>("0X");
AssertConversionFails<UInt32Type>("0x23512ak");
}

TEST(StringConversion, ToInt64) {
Expand All @@ -223,6 +278,17 @@ TEST(StringConversion, ToInt64) {
AssertConversionFails<Int64Type>("-");
AssertConversionFails<Int64Type>("0.0");
AssertConversionFails<Int64Type>("e");

// Hex
AssertConversion<Int64Type>("0x0", 0);
AssertConversion<Int64Type>("0x5415a123ABC123cb", 6058926048274359243);
AssertConversion<Int64Type>("0xA4B35", 674613);
AssertConversion<Int64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807);
AssertConversion<Int64Type>("0XF000000000000001", -1152921504606846975);
AssertConversion<Int64Type>("0xfFFFFFFFFFFFFFFf", -1);
AssertConversionFails<Int64Type>("0X");
AssertConversionFails<Int64Type>("0x12345678901234567");
AssertConversionFails<Int64Type>("0x23512ak");
}

TEST(StringConversion, ToUInt64) {
Expand All @@ -237,6 +303,17 @@ TEST(StringConversion, ToUInt64) {
AssertConversionFails<UInt64Type>("-");
AssertConversionFails<UInt64Type>("0.0");
AssertConversionFails<UInt64Type>("e");

// Hex
AssertConversion<UInt64Type>("0x0", 0);
AssertConversion<UInt64Type>("0x5415a123ABC123cb", 6058926048274359243);
AssertConversion<UInt64Type>("0xA4B35", 674613);
AssertConversion<UInt64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807);
AssertConversion<UInt64Type>("0XF000000000000001", 17293822569102704641ULL);
AssertConversion<UInt64Type>("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL);
AssertConversionFails<UInt64Type>("0x");
AssertConversionFails<UInt64Type>("0x12345678901234567");
AssertConversionFails<UInt64Type>("0x23512ak");
}

TEST(StringConversion, ToTimestampDate_ISO8601) {
Expand Down