From d614dd4c1029bbcbb905dbdac933cabff945b2cf Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 4 Dec 2017 21:02:09 +0900 Subject: [PATCH 1/8] ENH: Add casting implementations from strings to numbers or boolean --- cpp/src/arrow/compute/kernels/cast.cc | 113 ++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 465be958cfa..36f2a979283 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -17,6 +17,9 @@ #include "arrow/compute/kernels/cast.h" +#include +#include +#include #include #include #include @@ -660,6 +663,100 @@ struct CastFunctor +struct CastFunctor::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + using out_type = typename O::c_type; + StringArray input_array(input.Copy()); + + if (input_array.null_count() > 0) { + std::stringstream ss; + ss << "Failed to cast NA into " << output->type->ToString(); + ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); + return; + } + + auto out_data = GetMutableValues(output, 1); + + std::function cast_func; + if (output->type->id() == Type::INT8 || output->type->id() == Type::UINT8) { + cast_func = [](const std::string& s) { + return boost::numeric_cast(boost::lexical_cast(s)); + }; + } else { + cast_func = [](const std::string& s) { return boost::lexical_cast(s); }; + } + + for (int64_t i = 0; i < input.length; ++i) { + std::string s = input_array.GetString(i); + + try { + *out_data++ = cast_func(s); + } catch (...) { + std::stringstream ss; + ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); + ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); + return; + } + } + } +}; + +// ---------------------------------------------------------------------- +// String to Boolean + +template +struct CastFunctor::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + StringArray input_array(input.Copy()); + internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, + input.length); + + if (input_array.null_count() > 0) { + std::stringstream ss; + ss << "Failed to cast NA into " << output->type->ToString(); + ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); + return; + } + + for (int64_t i = 0; i < input.length; ++i) { + auto s = input_array.GetString(i); + auto s_lower = boost::algorithm::to_lower_copy(s); + bool flag; + + if (s_lower == "true") { + flag = true; + } else if (s_lower == "false") { + flag = false; + } else { + try { + flag = boost::lexical_cast(s); + } catch (...) { + std::stringstream ss; + ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); + ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); + return; + } + } + + if (flag) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); + } + writer.Finish(); + } +}; + // ---------------------------------------------------------------------- typedef std::function& CAST_FUNCTION_CASE(Time32Type); CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); + CAST_FUNCTION_CASE(StringType); CAST_FUNCTION_CASE(DictionaryType); default: break; From 8a6470c844586c6f1f4121c6ccdbce28d13ebb73 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 4 Dec 2017 22:45:40 +0900 Subject: [PATCH 2/8] TST: Add tests for casting from strings to numbers or boolean --- cpp/src/arrow/compute/compute-test.cc | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 84af8f7c6b0..dae5ded35bd 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -769,6 +769,65 @@ TEST_F(TestCast, OffsetOutputBuffer) { int16(), e3); } +TEST_F(TestCast, StringToBoolean) { + CastOptions options; + + vector is_valid = {true, true, true, true, true}; + + vector v1 = {"False", "true", "true", "True", "false"}; + vector v2 = {"0", "1", "1", "1", "0"}; + vector e = {false, true, true, true, false}; + CheckCase(utf8(), v1, is_valid, boolean(), + e, options); + CheckCase(utf8(), v2, is_valid, boolean(), + e, options); +} + +TEST_F(TestCast, StringToNumber) { + CastOptions options; + + vector is_valid = {true, true, true, true, true}; + + // string to int + vector v_int = {"0", "1", "127", "-1", "0"}; + vector e_int8 = {0, 1, 127, -1, 0}; + vector e_int16 = {0, 1, 127, -1, 0}; + vector e_int32 = {0, 1, 127, -1, 0}; + vector e_int64 = {0, 1, 127, -1, 0}; + CheckCase(utf8(), v_int, is_valid, int8(), + e_int8, options); + CheckCase(utf8(), v_int, is_valid, int16(), + e_int16, options); + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + // string to uint + vector v_uint = {"0", "1", "127", "255", "0"}; + vector e_uint8 = {0, 1, 127, 255, 0}; + vector e_uint16 = {0, 1, 127, 255, 0}; + vector e_uint32 = {0, 1, 127, 255, 0}; + vector e_uint64 = {0, 1, 127, 255, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint8(), e_uint8, options); + CheckCase(utf8(), v_uint, is_valid, + uint16(), e_uint16, options); + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + // string to float + vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; + vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; + vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); +} + template class TestDictionaryCast : public TestCast {}; From 32a91709a6c3e4551d4db27cf02c9c38dcad5ebc Mon Sep 17 00:00:00 2001 From: Licht-T Date: Tue, 5 Dec 2017 08:25:19 +0900 Subject: [PATCH 3/8] Add /bigobj into CMAKE_CXX_FLAGS if the compiler is MSVC --- cpp/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f4b7b29b9d3..5ba3b6cd93d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -268,6 +268,11 @@ include(ThirdpartyToolchain) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARROW_CXXFLAGS}") +if (MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") +endif() + + message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") if ("${COMPILER_FAMILY}" STREQUAL "clang") From 5d3871d456aea773d899ba5b2d29f4c821c78a9c Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 30 Dec 2017 13:40:29 +0900 Subject: [PATCH 4/8] ENH: Add String to Number/Boolean cast support with NA --- cpp/src/arrow/compute/kernels/cast.cc | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 1f7b19c071f..9eeb3ddeec5 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -717,13 +717,6 @@ struct CastFunctor 0) { - std::stringstream ss; - ss << "Failed to cast NA into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); - return; - } - auto out_data = GetMutableValues(output, 1); std::function cast_func; @@ -736,6 +729,11 @@ struct CastFunctorbuffers[1]->mutable_data(), output->offset, input.length); - if (input_array.null_count() > 0) { - std::stringstream ss; - ss << "Failed to cast NA into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); - return; - } - for (int64_t i = 0; i < input.length; ++i) { + if (input_array.IsNull(i)) { + writer.Next(); + continue; + } + auto s = input_array.GetString(i); auto s_lower = boost::algorithm::to_lower_copy(s); bool flag; From 24dec32bb6ab9fda9bc92f5c5112d2a2fc354d19 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 30 Dec 2017 13:55:49 +0900 Subject: [PATCH 5/8] TST: Add test of the String to Number/Boolean cast with NA --- cpp/src/arrow/compute/compute-test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 23261bae49d..dc88e54b3bf 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -772,7 +772,7 @@ TEST_F(TestCast, OffsetOutputBuffer) { TEST_F(TestCast, StringToBoolean) { CastOptions options; - vector is_valid = {true, true, true, true, true}; + vector is_valid = {true, false, true, true, true}; vector v1 = {"False", "true", "true", "True", "false"}; vector v2 = {"0", "1", "1", "1", "0"}; @@ -786,7 +786,7 @@ TEST_F(TestCast, StringToBoolean) { TEST_F(TestCast, StringToNumber) { CastOptions options; - vector is_valid = {true, true, true, true, true}; + vector is_valid = {true, false, true, true, true}; // string to int vector v_int = {"0", "1", "127", "-1", "0"}; From fbd590769e4dbf2fc8040527007ccecc604ad84b Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 30 Dec 2017 21:43:24 +0900 Subject: [PATCH 6/8] Replace lambda cast-function to template one --- cpp/src/arrow/compute/kernels/cast.cc | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 9eeb3ddeec5..e03c604e343 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -709,6 +709,23 @@ struct CastFunctor +typename std::enable_if::value && !std::is_same::value && + !std::is_same::value, + T>::type +castStringToNumeric(const std::string& s) { + return boost::lexical_cast(s); +} + +template +typename std::enable_if::value || std::is_same::value, + T>::type +castStringToNumeric(const std::string& s) { + // Convert to int before casting to T + // because boost::lexical_cast does not support 8bit int/uint. + return boost::numeric_cast(boost::lexical_cast(s)); +} + template struct CastFunctor::value>::type> { @@ -720,13 +737,6 @@ struct CastFunctor(output, 1); std::function cast_func; - if (output->type->id() == Type::INT8 || output->type->id() == Type::UINT8) { - cast_func = [](const std::string& s) { - return boost::numeric_cast(boost::lexical_cast(s)); - }; - } else { - cast_func = [](const std::string& s) { return boost::lexical_cast(s); }; - } for (int64_t i = 0; i < input.length; ++i) { if (input_array.IsNull(i)) { @@ -737,7 +747,7 @@ struct CastFunctor(s); } catch (...) { std::stringstream ss; ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); From 6020f3df79606b52a835afac57153f61bd9abafb Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 7 May 2018 15:59:10 +0900 Subject: [PATCH 7/8] Capitalize castStringToNumeric method --- cpp/src/arrow/compute/kernels/cast.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 22cf7149fe1..3a6c2d3e0a0 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -745,14 +745,14 @@ template typename std::enable_if::value && !std::is_same::value && !std::is_same::value, T>::type -castStringToNumeric(const std::string& s) { +CastStringToNumeric(const std::string& s) { return boost::lexical_cast(s); } template typename std::enable_if::value || std::is_same::value, T>::type -castStringToNumeric(const std::string& s) { +CastStringToNumeric(const std::string& s) { // Convert to int before casting to T // because boost::lexical_cast does not support 8bit int/uint. return boost::numeric_cast(boost::lexical_cast(s)); @@ -779,7 +779,7 @@ struct CastFunctor(s); + *out_data++ = CastStringToNumeric(s); } catch (...) { std::stringstream ss; ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); From fdbd3e0be3419dc758232494a1bd528e0d86aeda Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 7 May 2018 15:59:55 +0900 Subject: [PATCH 8/8] Remove unused variable --- cpp/src/arrow/compute/kernels/cast.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 3a6c2d3e0a0..138d7e01388 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -768,8 +768,6 @@ struct CastFunctor(output, 1); - std::function cast_func; - for (int64_t i = 0; i < input.length; ++i) { if (input_array.IsNull(i)) { out_data++;