From 1140ddad48a4eb7ce8d932e2f6ddf5e3ef8677c1 Mon Sep 17 00:00:00 2001 From: Rommel Quintanilla Date: Wed, 4 Aug 2021 09:46:07 -0500 Subject: [PATCH 1/4] ARROW-12540: [C++] Added unit tests for issue reproducing and implemented the casting support from date32/date64 to uft8/large_utf8 --- .../compute/kernels/scalar_cast_string.cc | 47 +++++++++++++++++++ .../arrow/compute/kernels/scalar_cast_test.cc | 16 ++++++- cpp/src/arrow/csv/writer_test.cc | 22 +++++---- 3 files changed, 75 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 3ce537b7223..44174681f75 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -72,6 +72,39 @@ struct NumericToStringCastFunctor { } }; +// ---------------------------------------------------------------------- +// Temporal to String + +template +struct TemporalToStringCastFunctor { + using value_type = typename TypeTraits::CType; + using BuilderType = typename TypeTraits::BuilderType; + using FormatterType = StringFormatter; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + DCHECK(out->is_array()); + const ArrayData& input = *batch[0].array(); + ArrayData* output = out->mutable_array(); + return Convert(ctx, input, output); + } + + static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) { + FormatterType formatter(input.type); + BuilderType builder(input.type, ctx->memory_pool()); + RETURN_NOT_OK(VisitArrayDataInline( + input, + [&](value_type v) { + return formatter(v, [&](util::string_view v) { return builder.Append(v); }); + }, + [&]() { return builder.AppendNull(); })); + + std::shared_ptr output_array; + RETURN_NOT_OK(builder.Finish(&output_array)); + *output = std::move(*output_array->data()); + return Status::OK(); + } +}; + // ---------------------------------------------------------------------- // Binary-like to binary-like // @@ -192,6 +225,18 @@ void AddNumberToStringCasts(CastFunction* func) { } } +template +void AddTemporalToStringCasts(CastFunction* func) { + auto out_ty = TypeTraits::type_singleton(); + for (const std::shared_ptr& in_ty : TemporalTypes()) { + DCHECK_OK(func->AddKernel( + in_ty->id(), {in_ty}, out_ty, + TrivialScalarUnaryAsArraysExec( + GenerateTemporal(*in_ty)), + NullHandling::COMPUTED_NO_PREALLOCATE)); + } +} + template void AddBinaryToBinaryCast(CastFunction* func) { auto in_ty = TypeTraits::type_singleton(); @@ -226,12 +271,14 @@ std::vector> GetBinaryLikeCasts() { auto cast_string = std::make_shared("cast_string", Type::STRING); AddCommonCasts(Type::STRING, utf8(), cast_string.get()); AddNumberToStringCasts(cast_string.get()); + AddTemporalToStringCasts(cast_string.get()); AddBinaryToBinaryCast(cast_string.get()); auto cast_large_string = std::make_shared("cast_large_string", Type::LARGE_STRING); AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get()); AddNumberToStringCasts(cast_large_string.get()); + AddTemporalToStringCasts(cast_large_string.get()); AddBinaryToBinaryCast(cast_large_string.get()); auto cast_fsb = diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 9f537fecf55..dcff406db3a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -197,8 +197,8 @@ TEST(Cast, CanCast) { ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO)}); - ExpectCannotCast(timestamp(TimeUnit::MICRO), - kBaseBinaryTypes); // no formatting supported + // ExpectCannotCast(timestamp(TimeUnit::MICRO), + // kBaseBinaryTypes); // no formatting supported ExpectCannotCast(fixed_size_binary(3), {fixed_size_binary(3)}); // FIXME missing identity cast @@ -208,6 +208,9 @@ TEST(Cast, CanCast) { ExpectCanCast(smallint(), kNumericTypes); // any cast which is valid for storage is supported ExpectCannotCast(null(), {smallint()}); // FIXME missing common cast from null + + ExpectCanCast(date32(), {utf8(), large_utf8()}); + ExpectCanCast(date64(), {utf8(), large_utf8()}); } TEST(Cast, SameTypeZeroCopy) { @@ -1208,6 +1211,15 @@ TEST(Cast, TimeZeroCopy) { time64(TimeUnit::MICRO)); } +TEST(Cast, DateToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(date32(), "[0, null]"), + ArrayFromJSON(string_type, R"(["1970-01-01", null])")); + CheckCast(ArrayFromJSON(date64(), "[86400000, null]"), + ArrayFromJSON(string_type, R"(["1970-01-02", null])")); + } +} + TEST(Cast, DateToDate) { auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]"); auto day_64 = ArrayFromJSON(date64(), R"([ diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 0c7e3fdb0c5..57b42c7f5a7 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -57,20 +57,26 @@ std::vector GenerateTestCases() { {field("a", uint64())}, {field("b\"", utf8())}, {field("c ", int32())}, + {field("d", date32())}, + {field("e", date64())}, }); auto populated_batch = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abc\"efg", "c ": 2324}, { "b\"": "abcd", "c ": 5467}, { }, { "a": 546, "b\"": "", "c ": 517 }, - { "a": 124, "b\"": "a\"\"b\"" }])"; - std::string expected_without_header = std::string("1,,-1") + "\n" + // line 1 - +R"(1,"abc""efg",2324)" + "\n" + // line 2 - R"(,"abcd",5467)" + "\n" + // line 3 - R"(,,)" + "\n" + // line 4 - R"(546,"",517)" + "\n" + // line 5 - R"(124,"a""""b""",)" + "\n"; // line 6 - std::string expected_header = std::string(R"("a","b""","c ")") + "\n"; + { "a": 124, "b\"": "a\"\"b\"" }, + { "d": 0 }, + { "e": 86400000 }])"; + std::string expected_without_header = std::string("1,,-1,,") + "\n" + // line 1 + R"(1,"abc""efg",2324,,)" + "\n" + // line 2 + R"(,"abcd",5467,,)" + "\n" + // line 3 + R"(,,,,)" + "\n" + // line 4 + R"(546,"",517,,)" + "\n" + // line 5 + R"(124,"a""""b""",,,)" + "\n" + // line 6 + R"(,,,1970-01-01,)" + "\n" + // line 7 + R"(,,,,1970-01-02)" + "\n"; // line 8 + std::string expected_header = std::string(R"("a","b""","c ","d","e")") + "\n"; return std::vector{ {abc_schema, "[]", DefaultTestOptions(/*header=*/false), ""}, From 5f2de42eb0c86aafee1713207c7eaabc88bc42bb Mon Sep 17 00:00:00 2001 From: Rommel Quintanilla Date: Wed, 4 Aug 2021 17:01:38 -0500 Subject: [PATCH 2/4] ARROW-12540: [C++] Added more tests for casting temporal types to strings --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index dcff406db3a..5d7bc38b745 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -197,8 +197,7 @@ TEST(Cast, CanCast) { ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO)}); - // ExpectCannotCast(timestamp(TimeUnit::MICRO), - // kBaseBinaryTypes); // no formatting supported + ExpectCannotCast(timestamp(TimeUnit::MICRO), {binary(), large_binary()}); // no formatting supported ExpectCannotCast(fixed_size_binary(3), {fixed_size_binary(3)}); // FIXME missing identity cast @@ -211,6 +210,11 @@ TEST(Cast, CanCast) { ExpectCanCast(date32(), {utf8(), large_utf8()}); ExpectCanCast(date64(), {utf8(), large_utf8()}); + ExpectCanCast(timestamp(TimeUnit::NANO), {utf8(), large_utf8()}); + ExpectCanCast(timestamp(TimeUnit::MICRO), {utf8(), large_utf8()}); + ExpectCanCast(time32(TimeUnit::MILLI), {utf8(), large_utf8()}); + ExpectCanCast(time64(TimeUnit::NANO), {utf8(), large_utf8()}); + //ExpectCanCast(duration(TimeUnit::SECOND), {utf8(), large_utf8()}); //FIXME } TEST(Cast, SameTypeZeroCopy) { From 3c7549e883d45df4efbbbc60792761e1b7728550 Mon Sep 17 00:00:00 2001 From: Rommel Quintanilla Date: Thu, 5 Aug 2021 11:00:22 -0500 Subject: [PATCH 3/4] ARROW-12540: [C++] Added tests for casting time and timestamp types to strings --- .../arrow/compute/kernels/scalar_cast_test.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 5d7bc38b745..1fdeb202fbe 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -214,7 +214,6 @@ TEST(Cast, CanCast) { ExpectCanCast(timestamp(TimeUnit::MICRO), {utf8(), large_utf8()}); ExpectCanCast(time32(TimeUnit::MILLI), {utf8(), large_utf8()}); ExpectCanCast(time64(TimeUnit::NANO), {utf8(), large_utf8()}); - //ExpectCanCast(duration(TimeUnit::SECOND), {utf8(), large_utf8()}); //FIXME } TEST(Cast, SameTypeZeroCopy) { @@ -1224,6 +1223,22 @@ TEST(Cast, DateToString) { } } +TEST(Cast, TimeToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"), + ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])")); + CheckCast(ArrayFromJSON(time64(TimeUnit::NANO), "[0, 1]"), + ArrayFromJSON(string_type, R"(["00:00:00.000000000", "00:00:00.000000001"])")); + } +} + +TEST(Cast, TimestampToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), + ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); + } +} + TEST(Cast, DateToDate) { auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]"); auto day_64 = ArrayFromJSON(date64(), R"([ From 626dc1ebaec7530fee85b0af01b298cc7eca6a28 Mon Sep 17 00:00:00 2001 From: Rommel Quintanilla Date: Thu, 5 Aug 2021 11:03:57 -0500 Subject: [PATCH 4/4] ARROW-12540: [C++] Fix formatting --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 1fdeb202fbe..1b6c862648e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -197,7 +197,8 @@ TEST(Cast, CanCast) { ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO)}); - ExpectCannotCast(timestamp(TimeUnit::MICRO), {binary(), large_binary()}); // no formatting supported + ExpectCannotCast(timestamp(TimeUnit::MICRO), + {binary(), large_binary()}); // no formatting supported ExpectCannotCast(fixed_size_binary(3), {fixed_size_binary(3)}); // FIXME missing identity cast @@ -1227,15 +1228,17 @@ TEST(Cast, TimeToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"), ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])")); - CheckCast(ArrayFromJSON(time64(TimeUnit::NANO), "[0, 1]"), - ArrayFromJSON(string_type, R"(["00:00:00.000000000", "00:00:00.000000001"])")); + CheckCast( + ArrayFromJSON(time64(TimeUnit::NANO), "[0, 1]"), + ArrayFromJSON(string_type, R"(["00:00:00.000000000", "00:00:00.000000001"])")); } } TEST(Cast, TimestampToString) { for (auto string_type : {utf8(), large_utf8()}) { - CheckCast(ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), - ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); + CheckCast( + ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), + ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); } }