From 3b477d819bb8c183e6e586b4b99e39802e6a45b0 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Wed, 24 Aug 2022 16:12:12 +0200 Subject: [PATCH 01/13] Correction for fields included when reading an ORC table --- cpp/src/arrow/adapters/orc/adapter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 5af5ebccc84..18f88bc6dfb 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -411,7 +411,7 @@ class ORCFileReader::Impl { ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } - opts->includeTypes(include_indices_list); + opts->include(include_indices_list); return Status::OK(); } From 34b8c4b96a82f56066a1a87cf8fe6d042f85ad99 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Mon, 26 Sep 2022 10:08:50 +0200 Subject: [PATCH 02/13] Change tests to reflect new behaviour --- c_glib/test/test-orc-file-reader.rb | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb index 38900cf12f3..ab7eac8f5dd 100644 --- a/c_glib/test/test-orc-file-reader.rb +++ b/c_glib/test/test-orc-file-reader.rb @@ -185,8 +185,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - assert_equal(build_table("boolean1" => build_boolean_array([false, true]), - "short1" => build_int16_array([1024, 2048])), + assert_equal(build_table("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripes) end end @@ -200,10 +200,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - boolean1 = build_boolean_array([false, true]) - short1 = build_int16_array([1024, 2048]) - assert_equal(build_record_batch("boolean1" => boolean1, - "short1" => short1), + assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripe(0)) end end From bb48ed3afb2df01ae0dbfef7628105cc51b82350 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Mon, 26 Sep 2022 14:22:39 +0200 Subject: [PATCH 03/13] Correct ruby tests --- ruby/red-arrow/test/test-orc.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb index b882da0a1b5..e4a4d181a2b 100644 --- a/ruby/red-arrow/test/test-orc.rb +++ b/ruby/red-arrow/test/test-orc.rb @@ -164,8 +164,8 @@ def pp_values(values) ] end assert_equal([ - ["boolean1: bool", [pp_values([false, true])]], - ["short1: int16", [pp_values([1024, 2048])]], + ["byte1: int8", [pp_values([1, 100])]], + ["int1: int32", [pp_values([65536, 65536])]], ], dump) end From e7329e6808104ef145a9476f0f6ed3e7ce12382b Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 27 Sep 2022 10:54:14 +0200 Subject: [PATCH 04/13] Update c_glib/test/test-orc-file-reader.rb Co-authored-by: Antoine Pitrou --- c_glib/test/test-orc-file-reader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb index ab7eac8f5dd..6626c67c3ab 100644 --- a/c_glib/test/test-orc-file-reader.rb +++ b/c_glib/test/test-orc-file-reader.rb @@ -186,7 +186,7 @@ def all_columns require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] assert_equal(build_table("byte1" => build_int8_array([1, 100]), - "int1" => build_int32_array([65536, 65536])), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripes) end end From 82c94730d27011ca7376aa2b09d1a5dd1d208c33 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 27 Sep 2022 11:02:38 +0200 Subject: [PATCH 05/13] Fix indentation --- ruby/red-arrow/test/test-orc.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb index e4a4d181a2b..4670350a09d 100644 --- a/ruby/red-arrow/test/test-orc.rb +++ b/ruby/red-arrow/test/test-orc.rb @@ -164,8 +164,8 @@ def pp_values(values) ] end assert_equal([ - ["byte1: int8", [pp_values([1, 100])]], - ["int1: int32", [pp_values([65536, 65536])]], + ["byte1: int8", [pp_values([1, 100])]], + ["int1: int32", [pp_values([65536, 65536])]], ], dump) end From 313acc9a36a72ad011072c56b8dddd9c6870f0cb Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 27 Sep 2022 11:28:00 +0200 Subject: [PATCH 06/13] Add C++ test for selection of fields in ORC import --- cpp/src/arrow/adapters/orc/adapter_test.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 1efc02bc404..7cf330dffb5 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -225,7 +225,8 @@ std::shared_ptr GenerateRandomTable(const std::shared_ptr& schema void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, const std::shared_ptr
& expected_output_table, - const int64_t max_size = kDefaultSmallMemStreamSize) { + const int64_t max_size = kDefaultSmallMemStreamSize, + std::vector* opt_selected_read_indices = nullptr) { EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(max_size)); auto write_options = adapters::orc::WriteOptions(); @@ -249,7 +250,9 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, ASSERT_EQ(reader->GetCompression(), write_options.compression); ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size); ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride); - EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read()); + EXPECT_OK_AND_ASSIGN(auto actual_output_table, + opt_selected_read_indices == nullptr ? + reader->Read() : reader->Read(*opt_selected_read_indices)); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } @@ -450,6 +453,17 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { std::shared_ptr
table = TableFromJSON(table_schema, {}); AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); } +TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { + std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); + std::shared_ptr schema_selected = schema( + { + field("int32", int32()), field("int64", int64()), field("float", float32()), + field("decimal128z", decimal128(32, 0)), field("date32", date32()) + }); + std::shared_ptr
table_selected = TableFromJSON(schema_selected, {R"([])"}); + std::vector selected_indices = {1,3}; + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, &selected_indices); +} class TestORCWriterTrivialWithConversion : public ::testing::Test { public: TestORCWriterTrivialWithConversion() { From 0c91f8c1b46792666ee406fa958cc93ca61fe15d Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 27 Sep 2022 15:26:07 +0200 Subject: [PATCH 07/13] Fix indentation and test --- cpp/src/arrow/adapters/orc/adapter_test.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 7cf330dffb5..095c154a3c7 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -225,7 +225,7 @@ std::shared_ptr
GenerateRandomTable(const std::shared_ptr& schema void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, const std::shared_ptr
& expected_output_table, - const int64_t max_size = kDefaultSmallMemStreamSize, + const int64_t max_size = kDefaultSmallMemStreamSize, std::vector* opt_selected_read_indices = nullptr) { EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(max_size)); @@ -251,8 +251,9 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size); ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride); EXPECT_OK_AND_ASSIGN(auto actual_output_table, - opt_selected_read_indices == nullptr ? - reader->Read() : reader->Read(*opt_selected_read_indices)); + opt_selected_read_indices == nullptr + ? reader->Read() + : reader->Read(*opt_selected_read_indices)); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } @@ -455,14 +456,12 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { } TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); - std::shared_ptr schema_selected = schema( - { - field("int32", int32()), field("int64", int64()), field("float", float32()), - field("decimal128z", decimal128(32, 0)), field("date32", date32()) - }); + std::shared_ptr schema_selected = + schema({field("int8", int8()), field("int32", int32())}); std::shared_ptr
table_selected = TableFromJSON(schema_selected, {R"([])"}); - std::vector selected_indices = {1,3}; - AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, &selected_indices); + std::vector selected_indices = {1, 3}; + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, + &selected_indices); } class TestORCWriterTrivialWithConversion : public ::testing::Test { public: From b2b6c3af18d7bf64914291168549b92158c4e9cd Mon Sep 17 00:00:00 2001 From: LouisClt Date: Wed, 28 Sep 2022 09:27:18 +0200 Subject: [PATCH 08/13] Fix spaces+ indentation --- cpp/src/arrow/adapters/orc/adapter_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 095c154a3c7..6b8bbffa004 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -251,9 +251,9 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size); ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride); EXPECT_OK_AND_ASSIGN(auto actual_output_table, - opt_selected_read_indices == nullptr - ? reader->Read() - : reader->Read(*opt_selected_read_indices)); + opt_selected_read_indices == nullptr + ? reader->Read() + : reader->Read(*opt_selected_read_indices)); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } @@ -456,8 +456,8 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { } TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); - std::shared_ptr schema_selected = - schema({field("int8", int8()), field("int32", int32())}); + std::shared_ptr schema_selected = + schema({field("int8", int8()), field("int32", int32())}); std::shared_ptr
table_selected = TableFromJSON(schema_selected, {R"([])"}); std::vector selected_indices = {1, 3}; AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, From 8677539f6bebdedf602be8bc2373ca579b88f0b7 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Fri, 14 Oct 2022 17:34:14 +0200 Subject: [PATCH 09/13] Add another test with random data and improved field selection --- cpp/src/arrow/adapters/orc/adapter_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 6b8bbffa004..358e9540424 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -463,6 +463,16 @@ TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, &selected_indices); } +TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { + std::vector selected_indices = {1,14}; + random::RandomArrayGenerator rand; + auto batch = rand.BatchOf(table_schema->fields(),100); + std::shared_ptr
table = Table::Make(table_schema,batch->columns()); + ARROW_ASSIGN_OR_RAISE(auto table_selected ,table->SelectColumns(selected_indices)); + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, + &selected_indices); +} + class TestORCWriterTrivialWithConversion : public ::testing::Test { public: TestORCWriterTrivialWithConversion() { From fc69b344e16ef5d08d7efa6a268bacf4d8ca5da5 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Fri, 14 Oct 2022 21:28:49 +0200 Subject: [PATCH 10/13] Fix --- cpp/src/arrow/adapters/orc/adapter_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 358e9540424..fb885bd7e95 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -465,10 +465,10 @@ TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { } TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { std::vector selected_indices = {1,14}; - random::RandomArrayGenerator rand; + random::RandomArrayGenerator rand(kRandomSeed); auto batch = rand.BatchOf(table_schema->fields(),100); std::shared_ptr
table = Table::Make(table_schema,batch->columns()); - ARROW_ASSIGN_OR_RAISE(auto table_selected ,table->SelectColumns(selected_indices)); + EXPECT_OK_AND_ASSIGN(auto table_selected ,table->SelectColumns(selected_indices)); AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, &selected_indices); } From d9c48f6951d3982fab92bca8408636ee11af6d82 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 18 Oct 2022 14:56:33 +0200 Subject: [PATCH 11/13] Fix test --- cpp/src/arrow/adapters/orc/adapter_test.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 6150153f2cb..eaf1dc9dc03 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -465,11 +465,19 @@ TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { &selected_indices); } TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { - std::vector selected_indices = {1,14}; + std::vector selected_indices = {1, 7}; random::RandomArrayGenerator rand(kRandomSeed); - auto batch = rand.BatchOf(table_schema->fields(),100); - std::shared_ptr
table = Table::Make(table_schema,batch->columns()); - EXPECT_OK_AND_ASSIGN(auto table_selected ,table->SelectColumns(selected_indices)); + std::shared_ptr localSchema = schema( + { field("bool", boolean()), field("int32",int32()), + field("int64", int64()), field("float", float32()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("double", float64()), field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("binary", binary()), + }); + auto batch = rand.BatchOf(localSchema->fields(), 100); + std::shared_ptr
table = Table::Make(localSchema, batch->columns()); + EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices)); AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, &selected_indices); } From 3217212f2291fc536bcb171012232a1b5e0b2e30 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 18 Oct 2022 15:18:44 +0200 Subject: [PATCH 12/13] Fix linter --- cpp/src/arrow/adapters/orc/adapter_test.cc | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index eaf1dc9dc03..33e05f5a6f6 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -467,14 +467,18 @@ TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { std::vector selected_indices = {1, 7}; random::RandomArrayGenerator rand(kRandomSeed); - std::shared_ptr localSchema = schema( - { field("bool", boolean()), field("int32",int32()), - field("int64", int64()), field("float", float32()), - field("struct", struct_({field("a", utf8()), field("b", int64())})), - field("double", float64()), field("date32", date32()), - field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), - field("binary", binary()), - }); + std::shared_ptr localSchema = schema({ + field("bool", boolean()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("double", float64()), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary()), + }); auto batch = rand.BatchOf(localSchema->fields(), 100); std::shared_ptr
table = Table::Make(localSchema, batch->columns()); EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices)); From ec3b6202bd70fd5b86154d08184e69aecd89adc3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 18 Oct 2022 17:21:03 +0200 Subject: [PATCH 13/13] Validate output table --- cpp/src/arrow/adapters/orc/adapter_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 33e05f5a6f6..afc4bdb1d3b 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -255,6 +255,7 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, opt_selected_read_indices == nullptr ? reader->Read() : reader->Read(*opt_selected_read_indices)); + ASSERT_OK(actual_output_table->ValidateFull()); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } @@ -467,7 +468,7 @@ TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { std::vector selected_indices = {1, 7}; random::RandomArrayGenerator rand(kRandomSeed); - std::shared_ptr localSchema = schema({ + std::shared_ptr local_schema = schema({ field("bool", boolean()), field("int32", int32()), field("int64", int64()), @@ -479,8 +480,8 @@ TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { field("string", utf8()), field("binary", binary()), }); - auto batch = rand.BatchOf(localSchema->fields(), 100); - std::shared_ptr
table = Table::Make(localSchema, batch->columns()); + auto batch = rand.BatchOf(local_schema->fields(), 100); + std::shared_ptr
table = Table::Make(local_schema, batch->columns()); EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices)); AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, &selected_indices);