From a7f2ae5ecdbb93eabd7365737f532614fc130dbf Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Tue, 21 May 2024 14:34:24 -0700 Subject: [PATCH 1/4] Copy key value metadata when store_schema is false --- .../parquet/arrow/arrow_reader_writer_test.cc | 33 +++++++++++++++---- cpp/src/parquet/arrow/test_util.h | 13 +++++--- cpp/src/parquet/arrow/writer.cc | 7 +++- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index dd0b19c2ce0..b7a23d9f92e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -649,22 +649,24 @@ class ParquetIOTestBase : public ::testing::Test { AssertArraysEqual(values, *out); } - void ReadTableFromFile(std::unique_ptr reader, bool expect_metadata, + void ReadTableFromFile(std::unique_ptr reader, bool expect_schema, std::shared_ptr* out) { ASSERT_OK_NO_THROW(reader->ReadTable(out)); auto key_value_metadata = reader->parquet_reader()->metadata()->key_value_metadata().get(); - if (!expect_metadata) { - ASSERT_EQ(nullptr, key_value_metadata); + if (!expect_schema) { + ASSERT_TRUE(key_value_metadata == nullptr || + !key_value_metadata->Contains("ARROW:schema")); } else { ASSERT_NE(nullptr, key_value_metadata); + ASSERT_TRUE(key_value_metadata->Contains("ARROW:schema")); } ASSERT_NE(nullptr, out->get()); } void ReadTableFromFile(std::unique_ptr reader, std::shared_ptr
* out) { - ReadTableFromFile(std::move(reader), /*expect_metadata=*/false, out); + ReadTableFromFile(std::move(reader), /*expect_schema=*/false, out); } void RoundTripSingleColumn( @@ -680,9 +682,9 @@ class ParquetIOTestBase : public ::testing::Test { std::shared_ptr
out; std::unique_ptr reader; ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); - const bool expect_metadata = arrow_properties->store_schema(); + const bool expect_schema = arrow_properties->store_schema(); ASSERT_NO_FATAL_FAILURE( - this->ReadTableFromFile(std::move(reader), expect_metadata, &out)); + this->ReadTableFromFile(std::move(reader), expect_schema, &out)); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(table->num_rows(), out->num_rows()); @@ -889,6 +891,25 @@ TYPED_TEST(TestParquetIO, ZeroChunksTable) { ASSERT_EQ(1, out->column(0)->num_chunks()); } +TYPED_TEST(TestParquetIO, TableWithMetadata) { + auto values = + std::make_shared(::arrow::ArrayVector{}, ::arrow::int32()); + auto table = MakeSimpleTable( + values, false, ::arrow::KeyValueMetadata::Make({"foo"}, {"bar"})); + + this->ResetSink(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), + this->sink_, SMALL_SIZE)); + + std::shared_ptr
out; + std::unique_ptr reader; + ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); + ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); + ASSERT_NE(nullptr, out->schema()->metadata()); + ASSERT_TRUE(out->schema()->metadata()->Contains("foo")); + ASSERT_EQ("bar", out->schema()->metadata()->Get("foo")); +} + TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { std::shared_ptr values; ASSERT_OK(NonNullArray(SMALL_SIZE, &values)); diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index b2be1b3c535..b2d59c73854 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -479,15 +479,18 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { } std::shared_ptr<::arrow::Table> MakeSimpleTable( - const std::shared_ptr& values, bool nullable) { - auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)}); + const std::shared_ptr& values, bool nullable, + std::shared_ptr<::arrow::KeyValueMetadata> metadata = nullptr) { + auto schema = ::arrow::schema( + {::arrow::field("col", values->type(), nullable)}, std::move(metadata)); return ::arrow::Table::Make(schema, {values}); } -std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr& values, - bool nullable) { +std::shared_ptr<::arrow::Table> MakeSimpleTable( + const std::shared_ptr& values, bool nullable, + std::shared_ptr<::arrow::KeyValueMetadata> metadata = nullptr) { auto carr = std::make_shared<::arrow::ChunkedArray>(values); - return MakeSimpleTable(carr, nullable); + return MakeSimpleTable(carr, nullable, std::move(metadata)); } template diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 5238986c428..72437c9e62f 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -529,7 +529,12 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo const ArrowWriterProperties& properties, std::shared_ptr* out) { if (!properties.store_schema()) { - *out = nullptr; + if (schema.metadata()) { + *out = schema.metadata()->Copy(); + } else { + // No metadata to propagate + *out = nullptr; + } return Status::OK(); } From a49079c572a1a0dfd631d6d900cf49eb9356652d Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 22 May 2024 09:36:30 -0700 Subject: [PATCH 2/4] fixup! Copy key value metadata when store_schema is false --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 14 ++++++-------- cpp/src/parquet/arrow/test_util.h | 4 ++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index b7a23d9f92e..3c9d6aef8e5 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -892,14 +892,13 @@ TYPED_TEST(TestParquetIO, ZeroChunksTable) { } TYPED_TEST(TestParquetIO, TableWithMetadata) { - auto values = - std::make_shared(::arrow::ArrayVector{}, ::arrow::int32()); - auto table = MakeSimpleTable( - values, false, ::arrow::KeyValueMetadata::Make({"foo"}, {"bar"})); + auto values = std::make_shared(::arrow::ArrayVector{}, ::arrow::int32()); + auto table = + MakeSimpleTable(values, false, ::arrow::KeyValueMetadata::Make({"foo"}, {"bar"})); this->ResetSink(); - ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), - this->sink_, SMALL_SIZE)); + ASSERT_OK_NO_THROW( + WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, SMALL_SIZE)); std::shared_ptr
out; std::unique_ptr reader; @@ -4055,8 +4054,7 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { GTEST_SKIP() << "ARROW_TEST_DATA not set."; } std::stringstream ss; - ss << c_root << "/" - << "parquet/ARROW-17100.parquet"; + ss << c_root << "/" << "parquet/ARROW-17100.parquet"; std::string path = ss.str(); TryReadDataFile(path); } diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index b2d59c73854..50336d39871 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -481,8 +481,8 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { std::shared_ptr<::arrow::Table> MakeSimpleTable( const std::shared_ptr& values, bool nullable, std::shared_ptr<::arrow::KeyValueMetadata> metadata = nullptr) { - auto schema = ::arrow::schema( - {::arrow::field("col", values->type(), nullable)}, std::move(metadata)); + auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)}, + std::move(metadata)); return ::arrow::Table::Make(schema, {values}); } From 203d4ca8ac29855f063de690fb2e652945adee35 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Wed, 22 May 2024 10:12:12 -0700 Subject: [PATCH 3/4] fixup! Copy key value metadata when store_schema is false --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 3c9d6aef8e5..6f0f7436044 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4054,7 +4054,8 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { GTEST_SKIP() << "ARROW_TEST_DATA not set."; } std::stringstream ss; - ss << c_root << "/" << "parquet/ARROW-17100.parquet"; + ss << c_root << "/" + << "parquet/ARROW-17100.parquet"; std::string path = ss.str(); TryReadDataFile(path); } From 3365cbf04d8ea2cc0cceb055fe14c2118a972c2c Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Tue, 28 May 2024 13:22:21 -0700 Subject: [PATCH 4/4] fixup! Copy key value metadata when store_schema is false --- .../parquet/arrow/arrow_reader_writer_test.cc | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 6f0f7436044..bccdc2b372a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -891,24 +891,6 @@ TYPED_TEST(TestParquetIO, ZeroChunksTable) { ASSERT_EQ(1, out->column(0)->num_chunks()); } -TYPED_TEST(TestParquetIO, TableWithMetadata) { - auto values = std::make_shared(::arrow::ArrayVector{}, ::arrow::int32()); - auto table = - MakeSimpleTable(values, false, ::arrow::KeyValueMetadata::Make({"foo"}, {"bar"})); - - this->ResetSink(); - ASSERT_OK_NO_THROW( - WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, SMALL_SIZE)); - - std::shared_ptr
out; - std::unique_ptr reader; - ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); - ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); - ASSERT_NE(nullptr, out->schema()->metadata()); - ASSERT_TRUE(out->schema()->metadata()->Contains("foo")); - ASSERT_EQ("bar", out->schema()->metadata()->Get("foo")); -} - TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { std::shared_ptr values; ASSERT_OK(NonNullArray(SMALL_SIZE, &values)); @@ -5363,6 +5345,27 @@ TEST(TestArrowReadWrite, OperationsOnClosedWriter) { ASSERT_RAISES(Invalid, writer->WriteTable(*table, 1)); } +TEST(TestArrowReadWrite, TableWithMetadata) { + auto values = std::make_shared(::arrow::ArrayVector{}, ::arrow::int32()); + auto table = + MakeSimpleTable(values, false, ::arrow::KeyValueMetadata::Make({"foo"}, {"bar"})); + + auto sink = CreateOutputStream(); + ASSERT_OK_NO_THROW( + WriteTable(*table, ::arrow::default_memory_pool(), sink, SMALL_SIZE)); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + std::shared_ptr
out; + std::unique_ptr reader; + ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_NO_THROW(reader->ReadTable(&out)); + + ASSERT_NE(nullptr, out->schema()->metadata()); + ASSERT_TRUE(out->schema()->metadata()->Contains("foo")); + ASSERT_EQ("bar", out->schema()->metadata()->Get("foo")); +} + namespace { struct ColumnIndexObject {