From eb218b63b944c25a91b6a9f55bc96cf34afe06c3 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 22 Aug 2022 11:06:51 +0200 Subject: [PATCH 1/2] Prevent Fields with same name in Schema --- cpp/src/arrow/type.cc | 4 +++- python/pyarrow/tests/test_table.py | 14 +++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index efff07db667..d35021e9314 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1556,7 +1556,9 @@ Result> Schema::AddField( if (i < 0 || i > this->num_fields()) { return Status::Invalid("Invalid column index to add field."); } - + if (this->GetFieldByName(field->name()) != nullptr) { + return Status::Invalid("Column already exists: ", field->name()); + } return std::make_shared(internal::AddVectorElement(impl_->fields_, i, field), impl_->metadata_); } diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index dbd90ac907b..b4162d55cba 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1000,17 +1000,6 @@ def test_table_select_column(): table.column(4) -def test_table_column_with_duplicates(): - # ARROW-8209 - table = pa.table([pa.array([1, 2, 3]), - pa.array([4, 5, 6]), - pa.array([7, 8, 9])], names=['a', 'b', 'a']) - - with pytest.raises(KeyError, - match='Field "a" exists 2 times in table schema'): - table.column('a') - - def test_table_add_column(): data = [ pa.array(range(5)), @@ -1033,6 +1022,9 @@ def test_table_add_column(): names=('d', 'a', 'b', 'c')) assert t4.equals(expected) + with pytest.raises(pa.ArrowInvalid, match="Column already exists: a"): + table.add_column(0, 'a', pa.array(range(5))) + def test_table_set_column(): data = [ From ee88edcb190d5861fad3708a91062f241ed8ccb5 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 22 Aug 2022 12:45:52 +0200 Subject: [PATCH 2/2] Error on WriteTable if duplicate field names --- cpp/src/arrow/type.cc | 4 +--- cpp/src/parquet/arrow/writer.cc | 9 +++++++++ python/pyarrow/tests/test_dataset.py | 11 +++++++++++ python/pyarrow/tests/test_table.py | 14 +++++++++++--- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d35021e9314..efff07db667 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1556,9 +1556,7 @@ Result> Schema::AddField( if (i < 0 || i > this->num_fields()) { return Status::Invalid("Invalid column index to add field."); } - if (this->GetFieldByName(field->name()) != nullptr) { - return Status::Invalid("Column already exists: ", field->name()); - } + return std::make_shared(internal::AddVectorElement(impl_->fields_, i, field), impl_->metadata_); } diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index cf174dc61c8..1c4eb5dbfc6 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -331,6 +331,15 @@ class FileWriterImpl : public FileWriter { chunk_size = this->properties().max_row_group_length(); } + // Cannot write with duplicate field names + std::unordered_set s; + for (auto field : table.fields()) { + if (s.count(field->name()) > 0) { + return Status::Invalid("Cannot write parquet table with duplicate field names: ", field->name()); + } + s.insert(field->name()); + } + auto WriteRowGroup = [&](int64_t offset, int64_t size) { RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3dc9c3beb6e..31af68397da 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4182,6 +4182,17 @@ def file_visitor(written_file): assert pathlib.Path(visited_path) in expected_paths +def test_write_table_duplicate_fields(tempdir): + table = pa.table([ + pa.array(range(5)), + pa.array(range(5)), + ], names=['a', 'a']) + + match = "Cannot write parquet table with duplicate field names: a" + with pytest.raises(pa.ArrowInvalid, match=match): + pq.write_table(table, tempdir / 'file.parquet') + + def test_write_table_multiple_fragments(tempdir): table = pa.table([ pa.array(range(10)), pa.array(np.random.randn(10)), diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b4162d55cba..dbd90ac907b 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1000,6 +1000,17 @@ def test_table_select_column(): table.column(4) +def test_table_column_with_duplicates(): + # ARROW-8209 + table = pa.table([pa.array([1, 2, 3]), + pa.array([4, 5, 6]), + pa.array([7, 8, 9])], names=['a', 'b', 'a']) + + with pytest.raises(KeyError, + match='Field "a" exists 2 times in table schema'): + table.column('a') + + def test_table_add_column(): data = [ pa.array(range(5)), @@ -1022,9 +1033,6 @@ def test_table_add_column(): names=('d', 'a', 'b', 'c')) assert t4.equals(expected) - with pytest.raises(pa.ArrowInvalid, match="Column already exists: a"): - table.add_column(0, 'a', pa.array(range(5))) - def test_table_set_column(): data = [