diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index cf174dc61c8..1c4eb5dbfc6 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -331,6 +331,15 @@ class FileWriterImpl : public FileWriter { chunk_size = this->properties().max_row_group_length(); } + // Cannot write with duplicate field names + std::unordered_set s; + for (auto field : table.fields()) { + if (s.count(field->name()) > 0) { + return Status::Invalid("Cannot write parquet table with duplicate field names: ", field->name()); + } + s.insert(field->name()); + } + auto WriteRowGroup = [&](int64_t offset, int64_t size) { RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3dc9c3beb6e..31af68397da 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4182,6 +4182,17 @@ def file_visitor(written_file): assert pathlib.Path(visited_path) in expected_paths +def test_write_table_duplicate_fields(tempdir): + table = pa.table([ + pa.array(range(5)), + pa.array(range(5)), + ], names=['a', 'a']) + + match = "Cannot write parquet table with duplicate field names: a" + with pytest.raises(pa.ArrowInvalid, match=match): + pq.write_table(table, tempdir / 'file.parquet') + + def test_write_table_multiple_fragments(tempdir): table = pa.table([ pa.array(range(10)), pa.array(np.random.randn(10)),