Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include "parquet/test_util.h"

using arrow::Array;
using arrow::ArrayData;
using arrow::ArrayVisitor;
using arrow::Buffer;
using arrow::ChunkedArray;
Expand Down Expand Up @@ -686,7 +687,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) {
ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out));
ASSERT_EQ(1, out->num_columns());
ASSERT_EQ(100, out->num_rows());
EXPECT_EQ(100, out->num_rows());

std::shared_ptr<ChunkedArray> chunked_array = out->column(0);
ASSERT_EQ(1, chunked_array->num_chunks());
Expand Down Expand Up @@ -1085,8 +1086,8 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
}

std::vector<std::shared_ptr<Buffer>> buffers{values->null_bitmap(), int64_data};
auto arr_data = std::make_shared<::arrow::ArrayData>(::arrow::int64(), values->length(),
buffers, values->null_count());
auto arr_data = std::make_shared<ArrayData>(::arrow::int64(), values->length(), buffers,
values->null_count());
std::shared_ptr<Array> expected_values = MakeArray(arr_data);
ASSERT_NE(expected_values, NULLPTR);

Expand Down Expand Up @@ -2360,6 +2361,39 @@ TEST(ArrowReadWrite, SingleColumnNullableStruct) {
3);
}

TEST(ArrowReadWrite, NestedRequiredField) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test cases look very very similar, just the name and the used values differ. I would have expected that we also would have set nullable=false somewhere in this one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you looking at the latest version?

Right below this comment is:

auto int_field = ::arrow::field("int_array", ::arrow::int32(), /*nullable=*/false);

(note the last parameter?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, didn't see that when reviewing this code. Now this makes sense!

auto int_field = ::arrow::field("int_array", ::arrow::int32(), /*nullable=*/false);
auto int_array = ::arrow::ArrayFromJSON(int_field->type(), "[0, 1, 2, 3, 4, 5, 7, 8]");
auto struct_field =
::arrow::field("root", ::arrow::struct_({int_field}), /*nullable=*/true);
std::shared_ptr<Buffer> validity_bitmap;
ASSERT_OK_AND_ASSIGN(validity_bitmap, ::arrow::AllocateBitmap(8));
validity_bitmap->mutable_data()[0] = 0xCC;

auto struct_data = ArrayData::Make(struct_field->type(), /*length=*/8,
{validity_bitmap}, {int_array->data()});
CheckSimpleRoundtrip(::arrow::Table::Make(::arrow::schema({struct_field}),
{::arrow::MakeArray(struct_data)}),
/*row_group_size=*/8);
}

TEST(ArrowReadWrite, NestedNullableField) {
auto int_field = ::arrow::field("int_array", ::arrow::int32());
auto int_array =
::arrow::ArrayFromJSON(int_field->type(), "[0, null, 2, null, 4, 5, null, 8]");
auto struct_field =
::arrow::field("root", ::arrow::struct_({int_field}), /*nullable=*/true);
std::shared_ptr<Buffer> validity_bitmap;
ASSERT_OK_AND_ASSIGN(validity_bitmap, ::arrow::AllocateBitmap(8));
validity_bitmap->mutable_data()[0] = 0xCC;

auto struct_data = ArrayData::Make(struct_field->type(), /*length=*/8,
{validity_bitmap}, {int_array->data()});
CheckSimpleRoundtrip(::arrow::Table::Make(::arrow::schema({struct_field}),
{::arrow::MakeArray(struct_data)}),
/*row_group_size=*/8);
}

TEST(TestArrowReadWrite, CanonicalNestedRoundTrip) {
auto doc_id = field("DocId", ::arrow::int64(), /*nullable=*/false);
auto links = field(
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/parquet/arrow/path_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ struct PathInfo {
int16_t max_def_level = 0;
int16_t max_rep_level = 0;
bool has_dictionary = false;
bool leaf_is_nullable = false;
};

/// Contains logic for writing a single leaf node to parquet.
Expand All @@ -540,6 +541,7 @@ Status WritePath(ElementRange root_range, PathInfo* path_info,
std::vector<ElementRange> stack(path_info->path.size());
MultipathLevelBuilderResult builder_result;
builder_result.leaf_array = path_info->primitive_array;
builder_result.leaf_is_nullable = path_info->leaf_is_nullable;

if (path_info->max_def_level == 0) {
// This case only occurs when there are no nullable or repeated
Expand Down Expand Up @@ -706,6 +708,7 @@ class PathBuilder {
explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
template <typename T>
void AddTerminalInfo(const T& array) {
info_.leaf_is_nullable = nullable_in_parent_;
if (nullable_in_parent_) {
info_.max_def_level++;
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/parquet/arrow/path_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ struct MultipathLevelBuilderResult {
/// This allows for the parquet writing to determine which values ultimately
/// needs to be written.
std::vector<ElementRange> post_list_visited_elements;

/// Whether the leaf array is nullable.
bool leaf_is_nullable;
};

/// \brief Logic for being able to write out nesting (rep/def level) data that is
Expand Down
7 changes: 0 additions & 7 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,6 @@ using ParquetReader = parquet::ParquetFileReader;

using parquet::internal::RecordReader;

#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
#define END_PARQUET_CATCH_EXCEPTIONS \
} \
catch (const ::parquet::ParquetException& e) { \
return ::arrow::Status::IOError(e.what()); \
}

namespace parquet {
namespace arrow {
namespace {
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/arrow/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,12 @@ class ArrowColumnWriterV2 {

return column_writer->WriteArrow(result.def_levels, result.rep_levels,
result.def_rep_level_count, *values_array,
ctx);
ctx, result.leaf_is_nullable);
}));
}

PARQUET_CATCH_NOT_OK(column_writer->Close());
}

return Status::OK();
}

Expand Down
Loading