Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c871ea9
Refactor WriteBatch/WriteBatchSpaced to utilize helper functions [ski…
wesm Aug 8, 2019
c4d7dc2
Refactor and add Arrow encoder stubs
wesm Aug 9, 2019
245f445
ByteArray statistics specializations [skip ci]
wesm Aug 9, 2019
882e434
TypedComparator/TypedStatistics augmentations for arrow::BinaryArray
wesm Aug 11, 2019
1264e10
More direct encoding implementation stubs
wesm Aug 11, 2019
57a45e0
Direct binary put works
wesm Aug 11, 2019
edc9f84
Fix unit tests [skip ci]
wesm Aug 12, 2019
d21ebd8
Get all direct put unit tests passing [skip ci]
wesm Aug 12, 2019
5aaf281
Temp [skip ci]
wesm Aug 12, 2019
8cc1bcf
Unit test for PutDictionary, PutIndices
wesm Aug 12, 2019
0a293ee
More scaffolding [skip ci]
wesm Aug 12, 2019
1380531
Closer to full dictionary write, NA test failing
wesm Aug 13, 2019
de9d0a5
Fix null dictionary test, unit tests passing again [skip ci]
wesm Aug 13, 2019
28268d6
Add unit test for writing changing dictionaries
wesm Aug 13, 2019
580a0ca
Add failing unit test for Arrow store schema option [skip ci]
wesm Aug 13, 2019
7705fdb
Automatically read dictionary fields by serializing the Arrow schema …
wesm Aug 13, 2019
7d663d5
Store schema when writing from Python, add unit test to exhibit direc…
wesm Aug 13, 2019
f26d7da
Fix up Python unit tests given schema serialization
wesm Aug 13, 2019
3425da4
Revert change causing ASAN failure [skip ci]
wesm Aug 14, 2019
5dc00b1
Fix MSVC compilation warnings [skip ci]
wesm Aug 14, 2019
8f4cd44
Fix DecodeArrow bug which only occurred when there are nulls at the e…
wesm Aug 14, 2019
3f45fef
Check more random seeds, fix warnings
wesm Aug 14, 2019
91555b6
Fix another new MSVC warning
wesm Aug 14, 2019
92cf4e0
Code review feedback
wesm Aug 15, 2019
494b954
Use other KeyValueMetadata factory function to hopefully appease MinGW
wesm Aug 15, 2019
7f3a2a8
Fix another KeyValueMetadata factory
wesm Aug 15, 2019
ad3bad3
Address code review comments
wesm Aug 16, 2019
6b1769c
Restore statistics aliases
wesm Aug 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ if((ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) AND NOT ARROW_JSON)
message(FATAL_ERROR "JSON parsing of arrays is required for Arrow tests")
endif()

if(ARROW_FLIGHT OR ARROW_BUILD_TESTS)
if(ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS)
set(ARROW_IPC ON)
endif()

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/array/builder_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {

template <typename DType, typename ArrayType>
enable_if_memoize<DType, Status> InsertValues(const DType&, const ArrayType& array) {
if (array.null_count() > 0) {
return Status::Invalid("Cannot insert dictionary values containing nulls");
}
for (int64_t i = 0; i < array.length(); ++i) {
ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i)));
}
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/ipc/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <memory>
#include <vector>

#include "arrow/ipc/dictionary.h" // IWYU pragma: export
#include "arrow/ipc/message.h"
#include "arrow/ipc/options.h"
#include "arrow/result.h"
Expand All @@ -49,8 +50,6 @@ class OutputStream;

namespace ipc {

class DictionaryMemo;

/// \class RecordBatchWriter
/// \brief Abstract interface for writing a stream of record batches
class ARROW_EXPORT RecordBatchWriter {
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/arrow/pretty_print.cc
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,21 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) {
return Status::OK();
}

Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
std::ostream* sink) {
for (int i = 0; i < batch.num_columns(); ++i) {
const std::string& name = batch.column_name(i);
PrettyPrintOptions column_options = options;
column_options.indent += 2;

(*sink) << name << ": ";
RETURN_NOT_OK(PrettyPrint(*batch.column(i), column_options, sink));
(*sink) << "\n";
}
(*sink) << std::flush;
return Status::OK();
}

Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
std::ostream* sink) {
RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink));
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/pretty_print.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ struct PrettyPrintOptions {
ARROW_EXPORT
Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink);

ARROW_EXPORT
Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
std::ostream* sink);

/// \brief Print human-readable representation of Table
ARROW_EXPORT
Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
Expand Down
19 changes: 15 additions & 4 deletions cpp/src/arrow/testing/gtest_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) {
for (int i = 0; i < carr.num_chunks(); ++i) {
auto c1 = carr.chunk(i);
*ss << "Chunk " << i << std::endl;
ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss));
::arrow::PrettyPrintOptions options(/*indent=*/2);
ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, options, ss));
*ss << std::endl;
}
}
Expand All @@ -59,15 +60,25 @@ void AssertTsEqual(const T& expected, const T& actual) {
if (!expected.Equals(actual)) {
std::stringstream pp_expected;
std::stringstream pp_actual;
ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected));
ARROW_EXPECT_OK(PrettyPrint(actual, 0, &pp_actual));
::arrow::PrettyPrintOptions options(/*indent=*/2);
options.window = 50;
ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected));
ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual));
FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str();
}
}

void AssertArraysEqual(const Array& expected, const Array& actual) {
void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose) {
std::stringstream diff;
if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) {
if (verbose) {
::arrow::PrettyPrintOptions options(/*indent=*/2);
options.window = 50;
diff << "Expected:\n";
ARROW_EXPECT_OK(PrettyPrint(expected, options, &diff));
diff << "\nActual:\n";
ARROW_EXPECT_OK(PrettyPrint(actual, options, &diff));
}
FAIL() << diff.str();
}
}
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/testing/gtest_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ using ArrayVector = std::vector<std::shared_ptr<Array>>;
#define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs))
#define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs))

ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual);
// If verbose is true, then the arrays will be pretty printed
ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual,
bool verbose = false);
ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected,
const RecordBatch& actual);
ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected,
Expand Down
143 changes: 115 additions & 28 deletions cpp/src/parquet/arrow/arrow-reader-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,11 @@ void WriteTableToBuffer(const std::shared_ptr<Table>& table, int64_t row_group_s
const std::shared_ptr<ArrowWriterProperties>& arrow_properties,
std::shared_ptr<Buffer>* out) {
auto sink = CreateOutputStream();

auto write_props = WriterProperties::Builder().write_batch_size(100)->build();

ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink,
row_group_size, default_writer_properties(),
arrow_properties));
row_group_size, write_props, arrow_properties));
ASSERT_OK_NO_THROW(sink->Finish(out));
}

Expand All @@ -368,37 +370,39 @@ void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual
}
}

void DoConfiguredRoundtrip(
const std::shared_ptr<Table>& table, int64_t row_group_size,
std::shared_ptr<Table>* out,
const std::shared_ptr<::parquet::WriterProperties>& parquet_properties =
::parquet::default_writer_properties(),
const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
default_arrow_writer_properties()) {
void DoRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group_size,
std::shared_ptr<Table>* out,
const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
::parquet::default_writer_properties(),
const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
default_arrow_writer_properties(),
const ArrowReaderProperties& arrow_reader_properties =
default_arrow_reader_properties()) {
std::shared_ptr<Buffer> buffer;

auto sink = CreateOutputStream();
ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink,
row_group_size, parquet_properties, arrow_properties));
row_group_size, writer_properties,
arrow_writer_properties));
ASSERT_OK_NO_THROW(sink->Finish(&buffer));

std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
::arrow::default_memory_pool(), &reader));
FileReaderBuilder builder;
ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader));
ASSERT_OK_NO_THROW(reader->ReadTable(out));
}

void CheckConfiguredRoundtrip(
const std::shared_ptr<Table>& input_table,
const std::shared_ptr<Table>& expected_table = nullptr,
const std::shared_ptr<::parquet::WriterProperties>& parquet_properties =
const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
::parquet::default_writer_properties(),
const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
default_arrow_writer_properties()) {
std::shared_ptr<Table> actual_table;
ASSERT_NO_FATAL_FAILURE(DoConfiguredRoundtrip(input_table, input_table->num_rows(),
&actual_table, parquet_properties,
arrow_properties));
ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(), &actual_table,
writer_properties, arrow_writer_properties));
if (expected_table) {
ASSERT_NO_FATAL_FAILURE(
::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema()));
Expand Down Expand Up @@ -439,9 +443,8 @@ void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group
std::shared_ptr<Table> result;
DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result,
arrow_properties);
ASSERT_NO_FATAL_FAILURE(
::arrow::AssertSchemaEqual(*table->schema(), *result->schema()));
ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false));
::arrow::AssertSchemaEqual(*table->schema(), *result->schema());
::arrow::AssertTablesEqual(*table, *result, false);
}

static std::shared_ptr<GroupNode> MakeSimpleSchema(const DataType& type,
Expand Down Expand Up @@ -751,8 +754,8 @@ TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {

TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
std::shared_ptr<Table> table;
ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table));
ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
this->PrepareListTable(SMALL_SIZE, true, true, 10, &table);
this->CheckRoundTrip(table);
}

TYPED_TEST(TestParquetIO, SingleRequiredListNullableColumnReadWrite) {
Expand Down Expand Up @@ -1169,8 +1172,12 @@ TEST_F(TestNullParquetIO, NullListColumn) {
}

TEST_F(TestNullParquetIO, NullDictionaryColumn) {
std::shared_ptr<Buffer> null_bitmap;
ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE,
&null_bitmap));

std::shared_ptr<Array> indices =
std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE);
std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE);
std::shared_ptr<::arrow::DictionaryType> dict_type =
std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null());

Expand Down Expand Up @@ -2803,7 +2810,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
::arrow::AssertTablesEqual(expected, *actual, /*same_chunk_layout=*/false);
}

static std::vector<double> null_probabilites() { return {0.0, 0.5, 1}; }
static std::vector<double> null_probabilities() { return {0.0, 0.5, 1}; }

protected:
std::shared_ptr<Array> dense_values_;
Expand All @@ -2813,7 +2820,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
ArrowReaderProperties properties_;
};

void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
void AsDictionary32Encoded(const Array& arr, std::shared_ptr<Array>* out) {
::arrow::StringDictionary32Builder builder(default_memory_pool());
const auto& string_array = static_cast<const ::arrow::StringArray&>(arr);
ASSERT_OK(builder.AppendArray(string_array));
Expand All @@ -2826,7 +2833,7 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
std::vector<std::shared_ptr<Array>> chunks(kNumRowGroups);
const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups;
for (int i = 0; i < kNumRowGroups; ++i) {
AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
AsDictionary32Encoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
}
auto ex_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
/*nullable=*/true);
Expand All @@ -2840,8 +2847,88 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDense) {

INSTANTIATE_TEST_CASE_P(
ReadDictionary, TestArrowReadDictionary,
::testing::ValuesIn(TestArrowReadDictionary::null_probabilites()));
::testing::ValuesIn(TestArrowReadDictionary::null_probabilities()));

TEST(TestArrowWriteDictionaries, ChangingDictionaries) {
constexpr int num_unique = 50;
constexpr int repeat = 10000;
constexpr int64_t min_length = 2;
constexpr int64_t max_length = 20;
::arrow::random::RandomArrayGenerator rag(0);
auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length,
max_length, /*null_probability=*/0.1);
auto expected = MakeSimpleTable(values, /*nullable=*/true);

const int num_chunks = 10;
std::vector<std::shared_ptr<Array>> chunks(num_chunks);
const int64_t chunk_size = values->length() / num_chunks;
for (int i = 0; i < num_chunks; ++i) {
AsDictionary32Encoded(*values->Slice(chunk_size * i, chunk_size), &chunks[i]);
}

auto dict_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
/*nullable=*/true);

std::shared_ptr<Table> actual;
DoRoundtrip(dict_table, /*row_group_size=*/values->length() / 2, &actual);
::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false);
}

TEST(TestArrowWriteDictionaries, AutoReadAsDictionary) {
constexpr int num_unique = 50;
constexpr int repeat = 100;
constexpr int64_t min_length = 2;
constexpr int64_t max_length = 20;
::arrow::random::RandomArrayGenerator rag(0);
auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length,
max_length, /*null_probability=*/0.1);
std::shared_ptr<Array> dict_values;
AsDictionary32Encoded(*values, &dict_values);

} // namespace arrow
auto expected = MakeSimpleTable(dict_values, /*nullable=*/true);
auto expected_dense = MakeSimpleTable(values, /*nullable=*/true);

auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
std::shared_ptr<Table> actual, actual_dense;

DoRoundtrip(expected, values->length(), &actual, default_writer_properties(),
props_store_schema);
::arrow::AssertTablesEqual(*expected, *actual);

auto props_no_store_schema = ArrowWriterProperties::Builder().build();
DoRoundtrip(expected, values->length(), &actual_dense, default_writer_properties(),
props_no_store_schema);
::arrow::AssertTablesEqual(*expected_dense, *actual_dense);
}

TEST(TestArrowWriteDictionaries, NestedSubfield) {
// ARROW-3246: Automatic decoding of dictionary subfields left as followup
// work
auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 2, 3]");
auto indices = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 0]");
auto dict = ::arrow::ArrayFromJSON(::arrow::utf8(), "[\"foo\"]");

std::shared_ptr<Array> dict_values, values;
auto dict_ty = ::arrow::dictionary(::arrow::int32(), ::arrow::utf8());
ASSERT_OK(::arrow::DictionaryArray::FromArrays(dict_ty, indices, dict, &dict_values));
ASSERT_OK(::arrow::ListArray::FromArrays(*offsets, *dict_values,
::arrow::default_memory_pool(), &values));

auto dense_ty = ::arrow::list(::arrow::utf8());
auto dense_values =
::arrow::ArrayFromJSON(dense_ty, "[[], [\"foo\", \"foo\"], [\"foo\"]]");

auto table = MakeSimpleTable(values, /*nullable=*/true);
auto expected_table = MakeSimpleTable(dense_values, /*nullable=*/true);

auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
std::shared_ptr<Table> actual;
DoRoundtrip(table, values->length(), &actual, default_writer_properties(),
props_store_schema);

// The nested subfield is not automatically decoded to dictionary
::arrow::AssertTablesEqual(*expected_table, *actual);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you write (or point to an existing) test which is identical but doesn't set store_schema?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks for pointing this out, definitely makes the test "stronger"


} // namespace arrow
} // namespace parquet
7 changes: 4 additions & 3 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,9 @@ class FileReaderImpl : public FileReader {
: pool_(pool), reader_(std::move(reader)), reader_properties_(properties) {}

Status Init() {
return BuildSchemaManifest(reader_->metadata()->schema(), reader_properties_,
&manifest_);
return BuildSchemaManifest(reader_->metadata()->schema(),
reader_->metadata()->key_value_metadata(),
reader_properties_, &manifest_);
}

std::vector<int> AllRowGroups() {
Expand Down Expand Up @@ -777,7 +778,7 @@ Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
}
}

auto result_schema = ::arrow::schema(fields, reader_->metadata()->key_value_metadata());
auto result_schema = ::arrow::schema(fields, manifest_.schema_metadata);
*out = Table::Make(result_schema, columns);
return (*out)->Validate();
END_PARQUET_CATCH_EXCEPTIONS
Expand Down
Loading