diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0367e3fedc2..0e5365a508b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -267,7 +267,7 @@ if((ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) AND NOT ARROW_JSON) message(FATAL_ERROR "JSON parsing of arrays is required for Arrow tests") endif() -if(ARROW_FLIGHT OR ARROW_BUILD_TESTS) +if(ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS) set(ARROW_IPC ON) endif() diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 6c0e651efcb..d5f1857516c 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -202,6 +202,9 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { template enable_if_memoize InsertValues(const DType&, const ArrayType& array) { + if (array.null_count() > 0) { + return Status::Invalid("Cannot insert dictionary values containing nulls"); + } for (int64_t i = 0; i < array.length(); ++i) { ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i))); } diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index e70827ed380..3b1fece7127 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/ipc/dictionary.h" // IWYU pragma: export #include "arrow/ipc/message.h" #include "arrow/ipc/options.h" #include "arrow/result.h" @@ -49,8 +50,6 @@ class OutputStream; namespace ipc { -class DictionaryMemo; - /// \class RecordBatchWriter /// \brief Abstract interface for writing a stream of record batches class ARROW_EXPORT RecordBatchWriter { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 88bd5470a80..1daaa91de38 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -536,6 +536,21 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) { return Status::OK(); } +Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options, + std::ostream* sink) { + for (int i = 0; i < batch.num_columns(); ++i) { + const std::string& name = batch.column_name(i); + PrettyPrintOptions column_options = options; + column_options.indent += 2; + + (*sink) << name << ": "; + RETURN_NOT_OK(PrettyPrint(*batch.column(i), column_options, sink)); + (*sink) << "\n"; + } + (*sink) << std::flush; + return Status::OK(); +} + Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, std::ostream* sink) { RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink)); diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 5740341a67d..70caa8cfa87 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -61,6 +61,10 @@ struct PrettyPrintOptions { ARROW_EXPORT Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink); +ARROW_EXPORT +Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options, + std::ostream* sink); + /// \brief Print human-readable representation of Table ARROW_EXPORT Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 49447642bec..3928b07aa6f 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,7 +49,8 @@ static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) { for (int i = 0; i < carr.num_chunks(); ++i) { auto c1 = carr.chunk(i); *ss << "Chunk " << i << std::endl; - ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss)); + ::arrow::PrettyPrintOptions options(/*indent=*/2); + ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, options, ss)); *ss << std::endl; } } @@ -59,15 +60,25 @@ void AssertTsEqual(const T& expected, const T& actual) { if (!expected.Equals(actual)) { std::stringstream pp_expected; std::stringstream pp_actual; - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - ARROW_EXPECT_OK(PrettyPrint(actual, 0, &pp_actual)); + ::arrow::PrettyPrintOptions options(/*indent=*/2); + options.window = 50; + ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected)); + ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual)); FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str(); } } -void AssertArraysEqual(const Array& expected, const Array& actual) { +void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose) { std::stringstream diff; if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) { + if (verbose) { + ::arrow::PrettyPrintOptions options(/*indent=*/2); + options.window = 50; + diff << "Expected:\n"; + ARROW_EXPECT_OK(PrettyPrint(expected, options, &diff)); + diff << "\nActual:\n"; + ARROW_EXPECT_OK(PrettyPrint(actual, options, &diff)); + } FAIL() << diff.str(); } } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index f378b808c54..e070dc7d612 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -140,7 +140,9 @@ using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs)) #define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs)) -ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual); +// If verbose is true, then the arrays will be pretty printed +ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual, + bool verbose = false); ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual); ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected, diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 1d50bcb9220..b4b6d5ec6a9 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -343,9 +343,11 @@ void WriteTableToBuffer(const std::shared_ptr& table, int64_t row_group_s const std::shared_ptr& arrow_properties, std::shared_ptr* out) { auto sink = CreateOutputStream(); + + auto write_props = WriterProperties::Builder().write_batch_size(100)->build(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, default_writer_properties(), - arrow_properties)); + row_group_size, write_props, arrow_properties)); ASSERT_OK_NO_THROW(sink->Finish(out)); } @@ -368,37 +370,39 @@ void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual } } -void DoConfiguredRoundtrip( - const std::shared_ptr
& table, int64_t row_group_size, - std::shared_ptr
* out, - const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = - ::parquet::default_writer_properties(), - const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()) { +void DoRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, + std::shared_ptr
* out, + const std::shared_ptr<::parquet::WriterProperties>& writer_properties = + ::parquet::default_writer_properties(), + const std::shared_ptr& arrow_writer_properties = + default_arrow_writer_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { std::shared_ptr buffer; auto sink = CreateOutputStream(); ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, parquet_properties, arrow_properties)); + row_group_size, writer_properties, + arrow_writer_properties)); ASSERT_OK_NO_THROW(sink->Finish(&buffer)); std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + FileReaderBuilder builder; + ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); + ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader)); ASSERT_OK_NO_THROW(reader->ReadTable(out)); } void CheckConfiguredRoundtrip( const std::shared_ptr
& input_table, const std::shared_ptr
& expected_table = nullptr, - const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = + const std::shared_ptr<::parquet::WriterProperties>& writer_properties = ::parquet::default_writer_properties(), - const std::shared_ptr& arrow_properties = + const std::shared_ptr& arrow_writer_properties = default_arrow_writer_properties()) { std::shared_ptr
actual_table; - ASSERT_NO_FATAL_FAILURE(DoConfiguredRoundtrip(input_table, input_table->num_rows(), - &actual_table, parquet_properties, - arrow_properties)); + ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(), &actual_table, + writer_properties, arrow_writer_properties)); if (expected_table) { ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema())); @@ -439,9 +443,8 @@ void CheckSimpleRoundtrip(const std::shared_ptr
& table, int64_t row_group std::shared_ptr
result; DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, arrow_properties); - ASSERT_NO_FATAL_FAILURE( - ::arrow::AssertSchemaEqual(*table->schema(), *result->schema())); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false)); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema()); + ::arrow::AssertTablesEqual(*table, *result, false); } static std::shared_ptr MakeSimpleSchema(const DataType& type, @@ -751,8 +754,8 @@ TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) { TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) { std::shared_ptr
table; - ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + this->PrepareListTable(SMALL_SIZE, true, true, 10, &table); + this->CheckRoundTrip(table); } TYPED_TEST(TestParquetIO, SingleRequiredListNullableColumnReadWrite) { @@ -1169,8 +1172,12 @@ TEST_F(TestNullParquetIO, NullListColumn) { } TEST_F(TestNullParquetIO, NullDictionaryColumn) { + std::shared_ptr null_bitmap; + ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE, + &null_bitmap)); + std::shared_ptr indices = - std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE); + std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE); std::shared_ptr<::arrow::DictionaryType> dict_type = std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null()); @@ -2803,7 +2810,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { ::arrow::AssertTablesEqual(expected, *actual, /*same_chunk_layout=*/false); } - static std::vector null_probabilites() { return {0.0, 0.5, 1}; } + static std::vector null_probabilities() { return {0.0, 0.5, 1}; } protected: std::shared_ptr dense_values_; @@ -2813,7 +2820,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { ArrowReaderProperties properties_; }; -void AsDictionaryEncoded(const Array& arr, std::shared_ptr* out) { +void AsDictionary32Encoded(const Array& arr, std::shared_ptr* out) { ::arrow::StringDictionary32Builder builder(default_memory_pool()); const auto& string_array = static_cast(arr); ASSERT_OK(builder.AppendArray(string_array)); @@ -2826,7 +2833,7 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) { std::vector> chunks(kNumRowGroups); const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups; for (int i = 0; i < kNumRowGroups; ++i) { - AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]); + AsDictionary32Encoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]); } auto ex_table = MakeSimpleTable(std::make_shared(chunks), /*nullable=*/true); @@ -2840,8 +2847,88 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDense) { INSTANTIATE_TEST_CASE_P( ReadDictionary, TestArrowReadDictionary, - ::testing::ValuesIn(TestArrowReadDictionary::null_probabilites())); + ::testing::ValuesIn(TestArrowReadDictionary::null_probabilities())); + +TEST(TestArrowWriteDictionaries, ChangingDictionaries) { + constexpr int num_unique = 50; + constexpr int repeat = 10000; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 20; + ::arrow::random::RandomArrayGenerator rag(0); + auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, /*null_probability=*/0.1); + auto expected = MakeSimpleTable(values, /*nullable=*/true); + + const int num_chunks = 10; + std::vector> chunks(num_chunks); + const int64_t chunk_size = values->length() / num_chunks; + for (int i = 0; i < num_chunks; ++i) { + AsDictionary32Encoded(*values->Slice(chunk_size * i, chunk_size), &chunks[i]); + } + + auto dict_table = MakeSimpleTable(std::make_shared(chunks), + /*nullable=*/true); + + std::shared_ptr
actual; + DoRoundtrip(dict_table, /*row_group_size=*/values->length() / 2, &actual); + ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false); +} + +TEST(TestArrowWriteDictionaries, AutoReadAsDictionary) { + constexpr int num_unique = 50; + constexpr int repeat = 100; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 20; + ::arrow::random::RandomArrayGenerator rag(0); + auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, /*null_probability=*/0.1); + std::shared_ptr dict_values; + AsDictionary32Encoded(*values, &dict_values); -} // namespace arrow + auto expected = MakeSimpleTable(dict_values, /*nullable=*/true); + auto expected_dense = MakeSimpleTable(values, /*nullable=*/true); + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + std::shared_ptr
actual, actual_dense; + + DoRoundtrip(expected, values->length(), &actual, default_writer_properties(), + props_store_schema); + ::arrow::AssertTablesEqual(*expected, *actual); + + auto props_no_store_schema = ArrowWriterProperties::Builder().build(); + DoRoundtrip(expected, values->length(), &actual_dense, default_writer_properties(), + props_no_store_schema); + ::arrow::AssertTablesEqual(*expected_dense, *actual_dense); +} + +TEST(TestArrowWriteDictionaries, NestedSubfield) { + // ARROW-3246: Automatic decoding of dictionary subfields left as followup + // work + auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 2, 3]"); + auto indices = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 0]"); + auto dict = ::arrow::ArrayFromJSON(::arrow::utf8(), "[\"foo\"]"); + + std::shared_ptr dict_values, values; + auto dict_ty = ::arrow::dictionary(::arrow::int32(), ::arrow::utf8()); + ASSERT_OK(::arrow::DictionaryArray::FromArrays(dict_ty, indices, dict, &dict_values)); + ASSERT_OK(::arrow::ListArray::FromArrays(*offsets, *dict_values, + ::arrow::default_memory_pool(), &values)); + + auto dense_ty = ::arrow::list(::arrow::utf8()); + auto dense_values = + ::arrow::ArrayFromJSON(dense_ty, "[[], [\"foo\", \"foo\"], [\"foo\"]]"); + + auto table = MakeSimpleTable(values, /*nullable=*/true); + auto expected_table = MakeSimpleTable(dense_values, /*nullable=*/true); + + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + std::shared_ptr
actual; + DoRoundtrip(table, values->length(), &actual, default_writer_properties(), + props_store_schema); + + // The nested subfield is not automatically decoded to dictionary + ::arrow::AssertTablesEqual(*expected_table, *actual); +} + +} // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 3d1ad76c8f0..4451276d6b3 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -106,8 +106,9 @@ class FileReaderImpl : public FileReader { : pool_(pool), reader_(std::move(reader)), reader_properties_(properties) {} Status Init() { - return BuildSchemaManifest(reader_->metadata()->schema(), reader_properties_, - &manifest_); + return BuildSchemaManifest(reader_->metadata()->schema(), + reader_->metadata()->key_value_metadata(), + reader_properties_, &manifest_); } std::vector AllRowGroups() { @@ -777,7 +778,7 @@ Status FileReaderImpl::ReadRowGroups(const std::vector& row_groups, } } - auto result_schema = ::arrow::schema(fields, reader_->metadata()->key_value_metadata()); + auto result_schema = ::arrow::schema(fields, manifest_.schema_metadata); *out = Table::Make(result_schema, columns); return (*out)->Validate(); END_PARQUET_CATCH_EXCEPTIONS diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index bfc40940e3b..649f73f76be 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,8 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/compute/kernel.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -491,11 +494,13 @@ Status GroupToSchemaField(const GroupNode& node, int16_t max_def_level, Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_rep_level, SchemaTreeContext* ctx, const SchemaField* parent, SchemaField* out) { + /// Workhorse function for converting a Parquet schema node to an Arrow + /// type. Handles different conventions for nested data if (node.is_optional()) { ++max_def_level; } else if (node.is_repeated()) { - // Repeated fields add a definition level. This is used to distinguish - // between an empty list and a list with an item in it. + // Repeated fields add both a repetition and definition level. This is used + // to distinguish between an empty list and a list with an item in it. ++max_rep_level; ++max_def_level; } @@ -504,9 +509,19 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re // Now, walk the schema and create a ColumnDescriptor for each leaf node if (node.is_group()) { + // A nested field, but we don't know what kind yet return GroupToSchemaField(static_cast(node), max_def_level, max_rep_level, ctx, parent, out); } else { + // Either a normal flat primitive type, or a list type encoded with 1-level + // list encoding. Note that the 3-level encoding is the form recommended by + // the parquet specification, but technically we can have either + // + // required/optional $TYPE $FIELD_NAME + // + // or + // + // repeated $TYPE $FIELD_NAME const auto& primitive_node = static_cast(node); int column_index = ctx->schema->GetColumnIndex(primitive_node); std::shared_ptr type; @@ -526,6 +541,7 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re out->max_repetition_level = max_rep_level; return Status::OK(); } else { + // A normal (required/optional) primitive node return PopulateLeaf(column_index, ::arrow::field(node.name(), type, node.is_optional()), max_def_level, max_rep_level, ctx, parent, out); @@ -533,9 +549,56 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re } } +Status GetOriginSchema(const std::shared_ptr& metadata, + std::shared_ptr* clean_metadata, + std::shared_ptr<::arrow::Schema>* out) { + if (metadata == nullptr) { + *out = nullptr; + *clean_metadata = nullptr; + return Status::OK(); + } + + static const std::string kArrowSchemaKey = "ARROW:schema"; + int schema_index = metadata->FindKey(kArrowSchemaKey); + if (schema_index == -1) { + *out = nullptr; + *clean_metadata = metadata; + return Status::OK(); + } + + // The original Arrow schema was serialized using the store_schema option. We + // deserialize it here and use it to inform read options such as + // dictionary-encoded fields + auto schema_buf = std::make_shared(metadata->value(schema_index)); + + ::arrow::ipc::DictionaryMemo dict_memo; + ::arrow::io::BufferReader input(schema_buf); + RETURN_NOT_OK(::arrow::ipc::ReadSchema(&input, &dict_memo, out)); + + if (metadata->size() > 1) { + // Copy the metadata without the schema key + auto new_metadata = ::arrow::key_value_metadata({}, {}); + new_metadata->reserve(metadata->size() - 1); + for (int64_t i = 0; i < metadata->size(); ++i) { + if (i == schema_index) continue; + new_metadata->Append(metadata->key(i), metadata->value(i)); + } + *clean_metadata = new_metadata; + } else { + // No other keys, let metadata be null + *clean_metadata = nullptr; + } + return Status::OK(); +} + Status BuildSchemaManifest(const SchemaDescriptor* schema, + const std::shared_ptr& metadata, const ArrowReaderProperties& properties, SchemaManifest* manifest) { + std::shared_ptr<::arrow::Schema> origin_schema; + RETURN_NOT_OK( + GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema)); + SchemaTreeContext ctx; ctx.manifest = manifest; ctx.properties = properties; @@ -544,8 +607,26 @@ Status BuildSchemaManifest(const SchemaDescriptor* schema, manifest->descr = schema; manifest->schema_fields.resize(schema_node.field_count()); for (int i = 0; i < static_cast(schema_node.field_count()); ++i) { + SchemaField* out_field = &manifest->schema_fields[i]; RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), 0, 0, &ctx, - /*parent=*/nullptr, &manifest->schema_fields[i])); + /*parent=*/nullptr, out_field)); + + // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin + // schema (if any) through all functions in the schema reconstruction, but + // I'm being lazy and just setting dictionary fields at the top level for + // now + if (manifest->origin_schema == nullptr) { + continue; + } + auto origin_field = manifest->origin_schema->field(i); + auto current_type = out_field->field->type(); + if (origin_field->type()->id() != ::arrow::Type::DICTIONARY) { + continue; + } + if (current_type->id() != ::arrow::Type::DICTIONARY) { + out_field->field = + out_field->field->WithType(::arrow::dictionary(::arrow::int32(), current_type)); + } } return Status::OK(); } @@ -555,7 +636,7 @@ Status FromParquetSchema( const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out) { SchemaManifest manifest; - RETURN_NOT_OK(BuildSchemaManifest(schema, properties, &manifest)); + RETURN_NOT_OK(BuildSchemaManifest(schema, key_value_metadata, properties, &manifest)); std::vector> fields(manifest.schema_fields.size()); for (int i = 0; i < static_cast(fields.size()); i++) { fields[i] = manifest.schema_fields[i].field; diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index 4568e421474..d8f08524681 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -38,6 +38,8 @@ class Array; class ChunkedArray; class DataType; class Field; +class KeyValueMetadata; +class Schema; } // namespace arrow @@ -138,6 +140,8 @@ struct PARQUET_EXPORT SchemaField { struct SchemaManifest { const SchemaDescriptor* descr; + std::shared_ptr<::arrow::Schema> origin_schema; + std::shared_ptr schema_metadata; std::vector schema_fields; std::unordered_map column_index_to_field; @@ -185,6 +189,7 @@ struct SchemaManifest { PARQUET_EXPORT Status BuildSchemaManifest(const SchemaDescriptor* schema, + const std::shared_ptr& metadata, const ArrowReaderProperties& properties, SchemaManifest* manifest); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index fb437f14320..0d13528d5f9 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -19,13 +19,14 @@ #include #include +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer-builder.h" -#include "arrow/compute/api.h" +#include "arrow/ipc/writer.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/visitor_inline.h" @@ -43,6 +44,7 @@ using arrow::BinaryArray; using arrow::BooleanArray; using arrow::ChunkedArray; using arrow::Decimal128Array; +using arrow::DictionaryArray; using arrow::Field; using arrow::FixedSizeBinaryArray; using Int16BufferBuilder = arrow::TypedBufferBuilder; @@ -55,10 +57,6 @@ using arrow::Status; using arrow::Table; using arrow::TimeUnit; -using arrow::compute::Cast; -using arrow::compute::CastOptions; -using arrow::compute::FunctionContext; - using parquet::ParquetFileWriter; using parquet::ParquetVersion; using parquet::schema::GroupNode; @@ -89,6 +87,21 @@ class LevelBuilder { return Status::OK(); } + Status Visit(const DictionaryArray& array) { + // Only currently handle DictionaryArray where the dictionary is a + // primitive type + if (array.dict_type()->value_type()->num_children() > 0) { + return Status::NotImplemented( + "Writing DictionaryArray with nested dictionary " + "type not yet supported"); + } + array_offsets_.push_back(static_cast(array.offset())); + valid_bitmaps_.push_back(array.null_bitmap_data()); + null_counts_.push_back(array.null_count()); + values_array_ = std::make_shared(array.data()); + return Status::OK(); + } + Status Visit(const ListArray& array) { array_offsets_.push_back(static_cast(array.offset())); valid_bitmaps_.push_back(array.null_bitmap_data()); @@ -113,7 +126,6 @@ class LevelBuilder { NOT_IMPLEMENTED_VISIT(FixedSizeList) NOT_IMPLEMENTED_VISIT(Struct) NOT_IMPLEMENTED_VISIT(Union) - NOT_IMPLEMENTED_VISIT(Dictionary) NOT_IMPLEMENTED_VISIT(Extension) #undef NOT_IMPLEMENTED_VISIT @@ -411,8 +423,8 @@ class FileWriterImpl : public FileWriter { closed_(false) {} Status Init() { - return BuildSchemaManifest(writer_->schema(), default_arrow_reader_properties(), - &schema_manifest_); + return BuildSchemaManifest(writer_->schema(), /*schema_metadata=*/nullptr, + default_arrow_reader_properties(), &schema_manifest_); } Status NewRowGroup(int64_t chunk_size) override { @@ -444,28 +456,6 @@ class FileWriterImpl : public FileWriter { Status WriteColumnChunk(const std::shared_ptr& data, int64_t offset, int64_t size) override { - // DictionaryArrays are not yet handled with a fast path. To still support - // writing them as a workaround, we convert them back to their non-dictionary - // representation. - if (data->type()->id() == ::arrow::Type::DICTIONARY) { - const ::arrow::DictionaryType& dict_type = - static_cast(*data->type()); - - // TODO(ARROW-1648): Remove this special handling once we require an Arrow - // version that has this fixed. - if (dict_type.value_type()->id() == ::arrow::Type::NA) { - auto null_array = std::make_shared<::arrow::NullArray>(data->length()); - return WriteColumnChunk(*null_array); - } - - FunctionContext ctx(this->memory_pool()); - ::arrow::compute::Datum cast_input(data); - ::arrow::compute::Datum cast_output; - RETURN_NOT_OK( - Cast(&ctx, cast_input, dict_type.value_type(), CastOptions(), &cast_output)); - return WriteColumnChunk(cast_output.chunked_array(), offset, size); - } - ColumnWriter* column_writer; PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn()); @@ -563,6 +553,30 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool return Open(schema, pool, sink, properties, default_arrow_writer_properties(), writer); } +Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + const ArrowWriterProperties& properties, + std::shared_ptr* out) { + if (!properties.store_schema()) { + *out = nullptr; + return Status::OK(); + } + + static const std::string kArrowSchemaKey = "ARROW:schema"; + std::shared_ptr result; + if (schema.metadata()) { + result = schema.metadata()->Copy(); + } else { + result = ::arrow::key_value_metadata({}, {}); + } + + ::arrow::ipc::DictionaryMemo dict_memo; + std::shared_ptr serialized; + RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized)); + result->Append(kArrowSchemaKey, serialized->ToString()); + *out = result; + return Status::OK(); +} + Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, const std::shared_ptr<::arrow::io::OutputStream>& sink, const std::shared_ptr& properties, @@ -574,8 +588,11 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); + std::shared_ptr metadata; + RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata)); + std::unique_ptr base_writer = - ParquetFileWriter::Open(sink, schema_node, properties, schema.metadata()); + ParquetFileWriter::Open(sink, schema_node, properties, metadata); auto schema_ptr = std::make_shared<::arrow::Schema>(schema); return Make(pool, std::move(base_writer), schema_ptr, arrow_properties, writer); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 925a4ff80ab..0fd3a4c28dd 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1234,7 +1234,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, int64_t num_decoded = this->current_decoder_->DecodeArrow( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, builder_.get()); - DCHECK_EQ(num_decoded, values_to_read); + DCHECK_EQ(num_decoded, values_to_read - null_count); ResetValues(); } @@ -1310,7 +1310,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// Flush values since they have been copied into the builder ResetValues(); } - DCHECK_EQ(num_decoded, values_to_read); + DCHECK_EQ(num_decoded, values_to_read - null_count); } private: diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index dd0d65aa5cd..fcc8344ac06 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -218,7 +218,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { void ReadAndCompare(Compression::type compression, int64_t num_rows) { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - auto comparator = TypedComparator::Make(this->descr_); + auto comparator = MakeComparator(this->descr_); for (size_t i = 0; i < this->values_.size(); i++) { if (comparator->Compare(this->values_[i], this->values_out_[i]) || comparator->Compare(this->values_out_[i], this->values_[i])) { @@ -310,7 +310,7 @@ void TestPrimitiveWriter::ReadAndCompare(Compression::type compressio this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::SIGNED); for (size_t i = 0; i < this->values_.size(); i++) { if (comparator->Compare(this->values_[i], this->values_out_[i]) || comparator->Compare(this->values_out_[i], this->values_[i])) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fa16234e6ec..052ca14967a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/buffer-builder.h" +#include "arrow/compute/api.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-stream-utils.h" @@ -46,11 +47,12 @@ namespace parquet { -using ::arrow::Status; -using ::arrow::internal::checked_cast; +using arrow::Status; +using arrow::compute::Datum; +using arrow::internal::checked_cast; -using BitWriter = ::arrow::BitUtil::BitWriter; -using RleEncoder = ::arrow::util::RleEncoder; +using BitWriter = arrow::BitUtil::BitWriter; +using RleEncoder = arrow::util::RleEncoder; LevelEncoder::LevelEncoder() {} LevelEncoder::~LevelEncoder() {} @@ -135,7 +137,7 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), pool_(pool), @@ -282,7 +284,7 @@ class SerializedPageWriter : public PageWriter { std::unique_ptr thrift_serializer_; // Compression codec to use. - std::unique_ptr<::arrow::util::Codec> compressor_; + std::unique_ptr compressor_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -290,7 +292,7 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( @@ -332,7 +334,7 @@ class BufferedPageWriter : public PageWriter { private: std::shared_ptr final_sink_; ColumnChunkMetaDataBuilder* metadata_; - std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_; + std::shared_ptr in_memory_sink_; std::unique_ptr pager_; }; @@ -479,8 +481,8 @@ class ColumnWriterImpl { // Flag to infer if dictionary encoding has fallen back to PLAIN bool fallback_; - ::arrow::BufferBuilder definition_levels_sink_; - ::arrow::BufferBuilder repetition_levels_sink_; + arrow::BufferBuilder definition_levels_sink_; + arrow::BufferBuilder repetition_levels_sink_; std::shared_ptr definition_levels_rle_; std::shared_ptr repetition_levels_rle_; @@ -630,6 +632,50 @@ void ColumnWriterImpl::FlushBufferedDataPages() { // ---------------------------------------------------------------------- // TypedColumnWriter +template +inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) { + int64_t num_batches = static_cast(total / batch_size); + for (int round = 0; round < num_batches; round++) { + action(round * batch_size, batch_size); + } + // Write the remaining values + if (total % batch_size > 0) { + action(num_batches * batch_size, total % batch_size); + } +} + +bool DictionaryDirectWriteSupported(const arrow::Array& array) { + DCHECK_EQ(array.type_id(), arrow::Type::DICTIONARY); + const arrow::DictionaryType& dict_type = + static_cast(*array.type()); + auto id = dict_type.value_type()->id(); + return id == arrow::Type::BINARY || id == arrow::Type::STRING; +} + +Status ConvertDictionaryToDense(const arrow::Array& array, MemoryPool* pool, + std::shared_ptr* out) { + const arrow::DictionaryType& dict_type = + static_cast(*array.type()); + + // TODO(ARROW-1648): Remove this special handling once we require an Arrow + // version that has this fixed. + if (dict_type.value_type()->id() == arrow::Type::NA) { + *out = std::make_shared(array.length()); + return Status::OK(); + } + + arrow::compute::FunctionContext ctx(pool); + Datum cast_output; + RETURN_NOT_OK(arrow::compute::Cast(&ctx, Datum(array.data()), dict_type.value_type(), + arrow::compute::CastOptions(), &cast_output)); + *out = cast_output.make_array(); + return Status::OK(); +} + +static inline bool IsDictionaryEncoding(Encoding::type encoding) { + return encoding == Encoding::PLAIN_DICTIONARY; +} + template class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter { public: @@ -645,23 +691,70 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< if (properties->statistics_enabled(descr_->path()) && (SortOrder::UNKNOWN != descr_->sort_order())) { - page_statistics_ = TypedStats::Make(descr_, allocator_); - chunk_statistics_ = TypedStats::Make(descr_, allocator_); + page_statistics_ = MakeStatistics(descr_, allocator_); + chunk_statistics_ = MakeStatistics(descr_, allocator_); } } int64_t Close() override { return ColumnWriterImpl::Close(); } void WriteBatch(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const T* values) override; + const int16_t* rep_levels, const T* values) override { + // We check for DataPage limits only after we have inserted the values. If a user + // writes a large number of values, the DataPage size can be much above the limit. + // The purpose of this chunking is to bound this. Even if a user writes large number + // of values, the chunking will ensure the AddDataPage() is called at a reasonable + // pagesize limit + int64_t value_offset = 0; + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { + int64_t values_to_write = + WriteLevels(batch_size, def_levels + offset, rep_levels + offset); + // PARQUET-780 + if (values_to_write > 0) { + DCHECK_NE(nullptr, values); + } + WriteValues(values + value_offset, values_to_write, batch_size - values_to_write); + CommitWriteAndCheckPageLimit(batch_size, values_to_write); + value_offset += values_to_write; + + // Dictionary size checked separately from data page size since we + // circumvent this check when writing arrow::DictionaryArray directly + CheckDictionarySizeLimit(); + }; + DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); + } void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values) override; + int64_t valid_bits_offset, const T* values) override { + // Like WriteBatch, but for spaced values + int64_t value_offset = 0; + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + WriteValuesSpaced(values + value_offset, batch_num_values, batch_num_spaced_values, + valid_bits, valid_bits_offset + value_offset); + CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values); + value_offset += batch_num_spaced_values; + + // Dictionary size checked separately from data page size since we + // circumvent this check when writing arrow::DictionaryArray directly + CheckDictionarySizeLimit(); + }; + DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); + } Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const ::arrow::Array& array, - ArrowWriteContext* context) override; + int64_t num_levels, const arrow::Array& array, + ArrowWriteContext* ctx) override { + if (array.type()->id() == arrow::Type::DICTIONARY) { + return WriteArrowDictionary(def_levels, rep_levels, num_levels, array, ctx); + } else { + return WriteArrowDense(def_levels, rep_levels, num_levels, array, ctx); + } + } int64_t EstimatedBufferedValueBytes() const override { return current_encoder_->EstimatedDataEncodedSize(); @@ -672,6 +765,17 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< return current_encoder_->FlushValues(); } + // Internal function to handle direct writing of arrow::DictionaryArray, + // since the standard logic concerning dictionary size limits and fallback to + // plain encoding is circumvented + Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const arrow::Array& array, + ArrowWriteContext* context); + + Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const arrow::Array& array, + ArrowWriteContext* context); + void WriteDictionaryPage() override { // We have to dynamic cast here because of TypedEncoder as // some compilers don't want to cast through virtual inheritance @@ -686,11 +790,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< total_bytes_written_ += pager_->WriteDictionaryPage(page); } - // Checks if the Dictionary Page size limit is reached - // If the limit is reached, the Dictionary and Data Pages are serialized - // The encoding is switched to PLAIN - void CheckDictionarySizeLimit(); - EncodedStatistics GetPageStatistics() override { EncodedStatistics result; if (page_statistics_) result = page_statistics_->Encode(); @@ -729,233 +828,239 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; - inline int64_t WriteMiniBatch(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const T* values); - - inline int64_t WriteMiniBatchSpaced(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, - const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values, - int64_t* num_spaced_written); + // If writing a sequence of arrow::DictionaryArray to the writer, we keep the + // dictionary passed to DictEncoder::PutDictionary so we can check + // subsequent array chunks to see either if materialization is required (in + // which case we call back to the dense write path) + std::shared_ptr preserved_dictionary_; + + int64_t WriteLevels(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels) { + int64_t values_to_write = 0; + // If the field is required and non-repeated, there are no definition levels + if (descr_->max_definition_level() > 0) { + for (int64_t i = 0; i < num_values; ++i) { + if (def_levels[i] == descr_->max_definition_level()) { + ++values_to_write; + } + } - // Write values to a temporary buffer before they are encoded into pages - void WriteValues(int64_t num_values, const T* values) { - dynamic_cast(current_encoder_.get()) - ->Put(values, static_cast(num_values)); - } + WriteDefinitionLevels(num_values, def_levels); + } else { + // Required field, write all values + values_to_write = num_values; + } - void WriteValuesSpaced(int64_t num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values) { - dynamic_cast(current_encoder_.get()) - ->PutSpaced(values, static_cast(num_values), valid_bits, valid_bits_offset); - } -}; + // Not present for non-repeated fields + if (descr_->max_repetition_level() > 0) { + // A row could include more than one value + // Count the occasions where we start a new row + for (int64_t i = 0; i < num_values; ++i) { + if (rep_levels[i] == 0) { + rows_written_++; + } + } -// Only one Dictionary Page is written. -// Fallback to PLAIN if dictionary page limit is reached. -template -void TypedColumnWriterImpl::CheckDictionarySizeLimit() { - // We have to dynamic cast here because TypedEncoder as some compilers - // don't want to cast through virtual inheritance - auto dict_encoder = dynamic_cast*>(current_encoder_.get()); - if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { - WriteDictionaryPage(); - // Serialize the buffered Dictionary Indicies - FlushBufferedDataPages(); - fallback_ = true; - // Only PLAIN encoding is supported for fallback in V1 - current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, - properties_->memory_pool()); - encoding_ = Encoding::PLAIN; - } -} + WriteRepetitionLevels(num_values, rep_levels); + } else { + // Each value is exactly one row + rows_written_ += static_cast(num_values); + } + return values_to_write; + } + + void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels, + const int16_t* rep_levels, int64_t* out_values_to_write, + int64_t* out_spaced_values_to_write) { + int64_t values_to_write = 0; + int64_t spaced_values_to_write = 0; + // If the field is required and non-repeated, there are no definition levels + if (descr_->max_definition_level() > 0) { + // Minimal definition level for which spaced values are written + int16_t min_spaced_def_level = descr_->max_definition_level(); + if (descr_->schema_node()->is_optional()) { + min_spaced_def_level--; + } + for (int64_t i = 0; i < num_levels; ++i) { + if (def_levels[i] == descr_->max_definition_level()) { + ++values_to_write; + } + if (def_levels[i] >= min_spaced_def_level) { + ++spaced_values_to_write; + } + } -// ---------------------------------------------------------------------- -// Instantiate templated classes + WriteDefinitionLevels(num_levels, def_levels); + } else { + // Required field, write all values + values_to_write = num_levels; + spaced_values_to_write = num_levels; + } -template -int64_t TypedColumnWriterImpl::WriteMiniBatch(int64_t num_values, - const int16_t* def_levels, - const int16_t* rep_levels, - const T* values) { - int64_t values_to_write = 0; - // If the field is required and non-repeated, there are no definition levels - if (descr_->max_definition_level() > 0) { - for (int64_t i = 0; i < num_values; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_write; + // Not present for non-repeated fields + if (descr_->max_repetition_level() > 0) { + // A row could include more than one value + // Count the occasions where we start a new row + for (int64_t i = 0; i < num_levels; ++i) { + if (rep_levels[i] == 0) { + rows_written_++; + } } + + WriteRepetitionLevels(num_levels, rep_levels); + } else { + // Each value is exactly one row + rows_written_ += static_cast(num_levels); } - WriteDefinitionLevels(num_values, def_levels); - } else { - // Required field, write all values - values_to_write = num_values; + *out_values_to_write = values_to_write; + *out_spaced_values_to_write = spaced_values_to_write; } - // Not present for non-repeated fields - if (descr_->max_repetition_level() > 0) { - // A row could include more than one value - // Count the occasions where we start a new row - for (int64_t i = 0; i < num_values; ++i) { - if (rep_levels[i] == 0) { - rows_written_++; - } - } + void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) { + num_buffered_values_ += num_levels; + num_buffered_encoded_values_ += num_values; - WriteRepetitionLevels(num_values, rep_levels); - } else { - // Each value is exactly one row - rows_written_ += static_cast(num_values); + if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { + AddDataPage(); + } } - // PARQUET-780 - if (values_to_write > 0) { - DCHECK(nullptr != values) << "Values ptr cannot be NULL"; + void FallbackToPlainEncoding() { + if (IsDictionaryEncoding(current_encoder_->encoding())) { + WriteDictionaryPage(); + // Serialize the buffered Dictionary Indicies + FlushBufferedDataPages(); + fallback_ = true; + // Only PLAIN encoding is supported for fallback in V1 + current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); + encoding_ = Encoding::PLAIN; + } } - WriteValues(values_to_write, values); + // Checks if the Dictionary Page size limit is reached + // If the limit is reached, the Dictionary and Data Pages are serialized + // The encoding is switched to PLAIN + // + // Only one Dictionary Page is written. + // Fallback to PLAIN if dictionary page limit is reached. + void CheckDictionarySizeLimit() { + if (!has_dictionary_ || fallback_) { + // Either not using dictionary encoding, or we have already fallen back + // to PLAIN encoding because the size threshold was reached + return; + } - if (page_statistics_ != nullptr) { - page_statistics_->Update(values, values_to_write, num_values - values_to_write); + // We have to dynamic cast here because TypedEncoder as some compilers + // don't want to cast through virtual inheritance + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); + if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { + FallbackToPlainEncoding(); + } } - num_buffered_values_ += num_values; - num_buffered_encoded_values_ += values_to_write; - - if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { - AddDataPage(); - } - if (has_dictionary_ && !fallback_) { - CheckDictionarySizeLimit(); + void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) { + dynamic_cast(current_encoder_.get()) + ->Put(values, static_cast(num_values)); + if (page_statistics_ != nullptr) { + page_statistics_->Update(values, num_values, num_nulls); + } } - return values_to_write; -} - -template -int64_t TypedColumnWriterImpl::WriteMiniBatchSpaced( - int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, - const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values, - int64_t* num_spaced_written) { - int64_t values_to_write = 0; - int64_t spaced_values_to_write = 0; - // If the field is required and non-repeated, there are no definition levels - if (descr_->max_definition_level() > 0) { - // Minimal definition level for which spaced values are written - int16_t min_spaced_def_level = descr_->max_definition_level(); + void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values, + const uint8_t* valid_bits, int64_t valid_bits_offset) { if (descr_->schema_node()->is_optional()) { - min_spaced_def_level--; - } - for (int64_t i = 0; i < num_levels; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_write; - } - if (def_levels[i] >= min_spaced_def_level) { - ++spaced_values_to_write; - } + dynamic_cast(current_encoder_.get()) + ->PutSpaced(values, static_cast(num_spaced_values), valid_bits, + valid_bits_offset); + } else { + dynamic_cast(current_encoder_.get()) + ->Put(values, static_cast(num_values)); } - - WriteDefinitionLevels(num_levels, def_levels); - } else { - // Required field, write all values - values_to_write = num_levels; - spaced_values_to_write = num_levels; - } - - // Not present for non-repeated fields - if (descr_->max_repetition_level() > 0) { - // A row could include more than one value - // Count the occasions where we start a new row - for (int64_t i = 0; i < num_levels; ++i) { - if (rep_levels[i] == 0) { - rows_written_++; - } + if (page_statistics_ != nullptr) { + const int64_t num_nulls = num_spaced_values - num_values; + page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values, + num_nulls); } - - WriteRepetitionLevels(num_levels, rep_levels); - } else { - // Each value is exactly one row - rows_written_ += static_cast(num_levels); } +}; - if (descr_->schema_node()->is_optional()) { - WriteValuesSpaced(spaced_values_to_write, valid_bits, valid_bits_offset, values); - } else { - WriteValues(values_to_write, values); - } - *num_spaced_written = spaced_values_to_write; +template +Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + // If this is the first time writing a DictionaryArray, then there's + // a few possible paths to take: + // + // - If dictionary encoding is not enabled, convert to densely + // encoded and call WriteArrow + // - Dictionary encoding enabled + // - If this is the first time this is called, then we call + // PutDictionary into the encoder and then PutIndices on each + // chunk. We store the dictionary that was written in + // preserved_dictionary_ so that subsequent calls to this method + // can make sure the dictionary has not changed + // - On subsequent calls, we have to check whether the dictionary + // has changed. If it has, then we trigger the varying + // dictionary path and materialize each chunk and then call + // WriteArrow with that + auto WriteDense = [&] { + std::shared_ptr dense_array; + RETURN_NOT_OK( + ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array)); + return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx); + }; - if (page_statistics_ != nullptr) { - page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, values_to_write, - spaced_values_to_write - values_to_write); + if (!IsDictionaryEncoding(current_encoder_->encoding()) || + !DictionaryDirectWriteSupported(array)) { + // No longer dictionary-encoding for whatever reason, maybe we never were + // or we decided to stop. Note that WriteArrow can be invoked multiple + // times with both dense and dictionary-encoded versions of the same data + // without a problem. Any dense data will be hashed to indices until the + // dictionary page limit is reached, at which everything (dictionary and + // dense) will fall back to plain encoding + return WriteDense(); } - num_buffered_values_ += num_levels; - num_buffered_encoded_values_ += values_to_write; + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); + const auto& data = checked_cast(array); + std::shared_ptr dictionary = data.dictionary(); + std::shared_ptr indices = data.indices(); - if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { - AddDataPage(); - } - if (has_dictionary_ && !fallback_) { - CheckDictionarySizeLimit(); - } + int64_t value_offset = 0; + auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + dict_encoder->PutIndices(*indices->Slice(value_offset, batch_num_spaced_values)); + CommitWriteAndCheckPageLimit(batch_size, batch_num_values); + value_offset += batch_num_spaced_values; + }; - return values_to_write; -} + // Handle seeing dictionary for the first time + if (!preserved_dictionary_) { + // It's a new dictionary. Call PutDictionary and keep track of it + PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary)); -template -void TypedColumnWriterImpl::WriteBatch(int64_t num_values, - const int16_t* def_levels, - const int16_t* rep_levels, - const T* values) { - // We check for DataPage limits only after we have inserted the values. If a user - // writes a large number of values, the DataPage size can be much above the limit. - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure the AddDataPage() is called at a reasonable - // pagesize limit - int64_t write_batch_size = properties_->write_batch_size(); - int num_batches = static_cast(num_values / write_batch_size); - int64_t num_remaining = num_values % write_batch_size; - int64_t value_offset = 0; - for (int round = 0; round < num_batches; round++) { - int64_t offset = round * write_batch_size; - int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset], - &rep_levels[offset], &values[value_offset]); - value_offset += num_values; + // TODO(wesm): If some dictionary values are unobserved, then the + // statistics will be inaccurate. Do we care enough to fix it? + if (page_statistics_ != nullptr) { + PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary)); + } + preserved_dictionary_ = dictionary; + } else if (!dictionary->Equals(*preserved_dictionary_)) { + // Dictionary has changed + PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding()); + return WriteDense(); } - // Write the remaining values - int64_t offset = num_batches * write_batch_size; - WriteMiniBatch(num_remaining, &def_levels[offset], &rep_levels[offset], - &values[value_offset]); -} -template -void TypedColumnWriterImpl::WriteBatchSpaced( - int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, - const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) { - // We check for DataPage limits only after we have inserted the values. If a user - // writes a large number of values, the DataPage size can be much above the limit. - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure the AddDataPage() is called at a reasonable - // pagesize limit - int64_t write_batch_size = properties_->write_batch_size(); - int num_batches = static_cast(num_values / write_batch_size); - int64_t num_remaining = num_values % write_batch_size; - int64_t num_spaced_written = 0; - int64_t values_offset = 0; - for (int round = 0; round < num_batches; round++) { - int64_t offset = round * write_batch_size; - WriteMiniBatchSpaced(write_batch_size, &def_levels[offset], &rep_levels[offset], - valid_bits, valid_bits_offset + values_offset, - values + values_offset, &num_spaced_written); - values_offset += num_spaced_written; - } - // Write the remaining values - int64_t offset = num_batches * write_batch_size; - WriteMiniBatchSpaced(num_remaining, &def_levels[offset], &rep_levels[offset], - valid_bits, valid_bits_offset + values_offset, - values + values_offset, &num_spaced_written); + PARQUET_CATCH_NOT_OK( + DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk)); + return Status::OK(); } // ---------------------------------------------------------------------- @@ -964,7 +1069,7 @@ void TypedColumnWriterImpl::WriteBatchSpaced( template struct SerializeFunctor { using ArrowCType = typename ArrowType::c_type; - using ArrayType = typename ::arrow::TypeTraits::ArrayType; + using ArrayType = typename arrow::TypeTraits::ArrayType; using ParquetCType = typename ParquetType::c_type; Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) { const ArrowCType* input = array.raw_values(); @@ -980,15 +1085,15 @@ struct SerializeFunctor { }; template -inline Status SerializeData(const ::arrow::Array& array, ArrowWriteContext* ctx, +inline Status SerializeData(const arrow::Array& array, ArrowWriteContext* ctx, typename ParquetType::c_type* out) { - using ArrayType = typename ::arrow::TypeTraits::ArrayType; + using ArrayType = typename arrow::TypeTraits::ArrayType; SerializeFunctor functor; return functor.Serialize(checked_cast(array), ctx, out); } template -Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels, +Status WriteArrowSerialize(const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { @@ -1013,12 +1118,12 @@ Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels, } template -Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, +Status WriteArrowZeroCopy(const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { using T = typename ParquetType::c_type; - const auto& data = static_cast(array); + const auto& data = static_cast(array); const T* values = nullptr; // The values buffer may be null if the array is empty (ARROW-2744) if (data.values() != nullptr) { @@ -1036,13 +1141,13 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, return Status::OK(); } -#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \ - case ::arrow::Type::ArrowEnum: \ - return WriteArrowSerialize( \ +#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \ + case arrow::Type::ArrowEnum: \ + return WriteArrowSerialize( \ array, num_levels, def_levels, rep_levels, ctx, this); #define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \ - case ::arrow::Type::ArrowEnum: \ + case arrow::Type::ArrowEnum: \ return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, \ ctx, this); @@ -1056,43 +1161,34 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, // Write Arrow to BooleanType template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::BOOL) { - ARROW_UNSUPPORTED(); - } - bool* buffer = nullptr; - RETURN_NOT_OK(ctx->GetScratchData(array.length(), &buffer)); - - const auto& data = static_cast(array); - const uint8_t* values = nullptr; - // The values buffer may be null if the array is empty (ARROW-2744) - if (data.values() != nullptr) { - values = reinterpret_cast(data.values()->data()); - } else { - DCHECK_EQ(data.length(), 0); +struct SerializeFunctor { + Status Serialize(const arrow::BooleanArray& data, ArrowWriteContext*, bool* out) { + for (int i = 0; i < data.length(); i++) { + *out++ = data.Value(i); + } + return Status::OK(); } +}; - int buffer_idx = 0; - int64_t offset = array.offset(); - for (int i = 0; i < data.length(); i++) { - if (data.IsValid(i)) { - buffer[buffer_idx++] = BitUtil::GetBit(values, offset + i); - } +template <> +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::BOOL) { + ARROW_UNSUPPORTED(); } - PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, buffer)); - return Status::OK(); + return WriteArrowSerialize( + array, num_levels, def_levels, rep_levels, ctx, this); } // ---------------------------------------------------------------------- // Write Arrow types to INT32 template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) { +struct SerializeFunctor { + Status Serialize(const arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) { const int64_t* input = array.raw_values(); for (int i = 0; i < array.length(); i++) { *out++ = static_cast(*input++ / 86400000); @@ -1102,11 +1198,11 @@ struct SerializeFunctor { }; template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) { +struct SerializeFunctor { + Status Serialize(const arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) { const int32_t* input = array.raw_values(); - const auto& type = static_cast(*array.type()); - if (type.unit() == ::arrow::TimeUnit::SECOND) { + const auto& type = static_cast(*array.type()); + if (type.unit() == arrow::TimeUnit::SECOND) { for (int i = 0; i < array.length(); i++) { out[i] = input[i] * 1000; } @@ -1118,13 +1214,13 @@ struct SerializeFunctor { }; template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { - case ::arrow::Type::NA: { + case arrow::Type::NA: { PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr)); } break; WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type) @@ -1149,21 +1245,21 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]); template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) { +struct SerializeFunctor { + Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) { const int64_t* input = array.raw_values(); - const auto& type = static_cast(*array.type()); + const auto& type = static_cast(*array.type()); switch (type.unit()) { - case ::arrow::TimeUnit::NANO: + case arrow::TimeUnit::NANO: INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::MICRO: + case arrow::TimeUnit::MICRO: INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::MILLI: + case arrow::TimeUnit::MILLI: INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::SECOND: + case arrow::TimeUnit::SECOND: INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp); break; } @@ -1198,15 +1294,15 @@ static std::pair kTimestampCoercionFactors[4][4] = { {COERCE_MULTIPLY, 1}}}; template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx, +struct SerializeFunctor { + Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext* ctx, int64_t* out) { - const auto& source_type = static_cast(*array.type()); + const auto& source_type = static_cast(*array.type()); auto source_unit = source_type.unit(); const int64_t* values = array.raw_values(); - ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit(); - auto target_type = ::arrow::timestamp(target_unit); + arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit(); + auto target_type = arrow::timestamp(target_unit); bool truncation_allowed = ctx->properties->truncated_timestamps_allowed(); auto DivideBy = [&](const int64_t factor) { @@ -1242,15 +1338,15 @@ struct SerializeFunctor { #undef COERCE_INVALID #undef COERCE_MULTIPLY -Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, +Status WriteTimestamps(const arrow::Array& values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { - const auto& source_type = static_cast(*values.type()); + const auto& source_type = static_cast(*values.type()); auto WriteCoerce = [&](const ArrowWriterProperties* properties) { ArrowWriteContext temp_ctx = *ctx; temp_ctx.properties = properties; - return WriteArrowSerialize( + return WriteArrowSerialize( values, num_levels, def_levels, rep_levels, &temp_ctx, writer); }; @@ -1264,21 +1360,21 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, return WriteCoerce(ctx->properties); } } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 && - source_type.unit() == ::arrow::TimeUnit::NANO) { + source_type.unit() == arrow::TimeUnit::NANO) { // Absent superseding user instructions, when writing Parquet version 1.0 files, // timestamps in nanoseconds are coerced to microseconds std::shared_ptr properties = (ArrowWriterProperties::Builder()) - .coerce_timestamps(::arrow::TimeUnit::MICRO) + .coerce_timestamps(arrow::TimeUnit::MICRO) ->disallow_truncated_timestamps() ->build(); return WriteCoerce(properties.get()); - } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) { + } else if (source_type.unit() == arrow::TimeUnit::SECOND) { // Absent superseding user instructions, timestamps in seconds are coerced to // milliseconds std::shared_ptr properties = (ArrowWriterProperties::Builder()) - .coerce_timestamps(::arrow::TimeUnit::MILLI) + .coerce_timestamps(arrow::TimeUnit::MILLI) ->build(); return WriteCoerce(properties.get()); } else { @@ -1289,13 +1385,13 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { - case ::arrow::Type::TIMESTAMP: + case arrow::Type::TIMESTAMP: return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this); WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type) WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type) @@ -1307,15 +1403,15 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::TIMESTAMP) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::TIMESTAMP) { ARROW_UNSUPPORTED(); } - return WriteArrowSerialize( + return WriteArrowSerialize( array, num_levels, def_levels, rep_levels, ctx, this); } @@ -1323,12 +1419,12 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // Floating point types template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::FLOAT) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::FLOAT) { ARROW_UNSUPPORTED(); } return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, ctx, @@ -1336,12 +1432,12 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::DOUBLE) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::DOUBLE) { ARROW_UNSUPPORTED(); } return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, ctx, @@ -1351,51 +1447,37 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // ---------------------------------------------------------------------- // Write Arrow to BYTE_ARRAY -template -struct SerializeFunctor> { - Status Serialize(const ::arrow::BinaryArray& array, ArrowWriteContext*, - ByteArray* out) { - // In the case of an array consisting of only empty strings or all null, - // array.data() points already to a nullptr, thus array.data()->data() will - // segfault. - const uint8_t* values = nullptr; - if (array.value_data()) { - values = reinterpret_cast(array.value_data()->data()); - DCHECK(values != nullptr); - } +template <> +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type()->id() != arrow::Type::BINARY && + array.type()->id() != arrow::Type::STRING) { + ARROW_UNSUPPORTED(); + } - // Slice offset is accounted for in raw_value_offsets - const int32_t* value_offset = array.raw_value_offsets(); - if (array.null_count() == 0) { - // no nulls, just dump the data - for (int64_t i = 0; i < array.length(); i++) { - out[i] = - ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]); - } - } else { - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsValid(i)) { - out[i] = - ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]); - } - } + int64_t value_offset = 0; + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + std::shared_ptr data_slice = + array.Slice(value_offset, batch_num_spaced_values); + current_encoder_->Put(*data_slice); + if (page_statistics_ != nullptr) { + page_statistics_->Update(*data_slice); } - return Status::OK(); - } -}; + CommitWriteAndCheckPageLimit(batch_size, batch_num_values); + CheckDictionarySizeLimit(); + value_offset += batch_num_spaced_values; + }; -template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - switch (array.type()->id()) { - WRITE_SERIALIZE_CASE(BINARY, BinaryType, ByteArrayType) - WRITE_SERIALIZE_CASE(STRING, BinaryType, ByteArrayType) - default: - ARROW_UNSUPPORTED(); - } + PARQUET_CATCH_NOT_OK( + DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk)); + return Status::OK(); } // ---------------------------------------------------------------------- @@ -1403,8 +1485,8 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level template struct SerializeFunctor> { - Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*, + arrow::enable_if_fixed_size_binary> { + Status Serialize(const arrow::FixedSizeBinaryArray& array, ArrowWriteContext*, FLBA* out) { if (array.null_count() == 0) { // no nulls, just dump the data @@ -1424,17 +1506,17 @@ struct SerializeFunctor -Status WriteArrowSerialize( - const ::arrow::Array& array, int64_t num_levels, const int16_t* def_levels, +Status WriteArrowSerialize( + const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { - const auto& data = static_cast(array); + const auto& data = static_cast(array); const int64_t length = data.length(); FLBA* buffer; RETURN_NOT_OK(ctx->GetScratchData(num_levels, &buffer)); - const auto& decimal_type = static_cast(*data.type()); + const auto& decimal_type = static_cast(*data.type()); const int32_t offset = decimal_type.byte_width() - internal::DecimalSize(decimal_type.precision()); @@ -1452,8 +1534,8 @@ Status WriteArrowSerialize( // todo(advancedxy): use a writeBatch to avoid this step for (int64_t i = 0, j = 0; i < length; ++i, j += 2) { auto unsigned_64_bit = reinterpret_cast(data.GetValue(i)); - big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); - big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); + big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); + big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); buffer[i] = FixedLenByteArray( reinterpret_cast(&big_endian_values[j]) + offset); } @@ -1461,8 +1543,8 @@ Status WriteArrowSerialize( for (int64_t i = 0, buffer_idx = 0, j = 0; i < length; ++i) { if (data.IsValid(i)) { auto unsigned_64_bit = reinterpret_cast(data.GetValue(i)); - big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); - big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); + big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); + big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); buffer[buffer_idx++] = FixedLenByteArray( reinterpret_cast(&big_endian_values[j]) + offset); j += 2; @@ -1474,11 +1556,11 @@ Status WriteArrowSerialize( } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL, Decimal128Type, FLBAType) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 9497534c823..ccd456afce2 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -54,7 +54,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) { int nbytes = static_cast(BitUtil::BytesForBits(nvalues)); std::vector draws; - ::arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */); + arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */); std::unique_ptr encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -75,7 +75,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) { ASSERT_EQ(nvalues, values_decoded); for (int i = 0; i < nvalues; ++i) { - ASSERT_EQ(draws[i], ::arrow::BitUtil::GetBit(decode_data, i)) << i; + ASSERT_EQ(draws[i], arrow::BitUtil::GetBit(decode_data, i)) << i; } } @@ -260,7 +260,7 @@ class TestDictionaryEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() { - std::vector valid_bits(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + std::vector valid_bits(arrow::BitUtil::BytesForBits(num_values_) + 1, 255); auto base_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); auto encoder = @@ -327,8 +327,8 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { class TestArrowBuilderDecoding : public ::testing::Test { public: - using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder; - using DictBuilder = ::arrow::BinaryDictionary32Builder; + using DenseBuilder = arrow::internal::ChunkedBinaryBuilder; + using DictBuilder = arrow::BinaryDictionary32Builder; void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; } void TearDown() override {} @@ -343,7 +343,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { constexpr int repeat = 100; constexpr int64_t min_length = 2; constexpr int64_t max_length = 10; - ::arrow::random::RandomArrayGenerator rag(0); + arrow::random::RandomArrayGenerator rag(0); expected_dense_ = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, max_length, null_probability); @@ -356,7 +356,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { ASSERT_OK(builder->Finish(&expected_dict_)); // Initialize input_data_ for the encoder from the expected_array_ values - const auto& binary_array = static_cast(*expected_dense_); + const auto& binary_array = static_cast(*expected_dense_); input_data_.resize(binary_array.length()); for (int64_t i = 0; i < binary_array.length(); ++i) { @@ -382,8 +382,8 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDense(int actual_num_values, Builder& builder) { - ASSERT_EQ(actual_num_values, num_values_); - ::arrow::ArrayVector actual_vec; + ASSERT_EQ(actual_num_values, num_values_ - null_count_); + arrow::ArrayVector actual_vec; ASSERT_OK(builder.Finish(&actual_vec)); ASSERT_EQ(actual_vec.size(), 1); ASSERT_ARRAYS_EQUAL(*actual_vec[0], *expected_dense_); @@ -391,8 +391,8 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDict(int actual_num_values, Builder& builder) { - ASSERT_EQ(actual_num_values, num_values_); - std::shared_ptr<::arrow::Array> actual; + ASSERT_EQ(actual_num_values, num_values_ - null_count_); + std::shared_ptr actual; ASSERT_OK(builder.Finish(&actual)); ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); } @@ -439,8 +439,8 @@ class TestArrowBuilderDecoding : public ::testing::Test { protected: std::vector null_probabilities_; - std::shared_ptr<::arrow::Array> expected_dict_; - std::shared_ptr<::arrow::Array> expected_dense_; + std::shared_ptr expected_dict_; + std::shared_ptr expected_dense_; int num_values_; int null_count_; std::vector input_data_; @@ -480,6 +480,143 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } +TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 + + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const double null_probability = 0.25; + + auto CheckSeed = [&](int seed) { + arrow::random::RandomArrayGenerator rag(seed); + auto values = rag.String(size, min_length, max_length, null_probability); + + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + arrow::StringBuilder builder; + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), + values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(50, result->length()); + arrow::AssertArraysEqual(*values, *result); + + // Type checked + auto i32_values = rag.Int32(size, 0, 10, null_probability); + ASSERT_THROW(encoder->Put(*i32_values), ParquetException); + }; + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + CheckSeed(seed); + } +} + +void GetBinaryDictDecoder(DictEncoder* encoder, int64_t num_values, + std::shared_ptr* out_values, + std::shared_ptr* out_dict, + std::unique_ptr* out_decoder) { + auto decoder = MakeDictDecoder(); + auto buf = encoder->FlushValues(); + auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); + encoder->WriteDict(dict_buf->mutable_data()); + + auto dict_decoder = MakeTypedDecoder(Encoding::PLAIN); + dict_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast(dict_buf->size())); + + decoder->SetData(static_cast(num_values), buf->data(), + static_cast(buf->size())); + decoder->SetDict(dict_decoder.get()); + + *out_values = buf; + *out_dict = dict_buf; + *out_decoder = std::unique_ptr( + dynamic_cast(decoder.release())); +} + +TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 + const int64_t size = 50; + const int64_t min_length = 0; + const int64_t max_length = 10; + const double null_probability = 0.1; + arrow::random::RandomArrayGenerator rag(0); + auto values = rag.String(size, min_length, max_length, null_probability); + + auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, + /*use_dictionary=*/true); + + auto encoder = dynamic_cast*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->Put(*values)); + + std::unique_ptr decoder; + std::shared_ptr buf, dict_buf; + int num_values = static_cast(values->length() - values->null_count()); + GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); + + arrow::StringBuilder builder; + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + arrow::AssertArraysEqual(*values, *result); +} + +TEST(DictEncodingAdHoc, PutDictionaryPutIndices) { + // Part of ARROW-3246 + auto dict_values = arrow::ArrayFromJSON(arrow::binary(), "[\"foo\", \"bar\", \"baz\"]"); + auto indices = arrow::ArrayFromJSON(arrow::int32(), "[0, 1, 2]"); + auto indices_nulls = arrow::ArrayFromJSON(arrow::int32(), "[null, 0, 1, null, 2]"); + + auto expected = arrow::ArrayFromJSON(arrow::binary(), + "[\"foo\", \"bar\", \"baz\", null, " + "\"foo\", \"bar\", null, \"baz\"]"); + + auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, + /*use_dictionary=*/true); + auto owned_decoder = MakeDictDecoder(); + + auto encoder = dynamic_cast*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->PutDictionary(*dict_values)); + + // Trying to call PutDictionary again throws + ASSERT_THROW(encoder->PutDictionary(*dict_values), ParquetException); + + ASSERT_NO_THROW(encoder->PutIndices(*indices)); + ASSERT_NO_THROW(encoder->PutIndices(*indices_nulls)); + + std::unique_ptr decoder; + std::shared_ptr buf, dict_buf; + int num_values = static_cast(expected->length() - expected->null_count()); + GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); + + arrow::BinaryBuilder builder; + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), + expected->null_bitmap_data(), + expected->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + arrow::AssertArraysEqual(*expected, *result); +} + class DictEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 57d3dd73869..cd4518ebf3f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -46,8 +46,7 @@ constexpr int64_t kInMemoryDefaultCapacity = 1024; class EncoderImpl : virtual public Encoder { public: - EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, - ::arrow::MemoryPool* pool) + EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool) : descr_(descr), encoding_(encoding), pool_(pool), @@ -55,13 +54,13 @@ class EncoderImpl : virtual public Encoder { Encoding::type encoding() const override { return encoding_; } - ::arrow::MemoryPool* memory_pool() const override { return pool_; } + MemoryPool* memory_pool() const override { return pool_; } protected: // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY const ColumnDescriptor* descr_; const Encoding::type encoding_; - ::arrow::MemoryPool* pool_; + MemoryPool* pool_; /// Type length from descr int type_length_; @@ -75,38 +74,60 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { public: using T = typename DType::c_type; - explicit PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool) + : EncoderImpl(descr, Encoding::PLAIN, pool) { + values_sink_ = CreateOutputStream(pool); + } - int64_t EstimatedDataEncodedSize() override; - std::shared_ptr FlushValues() override; + int64_t EstimatedDataEncodedSize() override { + int64_t position = -1; + PARQUET_THROW_NOT_OK(values_sink_->Tell(&position)); + return position; + } + + std::shared_ptr FlushValues() override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer)); + values_sink_ = CreateOutputStream(this->pool_); + return buffer; + } void Put(const T* buffer, int num_values) override; - protected: - std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_; -}; + void Put(const arrow::Array& values) override; -template -PlainEncoder::PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) - : EncoderImpl(descr, Encoding::PLAIN, pool) { - values_sink_ = CreateOutputStream(pool); -} -template -int64_t PlainEncoder::EstimatedDataEncodedSize() { - int64_t position = -1; - PARQUET_THROW_NOT_OK(values_sink_->Tell(&position)); - return position; -} + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); + } -template -std::shared_ptr PlainEncoder::FlushValues() { - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer)); - values_sink_ = CreateOutputStream(this->pool_); - return buffer; -} + void Put(const ByteArray& val) { + // Write the result to the output stream + PARQUET_THROW_NOT_OK(values_sink_->Write(reinterpret_cast(&val.len), + sizeof(uint32_t))); + if (val.len > 0) { + DCHECK(nullptr != val.ptr) << "Value ptr cannot be NULL"; + } + PARQUET_THROW_NOT_OK( + values_sink_->Write(reinterpret_cast(val.ptr), val.len)); + } + + protected: + std::shared_ptr values_sink_; +}; template void PlainEncoder::Put(const T* buffer, int num_values) { @@ -117,17 +138,45 @@ void PlainEncoder::Put(const T* buffer, int num_values) { template <> inline void PlainEncoder::Put(const ByteArray* src, int num_values) { for (int i = 0; i < num_values; ++i) { - // Write the result to the output stream - PARQUET_THROW_NOT_OK(values_sink_->Write( - reinterpret_cast(&src[i].len), sizeof(uint32_t))); - if (src[i].len > 0) { - DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; + Put(src[i]); + } +} + +template +void PlainEncoder::Put(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} + +void AssertBinary(const arrow::Array& values) { + if (values.type_id() != arrow::Type::BINARY && + values.type_id() != arrow::Type::STRING) { + throw ParquetException("Only BinaryArray and subclasses supported"); + } +} + +template +void PutBinaryArray(const arrow::Array& values, EncoderType* encoder) { + AssertBinary(values); + const auto& data = checked_cast(values); + if (data.null_count() == 0) { + // no nulls, just dump the data + for (int64_t i = 0; i < data.length(); i++) { + encoder->Put(ByteArray(data.GetView(i))); + } + } else { + for (int64_t i = 0; i < data.length(); i++) { + if (data.IsValid(i)) { + encoder->Put(ByteArray(data.GetView(i))); + } } - PARQUET_THROW_NOT_OK( - values_sink_->Write(reinterpret_cast(src[i].ptr), src[i].len)); } } +template <> +void PlainEncoder::Put(const arrow::Array& values) { + PutBinaryArray(values, this); +} + template <> inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_values) { for (int i = 0; i < num_values; ++i) { @@ -140,13 +189,6 @@ inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_va } } -class PlainByteArrayEncoder : public PlainEncoder, - virtual public ByteArrayEncoder { - public: - using BASE = PlainEncoder; - using BASE::PlainEncoder; -}; - class PlainFLBAEncoder : public PlainEncoder, virtual public FLBAEncoder { public: using BASE = PlainEncoder; @@ -157,9 +199,8 @@ class PlainBooleanEncoder : public EncoderImpl, virtual public TypedEncoder, virtual public BooleanEncoder { public: - explicit PlainBooleanEncoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + explicit PlainBooleanEncoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()); int64_t EstimatedDataEncodedSize() override; std::shared_ptr FlushValues() override; @@ -167,11 +208,33 @@ class PlainBooleanEncoder : public EncoderImpl, void Put(const bool* src, int num_values) override; void Put(const std::vector& src, int num_values) override; + void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); + } + + void Put(const arrow::Array& values) override { + ParquetException::NYI("Direct Arrow to Boolean writes not implemented"); + } + private: int bits_available_; - std::unique_ptr<::arrow::BitUtil::BitWriter> bit_writer_; + std::unique_ptr bit_writer_; std::shared_ptr bits_buffer_; - std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_; + std::shared_ptr values_sink_; template void PutImpl(const SequenceType& src, int num_values); @@ -217,8 +280,7 @@ void PlainBooleanEncoder::PutImpl(const SequenceType& src, int num_values) { } } -PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) +PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, MemoryPool* pool) : EncoderImpl(descr, Encoding::PLAIN, pool), bits_available_(kInMemoryDefaultCapacity * 8), bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)) { @@ -262,24 +324,29 @@ void PlainBooleanEncoder::Put(const std::vector& src, int num_values) { template struct DictEncoderTraits { using c_type = typename DType::c_type; - using MemoTableType = ::arrow::internal::ScalarMemoTable; + using MemoTableType = arrow::internal::ScalarMemoTable; }; template <> struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; + using MemoTableType = arrow::internal::BinaryMemoTable; }; template <> struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; + using MemoTableType = arrow::internal::BinaryMemoTable; }; -/// See the dictionary encoding section of https://github.com/Parquet/parquet-format. -/// The encoding supports streaming encoding. Values are encoded as they are added while -/// the dictionary is being constructed. At any time, the buffered values can be -/// written out with the current dictionary size. More values can then be added to -/// the encoder, including new dictionary entries. +// Initially 1024 elements +static constexpr int32_t kInitialHashTableSize = 1 << 10; + +/// See the dictionary encoding section of +/// https://github.com/Parquet/parquet-format. The encoding supports +/// streaming encoding. Values are encoded as they are added while the +/// dictionary is being constructed. At any time, the buffered values +/// can be written out with the current dictionary size. More values +/// can then be added to the encoder, including new dictionary +/// entries. template class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { using MemoTableType = typename DictEncoderTraits::MemoTableType; @@ -287,9 +354,10 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { public: typedef typename DType::c_type T; - explicit DictEncoderImpl( - const ColumnDescriptor* desc, - ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()); + explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool) + : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), + dict_encoded_size_(0), + memo_table_(pool, kInitialHashTableSize) {} ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); } @@ -301,7 +369,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { ++buffer; --buffer_len; - ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); + arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); for (int index : buffered_indices_) { if (!encoder.Put(index)) return -1; } @@ -315,20 +383,96 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { /// Returns a conservative estimate of the number of bytes needed to encode the buffered /// indices. Used to size the buffer passed to WriteIndices(). - int64_t EstimatedDataEncodedSize() override; + int64_t EstimatedDataEncodedSize() override { + // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to + // reserve + // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used + // but not reserving them would cause the encoder to fail. + return 1 + + arrow::util::RleEncoder::MaxBufferSize( + bit_width(), static_cast(buffered_indices_.size())) + + arrow::util::RleEncoder::MinBufferSize(bit_width()); + } /// The minimum bit width required to encode the currently buffered indices. - int bit_width() const override; + int bit_width() const override { + if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; + if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; + return BitUtil::Log2(num_entries()); + } /// Encode value. Note that this does not actually write any data, just /// buffers the value's index to be written later. inline void Put(const T& value); - void Put(const T* values, int num_values) override; - std::shared_ptr FlushValues() override; + void Put(const T* src, int num_values) override { + for (int32_t i = 0; i < num_values; i++) { + Put(src[i]); + } + } void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) override; + int64_t valid_bits_offset) override { + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + Put(src[i]); + } + valid_bits_reader.Next(); + } + } + + void Put(const arrow::Array& values) override; + void PutDictionary(const arrow::Array& values) override; + + template + void PutIndicesTyped(const arrow::Array& data) { + using ArrayType = typename arrow::TypeTraits::ArrayType; + const auto& indices = checked_cast(data); + auto values = indices.raw_values(); + buffered_indices_.reserve( + buffered_indices_.size() + + static_cast(indices.length() - indices.null_count())); + if (indices.null_count() > 0) { + arrow::internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), + indices.offset(), indices.length()); + for (int64_t i = 0; i < indices.length(); ++i) { + if (valid_bits_reader.IsSet()) { + buffered_indices_.push_back(static_cast(values[i])); + } + valid_bits_reader.Next(); + } + } else { + for (int64_t i = 0; i < indices.length(); ++i) { + buffered_indices_.push_back(static_cast(values[i])); + } + } + } + + void PutIndices(const arrow::Array& data) override { + switch (data.type()->id()) { + case arrow::Type::INT8: + return PutIndicesTyped(data); + case arrow::Type::INT16: + return PutIndicesTyped(data); + case arrow::Type::INT32: + return PutIndicesTyped(data); + case arrow::Type::INT64: + return PutIndicesTyped(data); + default: + throw ParquetException("Dictionary indices were not signed integer"); + } + } + + std::shared_ptr FlushValues() override { + std::shared_ptr buffer = + AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); + int result_size = WriteIndices(buffer->mutable_data(), + static_cast(EstimatedDataEncodedSize())); + PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); + return std::move(buffer); + } /// Writes out the encoded dictionary to buffer. buffer must be preallocated to /// dict_encoded_size() bytes. @@ -350,66 +494,6 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { MemoTableType memo_table_; }; -// Initially 1024 elements -static constexpr int32_t INITIAL_HASH_TABLE_SIZE = 1 << 10; - -template -DictEncoderImpl::DictEncoderImpl(const ColumnDescriptor* desc, - ::arrow::MemoryPool* pool) - : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), - dict_encoded_size_(0), - memo_table_(pool, INITIAL_HASH_TABLE_SIZE) {} - -template -int64_t DictEncoderImpl::EstimatedDataEncodedSize() { - // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to - // reserve - // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used - // but not reserving them would cause the encoder to fail. - return 1 + - ::arrow::util::RleEncoder::MaxBufferSize( - bit_width(), static_cast(buffered_indices_.size())) + - ::arrow::util::RleEncoder::MinBufferSize(bit_width()); -} - -template -int DictEncoderImpl::bit_width() const { - if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; - if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; - return BitUtil::Log2(num_entries()); -} - -template -std::shared_ptr DictEncoderImpl::FlushValues() { - std::shared_ptr buffer = - AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); - int result_size = - WriteIndices(buffer->mutable_data(), static_cast(EstimatedDataEncodedSize())); - PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); - return std::move(buffer); -} - -template -void DictEncoderImpl::Put(const T* src, int num_values) { - for (int32_t i = 0; i < num_values; i++) { - Put(src[i]); - } -} - -template -void DictEncoderImpl::PutSpaced(const T* src, int num_values, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - Put(src[i]); - } - valid_bits_reader.Next(); - } -} - template void DictEncoderImpl::WriteDict(uint8_t* buffer) { // For primitive types, only a memcpy @@ -420,7 +504,7 @@ void DictEncoderImpl::WriteDict(uint8_t* buffer) { // ByteArray and FLBA already have the dictionary encoded in their data heaps template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) { + memo_table_.VisitValues(0, [&buffer](const arrow::util::string_view& v) { uint32_t len = static_cast(v.length()); memcpy(buffer, &len, sizeof(len)); buffer += sizeof(len); @@ -431,7 +515,7 @@ void DictEncoderImpl::WriteDict(uint8_t* buffer) { template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { + memo_table_.VisitValues(0, [&](const arrow::util::string_view& v) { DCHECK_EQ(v.length(), static_cast(type_length_)); memcpy(buffer, v.data(), type_length_); buffer += type_length_; @@ -479,25 +563,48 @@ inline void DictEncoderImpl::Put(const FixedLenByteArray& v) { buffered_indices_.push_back(memo_index); } -class DictByteArrayEncoder : public DictEncoderImpl, - virtual public ByteArrayEncoder { - public: - using BASE = DictEncoderImpl; - using BASE::DictEncoderImpl; -}; +template +void DictEncoderImpl::Put(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} -class DictFLBAEncoder : public DictEncoderImpl, virtual public FLBAEncoder { - public: - using BASE = DictEncoderImpl; - using BASE::DictEncoderImpl; -}; +template <> +void DictEncoderImpl::Put(const arrow::Array& values) { + PutBinaryArray(values, this); +} + +template +void DictEncoderImpl::PutDictionary(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} + +template <> +void DictEncoderImpl::PutDictionary(const arrow::Array& values) { + AssertBinary(values); + if (this->num_entries() > 0) { + throw ParquetException("Can only call PutDictionary on an empty DictEncoder"); + } + + const auto& data = checked_cast(values); + if (data.null_count() > 0) { + throw ParquetException("Inserted binary dictionary cannot cannot contain nulls"); + } + for (int64_t i = 0; i < data.length(); i++) { + auto v = data.GetView(i); + dict_encoded_size_ += static_cast(v.size() + sizeof(uint32_t)); + ARROW_IGNORE_EXPR( + memo_table_.GetOrInsert(v.data(), static_cast(v.size()), + /*on_found=*/[](int32_t memo_index) {}, + /*on_not_found=*/[](int32_t memo_index) {})); + } +} // ---------------------------------------------------------------------- // Encoder and decoder factory functions std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encoding, bool use_dictionary, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { + MemoryPool* pool) { if (use_dictionary) { switch (type_num) { case Type::INT32: @@ -511,9 +618,9 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::unique_ptr(new DictEncoderImpl(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new DictByteArrayEncoder(descr, pool)); + return std::unique_ptr(new DictEncoderImpl(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: - return std::unique_ptr(new DictFLBAEncoder(descr, pool)); + return std::unique_ptr(new DictEncoderImpl(descr, pool)); default: DCHECK(false) << "Encoder not implemented"; break; @@ -533,9 +640,9 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::unique_ptr(new PlainEncoder(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new PlainByteArrayEncoder(descr, pool)); + return std::unique_ptr(new PlainEncoder(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: - return std::unique_ptr(new PlainFLBAEncoder(descr, pool)); + return std::unique_ptr(new PlainEncoder(descr, pool)); default: DCHECK(false) << "Encoder not implemented"; break; @@ -665,7 +772,7 @@ class PlainBooleanDecoder : public DecoderImpl, int Decode(bool* buffer, int max_values) override; private: - std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_; + std::unique_ptr bit_reader_; }; PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr) @@ -679,7 +786,7 @@ void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); bool val; - ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); + arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); for (int i = 0; i < max_values; ++i) { if (!bit_reader_->GetValue(1, &val)) { ParquetException::EofException(); @@ -712,7 +819,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -721,7 +828,15 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -729,14 +844,14 @@ class PlainByteArrayDecoder : public PlainDecoder, } int DecodeArrowNonNull(int num_values, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; } int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; @@ -744,17 +859,17 @@ class PlainByteArrayDecoder : public PlainDecoder, private: template - ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* values_decoded) { - num_values = std::min(num_values, num_values_); + arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_values_decoded) { RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int increment; int i = 0; const uint8_t* data = data_; int64_t data_size = len_; int bytes_decoded = 0; + int values_decoded = 0; while (i < num_values) { if (bit_reader.IsSet()) { uint32_t len = arrow::util::SafeLoadAs(data); @@ -766,6 +881,7 @@ class PlainByteArrayDecoder : public PlainDecoder, data += increment; data_size -= increment; bytes_decoded += increment; + ++values_decoded; } else { RETURN_NOT_OK(builder->AppendNull()); } @@ -775,14 +891,14 @@ class PlainByteArrayDecoder : public PlainDecoder, data_ += bytes_decoded; len_ -= bytes_decoded; - num_values_ -= num_values; - *values_decoded = num_values; - return ::arrow::Status::OK(); + num_values_ -= values_decoded; + *out_values_decoded = values_decoded; + return arrow::Status::OK(); } template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, - int* values_decoded) { + arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* values_decoded) { num_values = std::min(num_values, num_values_); RETURN_NOT_OK(builder->Reserve(num_values)); int i = 0; @@ -805,7 +921,7 @@ class PlainByteArrayDecoder : public PlainDecoder, len_ -= bytes_decoded; num_values_ -= num_values; *values_decoded = num_values; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } }; @@ -827,7 +943,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // dictionary is not guaranteed to persist in memory after this call so the // dictionary decoder needs to copy the data out if necessary. explicit DictDecoderImpl(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::RLE_DICTIONARY), dictionary_(AllocateBuffer(pool, 0)), dictionary_length_(0), @@ -844,7 +960,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { uint8_t bit_width = *data; ++data; --len; - idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width); + idx_decoder_ = arrow::util::RleDecoder(data, len, bit_width); } int Decode(T* buffer, int num_values) override { @@ -870,12 +986,11 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { return num_values; } - void InsertDictionary(::arrow::ArrayBuilder* builder) override; + void InsertDictionary(arrow::ArrayBuilder* builder) override; int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::ArrayBuilder* builder) override { - num_values = std::min(num_values, num_values_); + arrow::ArrayBuilder* builder) override { if (num_values > 0) { // TODO(wesm): Refactor to batch reads for improved memory use. It is not // trivial because the null_count is relative to the entire bitmap @@ -893,20 +1008,20 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { /// XXX(wesm): Cannot append "valid bits" directly to the builder std::vector valid_bytes(num_values); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); for (int64_t i = 0; i < num_values; ++i) { valid_bytes[i] = static_cast(bit_reader.IsSet()); bit_reader.Next(); } - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); + auto binary_builder = checked_cast(builder); PARQUET_THROW_NOT_OK( binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); - num_values_ -= num_values; - return num_values; + num_values_ -= num_values - null_count; + return num_values - null_count; } - int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override { + int DecodeIndices(int num_values, arrow::ArrayBuilder* builder) override { num_values = std::min(num_values, num_values_); num_values = std::min(num_values, num_values_); if (num_values > 0) { @@ -921,7 +1036,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) { ParquetException::EofException(); } - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); + auto binary_builder = checked_cast(builder); PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); num_values_ -= num_values; return num_values; @@ -956,7 +1071,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // BinaryDictionary32Builder std::shared_ptr indices_scratch_space_; - ::arrow::util::RleDecoder idx_decoder_; + arrow::util::RleDecoder idx_decoder_; }; template @@ -1019,17 +1134,17 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar } template -void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { +void DictDecoderImpl::InsertDictionary(arrow::ArrayBuilder* builder) { ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types"); } template <> -void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); +void DictDecoderImpl::InsertDictionary(arrow::ArrayBuilder* builder) { + auto binary_builder = checked_cast(builder); // Make an BinaryArray referencing the internal dictionary data - auto arr = std::make_shared<::arrow::BinaryArray>( - dictionary_length_, byte_array_offsets_, byte_array_data_); + auto arr = std::make_shared(dictionary_length_, byte_array_offsets_, + byte_array_data_); PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } @@ -1041,7 +1156,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1050,7 +1165,15 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1058,14 +1181,14 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } int DecodeArrowNonNull(int num_values, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; } int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; @@ -1073,24 +1196,26 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, private: template - ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { + arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; + RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; - while (values_decoded < num_values) { + int num_appended = 0; + while (num_appended < num_values) { bool is_valid = bit_reader.IsSet(); bit_reader.Next(); if (is_valid) { int32_t batch_size = - std::min(buffer_size, num_values - values_decoded - null_count); + std::min(buffer_size, num_values - num_appended - null_count); int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); int i = 0; @@ -1100,11 +1225,12 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, const auto& val = dict_values[indices_buffer[i]]; RETURN_NOT_OK(builder->Append(val.ptr, val.len)); ++i; + ++values_decoded; } else { RETURN_NOT_OK(builder->AppendNull()); --null_count; } - ++values_decoded; + ++num_appended; if (i == num_indices) { // Do not advance the bit_reader if we have fulfilled the decode // request @@ -1116,20 +1242,20 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } else { RETURN_NOT_OK(builder->AppendNull()); --null_count; - ++values_decoded; + ++num_appended; } } - if (values_decoded != num_values) { - return ::arrow::Status::IOError("Expected to dictionary-decode ", num_values, - " but only able to decode ", values_decoded); + if (num_values != num_appended) { + return arrow::Status::IOError("Expected to dictionary-decode ", num_values, + " but only able to decode ", num_appended); } *out_num_values = values_decoded; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, - int* out_num_values) { + arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 2048; int32_t indices_buffer[buffer_size]; int values_decoded = 0; @@ -1151,7 +1277,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, ParquetException::EofException(); } *out_num_values = values_decoded; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } }; @@ -1170,7 +1296,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecodernum_values_ = num_values; - decoder_ = ::arrow::BitUtil::BitReader(data, len); + decoder_ = arrow::BitUtil::BitReader(data, len); values_current_block_ = 0; values_current_mini_block_ = 0; } @@ -1242,8 +1368,8 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder { public: - explicit DeltaLengthByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), len_decoder_(nullptr, pool) {} @@ -1303,9 +1428,8 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder { public: - explicit DeltaByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), prefix_len_decoder_(nullptr, pool), suffix_decoder_(nullptr, pool), @@ -1387,7 +1511,7 @@ namespace detail { std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { + MemoryPool* pool) { switch (type_num) { case Type::BOOLEAN: ParquetException::NYI("Dictionary encoding not implemented for boolean type"); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 5aa1fed74b6..618fd1a4c0c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -28,7 +28,10 @@ namespace arrow { +class Array; class ArrayBuilder; +class BinaryArray; +class BinaryBuilder; class BinaryDictionary32Builder; namespace internal { @@ -51,7 +54,9 @@ class Encoder { virtual std::shared_ptr FlushValues() = 0; virtual Encoding::type encoding() const = 0; - virtual ::arrow::MemoryPool* memory_pool() const = 0; + virtual void Put(const ::arrow::Array& values) = 0; + + virtual MemoryPool* memory_pool() const = 0; }; // Base class for value encoders. Since encoders may or not have state (e.g., @@ -63,25 +68,12 @@ class TypedEncoder : virtual public Encoder { public: typedef typename DType::c_type T; + using Encoder::Put; + virtual void Put(const T* src, int num_values) = 0; virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) { - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer( - this->memory_pool(), num_values * sizeof(T), &buffer)); - int32_t num_valid_values = 0; - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - T* data = reinterpret_cast(buffer->mutable_data()); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - data[num_valid_values++] = src[i]; - } - valid_bits_reader.Next(); - } - Put(data, num_valid_values); - } + int64_t valid_bits_offset) = 0; }; // Base class for dictionary encoders @@ -105,6 +97,20 @@ class DictEncoder : virtual public TypedEncoder { virtual void WriteDict(uint8_t* buffer) = 0; virtual int num_entries() const = 0; + + /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is + /// assumed (without any boundschecking) that the indices reference + /// pre-existing dictionary values + /// \param[in] indices the dictionary index values. Only Int32Array currently + /// supported + virtual void PutIndices(const ::arrow::Array& indices) = 0; + + /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices + /// separately. Currently throws exception if the current dictionary memo is + /// non-empty + /// \param[in] values the dictionary values. Only valid for certain + /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray + virtual void PutDictionary(const ::arrow::Array& values) = 0; }; // ---------------------------------------------------------------------- @@ -204,8 +210,8 @@ using Int64Encoder = TypedEncoder; using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : virtual public TypedEncoder {}; -class FLBAEncoder : virtual public TypedEncoder {}; +using ByteArrayEncoder = TypedEncoder; +using FLBAEncoder = TypedEncoder; class BooleanDecoder : virtual public TypedDecoder { public: @@ -223,6 +229,7 @@ class ByteArrayDecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; + /// \brief Returns number of encoded values decoded virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::BinaryDictionary32Builder* builder) = 0; @@ -230,6 +237,11 @@ class ByteArrayDecoder : virtual public TypedDecoder { virtual int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionary32Builder* builder) = 0; + /// \brief Returns number of encoded values decoded + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, ::arrow::BinaryBuilder* builder) = 0; + + /// \brief Returns number of encoded values decoded virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; @@ -331,7 +343,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, template std::unique_ptr> MakeDictDecoder( - const ColumnDescriptor* descr, + const ColumnDescriptor* descr = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = DictDecoder; auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5410dc8367c..c8718f07d62 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -81,14 +81,14 @@ static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return TypedStatistics::Make( + return MakeStatistics( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior - return TypedStatistics::Make( + return MakeStatistics( descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index b7e55f0cc96..209969a0054 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -482,7 +482,8 @@ class PARQUET_EXPORT ArrowWriterProperties { : write_timestamps_as_int96_(false), coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), - truncated_timestamps_allowed_(false) {} + truncated_timestamps_allowed_(false), + store_schema_(false) {} virtual ~Builder() {} Builder* disable_deprecated_int96_timestamps() { @@ -511,10 +512,18 @@ class PARQUET_EXPORT ArrowWriterProperties { return this; } + /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file, + /// to enable certain read options (like "read_dictionary") to be set + /// automatically + Builder* store_schema() { + store_schema_ = true; + return this; + } + std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, - truncated_timestamps_allowed_)); + truncated_timestamps_allowed_, store_schema_)); } private: @@ -523,6 +532,8 @@ class PARQUET_EXPORT ArrowWriterProperties { bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; + + bool store_schema_; }; bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } @@ -534,20 +545,24 @@ class PARQUET_EXPORT ArrowWriterProperties { bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } + bool store_schema() const { return store_schema_; } + private: explicit ArrowWriterProperties(bool write_nanos_as_int96, bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, - bool truncated_timestamps_allowed) + bool truncated_timestamps_allowed, bool store_schema) : write_timestamps_as_int96_(write_nanos_as_int96), coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), - truncated_timestamps_allowed_(truncated_timestamps_allowed) {} + truncated_timestamps_allowed_(truncated_timestamps_allowed), + store_schema_(store_schema) {} const bool write_timestamps_as_int96_; const bool coerce_timestamps_enabled_; const ::arrow::TimeUnit::type coerce_timestamps_unit_; const bool truncated_timestamps_allowed_; + const bool store_schema_; }; /// \brief State object used for writing Arrow data directly to a Parquet diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index fa1caa96d31..84150d1a8bf 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -62,8 +62,7 @@ static FLBA FLBAFromString(const std::string& s) { } TEST(Comparison, SignedByteArray) { - auto comparator = - TypedComparator::Make(Type::BYTE_ARRAY, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::BYTE_ARRAY, SortOrder::SIGNED); std::string s1 = "12345"; std::string s2 = "12345678"; @@ -82,8 +81,7 @@ TEST(Comparison, SignedByteArray) { TEST(Comparison, UnsignedByteArray) { // Check if UTF-8 is compared using unsigned correctly - auto comparator = - TypedComparator::Make(Type::BYTE_ARRAY, SortOrder::UNSIGNED); + auto comparator = MakeComparator(Type::BYTE_ARRAY, SortOrder::UNSIGNED); std::string s1 = "arrange"; std::string s2 = "arrangement"; @@ -107,8 +105,8 @@ TEST(Comparison, UnsignedByteArray) { TEST(Comparison, SignedFLBA) { int size = 10; - auto comparator = TypedComparator::Make(Type::FIXED_LEN_BYTE_ARRAY, - SortOrder::SIGNED, size); + auto comparator = + MakeComparator(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED, size); std::string s1 = "Anti123456"; std::string s2 = "Bunkd123456"; @@ -125,8 +123,8 @@ TEST(Comparison, SignedFLBA) { TEST(Comparison, UnsignedFLBA) { int size = 10; - auto comparator = TypedComparator::Make(Type::FIXED_LEN_BYTE_ARRAY, - SortOrder::UNSIGNED, size); + auto comparator = + MakeComparator(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::UNSIGNED, size); std::string s1 = "Anti123456"; std::string s2 = "Bunkd123456"; @@ -146,7 +144,7 @@ TEST(Comparison, SignedInt96) { parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}}; parquet::Int96 aaa{{1, 41, static_cast(-14)}}, bbb{{1, 41, 42}}; - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::SIGNED); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -158,7 +156,7 @@ TEST(Comparison, UnsignedInt96) { parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast(-14)}}; parquet::Int96 aaa, bbb; - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::UNSIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::UNSIGNED); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(comparator->Compare(aa, bb)); @@ -197,7 +195,7 @@ TEST(Comparison, SignedInt64) { NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64); ColumnDescriptor descr(node, 0, 0); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -214,7 +212,7 @@ TEST(Comparison, UnsignedInt64) { ColumnDescriptor descr(node, 0, 0); ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order()); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -231,7 +229,7 @@ TEST(Comparison, UnsignedInt32) { ColumnDescriptor descr(node, 0, 0); ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order()); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -253,7 +251,6 @@ template class TestStatistics : public PrimitiveTypedTest { public: using T = typename TestType::c_type; - using TypedStats = TypedStatistics; std::vector GetDeepCopy( const std::vector&); // allocates new memory for FLBA/ByteArray @@ -264,15 +261,16 @@ class TestStatistics : public PrimitiveTypedTest { void TestMinMaxEncode() { this->GenerateData(1000); - auto statistics1 = TypedStats::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); statistics1->Update(this->values_ptr_, this->values_.size(), 0); std::string encoded_min = statistics1->EncodeMin(); std::string encoded_max = statistics1->EncodeMax(); - auto statistics2 = TypedStats::Make(this->schema_.Column(0), encoded_min, encoded_max, - this->values_.size(), 0, 0, true); + auto statistics2 = + MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); - auto statistics3 = TypedStats::Make(this->schema_.Column(0)); + auto statistics3 = MakeStatistics(this->schema_.Column(0)); std::vector valid_bits( BitUtil::BytesForBits(static_cast(this->values_.size())) + 1, 255); statistics3->UpdateSpaced(this->values_ptr_, valid_bits.data(), 0, @@ -293,7 +291,7 @@ class TestStatistics : public PrimitiveTypedTest { void TestReset() { this->GenerateData(1000); - auto statistics = TypedStats::Make(this->schema_.Column(0)); + auto statistics = MakeStatistics(this->schema_.Column(0)); statistics->Update(this->values_ptr_, this->values_.size(), 0); ASSERT_EQ(this->values_.size(), statistics->num_values()); @@ -308,17 +306,17 @@ class TestStatistics : public PrimitiveTypedTest { int num_null[2]; random_numbers(2, 42, 0, 100, num_null); - auto statistics1 = TypedStats::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); this->GenerateData(1000); statistics1->Update(this->values_ptr_, this->values_.size() - num_null[0], num_null[0]); - auto statistics2 = TypedStats::Make(this->schema_.Column(0)); + auto statistics2 = MakeStatistics(this->schema_.Column(0)); this->GenerateData(1000); statistics2->Update(this->values_ptr_, this->values_.size() - num_null[1], num_null[1]); - auto total = TypedStats::Make(this->schema_.Column(0)); + auto total = MakeStatistics(this->schema_.Column(0)); total->Merge(*statistics1); total->Merge(*statistics2); @@ -332,7 +330,7 @@ class TestStatistics : public PrimitiveTypedTest { this->GenerateData(num_values); // compute statistics for the whole batch - auto expected_stats = TypedStats::Make(this->schema_.Column(0)); + auto expected_stats = MakeStatistics(this->schema_.Column(0)); expected_stats->Update(this->values_ptr_, num_values - null_count, null_count); auto sink = CreateOutputStream(); @@ -456,7 +454,7 @@ template <> void TestStatistics::TestMinMaxEncode() { this->GenerateData(1000); // Test that we encode min max strings correctly - auto statistics1 = TypedStatistics::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); statistics1->Update(this->values_ptr_, this->values_.size(), 0); std::string encoded_min = statistics1->EncodeMin(); std::string encoded_max = statistics1->EncodeMax(); @@ -470,8 +468,8 @@ void TestStatistics::TestMinMaxEncode() { statistics1->max().len)); auto statistics2 = - TypedStatistics::Make(this->schema_.Column(0), encoded_min, - encoded_max, this->values_.size(), 0, 0, true); + MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); @@ -833,6 +831,25 @@ TYPED_TEST(TestStatisticsSortOrder, MinMax) { ASSERT_NO_FATAL_FAILURE(this->VerifyParquetStats()); } +TEST(TestByteArrayStatisticsFromArrow, Basics) { + // Part of ARROW-3246. Replicating TestStatisticsSortOrder test but via Arrow + + auto values = ArrayFromJSON(::arrow::utf8(), + u8"[\"c123\", \"b123\", \"a123\", null, " + "null, \"f123\", \"g123\", \"h123\", \"i123\", \"ΓΌ123\"]"); + + const auto& typed_values = static_cast(*values); + + NodePtr node = PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY, + ConvertedType::UTF8); + ColumnDescriptor descr(node, 0, 0); + auto stats = MakeStatistics(&descr); + ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); + + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); + ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); +} + // Ensure UNKNOWN sort order is handled properly using TestStatisticsSortOrderFLBA = TestStatisticsSortOrder; @@ -873,7 +890,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) { } // Test values - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); nan_stats->Update(&values[0], NUM_VALUES, 0); float min = nan_stats->min(); float max = nan_stats->max(); @@ -881,7 +898,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) { ASSERT_EQ(max, 3.0f); // Test all NaNs - auto all_nan_stats = TypedStatistics::Make(&descr); + auto all_nan_stats = MakeStatistics(&descr); all_nan_stats->Update(&nan_values[0], NUM_VALUES, 0); min = all_nan_stats->min(); max = all_nan_stats->max(); @@ -925,7 +942,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) { std::vector valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255); // Test values - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); nan_stats->UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0); float min = nan_stats->min(); float max = nan_stats->max(); @@ -933,7 +950,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) { ASSERT_EQ(max, 3.0f); // Test all NaNs - auto all_nan_stats = TypedStatistics::Make(&descr); + auto all_nan_stats = MakeStatistics(&descr); all_nan_stats->UpdateSpaced(&nan_values[0], valid_bits.data(), 0, NUM_VALUES, 0); min = all_nan_stats->min(); max = all_nan_stats->max(); @@ -968,7 +985,7 @@ TEST(TestStatisticsSortOrderDoubleNaN, NaNValues) { NodePtr node = PrimitiveNode::Make("nan_double", Repetition::OPTIONAL, Type::DOUBLE); ColumnDescriptor descr(node, 1, 1); - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); double values[NUM_VALUES] = {std::nan(""), std::nan(""), -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; double* values_ptr = &values[0]; diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 16abc152cf4..4bada835741 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -20,6 +20,9 @@ #include #include +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "parquet/encoding.h" @@ -30,6 +33,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::internal::checked_cast; namespace parquet { @@ -126,14 +130,14 @@ struct CompareHelper { } }; -template -class TypedComparatorImpl : public TypedComparator { +template +class TypedComparatorImpl : virtual public TypedComparator { public: typedef typename DType::c_type T; explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {} - bool CompareInline(const T& a, const T& b) { + bool CompareInline(const T& a, const T& b) const { return CompareHelper::Compare(type_length_, a, b); } @@ -157,9 +161,18 @@ class TypedComparatorImpl : public TypedComparator { int64_t valid_bits_offset, T* out_min, T* out_max) override { ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, length); - T min = values[0]; - T max = values[0]; - for (int64_t i = 0; i < length; i++) { + + // Find the first non-null value + int64_t first_non_null = 0; + while (!valid_bits_reader.IsSet()) { + ++first_non_null; + valid_bits_reader.Next(); + } + + T min = values[first_non_null]; + T max = values[first_non_null]; + valid_bits_reader.Next(); + for (int64_t i = first_non_null + 1; i < length; i++) { if (valid_bits_reader.IsSet()) { if (CompareInline(values[i], min)) { min = values[i]; @@ -173,46 +186,114 @@ class TypedComparatorImpl : public TypedComparator { *out_max = max; } + void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) override; + private: int type_length_; }; +template +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + typename DType::c_type* out_min, + typename DType::c_type* out_max) { + ParquetException::NYI(values.type()->ToString()); +} + +template +void GetMinMaxBinaryHelper( + const TypedComparatorImpl& comparator, + const ::arrow::Array& values, ByteArray* out_min, ByteArray* out_max) { + const auto& data = checked_cast(values); + + ByteArray min, max; + if (data.null_count() > 0) { + ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(), + data.offset(), data.length()); + + int64_t first_non_null = 0; + while (!valid_bits_reader.IsSet()) { + ++first_non_null; + valid_bits_reader.Next(); + } + min = data.GetView(first_non_null); + max = data.GetView(first_non_null); + for (int64_t i = first_non_null; i < data.length(); i++) { + ByteArray val = data.GetView(i); + if (valid_bits_reader.IsSet()) { + if (comparator.CompareInline(val, min)) { + min = val; + } else if (comparator.CompareInline(max, val)) { + max = val; + } + } + valid_bits_reader.Next(); + } + } else { + min = data.GetView(0); + max = data.GetView(0); + for (int64_t i = 0; i < data.length(); i++) { + ByteArray val = data.GetView(i); + if (comparator.CompareInline(val, min)) { + min = val; + } else if (comparator.CompareInline(max, val)) { + max = val; + } + } + } + *out_min = min; + *out_max = max; +} + +template <> +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + ByteArray* out_min, + ByteArray* out_max) { + GetMinMaxBinaryHelper(*this, values, out_min, out_max); +} + +template <> +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + ByteArray* out_min, + ByteArray* out_max) { + GetMinMaxBinaryHelper(*this, values, out_min, out_max); +} + std::shared_ptr Comparator::Make(Type::type physical_type, SortOrder::type sort_order, int type_length) { if (SortOrder::SIGNED == sort_order) { switch (physical_type) { case Type::BOOLEAN: - return std::make_shared>(); + return std::make_shared>(); case Type::INT32: - return std::make_shared>(); + return std::make_shared>(); case Type::INT64: - return std::make_shared>(); + return std::make_shared>(); case Type::INT96: - return std::make_shared>(); + return std::make_shared>(); case Type::FLOAT: - return std::make_shared>(); + return std::make_shared>(); case Type::DOUBLE: - return std::make_shared>(); + return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared>(); + return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared>(type_length); + return std::make_shared>(type_length); default: ParquetException::NYI("Signed Compare not implemented"); } } else if (SortOrder::UNSIGNED == sort_order) { switch (physical_type) { case Type::INT32: - return std::make_shared>(); + return std::make_shared>(); case Type::INT64: - return std::make_shared>(); + return std::make_shared>(); case Type::INT96: - return std::make_shared>(); + return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared>(); + return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared>(type_length); + return std::make_shared>(type_length); default: ParquetException::NYI("Unsigned Compare not implemented"); } @@ -228,6 +309,59 @@ std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { // ---------------------------------------------------------------------- +template +struct StatsHelper { + bool CanHaveNaN() { return false; } + + inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; } + + inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; } + + inline bool IsNaN(const T value) { return false; } +}; + +template +struct StatsHelper::value>::type> { + bool CanHaveNaN() { return true; } + + inline int64_t GetValueBeginOffset(const T* values, int64_t count) { + // Skip NaNs + for (int64_t i = 0; i < count; i++) { + if (!std::isnan(values[i])) { + return i; + } + } + return count; + } + + inline int64_t GetValueEndOffset(const T* values, int64_t count) { + // Skip NaNs + for (int64_t i = (count - 1); i >= 0; i--) { + if (!std::isnan(values[i])) { + return (i + 1); + } + } + return 0; + } + + inline bool IsNaN(const T value) { return std::isnan(value); } +}; + +template +void SetNaN(T* value) { + // no-op +} + +template <> +void SetNaN(float* value) { + *value = std::nanf(""); +} + +template <> +void SetNaN(double* value) { + *value = std::nan(""); +} + template class TypedStatisticsImpl : public TypedStatistics { public: @@ -305,6 +439,25 @@ class TypedStatisticsImpl : public TypedStatistics { void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced, int64_t num_not_null, int64_t num_null) override; + void Update(const ::arrow::Array& values) override { + IncrementNullCount(values.null_count()); + IncrementNumValues(values.length() - values.null_count()); + + // TODO: support distinct count? + if (values.null_count() == values.length()) { + return; + } + + StatsHelper helper; + if (helper.CanHaveNaN()) { + ParquetException::NYI("No NaN handling for Arrow arrays yet"); + } + + T batch_min, batch_max; + comparator_->GetMinMax(values, &batch_min, &batch_max); + SetMinMax(batch_min, batch_max); + } + const T& min() const override { return min_; } const T& max() const override { return max_; } @@ -393,55 +546,6 @@ inline void TypedStatisticsImpl::Copy(const ByteArray& src, ByteA *dst = ByteArray(src.len, buffer->data()); } -template -struct StatsHelper { - inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; } - - inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; } - - inline bool IsNaN(const T value) { return false; } -}; - -template -struct StatsHelper::value>::type> { - inline int64_t GetValueBeginOffset(const T* values, int64_t count) { - // Skip NaNs - for (int64_t i = 0; i < count; i++) { - if (!std::isnan(values[i])) { - return i; - } - } - return count; - } - - inline int64_t GetValueEndOffset(const T* values, int64_t count) { - // Skip NaNs - for (int64_t i = (count - 1); i >= 0; i--) { - if (!std::isnan(values[i])) { - return (i + 1); - } - } - return 0; - } - - inline bool IsNaN(const T value) { return std::isnan(value); } -}; - -template -void SetNaN(T* value) { - // no-op -} - -template <> -void SetNaN(float* value) { - *value = std::nanf(""); -} - -template <> -void SetNaN(double* value) { - *value = std::nan(""); -} - template void TypedStatisticsImpl::Update(const T* values, int64_t num_not_null, int64_t num_null) { @@ -461,7 +565,7 @@ void TypedStatisticsImpl::Update(const T* values, int64_t num_not_null, int64_t end_offset = helper.GetValueEndOffset(values, num_not_null); // All values are NaN - if (end_offset < begin_offset) { + if (helper.CanHaveNaN() && end_offset < begin_offset) { // Set min/max to NaNs in this case. // Don't set has_min_max flag since // these values must be over-written by valid stats later @@ -494,26 +598,28 @@ void TypedStatisticsImpl::UpdateSpaced(const T* values, const uint8_t* va // As (num_not_null != 0) there must be one int64_t length = num_null + num_not_null; int64_t i = 0; - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - length); StatsHelper helper; - for (; i < length; i++) { - // PARQUET-1225: Handle NaNs - if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) { - break; + if (helper.CanHaveNaN()) { + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + length); + for (; i < length; i++) { + // PARQUET-1225: Handle NaNs + if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) { + break; + } + valid_bits_reader.Next(); } - valid_bits_reader.Next(); - } - // All are NaNs and stats are not set yet - if ((i == length) && helper.IsNaN(values[i - 1])) { - // Don't set has_min_max flag since - // these values must be over-written by valid stats later - if (!has_min_max_) { - SetNaN(&min_); - SetNaN(&max_); + // All are NaNs and stats are not set yet + if ((i == length) && helper.IsNaN(values[i - 1])) { + // Don't set has_min_max flag since + // these values must be over-written by valid stats later + if (!has_min_max_) { + SetNaN(&min_); + SetNaN(&max_); + } + return; } - return; } // Find min and max values from remaining non-NaN values diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 402b3c38923..30d58aafd8d 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -26,6 +26,13 @@ #include "parquet/platform.h" #include "parquet/types.h" +namespace arrow { + +class Array; +class BinaryArray; + +} // namespace arrow + namespace parquet { class ColumnDescriptor; @@ -63,19 +70,6 @@ class TypedComparator : public Comparator { public: using T = typename DType::c_type; - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(Type::type physical_type, - SortOrder::type sort_order, - int type_length = -1) { - return std::static_pointer_cast>( - Comparator::Make(physical_type, sort_order, type_length)); - } - - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(const ColumnDescriptor* descr) { - return std::static_pointer_cast>(Comparator::Make(descr)); - } - /// \brief Scalar comparison of two elements, return true if first /// is strictly less than the second virtual bool Compare(const T& a, const T& b) = 0; @@ -84,6 +78,11 @@ class TypedComparator : public Comparator { /// elements without any nulls virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0; + /// \brief Compute minimum and maximum elements from an Arrow array. Only + /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY + /// / arrow::BinaryArray + virtual void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) = 0; + /// \brief Compute maximum and minimum elements in a batch of /// elements with accompanying bitmap indicating which elements are /// included (bit set) and excluded (bit not set) @@ -100,6 +99,21 @@ class TypedComparator : public Comparator { int64_t valid_bits_offset, T* out_min, T* out_max) = 0; }; +/// \brief Typed version of Comparator::Make +template +std::shared_ptr> MakeComparator(Type::type physical_type, + SortOrder::type sort_order, + int type_length = -1) { + return std::static_pointer_cast>( + Comparator::Make(physical_type, sort_order, type_length)); +} + +/// \brief Typed version of Comparator::Make +template +std::shared_ptr> MakeComparator(const ColumnDescriptor* descr) { + return std::static_pointer_cast>(Comparator::Make(descr)); +} + // ---------------------------------------------------------------------- /// \brief Structure represented encoded statistics to be written to @@ -137,33 +151,33 @@ class PARQUET_EXPORT EncodedStatistics { } } - inline bool is_set() const { + bool is_set() const { return has_min || has_max || has_null_count || has_distinct_count; } - inline bool is_signed() const { return is_signed_; } + bool is_signed() const { return is_signed_; } - inline void set_is_signed(bool is_signed) { is_signed_ = is_signed; } + void set_is_signed(bool is_signed) { is_signed_ = is_signed; } - inline EncodedStatistics& set_max(const std::string& value) { + EncodedStatistics& set_max(const std::string& value) { *max_ = value; has_max = true; return *this; } - inline EncodedStatistics& set_min(const std::string& value) { + EncodedStatistics& set_min(const std::string& value) { *min_ = value; has_min = true; return *this; } - inline EncodedStatistics& set_null_count(int64_t value) { + EncodedStatistics& set_null_count(int64_t value) { null_count = value; has_null_count = true; return *this; } - inline EncodedStatistics& set_distinct_count(int64_t value) { + EncodedStatistics& set_distinct_count(int64_t value) { distinct_count = value; has_distinct_count = true; return *this; @@ -242,39 +256,6 @@ class TypedStatistics : public Statistics { public: using T = typename DType::c_type; - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, pool)); - } - - /// \brief Create Statistics initialized to a particular state - /// \param[in] min the minimum value - /// \param[in] max the minimum value - /// \param[in] num_values number of values - /// \param[in] null_count number of null values - /// \param[in] distinct_count number of distinct values - static std::shared_ptr> Make(const T& min, const T& max, - int64_t num_values, - int64_t null_count, - int64_t distinct_count) { - return std::static_pointer_cast>(Statistics::Make( - DType::type_num, &min, &max, num_values, null_count, distinct_count)); - } - - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, pool)); - } - /// \brief The current minimum value virtual const T& min() const = 0; @@ -289,18 +270,19 @@ class TypedStatistics : public Statistics { /// \brief Batch statistics update with supplied validity bitmap virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits, - int64_t valid_bits_spaced, int64_t num_not_null, + int64_t valid_bits_offset, int64_t num_not_null, int64_t num_null) = 0; + /// \brief EXPERIMENTAL: Update statistics with an Arrow array without + /// conversion to a primitive Parquet C type. Only implemented for certain + /// Parquet type / Arrow type combinations like BYTE_ARRAY / + /// arrow::BinaryArray + virtual void Update(const ::arrow::Array& values) = 0; + /// \brief Set min and max values to particular values virtual void SetMinMax(const T& min, const T& max) = 0; }; -#ifndef ARROW_NO_DEPRECATED_API -// TODO(wesm): Remove after Arrow 0.14.0 -using RowGroupStatistics = Statistics; -#endif - using BoolStatistics = TypedStatistics; using Int32Statistics = TypedStatistics; using Int64Statistics = TypedStatistics; @@ -309,4 +291,40 @@ using DoubleStatistics = TypedStatistics; using ByteArrayStatistics = TypedStatistics; using FLBAStatistics = TypedStatistics; +/// \brief Typed version of Statistics::Make +template +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>(Statistics::Make(descr, pool)); +} + +/// \brief Create Statistics initialized to a particular state +/// \param[in] min the minimum value +/// \param[in] max the minimum value +/// \param[in] num_values number of values +/// \param[in] null_count number of null values +/// \param[in] distinct_count number of distinct values +template +std::shared_ptr> MakeStatistics(const typename DType::c_type& min, + const typename DType::c_type& max, + int64_t num_values, + int64_t null_count, + int64_t distinct_count) { + return std::static_pointer_cast>(Statistics::Make( + DType::type_num, &min, &max, num_values, null_count, distinct_count)); +} + +/// \brief Typed version of Statistics::Make +template +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, pool)); +} + } // namespace parquet diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index bc456ea24a8..30395f37ec4 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -493,6 +493,10 @@ class ColumnOrder { struct ByteArray { ByteArray() : len(0), ptr(NULLPTR) {} ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + + ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion + : ByteArray(static_cast(view.size()), + reinterpret_cast(view.data())) {} uint32_t len; const uint8_t* ptr; }; diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 82ca9fbb33e..97e73cb6468 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -365,6 +365,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* coerce_timestamps(TimeUnit unit) Builder* allow_truncated_timestamps() Builder* disallow_truncated_timestamps() + Builder* store_schema() shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index d51e7fb3c6c..82da6b572d8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1226,6 +1226,11 @@ cdef class ParquetWriter: properties = properties_builder.build() cdef ArrowWriterProperties.Builder arrow_properties_builder + + # Store the original Arrow schema so things like dictionary types can + # be automatically reconstructed + arrow_properties_builder.store_schema() + self._set_int96_support(&arrow_properties_builder) self._set_coerce_timestamps(&arrow_properties_builder) self._set_allow_truncated_timestamps(&arrow_properties_builder) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b143dd4bada..fc620c1eea7 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -159,10 +159,10 @@ def alltypes_sample(size=10000, seed=0, categorical=False): @pytest.mark.pandas @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size): +def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): df = alltypes_sample(size=10000, categorical=True) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None @@ -173,7 +173,7 @@ def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size): assert arrow_table.schema.metadata == table_read.schema.metadata - df_read = table_read.to_pandas(categories=['str_category']) + df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read, check_categorical=False) @@ -297,7 +297,7 @@ def test_datetime_timezone_tzinfo(): def test_pandas_parquet_custom_metadata(tempdir): df = alltypes_sample(size=10000) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata @@ -321,7 +321,7 @@ def test_pandas_parquet_column_multiindex(tempdir): names=['level_1', 'level_2'] ) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None @@ -333,10 +333,10 @@ def test_pandas_parquet_column_multiindex(tempdir): @pytest.mark.pandas -def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir): +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = arrow_table.schema.pandas_metadata assert not js['index_columns'] @@ -357,7 +357,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir): @pytest.mark.pandas -def test_pandas_parquet_1_0_rountrip(tempdir): +def test_pandas_parquet_1_0_roundtrip(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -376,7 +376,7 @@ def test_pandas_parquet_1_0_rountrip(tempdir): 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') table_read = _read_table(filename) @@ -415,7 +415,7 @@ def test_pandas_column_selection(tempdir): 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) table_read = _read_table(filename, columns=['uint8']) @@ -567,7 +567,7 @@ def test_pandas_parquet_configuration_options(tempdir): 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: @@ -883,7 +883,7 @@ def test_validate_schema_write_table(tempdir): def test_column_of_arrays(tempdir): df, schema = dataframe_with_arrays() - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms') table_read = _read_table(filename) @@ -914,7 +914,7 @@ def test_coerce_timestamps(tempdir): df = pd.DataFrame(arrays) schema = pa.schema(fields) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='us') @@ -967,7 +967,7 @@ def test_coerce_timestamps_truncated(tempdir): def test_column_of_lists(tempdir): df, schema = dataframe_with_lists(parquet_compatible=True) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version='2.0') table_read = _read_table(filename) @@ -1888,8 +1888,12 @@ def test_read_schema(tempdir): table = pa.Table.from_pandas(df) _write_table(table, data_path) - assert table.schema.equals(pq.read_schema(data_path)) - assert table.schema.equals(pq.read_schema(data_path, memory_map=True)) + read1 = pq.read_schema(data_path) + read2 = pq.read_schema(data_path, memory_map=True) + assert table.schema.equals(read1, check_metadata=False) + assert table.schema.equals(read2, check_metadata=False) + + assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas'] def _filter_partition(df, part_keys): @@ -2981,6 +2985,36 @@ def test_parquet_file_too_small(tempdir): pq.read_table(path) +def test_dictionary_array_automatically_read(): + # ARROW-3246 + + # Make a large dictionary, a little over 4MB of data + dict_length = 4000 + dict_values = pa.array([('x' * 1000 + '_{}'.format(i)) + for i in range(dict_length)]) + + num_chunks = 10 + chunk_size = 100 + chunks = [] + for i in range(num_chunks): + indices = np.random.randint(0, dict_length, + size=chunk_size).astype(np.int32) + chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices), + dict_values)) + + table = pa.table([pa.chunked_array(chunks)], names=['f0']) + + bio = pa.BufferOutputStream() + pq.write_table(table, bio) + contents = bio.getvalue() + result = pq.read_table(pa.BufferReader(contents)) + + assert result.equals(table) + + # The only key in the metadata was the Arrow schema key + assert result.schema.metadata is None + + @pytest.mark.pandas def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]