From 2a95b6794aabca977064e7b7f1969d9ab0c54dc7 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 8 May 2016 00:36:37 -0400 Subject: [PATCH 1/3] Implemented leaf_to_base --- src/parquet/column/CMakeLists.txt | 1 + src/parquet/schema/descriptor.cc | 18 +++++++++++++++--- src/parquet/schema/descriptor.h | 9 +++++++-- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/parquet/column/CMakeLists.txt b/src/parquet/column/CMakeLists.txt index 4c50c0a7..d64be6c9 100644 --- a/src/parquet/column/CMakeLists.txt +++ b/src/parquet/column/CMakeLists.txt @@ -19,6 +19,7 @@ install(FILES page.h levels.h + properties.h reader.h scanner.h writer.h diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index 01f04212..f34e2c5b 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -42,12 +42,13 @@ void SchemaDescriptor::Init(const NodePtr& schema) { leaves_.clear(); for (int i = 0; i < group_->field_count(); ++i) { - BuildTree(group_->field(i), 0, 0); + BuildTree(group_->field(i), 0, 0, group_->field(i)); } } void SchemaDescriptor::BuildTree( - const NodePtr& node, int16_t max_def_level, int16_t max_rep_level) { + const NodePtr& node, int16_t max_def_level, int16_t max_rep_level, + const NodePtr& base) { if (node->is_optional()) { ++max_def_level; } else if (node->is_repeated()) { @@ -61,11 +62,12 @@ void SchemaDescriptor::BuildTree( if (node->is_group()) { const GroupNode* group = static_cast(node.get()); for (int i = 0; i < group->field_count(); ++i) { - BuildTree(group->field(i), max_def_level, max_rep_level); + BuildTree(group->field(i), max_def_level, max_rep_level, base); } } else { // Primitive node, append to leaves leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this)); + leaf_to_base_.emplace(leaves_.size() - 1, base); } } @@ -81,9 +83,19 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { + if (i < 0 || i >= static_cast(leaves_.size())) { + throw ParquetException("Invalid column id " + i); + } return &leaves_[i]; } +const schema::NodePtr& SchemaDescriptor::base(int i) const { + if (i < 0 || i >= static_cast(leaves_.size())) { + throw ParquetException("Invalid column id " + i); + } + return leaf_to_base_.find(i)->second; +} + int ColumnDescriptor::type_scale() const { return primitive_node_->decimal_metadata().scale; } diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index 7c04e59a..54bc61b4 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -101,6 +101,10 @@ class SchemaDescriptor { const schema::NodePtr& schema() const { return schema_; } + const schema::GroupNode* group() const { return group_; } + + const schema::NodePtr& base(int i) const; + private: friend class ColumnDescriptor; @@ -108,7 +112,8 @@ class SchemaDescriptor { const schema::GroupNode* group_; void BuildTree( - const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level); + const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level, + const schema::NodePtr& base); // Result of leaf node / tree analysis std::vector leaves_; @@ -122,7 +127,7 @@ class SchemaDescriptor { // -- -- b | // -- -- -- c | // -- -- -- -- d - std::unordered_map leaf_to_base_; + std::unordered_map leaf_to_base_; }; } // namespace parquet From d80352ff705163859601c60ab1f6b3e5aed67207 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 8 May 2016 10:44:09 -0400 Subject: [PATCH 2/3] added tests --- src/parquet/schema/descriptor.cc | 9 ++++----- src/parquet/schema/descriptor.h | 5 ++--- src/parquet/schema/schema-descriptor-test.cc | 14 +++++++++++++- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index f34e2c5b..68c0ecbb 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -46,9 +46,8 @@ void SchemaDescriptor::Init(const NodePtr& schema) { } } -void SchemaDescriptor::BuildTree( - const NodePtr& node, int16_t max_def_level, int16_t max_rep_level, - const NodePtr& base) { +void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const NodePtr& base) { if (node->is_optional()) { ++max_def_level; } else if (node->is_repeated()) { @@ -84,14 +83,14 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, const ColumnDescriptor* SchemaDescriptor::Column(int i) const { if (i < 0 || i >= static_cast(leaves_.size())) { - throw ParquetException("Invalid column id " + i); + throw ParquetException("Invalid column id " + std::to_string(i)); } return &leaves_[i]; } const schema::NodePtr& SchemaDescriptor::base(int i) const { if (i < 0 || i >= static_cast(leaves_.size())) { - throw ParquetException("Invalid column id " + i); + throw ParquetException("Invalid column id " + std::to_string(i)); } return leaf_to_base_.find(i)->second; } diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index 54bc61b4..a5609d3a 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -111,9 +111,8 @@ class SchemaDescriptor { schema::NodePtr schema_; const schema::GroupNode* group_; - void BuildTree( - const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level, - const schema::NodePtr& base); + void BuildTree(const schema::NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const schema::NodePtr& base); // Result of leaf node / tree analysis std::vector leaves_; diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc index dd552be8..0efd99dd 100644 --- a/src/parquet/schema/schema-descriptor-test.cc +++ b/src/parquet/schema/schema-descriptor-test.cc @@ -75,7 +75,8 @@ TEST_F(TestSchemaDescriptor, BuildTree) { NodeVector fields; NodePtr schema; - fields.push_back(Int32("a", Repetition::REQUIRED)); + NodePtr inta = Int32("a", Repetition::REQUIRED); + fields.push_back(inta); fields.push_back(Int64("b", Repetition::OPTIONAL)); fields.push_back(ByteArray("c", Repetition::REPEATED)); @@ -121,6 +122,17 @@ TEST_F(TestSchemaDescriptor, BuildTree) { ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1"); ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2"); ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3"); + ASSERT_THROW(descr_.Column(-1), ParquetException); + ASSERT_THROW(descr_.Column(6), ParquetException); + + ASSERT_EQ(inta.get(), descr_.base(0).get()); + ASSERT_EQ(bag.get(), descr_.base(3).get()); + ASSERT_EQ(bag.get(), descr_.base(4).get()); + ASSERT_EQ(bag.get(), descr_.base(5).get()); + ASSERT_THROW(descr_.base(-1), ParquetException); + ASSERT_THROW(descr_.base(6), ParquetException); + + ASSERT_EQ(schema.get(), descr_.group()); // Init clears the leaves descr_.Init(schema); From 9ded3683e83cb3a129ea69852b674d741961caa4 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 9 May 2016 00:03:09 -0400 Subject: [PATCH 3/3] review comments --- src/parquet/column/properties.h | 1 - src/parquet/schema/descriptor.cc | 11 ++++------- src/parquet/schema/descriptor.h | 3 ++- src/parquet/schema/schema-descriptor-test.cc | 14 +++++--------- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/parquet/column/properties.h b/src/parquet/column/properties.h index 40d04c3f..132b1a6e 100644 --- a/src/parquet/column/properties.h +++ b/src/parquet/column/properties.h @@ -23,7 +23,6 @@ #include "parquet/util/input.h" #include "parquet/util/mem-allocator.h" -#include "parquet/types.h" namespace parquet { diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index 68c0ecbb..de63e5e7 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -18,6 +18,7 @@ #include "parquet/schema/descriptor.h" #include "parquet/exception.h" +#include "parquet/util/logging.h" namespace parquet { @@ -82,16 +83,12 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { - if (i < 0 || i >= static_cast(leaves_.size())) { - throw ParquetException("Invalid column id " + std::to_string(i)); - } + DCHECK(i >= 0 && i < static_cast(leaves_.size())); return &leaves_[i]; } -const schema::NodePtr& SchemaDescriptor::base(int i) const { - if (i < 0 || i >= static_cast(leaves_.size())) { - throw ParquetException("Invalid column id " + std::to_string(i)); - } +const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const { + DCHECK(i >= 0 && i < static_cast(leaves_.size())); return leaf_to_base_.find(i)->second; } diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index a5609d3a..eb6eac66 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -103,7 +103,8 @@ class SchemaDescriptor { const schema::GroupNode* group() const { return group_; } - const schema::NodePtr& base(int i) const; + // Returns the root (child of the schema root) node of the leaf(column) node + const schema::NodePtr& GetColumnRoot(int i) const; private: friend class ColumnDescriptor; diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc index 0efd99dd..d88cd0d9 100644 --- a/src/parquet/schema/schema-descriptor-test.cc +++ b/src/parquet/schema/schema-descriptor-test.cc @@ -122,15 +122,11 @@ TEST_F(TestSchemaDescriptor, BuildTree) { ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1"); ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2"); ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3"); - ASSERT_THROW(descr_.Column(-1), ParquetException); - ASSERT_THROW(descr_.Column(6), ParquetException); - - ASSERT_EQ(inta.get(), descr_.base(0).get()); - ASSERT_EQ(bag.get(), descr_.base(3).get()); - ASSERT_EQ(bag.get(), descr_.base(4).get()); - ASSERT_EQ(bag.get(), descr_.base(5).get()); - ASSERT_THROW(descr_.base(-1), ParquetException); - ASSERT_THROW(descr_.base(6), ParquetException); + + ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get()); ASSERT_EQ(schema.get(), descr_.group());