From 21347cb1b4bd068fd5ecb9a2297c74bbff68abc6 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Thu, 24 Sep 2015 06:34:55 -0700 Subject: [PATCH] Fixed ORC-28: logical (top-level) column selection now is correctly matched to physical column selection. --- c++/src/Reader.cc | 50 ++++++++++------------------ tools/test/TestReader.cc | 70 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 33 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index cb0647dd00..c22996b9e7 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -895,8 +895,7 @@ namespace orc { proto::StripeFooter getStripeFooter(const proto::StripeInformation& info); void startNextStripe(); void checkOrcVersion(); - void selectTypeParent(size_t columnId); - void selectTypeChildren(size_t columnId); + void selectType(const Type& type); void readMetadata() const; std::unique_ptr createRowBatch(const Type& type, uint64_t capacity @@ -1069,9 +1068,22 @@ namespace orc { const std::list& included = options.getInclude(); for(std::list::const_iterator columnId = included.begin(); columnId != included.end(); ++columnId) { - if (*columnId <= static_cast(schema->getSubtypeCount())) { - selectTypeParent(static_cast(*columnId)); - selectTypeChildren(static_cast(*columnId)); + if (*columnId == 0) { + selectType(*(schema.get())); + } else if (*columnId <= static_cast(schema->getSubtypeCount())) { + selectType(schema->getSubtype(*columnId-1)); + } + } + if (included.size() > 0) { + selectedColumns[0] = true; + } + } + + void ReaderImpl::selectType(const Type& type) { + if (!selectedColumns[type.getColumnId()]) { + selectedColumns[type.getColumnId()] = true; + for (uint64_t i=0; i < type.getSubtypeCount(); i++) { + selectType(type.getSubtype(i)); } } } @@ -1184,34 +1196,6 @@ namespace orc { return false; } - void ReaderImpl::selectTypeParent(size_t columnId) { - for(size_t parent=0; parent < columnId; ++parent) { - const proto::Type& parentType = footer->types(static_cast(parent)); - for(int idx=0; idx < parentType.subtypes_size(); ++idx) { - uint64_t child = parentType.subtypes(idx); - if (child == columnId) { - if (!selectedColumns[parent]) { - selectedColumns[parent] = true; - selectTypeParent(parent); - return; - } - } - } - } - } - - void ReaderImpl::selectTypeChildren(size_t columnId) { - if (!selectedColumns[columnId]) { - selectedColumns[columnId] = true; - const proto::Type& parentType = - footer->types(static_cast(columnId)); - for(int idx=0; idx < parentType.subtypes_size(); ++idx) { - uint64_t child = parentType.subtypes(idx); - selectTypeChildren(child); - } - } - } - const std::vector ReaderImpl::getSelectedColumns() const { return selectedColumns; } diff --git a/tools/test/TestReader.cc b/tools/test/TestReader.cc index ebc326fcca..92fa10ae62 100644 --- a/tools/test/TestReader.cc +++ b/tools/test/TestReader.cc @@ -907,6 +907,76 @@ TEST(Reader, futureFormatVersion) { EXPECT_EQ("19.99", reader->getFormatVersion()); } +TEST(Reader, selectColumns) { + orc::ReaderOptions opts; + std::ostringstream filename; + filename << exampleDirectory << "/TestOrcFile.testSeek.orc"; + std::list cols; + + // All columns + cols.push_back(0); + opts.include(cols); + std::unique_ptr reader = + orc::createReader(orc::readLocalFile(filename.str()), opts); + std::vector c = reader->getSelectedColumns(); + EXPECT_EQ(24, c.size()); + for (unsigned int i=0; i < c.size(); i++) { + EXPECT_TRUE(c[i]); + } + + // Int column #2 + cols.clear(); + cols.push_back(2); + opts.include(cols); + reader = orc::createReader(orc::readLocalFile(filename.str()), opts); + c = reader->getSelectedColumns(); + for (unsigned int i=1; i < c.size(); i++) { + if (i==2) + EXPECT_TRUE(c[i]); + else + EXPECT_TRUE(!c[i]); + } + + // Struct column #10 + cols.clear(); + cols.push_back(10); + opts.include(cols); + reader = orc::createReader(orc::readLocalFile(filename.str()), opts); + c = reader->getSelectedColumns(); + for (unsigned int i=1; i < c.size(); i++) { + if (i>=10 && i<=14) + EXPECT_TRUE(c[i]); + else + EXPECT_TRUE(!c[i]); + } + + // Array column #11 + cols.clear(); + cols.push_back(11); + opts.include(cols); + reader = orc::createReader(orc::readLocalFile(filename.str()), opts); + c = reader->getSelectedColumns(); + for (unsigned int i=1; i < c.size(); i++) { + if (i>=15 && i<=18) + EXPECT_TRUE(c[i]); + else + EXPECT_TRUE(!c[i]); + } + + // Map column #12 + cols.clear(); + cols.push_back(12); + opts.include(cols); + reader = orc::createReader(orc::readLocalFile(filename.str()), opts); + c = reader->getSelectedColumns(); + for (unsigned int i=1; i < c.size(); i++) { + if (i>=19 && i<=23) + EXPECT_TRUE(c[i]); + else + EXPECT_TRUE(!c[i]); + } +} + std::map makeMetadata() { std::map result; result["my.meta"] = "\x01\x02\x03\x04\x05\x06\x07\xff\xfe\x7f\x80";