From 0b429faef74703f63530e309234f461aa2444ce7 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Mon, 12 Oct 2015 09:33:21 -0400 Subject: [PATCH 1/6] Work in progress. --- c++/include/orc/ColumnPrinter.hh | 2 +- c++/src/ColumnPrinter.cc | 33 +++++++++++++++++++++----------- tools/src/FileContents.cc | 31 +++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/c++/include/orc/ColumnPrinter.hh b/c++/include/orc/ColumnPrinter.hh index 17c1901b32..c2d8ce8fdd 100644 --- a/c++/include/orc/ColumnPrinter.hh +++ b/c++/include/orc/ColumnPrinter.hh @@ -47,6 +47,6 @@ namespace orc { }; ORC_UNIQUE_PTR createColumnPrinter(std::string&, - const Type& type); + const Type& type, const std::vector* selectedColumns = nullptr); } #endif diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc index aa90be61d1..0185f9e0e0 100644 --- a/c++/src/ColumnPrinter.cc +++ b/c++/src/ColumnPrinter.cc @@ -173,8 +173,10 @@ namespace orc { class StructColumnPrinter: public ColumnPrinter { private: std::vector fieldPrinter; + std::vector fieldNames; public: - StructColumnPrinter(std::string&, const Type& type); + StructColumnPrinter(std::string&, const Type& type, + const std::vector* selectedColumns); virtual ~StructColumnPrinter(); void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -209,9 +211,11 @@ namespace orc { } } - std::unique_ptr createColumnPrinter(std::string& buffer, - const Type& type) { - ColumnPrinter *result; + std::unique_ptr createColumnPrinter( + std::string& buffer, + const Type& type, + const std::vector* selectedColumns) { + ColumnPrinter *result = nullptr; switch(static_cast(type.getKind())) { case BOOLEAN: result = new BooleanColumnPrinter(buffer, type); @@ -252,7 +256,7 @@ namespace orc { break; case STRUCT: - result = new StructColumnPrinter(buffer, type); + result = new StructColumnPrinter(buffer, type, selectedColumns); break; case DECIMAL: @@ -558,12 +562,19 @@ namespace orc { } } - StructColumnPrinter::StructColumnPrinter(std::string& buffer, - const Type& type - ): ColumnPrinter(buffer, type) { + StructColumnPrinter::StructColumnPrinter( + std::string& buffer, + const Type& type, + const std::vector* selectedColumns + ): ColumnPrinter(buffer, type) { for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)) - .release()); + if (selectedColumns==nullptr || selectedColumns->at(type.getSubtype(i).getColumnId())) { + std::cout << "SELECTED COLUMN " << i << "(" << type.getFieldName(i) + << ") with columnId " << type.getSubtype(i).getColumnId() << std::endl; + fieldNames.push_back(type.getFieldName(i)); + fieldPrinter.push_back(createColumnPrinter(buffer, + type.getSubtype(i)).release()); + } } } @@ -592,7 +603,7 @@ namespace orc { writeString(buffer, ", "); } writeChar(buffer, '"'); - writeString(buffer, type.getFieldName(i).c_str()); + writeString(buffer, fieldNames[i].c_str()); writeString(buffer, "\": "); fieldPrinter[i]->printRow(rowId); } diff --git a/tools/src/FileContents.cc b/tools/src/FileContents.cc index 694fea3ba9..f0a1bf1b3c 100644 --- a/tools/src/FileContents.cc +++ b/tools/src/FileContents.cc @@ -31,8 +31,9 @@ void printContents(const char* filename, const orc::ReaderOptions opts) { std::unique_ptr batch = reader->createRowBatch(1000); std::string line; + const std::vector selectedColumns = reader->getSelectedColumns(); std::unique_ptr printer = - createColumnPrinter(line, reader->getType()); + createColumnPrinter(line, reader->getType(), &selectedColumns); while (reader->next(*batch)) { printer->reset(*batch); @@ -48,11 +49,35 @@ void printContents(const char* filename, const orc::ReaderOptions opts) { int main(int argc, char* argv[]) { if (argc < 2) { - std::cout << "Usage: file-contents \n"; - return 1; + if (argc < 2) { + std::cout << "Usage: file-contents " + << "[--columns=column1,column2,...]\n" ; + return 1; + } + } try { + const std::string COLUMNS_PREFIX = "--columns="; + std::list cols; + + // Read command-line options + char* param ; + char* value ; + for (int i = 2; i < argc; i++) { + if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) { + value = std::strtok(param+COLUMNS_PREFIX.length(), "," ); + while (value) { + cols.push_back(std::atoi(value)); + value = std::strtok(nullptr, "," ); + } + } else { + std::cout << "Unknown option " << argv[i] << "\n" ; + } + } orc::ReaderOptions opts; + if (cols.size() > 0) { + opts.include(cols); + } printContents(argv[1], opts); } catch (std::exception& ex) { std::cerr << "Caught exception: " << ex.what() << "\n"; From 9e7f280cf64ed1434332504341d88048f6925e98 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Mon, 12 Oct 2015 08:04:44 -0700 Subject: [PATCH 2/6] Fixed ORC-29: Enable ColumnPrinter to print only specified columns. --- c++/src/ColumnPrinter.cc | 2 -- tools/test/TestReader.cc | 60 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc index 0185f9e0e0..5d1703235f 100644 --- a/c++/src/ColumnPrinter.cc +++ b/c++/src/ColumnPrinter.cc @@ -569,8 +569,6 @@ namespace orc { ): ColumnPrinter(buffer, type) { for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { if (selectedColumns==nullptr || selectedColumns->at(type.getSubtype(i).getColumnId())) { - std::cout << "SELECTED COLUMN " << i << "(" << type.getFieldName(i) - << ") with columnId " << type.getSubtype(i).getColumnId() << std::endl; fieldNames.push_back(type.getFieldName(i)); fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)).release()); diff --git a/tools/test/TestReader.cc b/tools/test/TestReader.cc index 92fa10ae62..773e6dfcd7 100644 --- a/tools/test/TestReader.cc +++ b/tools/test/TestReader.cc @@ -923,6 +923,26 @@ TEST(Reader, selectColumns) { for (unsigned int i=0; i < c.size(); i++) { EXPECT_TRUE(c[i]); } + std::unique_ptr batch = reader->createRowBatch(1); + std::string line; + std::unique_ptr printer = + createColumnPrinter(line, reader->getType(), &c); + reader->next(*batch); + printer->reset(*batch); + printer->printRow(0); + std::ostringstream expected; + expected << "{\"boolean1\": true, \"byte1\": -76, " + << "\"short1\": 21684, \"int1\": -941468492, " + << "\"long1\": -6863419716327549772, \"float1\": 0.7762409, " + << "\"double1\": 0.77624090391187, \"bytes1\": [123, 108, 207, 27, 93, " + << "157, 139, 233, 181, 90, 14, 60, 34, 120, 26, 119, 231, 50, 155, 121], " + << "\"string1\": \"887336a7\", \"middle\": {\"list\": [{\"int1\": " + << "-941468492, \"string1\": \"887336a7\"}, {\"int1\": -1598014431, " + << "\"string1\": \"ba419d35-x\"}]}, \"list\": [], \"map\": [{\"key\": " + << "\"ba419d35-x\", \"value\": {\"int1\": -1598014431, \"string1\": " + << "\"ba419d35-x\"}}, {\"key\": \"887336a7\", \"value\": {\"int1\": " + << "-941468492, \"string1\": \"887336a7\"}}]}"; + EXPECT_EQ(expected.str(), line); // Int column #2 cols.clear(); @@ -936,6 +956,15 @@ TEST(Reader, selectColumns) { else EXPECT_TRUE(!c[i]); } + batch = reader->createRowBatch(1); + line.clear(); + printer = createColumnPrinter(line, reader->getType(), &c); + reader->next(*batch); + printer->reset(*batch); + printer->printRow(0); + std::string expectedInt("{\"byte1\": -76}"); + EXPECT_EQ(expectedInt, line); + // Struct column #10 cols.clear(); @@ -949,6 +978,17 @@ TEST(Reader, selectColumns) { else EXPECT_TRUE(!c[i]); } + batch = reader->createRowBatch(1); + line.clear(); + printer = createColumnPrinter(line, reader->getType(), &c); + reader->next(*batch); + printer->reset(*batch); + printer->printRow(0); + std::ostringstream expectedStruct; + expectedStruct << "{\"middle\": {\"list\": " + << "[{\"int1\": -941468492, \"string1\": \"887336a7\"}, " + << "{\"int1\": -1598014431, \"string1\": \"ba419d35-x\"}]}}"; + EXPECT_EQ(expectedStruct.str(), line); // Array column #11 cols.clear(); @@ -962,6 +1002,14 @@ TEST(Reader, selectColumns) { else EXPECT_TRUE(!c[i]); } + batch = reader->createRowBatch(1); + line.clear(); + printer = createColumnPrinter(line, reader->getType(), &c); + reader->next(*batch); + printer->reset(*batch); + printer->printRow(0); + std::string expectedArray("{\"list\": []}"); + EXPECT_EQ(expectedArray, line); // Map column #12 cols.clear(); @@ -975,6 +1023,18 @@ TEST(Reader, selectColumns) { else EXPECT_TRUE(!c[i]); } + batch = reader->createRowBatch(1); + line.clear(); + printer = createColumnPrinter(line, reader->getType(), &c); + reader->next(*batch); + printer->reset(*batch); + printer->printRow(0); + std::ostringstream expectedMap; + expectedMap << "{\"map\": [{\"key\": \"ba419d35-x\", \"value\": {\"int1\":" + << " -1598014431, \"string1\": \"ba419d35-x\"}}, {\"key\": " + << "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": " + << "\"887336a7\"}}]}"; + EXPECT_EQ(expectedMap.str(), line); } std::map makeMetadata() { From 2773e1585e7a46331b1d0a80b43e816b1584e971 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Mon, 12 Oct 2015 08:10:10 -0700 Subject: [PATCH 3/6] Minor code polishing. --- tools/src/FileContents.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/src/FileContents.cc b/tools/src/FileContents.cc index f0a1bf1b3c..ee643ca32d 100644 --- a/tools/src/FileContents.cc +++ b/tools/src/FileContents.cc @@ -49,20 +49,16 @@ void printContents(const char* filename, const orc::ReaderOptions opts) { int main(int argc, char* argv[]) { if (argc < 2) { - if (argc < 2) { - std::cout << "Usage: file-contents " - << "[--columns=column1,column2,...]\n" ; - return 1; - } - + std::cout << "Usage: file-contents " + << "[--columns=column1,column2,...]\n" ; + return 1; } try { const std::string COLUMNS_PREFIX = "--columns="; std::list cols; // Read command-line options - char* param ; - char* value ; + char *param, *value; for (int i = 2; i < argc; i++) { if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) { value = std::strtok(param+COLUMNS_PREFIX.length(), "," ); From eba3b241fa4981672fdbdf7ad8c8c9ef6ea11f76 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Fri, 23 Oct 2015 11:12:31 -0700 Subject: [PATCH 4/6] Minor corrections following the code review. --- c++/include/orc/ColumnPrinter.hh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c++/include/orc/ColumnPrinter.hh b/c++/include/orc/ColumnPrinter.hh index c2d8ce8fdd..95e965fafc 100644 --- a/c++/include/orc/ColumnPrinter.hh +++ b/c++/include/orc/ColumnPrinter.hh @@ -47,6 +47,7 @@ namespace orc { }; ORC_UNIQUE_PTR createColumnPrinter(std::string&, - const Type& type, const std::vector* selectedColumns = nullptr); + const Type& type, + const std::vector* selectedColumns = ORC_NULLPTR); } #endif From fb0acce56c6533b8f2ad22daf8f7e69eb6e870dd Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Fri, 23 Oct 2015 11:43:00 -0700 Subject: [PATCH 5/6] Modified the file-contents utility to accept command-line options in any order. --- tools/src/FileContents.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/src/FileContents.cc b/tools/src/FileContents.cc index ee643ca32d..d13dcb54b9 100644 --- a/tools/src/FileContents.cc +++ b/tools/src/FileContents.cc @@ -49,17 +49,17 @@ void printContents(const char* filename, const orc::ReaderOptions opts) { int main(int argc, char* argv[]) { if (argc < 2) { - std::cout << "Usage: file-contents " - << "[--columns=column1,column2,...]\n" ; + std::cout << "Usage: file-contents [--columns=1,2,...] \n" ; return 1; } try { const std::string COLUMNS_PREFIX = "--columns="; std::list cols; + char* filename = ORC_NULLPTR; // Read command-line options char *param, *value; - for (int i = 2; i < argc; i++) { + for (int i = 1; i < argc; i++) { if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) { value = std::strtok(param+COLUMNS_PREFIX.length(), "," ); while (value) { @@ -67,14 +67,16 @@ int main(int argc, char* argv[]) { value = std::strtok(nullptr, "," ); } } else { - std::cout << "Unknown option " << argv[i] << "\n" ; + filename = argv[i]; } } orc::ReaderOptions opts; if (cols.size() > 0) { opts.include(cols); } - printContents(argv[1], opts); + if (filename != ORC_NULLPTR) { + printContents(filename, opts); + } } catch (std::exception& ex) { std::cerr << "Caught exception: " << ex.what() << "\n"; return 1; From 4f78e54a7c800e0ec4324650b8574023d2673015 Mon Sep 17 00:00:00 2001 From: Aliaksei Sandryhaila Date: Wed, 4 Nov 2015 14:55:52 -0800 Subject: [PATCH 6/6] Added usage info to file-contents. --- tools/src/FileContents.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/src/FileContents.cc b/tools/src/FileContents.cc index d13dcb54b9..c2e2c358f0 100644 --- a/tools/src/FileContents.cc +++ b/tools/src/FileContents.cc @@ -49,7 +49,9 @@ void printContents(const char* filename, const orc::ReaderOptions opts) { int main(int argc, char* argv[]) { if (argc < 2) { - std::cout << "Usage: file-contents [--columns=1,2,...] \n" ; + std::cout << "Usage: file-contents [--columns=1,2,...]\n" + << "Print contents of .\n" + << "If columns are specified, only these top-level (logical) columns are printed.\n" ; return 1; } try {