Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion c++/include/orc/ColumnPrinter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace orc {
};

ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&,
const Type& type);
const Type& type,
const std::vector<bool>* selectedColumns = ORC_NULLPTR);
}
#endif
31 changes: 20 additions & 11 deletions c++/src/ColumnPrinter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,10 @@ namespace orc {
class StructColumnPrinter: public ColumnPrinter {
private:
std::vector<ColumnPrinter*> fieldPrinter;
std::vector<std::string> fieldNames;
public:
StructColumnPrinter(std::string&, const Type& type);
StructColumnPrinter(std::string&, const Type& type,
const std::vector<bool>* selectedColumns);
virtual ~StructColumnPrinter();
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
Expand Down Expand Up @@ -209,9 +211,11 @@ namespace orc {
}
}

std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer,
const Type& type) {
ColumnPrinter *result;
std::unique_ptr<ColumnPrinter> createColumnPrinter(
std::string& buffer,
const Type& type,
const std::vector<bool>* selectedColumns) {
ColumnPrinter *result = nullptr;
switch(static_cast<int64_t>(type.getKind())) {
case BOOLEAN:
result = new BooleanColumnPrinter(buffer, type);
Expand Down Expand Up @@ -252,7 +256,7 @@ namespace orc {
break;

case STRUCT:
result = new StructColumnPrinter(buffer, type);
result = new StructColumnPrinter(buffer, type, selectedColumns);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You also need to push the selected columns down through the list, map, and union types. Otherwise, you won't be able to select columns below them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this is the intended implementation: only specify top-level (logical) columns. Otherwise, there is no way to distinguish between logical and physical columns.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, if an ORC file contains columns INT, STRUCT<STRING, BOOLEAN>, running
./file-contents --columns=2 file.orc
will select the STRUCT column. If we allowed selection of subcolumns, then it is unclear which column the above command will select: STRUCT or STRING.

break;

case DECIMAL:
Expand Down Expand Up @@ -558,12 +562,17 @@ namespace orc {
}
}

StructColumnPrinter::StructColumnPrinter(std::string& buffer,
const Type& type
): ColumnPrinter(buffer, type) {
StructColumnPrinter::StructColumnPrinter(
std::string& buffer,
const Type& type,
const std::vector<bool>* selectedColumns
): ColumnPrinter(buffer, type) {
for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))
.release());
if (selectedColumns==nullptr || selectedColumns->at(type.getSubtype(i).getColumnId())) {
fieldNames.push_back(type.getFieldName(i));
fieldPrinter.push_back(createColumnPrinter(buffer,
type.getSubtype(i)).release());
}
}
}

Expand Down Expand Up @@ -592,7 +601,7 @@ namespace orc {
writeString(buffer, ", ");
}
writeChar(buffer, '"');
writeString(buffer, type.getFieldName(i).c_str());
writeString(buffer, fieldNames[i].c_str());
writeString(buffer, "\": ");
fieldPrinter[i]->printRow(rowId);
}
Expand Down
31 changes: 28 additions & 3 deletions tools/src/FileContents.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ void printContents(const char* filename, const orc::ReaderOptions opts) {

std::unique_ptr<orc::ColumnVectorBatch> batch = reader->createRowBatch(1000);
std::string line;
const std::vector<bool> selectedColumns = reader->getSelectedColumns();
std::unique_ptr<orc::ColumnPrinter> printer =
createColumnPrinter(line, reader->getType());
createColumnPrinter(line, reader->getType(), &selectedColumns);

while (reader->next(*batch)) {
printer->reset(*batch);
Expand All @@ -48,12 +49,36 @@ void printContents(const char* filename, const orc::ReaderOptions opts) {

int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "Usage: file-contents <filename>\n";
std::cout << "Usage: file-contents <filename> [--columns=1,2,...]\n"
<< "Print contents of <filename>.\n"
<< "If columns are specified, only these top-level (logical) columns are printed.\n" ;
return 1;
}
try {
const std::string COLUMNS_PREFIX = "--columns=";
std::list<int64_t> cols;
char* filename = ORC_NULLPTR;

// Read command-line options
char *param, *value;
for (int i = 1; i < argc; i++) {
if ( (param = std::strstr(argv[i], COLUMNS_PREFIX.c_str())) ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the semantics? Are the fields above the selected ones automatically included? What about the types below the selected one?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be much more user friendly to select by column name rather than column id, which given complex types are hard to know. At that point, you might start with only selecting top level columns with something like "--columns=field1,field12", which would mean all of the types under those types.

Eventually, it would be nice to support virtual column names like "length" and "value" for lists, and "length, "key", and "value" for maps. Nested structures would look like "outer12.inner3" or "outer12.key".

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, originally I wanted to use column names, too. Unfortunately, they are optional. If column names are missing, how would a user select specific columns?

value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
while (value) {
cols.push_back(std::atoi(value));
value = std::strtok(nullptr, "," );
}
} else {
filename = argv[i];
}
}
orc::ReaderOptions opts;
printContents(argv[1], opts);
if (cols.size() > 0) {
opts.include(cols);
}
if (filename != ORC_NULLPTR) {
printContents(filename, opts);
}
} catch (std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << "\n";
return 1;
Expand Down
60 changes: 60 additions & 0 deletions tools/test/TestReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,26 @@ TEST(Reader, selectColumns) {
for (unsigned int i=0; i < c.size(); i++) {
EXPECT_TRUE(c[i]);
}
std::unique_ptr<orc::ColumnVectorBatch> batch = reader->createRowBatch(1);
std::string line;
std::unique_ptr<orc::ColumnPrinter> printer =
createColumnPrinter(line, reader->getType(), &c);
reader->next(*batch);
printer->reset(*batch);
printer->printRow(0);
std::ostringstream expected;
expected << "{\"boolean1\": true, \"byte1\": -76, "
<< "\"short1\": 21684, \"int1\": -941468492, "
<< "\"long1\": -6863419716327549772, \"float1\": 0.7762409, "
<< "\"double1\": 0.77624090391187, \"bytes1\": [123, 108, 207, 27, 93, "
<< "157, 139, 233, 181, 90, 14, 60, 34, 120, 26, 119, 231, 50, 155, 121], "
<< "\"string1\": \"887336a7\", \"middle\": {\"list\": [{\"int1\": "
<< "-941468492, \"string1\": \"887336a7\"}, {\"int1\": -1598014431, "
<< "\"string1\": \"ba419d35-x\"}]}, \"list\": [], \"map\": [{\"key\": "
<< "\"ba419d35-x\", \"value\": {\"int1\": -1598014431, \"string1\": "
<< "\"ba419d35-x\"}}, {\"key\": \"887336a7\", \"value\": {\"int1\": "
<< "-941468492, \"string1\": \"887336a7\"}}]}";
EXPECT_EQ(expected.str(), line);

// Int column #2
cols.clear();
Expand All @@ -936,6 +956,15 @@ TEST(Reader, selectColumns) {
else
EXPECT_TRUE(!c[i]);
}
batch = reader->createRowBatch(1);
line.clear();
printer = createColumnPrinter(line, reader->getType(), &c);
reader->next(*batch);
printer->reset(*batch);
printer->printRow(0);
std::string expectedInt("{\"byte1\": -76}");
EXPECT_EQ(expectedInt, line);


// Struct column #10
cols.clear();
Expand All @@ -949,6 +978,17 @@ TEST(Reader, selectColumns) {
else
EXPECT_TRUE(!c[i]);
}
batch = reader->createRowBatch(1);
line.clear();
printer = createColumnPrinter(line, reader->getType(), &c);
reader->next(*batch);
printer->reset(*batch);
printer->printRow(0);
std::ostringstream expectedStruct;
expectedStruct << "{\"middle\": {\"list\": "
<< "[{\"int1\": -941468492, \"string1\": \"887336a7\"}, "
<< "{\"int1\": -1598014431, \"string1\": \"ba419d35-x\"}]}}";
EXPECT_EQ(expectedStruct.str(), line);

// Array column #11
cols.clear();
Expand All @@ -962,6 +1002,14 @@ TEST(Reader, selectColumns) {
else
EXPECT_TRUE(!c[i]);
}
batch = reader->createRowBatch(1);
line.clear();
printer = createColumnPrinter(line, reader->getType(), &c);
reader->next(*batch);
printer->reset(*batch);
printer->printRow(0);
std::string expectedArray("{\"list\": []}");
EXPECT_EQ(expectedArray, line);

// Map column #12
cols.clear();
Expand All @@ -975,6 +1023,18 @@ TEST(Reader, selectColumns) {
else
EXPECT_TRUE(!c[i]);
}
batch = reader->createRowBatch(1);
line.clear();
printer = createColumnPrinter(line, reader->getType(), &c);
reader->next(*batch);
printer->reset(*batch);
printer->printRow(0);
std::ostringstream expectedMap;
expectedMap << "{\"map\": [{\"key\": \"ba419d35-x\", \"value\": {\"int1\":"
<< " -1598014431, \"string1\": \"ba419d35-x\"}}, {\"key\": "
<< "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
<< "\"887336a7\"}}]}";
EXPECT_EQ(expectedMap.str(), line);
}

std::map<std::string, std::string> makeMetadata() {
Expand Down