Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion c++/include/orc/Reader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ namespace orc {
/**
* Selects which type ids to read. The root type is always 0 and the
* rest of the types are labeled in a preorder traversal of the tree.
* The parent types are automatically selected, but the children are not.
*
* This option clears any previous setting of the selected columns or
* types.
Expand Down Expand Up @@ -206,6 +205,17 @@ namespace orc {
*/
RowReaderOptions& filter(const std::list<std::string>& filterColNames);

/**
* Selects which type ids to filter. The root type is always 0 and the
* rest of the types are labeled in a preorder traversal of the tree.
*
* This option clears any previous setting of the filter columns or
* types.
* @param types a list of the type ids to filter
* @return this
*/
RowReaderOptions& filterTypes(const std::list<uint64_t>& types);

/**
* A map type of <typeId, ReadIntent>.
*/
Expand Down Expand Up @@ -308,6 +318,16 @@ namespace orc {
*/
const std::list<std::string>& getFilterColNames() const;

/**
* Were the filter type ids set?
*/
bool getFilterTypeIdsSet() const;

/**
* Get the list of filter type ids.
*/
const std::list<uint64_t>& getFilterTypeIds() const;

/**
* Get the start of the range for the data being processed.
* @return if not set, return 0
Expand Down
5 changes: 3 additions & 2 deletions c++/src/ColumnReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1158,9 +1158,10 @@ namespace orc {
}

uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase& readPhase) {
if (readPhase.contains(this->type.getReaderCategory())) {
numValues = ColumnReader::skip(numValues, readPhase);
if (!readPhase.contains(this->type.getReaderCategory())) {
return 0;
}
numValues = ColumnReader::skip(numValues, readPhase);
for (auto& ptr : children) {
if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) {
ptr->skip(numValues, readPhase);
Expand Down
15 changes: 15 additions & 0 deletions c++/src/Options.hh
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,13 @@ namespace orc {
return *this;
}

RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>& types) {
privateBits->filter = ColumnFilter_TYPE_IDS;
privateBits->filterColumnIndexes.assign(types.begin(), types.end());
privateBits->filterColumnNames.clear();
return *this;
}

RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
privateBits->dataStart = offset;
privateBits->dataLength = length;
Expand Down Expand Up @@ -268,6 +275,14 @@ namespace orc {
return privateBits->filterColumnNames;
}

bool RowReaderOptions::getFilterTypeIdsSet() const {
return privateBits->filter == ColumnFilter_TYPE_IDS;
}

const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const {
return privateBits->filterColumnIndexes;
}

uint64_t RowReaderOptions::getOffset() const {
return privateBits->dataStart;
}
Expand Down
77 changes: 75 additions & 2 deletions c++/src/Reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,22 +176,36 @@ namespace orc {
field != options.getInclude().end(); ++field) {
updateSelectedByFieldId(selectedColumns, *field);
}
selectParents(selectedColumns, *contents->schema.get());
} else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) {
for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
field != options.getIncludeNames().end(); ++field) {
updateSelectedByName(selectedColumns, *field);
}
selectParents(selectedColumns, *contents->schema.get());
} else if (options.getTypeIdsSet()) {
const RowReaderOptions::IdReadIntentMap idReadIntentMap = options.getIdReadIntentMap();
for (std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
typeId != options.getInclude().end(); ++typeId) {
updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
if (!idReadIntentMap.empty()) {
updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
selectParents(selectedColumns, *contents->schema.get());
} else {
if (*typeId < selectedColumns.size()) {
// Only select the specified type ID, do not automatically select children or parents
selectedColumns[*typeId] = true;
} else {
std::stringstream buffer;
buffer << "Invalid type id selected " << *typeId << " out of " << selectedColumns.size();
throw ParseError(buffer.str());
}
}
}
} else {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
selectParents(selectedColumns, *contents->schema.get());
}
selectParents(selectedColumns, *contents->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
}

Expand Down Expand Up @@ -374,6 +388,65 @@ namespace orc {
processChildren(type);
}

startReadPhase = ReadPhase::LEADERS;
readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
readerContext->setFilterCallback(std::move(filterColIds), filter);
} else if (opts.getFilterTypeIdsSet()) {
// Handle filter by type IDs
const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds();

for (const auto& typeId : filterTypeIds) {
if (typeId >= idTypeMap.size()) {
std::stringstream buffer;
buffer << "Invalid type id for filter " << typeId << " out of " << idTypeMap.size();
throw ParseError(buffer.str());
}

Type* type = idTypeMap[typeId];

// Process current node and all its parent nodes
// Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes
Type* current = type;
while (current != nullptr) {
if (current->getSubtypeCount() == 0) {
current->setReaderCategory(ReaderCategory::FILTER_CHILD);
} else if (current->getKind() == TypeKind::LIST
|| current->getKind() == TypeKind::MAP) {
current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
} else {
current->setReaderCategory(ReaderCategory::FILTER_PARENT);
}
filterColIds.emplace(current->getColumnId());
current = current->getParent();
}

// Process all child nodes of the current node
// For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if it has children
std::function<void(Type*)> processChildren = [&processChildren](Type* node) {
if (node == nullptr) return;

// Iterate through all child nodes
for (int i = 0; i < node->getSubtypeCount(); ++i) {
Type* child = node->getSubtype(i);
if (child->getSubtypeCount() == 0) {
// Leaf node (no children)
child->setReaderCategory(ReaderCategory::FILTER_CHILD);
} else if (child->getKind() == TypeKind::LIST
|| child->getKind() == TypeKind::MAP) {
child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
// Recursively process its children
processChildren(child);
} else {
// Non-leaf node (has children)
child->setReaderCategory(ReaderCategory::FILTER_PARENT);
// Recursively process its children
processChildren(child);
}
}
};
processChildren(type);
}

startReadPhase = ReadPhase::LEADERS;
readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
readerContext->setFilterCallback(std::move(filterColIds), filter);
Expand Down
2 changes: 1 addition & 1 deletion c++/src/Reader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ namespace orc {
ReadPhase startReadPhase;
bool needsFollowColumnsRead;

std::map<uint64_t, const Type*> idTypeMap;
std::map<uint64_t, Type*> idTypeMap;
std::map<std::string, Type*> nameTypeMap;
std::vector<std::string> columns;

Expand Down
1 change: 1 addition & 0 deletions c++/src/TypeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ namespace orc {
throw NotImplementedYet("Unknown type kind");
}
result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
result->setReaderCategory(fileType->getReaderCategory());
for (auto& key : fileType->getAttributeKeys()) {
const auto& value = fileType->getAttributeValue(key);
result->setAttribute(key, value);
Expand Down