From c74460f147a27f5490c13ab123836bf42bdd8748 Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Fri, 20 Feb 2026 14:22:00 -0800 Subject: [PATCH] Task #3: Implement GetOrcColumnIndex function - Implemented GetOrcColumnIndex helper function that: - Resolves FieldRef to ORC column index using manifest - Uses FieldRef.FindOne() to locate field in schema - Traverses manifest tree following field path indices - Handles both top-level and nested fields - Returns column_index for leaf nodes (primitives with statistics) - Returns std::nullopt for containers or not found - Added necessary includes: - for std::optional return type - arrow/compute/api_scalar.h for FieldRef and FieldPath Implementation details: - Top-level fields accessed via manifest.schema_fields[index] - Nested fields traversed via current_field->children[index] - Validates indices at each level to prevent out-of-bounds - Only returns column_index if field is leaf (has statistics) - Containers (struct/list/map) return nullopt Verified: Manual code review - follows FieldRef resolution pattern Co-Authored-By: Claude Sonnet 4.5 --- cpp/src/arrow/dataset/file_orc.cc | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/cpp/src/arrow/dataset/file_orc.cc b/cpp/src/arrow/dataset/file_orc.cc index 72ca7033e806..3c751657a691 100644 --- a/cpp/src/arrow/dataset/file_orc.cc +++ b/cpp/src/arrow/dataset/file_orc.cc @@ -18,8 +18,10 @@ #include "arrow/dataset/file_orc.h" #include +#include #include "arrow/adapters/orc/adapter.h" +#include "arrow/compute/api_scalar.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/file_base.h" #include "arrow/dataset/scanner.h" @@ -150,6 +152,51 @@ Status OrcSchemaManifest::Make(const std::shared_ptr& schema, return Status::OK(); } +// Helper function to resolve FieldRef to ORC column index using the manifest +// Returns std::nullopt if the field is not found or is not a leaf node +std::optional GetOrcColumnIndex(const compute::FieldRef& field_ref, + const OrcSchemaManifest& manifest) { + // Try to resolve the FieldRef to a field in the schema + auto maybe_match = field_ref.FindOne(*manifest.origin_schema); + if (!maybe_match.ok()) { + // Field not found in schema + return std::nullopt; + } + + const compute::FieldPath& field_path = *maybe_match; + + // Traverse the manifest to find the corresponding OrcSchemaField + const OrcSchemaField* current_field = nullptr; + + // Start with top-level fields + for (size_t i = 0; i < field_path.indices().size(); ++i) { + int field_index = field_path.indices()[i]; + + if (i == 0) { + // Top-level field + if (field_index < 0 || static_cast(field_index) >= manifest.schema_fields.size()) { + return std::nullopt; + } + current_field = &manifest.schema_fields[field_index]; + } else { + // Nested field + if (!current_field || field_index < 0 || + static_cast(field_index) >= current_field->children.size()) { + return std::nullopt; + } + current_field = ¤t_field->children[field_index]; + } + } + + // Check if we found a field and if it's a leaf node + if (current_field && current_field->is_leaf()) { + return current_field->column_index; + } + + // Not a leaf node or not found + return std::nullopt; +} + namespace { Result> OpenORCReader(