From 606908d0a8a4fb33f79fa02fbe2146b8898f9c39 Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Fri, 20 Feb 2026 14:37:33 -0800 Subject: [PATCH] Task #9: Implement ResolvePredicateFields function - Added PredicateField struct to hold resolved field information - Implemented ResolvePredicateFields() helper function - Resolves field references in predicates to ORC column indices - Uses OrcSchemaManifest for Arrow-to-ORC column mapping - Traverses nested field paths (structs only) - Filters to leaf nodes only (containers don't have statistics) - Type support check (currently int32/int64 only) - Returns vector of PredicateField entities Implementation details: - Uses compute::FieldsInExpression() to extract field refs - Uses FieldRef.FindOneOrNone() for schema matching - Traverses OrcSchemaField tree for nested paths - Validates field indices and struct types - PredicateField includes: field_ref, arrow_field_index, orc_column_index, data_type, supports_statistics Verified: Manual code review following Parquet TestRowGroups pattern (lines 945-960) Co-Authored-By: Claude Sonnet 4.5 --- cpp/src/arrow/dataset/file_orc.cc | 87 +++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/cpp/src/arrow/dataset/file_orc.cc b/cpp/src/arrow/dataset/file_orc.cc index b0bb7f2ae18d..6f50079255ed 100644 --- a/cpp/src/arrow/dataset/file_orc.cc +++ b/cpp/src/arrow/dataset/file_orc.cc @@ -385,6 +385,93 @@ struct StripeStatisticsCache { std::vector statistics_complete; }; +// +// PredicateField - Helper structure for resolved predicate fields +// + +struct PredicateField { + compute::FieldRef field_ref; + int arrow_field_index; + int orc_column_index; + std::shared_ptr data_type; + bool supports_statistics; +}; + +// +// ResolvePredicateFields - Resolve field references in predicate to ORC columns +// + +/// Resolve field references in a predicate to PredicateField entities. +/// Uses the schema manifest to map Arrow fields to ORC column indices. +/// Only returns fields that are leaves and support statistics. +/// +/// \param predicate The predicate expression to analyze +/// \param physical_schema The Arrow physical schema +/// \param manifest The ORC schema manifest for column mapping +/// \return Vector of resolved PredicateField entities +Result> ResolvePredicateFields( + const compute::Expression& predicate, + const std::shared_ptr& physical_schema, + const std::shared_ptr& manifest) { + std::vector resolved_fields; + + for (const compute::FieldRef& ref : compute::FieldsInExpression(predicate)) { + // Find the field in the Arrow schema + ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOneOrNone(*physical_schema)); + + if (match.empty()) { + // Field not found - skip + continue; + } + + // Get the top-level schema field + const OrcSchemaField* schema_field = &manifest->schema_fields[match[0]]; + int arrow_field_index = match[0]; + + // Traverse nested paths + for (size_t i = 1; i < match.indices().size(); ++i) { + if (schema_field->field->type()->id() != Type::STRUCT) { + return Status::Invalid("Nested paths only supported for structs"); + } + + int child_index = match[i]; + if (child_index < 0 || + static_cast(child_index) >= schema_field->children.size()) { + return Status::Invalid("Invalid nested field index"); + } + + schema_field = &schema_field->children[child_index]; + } + + // Skip if not a leaf node (containers don't have statistics) + if (!schema_field->is_leaf()) { + continue; + } + + // Check if this type supports statistics + // For initial implementation, only support integer types + Type::type type_id = schema_field->field->type()->id(); + bool supports_stats = (type_id == Type::INT32 || type_id == Type::INT64); + + if (!supports_stats) { + // Skip unsupported types + continue; + } + + // Create PredicateField + PredicateField pf; + pf.field_ref = ref; + pf.arrow_field_index = arrow_field_index; + pf.orc_column_index = schema_field->column_index; + pf.data_type = schema_field->field->type(); + pf.supports_statistics = supports_stats; + + resolved_fields.push_back(std::move(pf)); + } + + return resolved_fields; +} + // // OrcFileFragment implementation //