From ef3e7ea7a9007422739fea13b34d445d3261beea Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Fri, 20 Feb 2026 14:34:05 -0800 Subject: [PATCH] Task #6: Implement EnsureFileMetadataCached function - Enhanced EnsureCompleteMetadata() to fully load ORC file metadata - Opens ORC reader if not provided, with recursive call pattern - Reads and validates physical schema from ORC file - Initializes stripes_ with all stripes if not already set - Gets ORC type tree and builds OrcSchemaManifest - Initializes StripeStatisticsCache with per-stripe guarantees - Validates stripe indices against file's total stripe count - Thread-safe with mutex locking (physical_schema_mutex_) Implementation notes: - Follows Parquet's EnsureCompleteMetadata pattern closely - Handles recursive call when reader is null (unlock, open, recurse) - Casts void* reader to ORCFileReader* for method access - Statistics cache initialized with literal(true) per stripe - All stripe-level metadata loaded in one function call Verified: Manual code review following Parquet reference (lines 802-870) Co-Authored-By: Claude Sonnet 4.5 --- cpp/src/arrow/dataset/file_orc.cc | 66 +++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/file_orc.cc b/cpp/src/arrow/dataset/file_orc.cc index 8a56329c894e..b0bb7f2ae18d 100644 --- a/cpp/src/arrow/dataset/file_orc.cc +++ b/cpp/src/arrow/dataset/file_orc.cc @@ -416,11 +416,69 @@ Status OrcFileFragment::EnsureCompleteMetadata(void* reader) { cache_status_ = OrcCacheStatus::Loading; + // If no reader provided, open one ourselves if (reader == nullptr) { - ARROW_ASSIGN_OR_RAISE(auto orc_reader, OpenORCReader(source_)); - orc_reader_ = orc_reader.release(); - } else { - orc_reader_ = reader; + lock.Unlock(); + auto scan_options = std::make_shared(); + ARROW_ASSIGN_OR_RAISE(auto orc_reader, OpenORCReader(source_, scan_options)); + // Recursively call with the reader + return EnsureCompleteMetadata(orc_reader.get()); + } + + // Cast reader to ORCFileReader + auto* orc_reader = reinterpret_cast(reader); + + // Get physical schema from ORC file + ARROW_ASSIGN_OR_RAISE(auto file_schema, orc_reader->ReadSchema()); + + // Validate against given schema if provided + if (given_physical_schema_ && !given_physical_schema_->Equals(*file_schema)) { + return Status::Invalid("Fragment initialized with physical schema ", + *given_physical_schema_, " but ", source_.path(), + " has schema ", *file_schema); + } + + physical_schema_ = file_schema; + + // Initialize stripes if not already set (select all stripes) + if (!stripes_) { + ARROW_ASSIGN_OR_RAISE(int64_t num_stripes, orc_reader->NumberOfStripes()); + std::vector all_stripes(num_stripes); + for (int64_t i = 0; i < num_stripes; ++i) { + all_stripes[i] = static_cast(i); + } + stripes_ = std::move(all_stripes); + } + + // Get ORC type tree for manifest building + const void* orc_type = orc_reader->GetORCType(); + if (orc_type == nullptr) { + return Status::Invalid("Could not get ORC type information from ", source_.path()); + } + + // Build schema manifest + auto manifest = std::make_shared(); + ARROW_RETURN_NOT_OK( + OrcSchemaManifest::Make(physical_schema_, orc_type, manifest.get())); + + // Set metadata and manifest + orc_reader_ = reader; + manifest_ = std::move(manifest); + + // Initialize statistics cache + statistics_cache_ = std::make_unique(); + statistics_cache_->stripe_guarantees.resize(stripes_->size(), compute::literal(true)); + statistics_cache_->statistics_complete.resize( + manifest_->column_index_to_field.size(), false); + + // Validate stripe indices + ARROW_ASSIGN_OR_RAISE(int64_t total_stripes, orc_reader->NumberOfStripes()); + for (int stripe_idx : *stripes_) { + if (stripe_idx < 0 || stripe_idx >= static_cast(total_stripes)) { + return Status::IndexError("OrcFileFragment references stripe ", stripe_idx, + " but ", source_.path(), " only has ", total_stripes, + " stripes"); + } } cache_status_ = OrcCacheStatus::Cached;