From 0c640ac4f974877f7a08c0de4bb0b52ff1c74965 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 21 Mar 2024 09:47:00 +0800 Subject: [PATCH 1/6] GH-36026: [C++][ORC] Check TZDB availability for ORC --- cpp/src/arrow/adapters/orc/adapter.cc | 26 ++++++++++++++++------ cpp/src/arrow/adapters/orc/adapter_test.cc | 26 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 2100e701f33..60809bbc059 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,17 +18,15 @@ #include "arrow/adapters/orc/adapter.h" #include -#include -#include +#include +#include #include #include #include #include -#include #include #include "arrow/adapters/orc/util.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" @@ -37,14 +35,11 @@ #include "arrow/table.h" #include "arrow/table_builder.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" -#include "arrow/util/range.h" -#include "arrow/util/visibility.h" #include "orc/Exceptions.hh" // alias to not interfere with nested orc namespace @@ -183,6 +178,21 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } +// Remove this check once https://issues.apache.org/jira/browse/ORC-1661 is fixed. +Status check_timezone_database_availability() { + auto tz_dir = std::getenv("TZDIR"); + bool is_tzdb_avaiable = tz_dir != nullptr + ? std::filesystem::exists(tz_dir) + : std::filesystem::exists("/usr/share/zoneinfo"); + if (!is_tzdb_avaiable) { + return Status::Invalid( + "IANA timezone database is unavailable but required by ORC." + " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" + " directory"); + } + return Status::OK(); +} + } // namespace class ORCFileReader::Impl { @@ -541,6 +551,7 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { + RETURN_NOT_OK(check_timezone_database_availability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -807,6 +818,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { + RETURN_NOT_OK(check_timezone_database_availability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 73ecde6b9b5..c4c3660ff5b 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -33,8 +33,10 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" namespace liborc = orc; @@ -636,6 +638,30 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true); } +TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { + // Backup the original TZDIR env and set a wrong value by purpose to trigger the check. + const char* tzdir_env_key = "TZDIR"; + const char* expect_str = "IANA timezone database is unavailable but required by ORC"; + auto tzdir_env_backup = std::getenv(tzdir_env_key); + ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e")); + + EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + EXPECT_THAT( + adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + + EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + EXPECT_THAT(adapters::orc::ORCFileReader::Open( + std::make_shared(buffer), default_memory_pool()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + + // Restore TZDIR env. + ARROW_EXPECT_OK(arrow::internal::DelEnvVar(tzdir_env_key)); + if (tzdir_env_backup) { + ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, tzdir_env_backup)); + } +} + // Trivial class TestORCWriterTrivialNoWrite : public ::testing::Test {}; From 589b252a22ba3bc000ec222eec6d0ee79ae63205 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 22 Mar 2024 09:32:54 +0800 Subject: [PATCH 2/6] catch everything --- cpp/src/arrow/adapters/orc/adapter.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 60809bbc059..55c1224186b 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,7 +18,6 @@ #include "arrow/adapters/orc/adapter.h" #include -#include #include #include #include @@ -75,6 +74,12 @@ namespace liborc = orc; } \ catch (const liborc::NotImplementedYet& e) { \ return Status::NotImplemented(e.what()); \ + } \ + catch (const std::exception& e) { \ + return Status::Invalid(e.what()); \ + } \ + catch (...) { \ + return Status::UnknownError("ORC error"); \ } #define ORC_CATCH_NOT_OK(_s) \ @@ -178,7 +183,8 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } -// Remove this check once https://issues.apache.org/jira/browse/ORC-1661 is fixed. +// Proactively check the availability of timezone database. +// Remove it once https://issues.apache.org/jira/browse/ORC-1661 has been fixed. Status check_timezone_database_availability() { auto tz_dir = std::getenv("TZDIR"); bool is_tzdb_avaiable = tz_dir != nullptr From a2f7937065f3496f482111a866a70e776db4295d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 25 Mar 2024 13:29:36 +0800 Subject: [PATCH 3/6] remove unnecessary check after catch everything --- cpp/src/arrow/adapters/orc/adapter.cc | 22 ++------------ cpp/src/arrow/adapters/orc/adapter_test.cc | 35 ++++++++++++++-------- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 55c1224186b..a7e90c6106f 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -76,7 +76,7 @@ namespace liborc = orc; return Status::NotImplemented(e.what()); \ } \ catch (const std::exception& e) { \ - return Status::Invalid(e.what()); \ + return Status::UnknownError(e.what()); \ } \ catch (...) { \ return Status::UnknownError("ORC error"); \ @@ -183,22 +183,6 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } -// Proactively check the availability of timezone database. -// Remove it once https://issues.apache.org/jira/browse/ORC-1661 has been fixed. -Status check_timezone_database_availability() { - auto tz_dir = std::getenv("TZDIR"); - bool is_tzdb_avaiable = tz_dir != nullptr - ? std::filesystem::exists(tz_dir) - : std::filesystem::exists("/usr/share/zoneinfo"); - if (!is_tzdb_avaiable) { - return Status::Invalid( - "IANA timezone database is unavailable but required by ORC." - " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" - " directory"); - } - return Status::OK(); -} - } // namespace class ORCFileReader::Impl { @@ -557,7 +541,6 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { - RETURN_NOT_OK(check_timezone_database_availability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -796,7 +779,7 @@ class ORCFileWriter::Impl { &(arrow_index_offset[i]), (root->fields)[i])); } root->numElements = (root->fields)[0]->numElements; - writer_->add(*batch); + ORC_CATCH_NOT_OK(writer_->add(*batch)); batch->clear(); num_rows -= batch_size; } @@ -824,7 +807,6 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { - RETURN_NOT_OK(check_timezone_database_availability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index c4c3660ff5b..8aaa303d1b1 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -638,28 +638,37 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true); } -TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { - // Backup the original TZDIR env and set a wrong value by purpose to trigger the check. +TEST(TestAdapterReadWrite, catchTimezoneError) { + // Write a simple ORC file with timestamp type. + auto table_schema = schema({field("a", timestamp(TimeUnit::NANO))}); + const auto table = GenerateRandomTable(table_schema, /*size=1*/ 1, /*min_num_chunks=*/1, + /*max_num_chunks=*/1, /*null_probability=*/0); + EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(4096)); + EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open(out_stream.get())); + ARROW_EXPECT_OK(writer->Write(*table)); + ARROW_EXPECT_OK(writer->Close()); + EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + + // Backup the original TZDIR env and set a wrong value by purpose. const char* tzdir_env_key = "TZDIR"; - const char* expect_str = "IANA timezone database is unavailable but required by ORC"; auto tzdir_env_backup = std::getenv(tzdir_env_key); - ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e")); + ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/invalid_path")); - EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); - EXPECT_THAT( - adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), - Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + EXPECT_OK_AND_ASSIGN(auto reader, adapters::orc::ORCFileReader::Open( + std::make_shared(buffer), + default_memory_pool())); - EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); - EXPECT_THAT(adapters::orc::ORCFileReader::Open( - std::make_shared(buffer), default_memory_pool()), - Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + // Verify that we have caught the orc::TimezoneError exception. + EXPECT_THAT(reader->Read(), + Raises(StatusCode::UnknownError, testing::HasSubstr("/invalid_path"))); - // Restore TZDIR env. + // Restore TZDIR env to verify timestamp values can be read successfully. ARROW_EXPECT_OK(arrow::internal::DelEnvVar(tzdir_env_key)); if (tzdir_env_backup) { ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, tzdir_env_backup)); } + EXPECT_OK_AND_ASSIGN(auto read_table, reader->Read()); + EXPECT_TRUE(read_table->Equals(*table)); } // Trivial From a4d880eb5f86014c381c364e1879efd60729f3b0 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 25 Mar 2024 22:26:48 +0800 Subject: [PATCH 4/6] add back the check --- cpp/src/arrow/adapters/orc/adapter.cc | 35 ++++++++++++++++++++ cpp/src/arrow/adapters/orc/adapter.h | 11 +++++++ cpp/src/arrow/adapters/orc/adapter_test.cc | 38 ++++++++++------------ 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index a7e90c6106f..b83c99c3e7d 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -40,6 +40,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "orc/Exceptions.hh" +#include "orc/orc-config.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; @@ -183,8 +184,40 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } +// Proactively check timezone database availability for ORC versions older than 2.0.0 +Status check_timezone_database_availability() { + if (OrcVersion::Get().value_or(OrcVersion{0, 0, 0}).major >= 2) { + return Status::OK(); + } + auto tz_dir = std::getenv("TZDIR"); + bool is_tzdb_avaiable = tz_dir != nullptr + ? std::filesystem::exists(tz_dir) + : std::filesystem::exists("/usr/share/zoneinfo"); + if (!is_tzdb_avaiable) { + return Status::Invalid( + "IANA time zone database is unavailable but required by ORC." + " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" + " directory"); + } + return Status::OK(); +} + } // namespace +std::optional OrcVersion::Get() { + std::vector versions; + std::stringstream ss{ORC_VERSION}; + while (ss.good()) { + std::string str; + std::getline(ss, str, '.'); + versions.emplace_back(std::stoi(str)); + } + if (versions.size() != 3) { + return std::nullopt; + } + return OrcVersion{versions[0], versions[1], versions[2]}; +} + class ORCFileReader::Impl { public: Impl() {} @@ -541,6 +574,7 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { + RETURN_NOT_OK(check_timezone_database_availability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -807,6 +841,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { + RETURN_NOT_OK(check_timezone_database_availability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 4ffff81f355..1beac1ed061 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/adapters/orc/options.h" @@ -47,6 +48,16 @@ struct StripeInformation { int64_t first_row_id; }; +/// \brief Version provided by the official ORC C++ library. +struct ARROW_EXPORT OrcVersion { + int major; + int minor; + int patch; + + /// \brief Return the current version of ORC C++ library. + static std::optional Get(); +}; + /// \class ORCFileReader /// \brief Read an Arrow Table or RecordBatch from an ORC file. class ARROW_EXPORT ORCFileReader { diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 8aaa303d1b1..76fab3d3410 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -638,37 +638,33 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true); } -TEST(TestAdapterReadWrite, catchTimezoneError) { - // Write a simple ORC file with timestamp type. - auto table_schema = schema({field("a", timestamp(TimeUnit::NANO))}); - const auto table = GenerateRandomTable(table_schema, /*size=1*/ 1, /*min_num_chunks=*/1, - /*max_num_chunks=*/1, /*null_probability=*/0); - EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(4096)); - EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open(out_stream.get())); - ARROW_EXPECT_OK(writer->Write(*table)); - ARROW_EXPECT_OK(writer->Close()); - EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); +TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { + auto orc_version = adapters::orc::OrcVersion::Get(); + if (orc_version.has_value() && orc_version->major >= 2) { + GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database check"; + } - // Backup the original TZDIR env and set a wrong value by purpose. + // Backup the original TZDIR env and set a wrong value by purpose to trigger the check. const char* tzdir_env_key = "TZDIR"; + const char* expect_str = "IANA time zone database is unavailable but required by ORC"; auto tzdir_env_backup = std::getenv(tzdir_env_key); - ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/invalid_path")); + ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e")); - EXPECT_OK_AND_ASSIGN(auto reader, adapters::orc::ORCFileReader::Open( - std::make_shared(buffer), - default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + EXPECT_THAT( + adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); - // Verify that we have caught the orc::TimezoneError exception. - EXPECT_THAT(reader->Read(), - Raises(StatusCode::UnknownError, testing::HasSubstr("/invalid_path"))); + EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + EXPECT_THAT(adapters::orc::ORCFileReader::Open( + std::make_shared(buffer), default_memory_pool()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); - // Restore TZDIR env to verify timestamp values can be read successfully. + // Restore TZDIR env. ARROW_EXPECT_OK(arrow::internal::DelEnvVar(tzdir_env_key)); if (tzdir_env_backup) { ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, tzdir_env_backup)); } - EXPECT_OK_AND_ASSIGN(auto read_table, reader->Read()); - EXPECT_TRUE(read_table->Equals(*table)); } // Trivial From 6320e7ad7b9af8d432e41d9ef50ee8cdb4ef2690 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 26 Mar 2024 10:11:40 +0800 Subject: [PATCH 5/6] Use EnvVarGuard and change function name --- cpp/src/arrow/adapters/orc/adapter.cc | 45 ++++++++-------------- cpp/src/arrow/adapters/orc/adapter.h | 10 ----- cpp/src/arrow/adapters/orc/adapter_test.cc | 16 +------- cpp/src/arrow/adapters/orc/util.cc | 8 ++++ cpp/src/arrow/adapters/orc/util.h | 3 ++ 5 files changed, 28 insertions(+), 54 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index b83c99c3e7d..127ec49ba99 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -40,7 +40,6 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "orc/Exceptions.hh" -#include "orc/orc-config.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; @@ -174,7 +173,7 @@ class OrcStripeReader : public RecordBatchReader { int64_t batch_size_; }; -liborc::RowReaderOptions default_row_reader_options() { +liborc::RowReaderOptions DefaultRowReaderOptions() { liborc::RowReaderOptions options; // Orc timestamp type is error-prone since it serializes values in the writer timezone // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ @@ -185,8 +184,8 @@ liborc::RowReaderOptions default_row_reader_options() { } // Proactively check timezone database availability for ORC versions older than 2.0.0 -Status check_timezone_database_availability() { - if (OrcVersion::Get().value_or(OrcVersion{0, 0, 0}).major >= 2) { +Status CheckTimeZoneDatabaseAvailability() { + if (GetOrcMajorVersion() >= 2) { return Status::OK(); } auto tz_dir = std::getenv("TZDIR"); @@ -204,20 +203,6 @@ Status check_timezone_database_availability() { } // namespace -std::optional OrcVersion::Get() { - std::vector versions; - std::stringstream ss{ORC_VERSION}; - while (ss.good()) { - std::string str; - std::getline(ss, str, '.'); - versions.emplace_back(std::stoi(str)); - } - if (versions.size() != 3) { - return std::nullopt; - } - return OrcVersion{versions[0], versions[1], versions[2]}; -} - class ORCFileReader::Impl { public: Impl() {} @@ -365,25 +350,25 @@ class ORCFileReader::Impl { } Result> Read() { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema()); return ReadTable(opts, schema); } Result> Read(const std::shared_ptr& schema) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); return ReadTable(opts, schema); } Result> Read(const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); } Result> Read(const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); @@ -391,13 +376,13 @@ class ORCFileReader::Impl { Result> Read(const std::shared_ptr& schema, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); return ReadTable(opts, schema); } Result> ReadStripe(int64_t stripe) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadBatch(opts, schema, stripes_[static_cast(stripe)].num_rows); @@ -405,7 +390,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -414,7 +399,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -520,7 +505,7 @@ class ORCFileReader::Impl { return nullptr; } - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_indices.empty()) { RETURN_NOT_OK(SelectIndices(&opts, include_indices)); } @@ -541,7 +526,7 @@ class ORCFileReader::Impl { Result> GetRecordBatchReader( int64_t batch_size, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_names.empty()) { RETURN_NOT_OK(SelectNames(&opts, include_names)); } @@ -574,7 +559,7 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { - RETURN_NOT_OK(check_timezone_database_availability()); + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -841,7 +826,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { - RETURN_NOT_OK(check_timezone_database_availability()); + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 1beac1ed061..95f69f96163 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -48,16 +48,6 @@ struct StripeInformation { int64_t first_row_id; }; -/// \brief Version provided by the official ORC C++ library. -struct ARROW_EXPORT OrcVersion { - int major; - int minor; - int patch; - - /// \brief Return the current version of ORC C++ library. - static std::optional Get(); -}; - /// \class ORCFileReader /// \brief Read an Arrow Table or RecordBatch from an ORC file. class ARROW_EXPORT ORCFileReader { diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 76fab3d3410..b9d6c53215b 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -639,32 +639,20 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { } TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { - auto orc_version = adapters::orc::OrcVersion::Get(); - if (orc_version.has_value() && orc_version->major >= 2) { + if (adapters::orc::GetOrcMajorVersion() >= 2) { GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database check"; } - // Backup the original TZDIR env and set a wrong value by purpose to trigger the check. - const char* tzdir_env_key = "TZDIR"; + EnvVarGuard tzdir_guard("TZDIR", "/wrong/path"); const char* expect_str = "IANA time zone database is unavailable but required by ORC"; - auto tzdir_env_backup = std::getenv(tzdir_env_key); - ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e")); - EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); EXPECT_THAT( adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); - EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); EXPECT_THAT(adapters::orc::ORCFileReader::Open( std::make_shared(buffer), default_memory_pool()), Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); - - // Restore TZDIR env. - ARROW_EXPECT_OK(arrow::internal::DelEnvVar(tzdir_env_key)); - if (tzdir_env_backup) { - ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, tzdir_env_backup)); - } } // Trivial diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index f4bdbae6a7b..2a74bec1aa6 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -37,6 +37,7 @@ #include "orc/MemoryPool.hh" #include "orc/OrcFile.hh" +#include "orc/orc-config.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; @@ -1220,6 +1221,13 @@ Result> GetArrowField(const std::string& name, return field(name, std::move(arrow_type), nullable, std::move(metadata)); } +int GetOrcMajorVersion() { + std::stringstream orc_version(ORC_VERSION); + std::string major_version; + std::getline(orc_version, major_version, '.'); + return std::stoi(major_version); +} + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/util.h b/cpp/src/arrow/adapters/orc/util.h index 00af9f4b76e..a18b11dda01 100644 --- a/cpp/src/arrow/adapters/orc/util.h +++ b/cpp/src/arrow/adapters/orc/util.h @@ -60,6 +60,9 @@ ARROW_EXPORT Status WriteBatch(const ChunkedArray& chunked_array, int64_t length int* arrow_chunk_offset, int64_t* arrow_index_offset, liborc::ColumnVectorBatch* column_vector_batch); +/// \brief Get the major version provided by the official ORC C++ library. +ARROW_EXPORT int GetOrcMajorVersion(); + } // namespace orc } // namespace adapters } // namespace arrow From 182e1d3931e56d961aadd060e0a843eba9a5e0ca Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 26 Mar 2024 22:28:53 +0800 Subject: [PATCH 6/6] Update cpp/src/arrow/adapters/orc/adapter.h --- cpp/src/arrow/adapters/orc/adapter.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 95f69f96163..4ffff81f355 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -19,7 +19,6 @@ #include #include -#include #include #include "arrow/adapters/orc/options.h"