-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-36026: [C++][ORC] Catch all ORC exceptions to avoid crash #40697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0c640ac
589b252
a2f7937
a4d880e
6320e7a
182e1d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,17 +18,14 @@ | |
| #include "arrow/adapters/orc/adapter.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <cstdint> | ||
| #include <functional> | ||
| #include <filesystem> | ||
| #include <list> | ||
| #include <memory> | ||
| #include <sstream> | ||
| #include <string> | ||
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/adapters/orc/util.h" | ||
| #include "arrow/buffer.h" | ||
| #include "arrow/builder.h" | ||
| #include "arrow/io/interfaces.h" | ||
| #include "arrow/memory_pool.h" | ||
|
|
@@ -37,14 +34,11 @@ | |
| #include "arrow/table.h" | ||
| #include "arrow/table_builder.h" | ||
| #include "arrow/type.h" | ||
| #include "arrow/type_traits.h" | ||
| #include "arrow/util/bit_util.h" | ||
| #include "arrow/util/checked_cast.h" | ||
| #include "arrow/util/decimal.h" | ||
| #include "arrow/util/key_value_metadata.h" | ||
| #include "arrow/util/macros.h" | ||
| #include "arrow/util/range.h" | ||
| #include "arrow/util/visibility.h" | ||
| #include "orc/Exceptions.hh" | ||
|
|
||
| // alias to not interfere with nested orc namespace | ||
|
|
@@ -80,6 +74,12 @@ namespace liborc = orc; | |
| } \ | ||
| catch (const liborc::NotImplementedYet& e) { \ | ||
| return Status::NotImplemented(e.what()); \ | ||
| } \ | ||
| catch (const std::exception& e) { \ | ||
| return Status::UnknownError(e.what()); \ | ||
| } \ | ||
| catch (...) { \ | ||
| return Status::UnknownError("ORC error"); \ | ||
|
Comment on lines
+81
to
+82
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this? (Is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Usually it is enough. But just in case? |
||
| } | ||
|
|
||
| #define ORC_CATCH_NOT_OK(_s) \ | ||
|
|
@@ -173,7 +173,7 @@ class OrcStripeReader : public RecordBatchReader { | |
| int64_t batch_size_; | ||
| }; | ||
|
|
||
| liborc::RowReaderOptions default_row_reader_options() { | ||
| liborc::RowReaderOptions DefaultRowReaderOptions() { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function was added by me in the past and now I have changed its name to follow the same style. |
||
| liborc::RowReaderOptions options; | ||
| // Orc timestamp type is error-prone since it serializes values in the writer timezone | ||
| // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ | ||
|
|
@@ -183,6 +183,24 @@ liborc::RowReaderOptions default_row_reader_options() { | |
| return options; | ||
| } | ||
|
|
||
| // Proactively check timezone database availability for ORC versions older than 2.0.0 | ||
| Status CheckTimeZoneDatabaseAvailability() { | ||
| if (GetOrcMajorVersion() >= 2) { | ||
| return Status::OK(); | ||
| } | ||
| auto tz_dir = std::getenv("TZDIR"); | ||
| bool is_tzdb_avaiable = tz_dir != nullptr | ||
| ? std::filesystem::exists(tz_dir) | ||
| : std::filesystem::exists("/usr/share/zoneinfo"); | ||
| if (!is_tzdb_avaiable) { | ||
| return Status::Invalid( | ||
| "IANA time zone database is unavailable but required by ORC." | ||
| " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" | ||
| " directory"); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| class ORCFileReader::Impl { | ||
|
|
@@ -332,47 +350,47 @@ class ORCFileReader::Impl { | |
| } | ||
|
|
||
| Result<std::shared_ptr<Table>> Read() { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema()); | ||
| return ReadTable(opts, schema); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| return ReadTable(opts, schema); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectIndices(&opts, include_indices)); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
| return ReadTable(opts, schema); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectNames(&opts, include_names)); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
| return ReadTable(opts, schema); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema, | ||
| const std::vector<int>& include_indices) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectIndices(&opts, include_indices)); | ||
| return ReadTable(opts, schema); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectStripe(&opts, stripe)); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
| return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<RecordBatch>> ReadStripe( | ||
| int64_t stripe, const std::vector<int>& include_indices) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectIndices(&opts, include_indices)); | ||
| RETURN_NOT_OK(SelectStripe(&opts, stripe)); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
|
|
@@ -381,7 +399,7 @@ class ORCFileReader::Impl { | |
|
|
||
| Result<std::shared_ptr<RecordBatch>> ReadStripe( | ||
| int64_t stripe, const std::vector<std::string>& include_names) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| RETURN_NOT_OK(SelectNames(&opts, include_names)); | ||
| RETURN_NOT_OK(SelectStripe(&opts, stripe)); | ||
| ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
|
|
@@ -487,7 +505,7 @@ class ORCFileReader::Impl { | |
| return nullptr; | ||
| } | ||
|
|
||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| if (!include_indices.empty()) { | ||
| RETURN_NOT_OK(SelectIndices(&opts, include_indices)); | ||
| } | ||
|
|
@@ -508,7 +526,7 @@ class ORCFileReader::Impl { | |
|
|
||
| Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader( | ||
| int64_t batch_size, const std::vector<std::string>& include_names) { | ||
| liborc::RowReaderOptions opts = default_row_reader_options(); | ||
| liborc::RowReaderOptions opts = DefaultRowReaderOptions(); | ||
| if (!include_names.empty()) { | ||
| RETURN_NOT_OK(SelectNames(&opts, include_names)); | ||
| } | ||
|
|
@@ -541,6 +559,7 @@ ORCFileReader::~ORCFileReader() {} | |
|
|
||
| Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open( | ||
| const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) { | ||
| RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); | ||
| auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader()); | ||
| RETURN_NOT_OK(result->impl_->Open(file, pool)); | ||
| return std::move(result); | ||
|
|
@@ -779,7 +798,7 @@ class ORCFileWriter::Impl { | |
| &(arrow_index_offset[i]), (root->fields)[i])); | ||
| } | ||
| root->numElements = (root->fields)[0]->numElements; | ||
| writer_->add(*batch); | ||
| ORC_CATCH_NOT_OK(writer_->add(*batch)); | ||
| batch->clear(); | ||
| num_rows -= batch_size; | ||
| } | ||
|
|
@@ -807,6 +826,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } | |
|
|
||
| Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open( | ||
| io::OutputStream* output_stream, const WriteOptions& writer_options) { | ||
| RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); | ||
| std::unique_ptr<ORCFileWriter> result = | ||
| std::unique_ptr<ORCFileWriter>(new ORCFileWriter()); | ||
| Status status = result->impl_->Open(output_stream, writer_options); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.