From 06fef48b139ad3dd0d6267273e2737e01ba4818b Mon Sep 17 00:00:00 2001 From: stiga-huang Date: Sat, 15 Sep 2018 18:59:18 -0700 Subject: [PATCH 1/2] ORC-403: [C++] Add checks to avoid invalid offsets in InputStream --- c++/src/Reader.cc | 38 ++++++++++++++++++++++++++++++++++++-- c++/src/StripeStream.cc | 11 +++++++++-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 0a3fdcd46a..3078c9cb65 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -498,6 +498,12 @@ namespace orc { const proto::Stream& stream = currentStripeFooter.streams(i); uint64_t length = static_cast(stream.length()); if (static_cast(stream.kind()) == StreamKind::StreamKind_ROW_INDEX) { + if (offset + length > fileLength) { + std::stringstream msg; + msg << "Malformed RowIndex stream meta: streamOffset=" << offset + << ", streamLength=" << length << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } std::unique_ptr pbStream = createDecompressor(contents->compression, std::unique_ptr @@ -586,8 +592,15 @@ namespace orc { void ReaderImpl::readMetadata() const { uint64_t metadataSize = contents->postscript->metadatalength(); - uint64_t metadataStart = fileLength - metadataSize - - contents->postscript->footerlength() - postscriptLength - 1; + uint64_t footerLength = contents->postscript->footerlength(); + if (fileLength < metadataSize + footerLength + postscriptLength + 1) { + std::stringstream msg; + msg << "Invalid Metadata length: fileLength=" << fileLength + << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength + << ", postscriptLength=" << postscriptLength; + throw ParseError(msg.str()); + } + uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; if (metadataSize != 0) { std::unique_ptr pbStream = createDecompressor(contents->compression, @@ -798,6 +811,16 @@ namespace orc { void RowReaderImpl::startNextStripe() { reader.reset(); // ColumnReaders use lots of memory; free old memory first currentStripeInfo = footer->stripes(static_cast(currentStripe)); + uint64_t fileLength = contents->stream->getLength(); + if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + + currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { + std::stringstream msg; + msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" + << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" + << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() + << ", footerLength=" << currentStripeInfo.footerlength() << ")"; + throw ParseError(msg.str()); + } currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); rowsInCurrentStripe = currentStripeInfo.numberofrows(); const Timezone& writerTimezone = @@ -889,6 +912,12 @@ namespace orc { std::unique_ptr postscript = std::unique_ptr(new proto::PostScript()); + if (readSize < 1 + postscriptSize) { + std::stringstream msg; + msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " + << stream->getLength(); + throw ParseError(msg.str()); + } if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, static_cast(postscriptSize))) { throw ParseError("Failed to parse the postscript from " + @@ -997,6 +1026,11 @@ namespace orc { buffer.get(), postscriptLength)); uint64_t footerSize = contents->postscript->footerlength(); uint64_t tailSize = 1 + postscriptLength + footerSize; + if (tailSize >= fileLength) { + std::stringstream msg; + msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } uint64_t footerOffset; if (tailSize > readSize) { diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc index 75affa46b0..1726feacd0 100644 --- a/c++/src/StripeStream.cc +++ b/c++/src/StripeStream.cc @@ -83,8 +83,15 @@ namespace orc { if (stream.has_kind() && stream.kind() == kind && stream.column() == static_cast(columnId)) { - uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): - stream.length(); + uint64_t streamLength = stream.length(); + uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; + if (offset + streamLength > input.getLength()) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i + << ": streamOffset=" << offset << ", streamLength=" << streamLength + << ", fileLength=" << input.getLength(); + throw ParseError(msg.str()); + } return createDecompressor(reader.getCompression(), std::unique_ptr (new SeekableFileInputStream From ae43ae2f157f132ccf73f378c0c08582af86e185 Mon Sep 17 00:00:00 2001 From: stiga-huang Date: Tue, 18 Sep 2018 07:02:43 -0700 Subject: [PATCH 2/2] Stream ends should not exceed stripe ends --- c++/src/Reader.cc | 22 +++++++++++++--------- c++/src/Reader.hh | 2 +- c++/src/StripeStream.cc | 13 +++++++++---- c++/src/StripeStream.hh | 5 ++++- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 3078c9cb65..4b7977a9e3 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -489,19 +489,22 @@ namespace orc { throw std::range_error("key not found"); } - void ReaderImpl::getRowIndexStatistics( - uint64_t stripeOffset, const proto::StripeFooter& currentStripeFooter, - std::vector >* indexStats) const { + void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, + uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, + std::vector >* indexStats) const { int num_streams = currentStripeFooter.streams_size(); - uint64_t offset = stripeOffset; + uint64_t offset = stripeInfo.offset(); + uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); for (int i = 0; i < num_streams; i++) { const proto::Stream& stream = currentStripeFooter.streams(i); uint64_t length = static_cast(stream.length()); if (static_cast(stream.kind()) == StreamKind::StreamKind_ROW_INDEX) { - if (offset + length > fileLength) { + if (offset + length > indexEnd) { std::stringstream msg; - msg << "Malformed RowIndex stream meta: streamOffset=" << offset - << ", streamLength=" << length << ", fileLength=" << fileLength; + msg << "Malformed RowIndex stream meta in stripe " << stripeIndex + << ": streamOffset=" << offset << ", streamLength=" << length + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength(); throw ParseError(msg.str()); } std::unique_ptr pbStream = @@ -560,7 +563,7 @@ namespace orc { proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); - getRowIndexStatistics(currentStripeInfo.offset(), currentStripeFooter, &indexStats); + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); const Timezone& writerTZ = currentStripeFooter.has_writertimezone() ? @@ -827,7 +830,8 @@ namespace orc { currentStripeFooter.has_writertimezone() ? getTimezoneByName(currentStripeFooter.writertimezone()) : localTimezone; - StripeStreamsImpl stripeStreams(*this, currentStripeFooter, + StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, + currentStripeFooter, currentStripeInfo.offset(), *(contents->stream.get()), writerTimezone); diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index b952fb9b4c..03ef9dd94c 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -190,7 +190,7 @@ namespace orc { // internal methods void readMetadata() const; void checkOrcVersion(); - void getRowIndexStatistics(uint64_t stripeOffset, + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, std::vector >* indexStats) const; diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc index 1726feacd0..b63f19d28e 100644 --- a/c++/src/StripeStream.cc +++ b/c++/src/StripeStream.cc @@ -25,13 +25,16 @@ namespace orc { - StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, + StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, + const proto::StripeInformation& _stripeInfo, const proto::StripeFooter& _footer, uint64_t _stripeStart, InputStream& _input, const Timezone& _writerTimezone ): reader(_reader), + stripeInfo(_stripeInfo), footer(_footer), + stripeIndex(_index), stripeStart(_stripeStart), input(_input), writerTimezone(_writerTimezone) { @@ -77,6 +80,7 @@ namespace orc { proto::Stream_Kind kind, bool shouldStream) const { uint64_t offset = stripeStart; + uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); MemoryPool *pool = reader.getFileContents().pool; for(int i = 0; i < footer.streams_size(); ++i) { const proto::Stream& stream = footer.streams(i); @@ -85,11 +89,12 @@ namespace orc { stream.column() == static_cast(columnId)) { uint64_t streamLength = stream.length(); uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; - if (offset + streamLength > input.getLength()) { + if (offset + streamLength > dataEnd) { std::stringstream msg; - msg << "Malformed stream meta at stream index " << i + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex << ": streamOffset=" << offset << ", streamLength=" << streamLength - << ", fileLength=" << input.getLength(); + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); throw ParseError(msg.str()); } return createDecompressor(reader.getCompression(), diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh index 777056426f..5cbaf60a69 100644 --- a/c++/src/StripeStream.hh +++ b/c++/src/StripeStream.hh @@ -37,13 +37,16 @@ namespace orc { class StripeStreamsImpl: public StripeStreams { private: const RowReaderImpl& reader; + const proto::StripeInformation& stripeInfo; const proto::StripeFooter& footer; + const uint64_t stripeIndex; const uint64_t stripeStart; InputStream& input; const Timezone& writerTimezone; public: - StripeStreamsImpl(const RowReaderImpl& reader, + StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, + const proto::StripeInformation& stripeInfo, const proto::StripeFooter& footer, uint64_t stripeStart, InputStream& input,