From f31ebb9b7efd55f326d02589d4d1d4447b698e81 Mon Sep 17 00:00:00 2001 From: lan Date: Tue, 25 Feb 2025 00:07:00 +0800 Subject: [PATCH 1/7] add StripeStatistic api and test --- c++/include/orc/Reader.hh | 9 ++++++++- c++/src/Reader.cc | 14 ++++++++++++++ c++/src/Reader.hh | 2 ++ c++/test/TestStripeIndexStatistics.cc | 25 +++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index b015b64910..25488ee8da 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -495,10 +495,17 @@ namespace orc { */ virtual uint64_t getNumberOfStripeStatistics() const = 0; + /** + * Get the statistics about a stripe . + * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about + * @return the statistics about that stripe without reading row group index statistics + */ + virtual std::unique_ptr getStripeStatisticsOnly(uint64_t stripeIndex) const = 0; + /** * Get the statistics about a stripe. * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe + * @return the statistics about that stripe and row group index statistics */ virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0; diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index c93c62f6c4..958b9f2ee7 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,6 +751,20 @@ namespace orc { return *(contents_->schema.get()); } + std::unique_ptr + ReaderImpl::getStripeStatisticsOnly(uint64_t stripeIndex) const { + if (!isMetadataLoaded_) { + readMetadata(); + } + if (contents_->metadata == nullptr) { + throw std::logic_error("No stripe statistics in file"); + } + StatContext statContext(hasCorrectStatistics()); + return std::unique_ptr(new StatisticsImpl( + contents_->metadata->stripestats(static_cast(stripeIndex)), + statContext)); + } + std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { if (!isMetadataLoaded_) { readMetadata(); diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 39ca739675..5fdc686a20 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -346,6 +346,8 @@ namespace orc { std::unique_ptr getColumnStatistics(uint32_t columnId) const override; + std::unique_ptr getStripeStatisticsOnly(uint64_t stripeIndex) const override; + std::string getSerializedFileTail() const override; const Type& getType() const override; diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 85fdb80e4f..e8a6305adc 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -83,6 +83,31 @@ namespace orc { "length: " "8000\n", stringColStats->toString()); + + std::unique_ptr stripeLevelStats = reader->getStripeStatisticsOnly(0); + const orc::IntegerColumnStatistics* stripeLevelIntColStats; + stripeLevelIntColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(1)); + EXPECT_EQ( + "Data type: Integer\nValues: 6000\nHas null: no\nMinimum: 1\nMaximum: 6000\nSum: " + "18003000\n", + stripeLevelIntColStats->toString()); + + const orc::StringColumnStatistics* stripeLevelStringColStats; + stripeLevelStringColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(2)); + EXPECT_EQ( + "Data type: String\nValues: 6000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "23892\n", + stripeLevelStringColStats->toString()); + + intColStats = + reinterpret_cast(stripeStats->getColumnStatistics(1)); + stringColStats = + reinterpret_cast(stripeStats->getColumnStatistics(2)); + + EXPECT_EQ(intColStats->toString(), stripeLevelStringColStats->toString()); + EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString()); } } // namespace orc From 5cac6713b3bf4b245ed330aa7921d5f4e6235fda Mon Sep 17 00:00:00 2001 From: lan Date: Wed, 26 Feb 2025 21:29:54 +0800 Subject: [PATCH 2/7] fix var name --- c++/src/Reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 958b9f2ee7..ea95de020c 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -761,7 +761,7 @@ namespace orc { } StatContext statContext(hasCorrectStatistics()); return std::unique_ptr(new StatisticsImpl( - contents_->metadata->stripestats(static_cast(stripeIndex)), + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext)); } From 06c85a21b53d8ad207041d9c9679f5840f72da8d Mon Sep 17 00:00:00 2001 From: lan Date: Wed, 26 Feb 2025 21:51:19 +0800 Subject: [PATCH 3/7] fix ut --- c++/test/TestStripeIndexStatistics.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index e8a6305adc..5370aa8fca 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -89,7 +89,7 @@ namespace orc { stripeLevelIntColStats = reinterpret_cast( stripeLevelStats->getColumnStatistics(1)); EXPECT_EQ( - "Data type: Integer\nValues: 6000\nHas null: no\nMinimum: 1\nMaximum: 6000\nSum: " + "Data type: Integer\nValues: 6000\nHas null: yes\nMinimum: 1\nMaximum: 6000\nSum: " "18003000\n", stripeLevelIntColStats->toString()); @@ -97,7 +97,7 @@ namespace orc { stripeLevelStringColStats = reinterpret_cast( stripeLevelStats->getColumnStatistics(2)); EXPECT_EQ( - "Data type: String\nValues: 6000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "Data type: String\nValues: 6000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " "23892\n", stripeLevelStringColStats->toString()); @@ -106,7 +106,7 @@ namespace orc { stringColStats = reinterpret_cast(stripeStats->getColumnStatistics(2)); - EXPECT_EQ(intColStats->toString(), stripeLevelStringColStats->toString()); + EXPECT_EQ(intColStats->toString(), stripeLevelIntColStats->toString()); EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString()); } From 4196f1cea65a3339da7cb0492a7fa1bd785fdb86 Mon Sep 17 00:00:00 2001 From: lan Date: Wed, 26 Feb 2025 22:05:26 +0800 Subject: [PATCH 4/7] fix return value --- c++/src/Reader.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index ea95de020c..a0073665f0 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -760,9 +760,8 @@ namespace orc { throw std::logic_error("No stripe statistics in file"); } StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr(new StatisticsImpl( - contents_->metadata->stripe_stats(static_cast(stripeIndex)), - statContext)); + return std::make_unique( + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); } std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { From 4b14194423d97beb379826f2fe94c2b8d8816896 Mon Sep 17 00:00:00 2001 From: lan Date: Wed, 26 Feb 2025 22:21:57 +0800 Subject: [PATCH 5/7] fix type --- c++/src/Reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index a0073665f0..1b9f23ac14 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -760,7 +760,7 @@ namespace orc { throw std::logic_error("No stripe statistics in file"); } StatContext statContext(hasCorrectStatistics()); - return std::make_unique( + return std::make_unique( contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); } From d4b8f2a157ac9455409df37ff27ded25e17d9577 Mon Sep 17 00:00:00 2001 From: lan Date: Wed, 26 Feb 2025 23:31:07 +0800 Subject: [PATCH 6/7] fix format --- c++/build-support/README.md | 4 ++-- c++/src/Reader.cc | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/c++/build-support/README.md b/c++/build-support/README.md index 0ffad788e8..80966104bb 100644 --- a/c++/build-support/README.md +++ b/c++/build-support/README.md @@ -11,7 +11,7 @@ To use `run_clang_format.py` you could act like below: ```shell mkdir build cd build -cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 make check-format # Do checks only make format # This would apply suggested changes, take care! ``` @@ -23,7 +23,7 @@ To use `run_clang_tidy.py` you could act like below: ```shell mkdir build cd build -cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 make -j`nproc` # Important make check-clang-tidy # Do checks only make fix-clang-tidy # This would apply suggested changes, take care! diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 1b9f23ac14..a3e0c44879 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,8 +751,7 @@ namespace orc { return *(contents_->schema.get()); } - std::unique_ptr - ReaderImpl::getStripeStatisticsOnly(uint64_t stripeIndex) const { + std::unique_ptr ReaderImpl::getStripeStatisticsOnly(uint64_t stripeIndex) const { if (!isMetadataLoaded_) { readMetadata(); } From 5777328d2cb6bb3ebd3767c58062ba48f8064d31 Mon Sep 17 00:00:00 2001 From: lan Date: Sat, 1 Mar 2025 01:52:03 +0800 Subject: [PATCH 7/7] add an extra parameter to the original interface --- c++/include/orc/Reader.hh | 11 +++------ c++/src/Reader.cc | 34 ++++++++++++--------------- c++/src/Reader.hh | 5 ++-- c++/src/Statistics.cc | 15 +++++++++--- c++/src/Statistics.hh | 29 ++++++++++++++++++++--- c++/test/TestStripeIndexStatistics.cc | 2 +- c++/test/TestTimestampStatistics.cc | 13 ++++++++++ 7 files changed, 72 insertions(+), 37 deletions(-) diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 25488ee8da..e9f420f113 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -495,19 +495,14 @@ namespace orc { */ virtual uint64_t getNumberOfStripeStatistics() const = 0; - /** - * Get the statistics about a stripe . - * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe without reading row group index statistics - */ - virtual std::unique_ptr getStripeStatisticsOnly(uint64_t stripeIndex) const = 0; - /** * Get the statistics about a stripe. * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about + * @param includeRowIndex whether the row index of the stripe is included * @return the statistics about that stripe and row group index statistics */ - virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const = 0; /** * Get the length of the data stripes in the file. diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index a3e0c44879..17bf835203 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,39 +751,35 @@ namespace orc { return *(contents_->schema.get()); } - std::unique_ptr ReaderImpl::getStripeStatisticsOnly(uint64_t stripeIndex) const { + std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex, + bool includeRowIndex) const { if (!isMetadataLoaded_) { readMetadata(); } if (contents_->metadata == nullptr) { throw std::logic_error("No stripe statistics in file"); } - StatContext statContext(hasCorrectStatistics()); - return std::make_unique( - contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); - } - - std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { - if (!isMetadataLoaded_) { - readMetadata(); - } - if (contents_->metadata == nullptr) { - throw std::logic_error("No stripe statistics in file"); - } - size_t num_cols = static_cast( - contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); - std::vector> indexStats(num_cols); proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast(stripeIndex)); proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get()); - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - const Timezone& writerTZ = currentStripeFooter.has_writer_timezone() ? getTimezoneByName(currentStripeFooter.writer_timezone()) : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::make_unique( + + if (!includeRowIndex) { + return std::make_unique( + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); + } + + size_t num_cols = static_cast( + contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); + std::vector> indexStats(num_cols); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + return std::make_unique( contents_->metadata->stripe_stats(static_cast(stripeIndex)), indexStats, statContext); } diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 5fdc686a20..3d81d26920 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -330,7 +330,8 @@ namespace orc { const std::string& getStreamName() const override; - std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const override; + std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const override; std::unique_ptr createRowReader() const override; @@ -346,8 +347,6 @@ namespace orc { std::unique_ptr getColumnStatistics(uint32_t columnId) const override; - std::unique_ptr getStripeStatisticsOnly(uint64_t stripeIndex) const override; - std::string getSerializedFileTail() const override; const Type& getType() const override; diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 76fd736b27..c1a23cad16 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -81,11 +81,20 @@ namespace orc { // PASS } - StripeStatisticsImpl::StripeStatisticsImpl( + StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + columnStats_ = std::make_unique(stripeStats, statContext); + } + + StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() { + // PASS + } + + StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl( const proto::StripeStatistics& stripeStats, std::vector >& indexStats, - const StatContext& statContext) { - columnStats_ = std::make_unique(stripeStats, statContext); + const StatContext& statContext) + : StripeStatisticsImpl(stripeStats, statContext) { rowIndexStats_.resize(indexStats.size()); for (size_t i = 0; i < rowIndexStats_.size(); i++) { for (size_t j = 0; j < indexStats[i].size(); j++) { diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 6f212c15cc..b7ed5d1e56 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1713,7 +1713,6 @@ namespace orc { class StripeStatisticsImpl : public StripeStatistics { private: std::unique_ptr columnStats_; - std::vector > > rowIndexStats_; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); @@ -1721,7 +1720,6 @@ namespace orc { public: StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, - std::vector >& indexStats, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { @@ -1732,13 +1730,38 @@ namespace orc { return columnStats_->getNumberOfColumns(); } + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + + virtual ~StripeStatisticsImpl() override; + + virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + }; + + class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl { + private: + std::vector > > rowIndexStats_; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&); + StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&); + + public: + StripeStatisticsWithRowGroupIndexImpl( + const proto::StripeStatistics& stripeStats, + std::vector >& indexStats, + const StatContext& statContext); + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats_[columnId][rowIndex].get(); } - virtual ~StripeStatisticsImpl() override; + virtual ~StripeStatisticsWithRowGroupIndexImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { return static_cast(rowIndexStats_[columnId].size()); diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 5370aa8fca..a529792c17 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -84,7 +84,7 @@ namespace orc { "8000\n", stringColStats->toString()); - std::unique_ptr stripeLevelStats = reader->getStripeStatisticsOnly(0); + std::unique_ptr stripeLevelStats = reader->getStripeStatistics(0, false); const orc::IntegerColumnStatistics* stripeLevelIntColStats; stripeLevelIntColStats = reinterpret_cast( stripeLevelStats->getColumnStatistics(1)); diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc index d20a049557..e005fa6cf6 100644 --- a/c++/test/TestTimestampStatistics.cc +++ b/c++/test/TestTimestampStatistics.cc @@ -68,6 +68,19 @@ namespace orc { "00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 " "00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString()); + + std::unique_ptr stripeStatsWithOutRowIndex = + reader->getStripeStatistics(0, false); + const orc::TimestampColumnStatistics* stripeColStatsOnly = + reinterpret_cast( + stripeStatsWithOutRowIndex->getColumnStatistics(0)); + + EXPECT_TRUE(stripeColStatsOnly->hasMinimum()); + EXPECT_TRUE(stripeColStatsOnly->hasMaximum()); + EXPECT_EQ(stripeColStats->toString(), stripeColStatsOnly->toString()); + EXPECT_EQ(stripeStats->getNumberOfColumns(), stripeStatsWithOutRowIndex->getNumberOfColumns()); + EXPECT_THROW(stripeStatsWithOutRowIndex->getRowIndexStatistics(1, 1), NotImplementedYet); + EXPECT_THROW(stripeStatsWithOutRowIndex->getNumberOfRowIndexStats(1), NotImplementedYet); } TEST(TestTimestampStatistics, testTimezoneUTC) {