From 360ffade3b7b657c87aa5d1f0a143b5f176331cb Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Tue, 21 May 2024 10:57:58 +0800 Subject: [PATCH] [Fix](parquet-reader) Fix Timestamp Int96 min-max statistics is incorrect when was written by some old parquet writers by disable it. (#35041) Parquet INT96 timestamp values were compared incorrectly for the purposes of producing statistics by older parquet writers, so PARQUET-1065 deprecated them. The result is that any writer that produced stats was producing unusable incorrect values, except the special case where min == max and an incorrect ordering would not be material to the result. PARQUET-1026 made binary stats available and valid in that special case. --- be/src/vec/exec/format/parquet/parquet_pred_cmp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h index b993370a159895..e53b76afb8b19b 100644 --- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h +++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h @@ -256,6 +256,17 @@ class ParquetPredicate { ParquetInt96 datetime96_max = *reinterpret_cast(encoded_max.data()); int64_t micros_max = datetime96_max.to_timestamp_micros(); + + // From Trino: Parquet INT96 timestamp values were compared incorrectly + // for the purposes of producing statistics by older parquet writers, + // so PARQUET-1065 deprecated them. The result is that any writer that produced stats + // was producing unusable incorrect values, except the special case where min == max + // and an incorrect ordering would not be material to the result. + // PARQUET-1026 made binary stats available and valid in that special case. + if (micros_min != micros_max) { + return false; + } + if constexpr (std::is_same_v || std::is_same_v>) { min_value.from_unixtime(micros_min / 1000000, ctz);