From 721a8516505e2b3ef97c8fb68895043f701196d1 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 23 Feb 2023 12:42:39 -0800 Subject: [PATCH 1/6] Optimize count_distinct.size --- .../src/aggregate/count_distinct.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index 0f3c4c5b4d005..3ee81134079c8 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -216,23 +216,19 @@ impl Accumulator for DistinctCountAccumulator { } fn size(&self) -> usize { + // temporarily calculating the size approximately, taking first batch size * number of batches + // such approach has some inaccuracy for variable length values, like strings. std::mem::size_of_val(self) + (std::mem::size_of::() * self.values.capacity()) + self .values .iter() + .next() .map(|vals| { - ScalarValue::size_of_vec(&vals.0) - std::mem::size_of_val(&vals.0) + (ScalarValue::size_of_vec(&vals.0) - std::mem::size_of_val(&vals.0)) + * self.values.capacity() }) - .sum::() - + (std::mem::size_of::() * self.state_data_types.capacity()) - + self - .state_data_types - .iter() - .map(|dt| dt.size() - std::mem::size_of_val(dt)) - .sum::() - + self.count_data_type.size() - - std::mem::size_of_val(&self.count_data_type) + .unwrap_or(0) } } From 7e937ea619fe27d9823ea8bbf92fb7d4d1e53a44 Mon Sep 17 00:00:00 2001 From: comphead Date: Sun, 26 Feb 2023 14:48:10 -0800 Subject: [PATCH 2/6] Optimize count_distinct.size for primitives only --- .../src/aggregate/count_distinct.rs | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index 3ee81134079c8..769e40decb9e6 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -149,6 +149,42 @@ impl DistinctCountAccumulator { self.update(&row_values) }) } + + // calculating the size approximately, taking first batch size * number of batches + // such approach has some inaccuracy for variable length values, like strings. + fn approx_size(&self) -> usize { + std::mem::size_of_val(self) + + (std::mem::size_of::() * self.values.capacity()) + + self + .values + .iter() + .next() + .map(|vals| { + (ScalarValue::size_of_vec(&vals.0) - std::mem::size_of_val(&vals.0)) + * self.values.capacity() + }) + .unwrap_or(0) + } + + fn full_size(&self) -> usize { + std::mem::size_of_val(self) + + (std::mem::size_of::() * self.values.capacity()) + + self + .values + .iter() + .map(|vals| { + ScalarValue::size_of_vec(&vals.0) - std::mem::size_of_val(&vals.0) + }) + .sum::() + + (std::mem::size_of::() * self.state_data_types.capacity()) + + self + .state_data_types + .iter() + .map(|dt| dt.size() - std::mem::size_of_val(dt)) + .sum::() + + self.count_data_type.size() + - std::mem::size_of_val(&self.count_data_type) + } } impl Accumulator for DistinctCountAccumulator { @@ -216,19 +252,27 @@ impl Accumulator for DistinctCountAccumulator { } fn size(&self) -> usize { - // temporarily calculating the size approximately, taking first batch size * number of batches - // such approach has some inaccuracy for variable length values, like strings. - std::mem::size_of_val(self) - + (std::mem::size_of::() * self.values.capacity()) - + self - .values - .iter() - .next() - .map(|vals| { - (ScalarValue::size_of_vec(&vals.0) - std::mem::size_of_val(&vals.0)) - * self.values.capacity() - }) - .unwrap_or(0) + match &self.count_data_type { + DataType::Boolean + | DataType::Date32 + | DataType::Date64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Int8 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Null + | DataType::Timestamp(_, _) + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::UInt8 => self.approx_size(), + _ => self.full_size(), + } } } From 6da6e28bf8e40f6087fe6cc6eb799e9199d9f0e6 Mon Sep 17 00:00:00 2001 From: comphead Date: Sun, 26 Feb 2023 14:48:57 -0800 Subject: [PATCH 3/6] Optimize count_distinct.size for primitives only --- datafusion/physical-expr/src/aggregate/count_distinct.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index 769e40decb9e6..c20dfe7b009d3 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -151,7 +151,7 @@ impl DistinctCountAccumulator { } // calculating the size approximately, taking first batch size * number of batches - // such approach has some inaccuracy for variable length values, like strings. + // approx_size has some inaccuracy for variable length values, like strings. fn approx_size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.values.capacity()) @@ -166,6 +166,7 @@ impl DistinctCountAccumulator { .unwrap_or(0) } + // calculates the size as accurate as possible, call to this method is expensive fn full_size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.values.capacity()) From 3e8203973ee61b1f77a6c2ce4eb69721dfe2b459 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 27 Feb 2023 09:37:57 -0800 Subject: [PATCH 4/6] Optimize count_distinct.size for primitives only --- datafusion/physical-expr/src/aggregate/count_distinct.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index c20dfe7b009d3..7739d738a791f 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -150,9 +150,9 @@ impl DistinctCountAccumulator { }) } - // calculating the size approximately, taking first batch size * number of batches - // approx_size has some inaccuracy for variable length values, like strings. - fn approx_size(&self) -> usize { + // calculating the size for fixed length values, taking first batch size * number of batches + // This method is faster than .size, however it is not suitable for variable length values like strings or complex types + fn fixed_size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.values.capacity()) + self @@ -271,7 +271,7 @@ impl Accumulator for DistinctCountAccumulator { | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 - | DataType::UInt8 => self.approx_size(), + | DataType::UInt8 => self.fixed_size(), _ => self.full_size(), } } From 64dea1e70730dce0deebf9e19272f0325d84d03a Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 27 Feb 2023 09:42:43 -0800 Subject: [PATCH 5/6] Optimize count_distinct.size for primitives only --- datafusion/physical-expr/src/aggregate/count_distinct.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index 7739d738a791f..613816799f4aa 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -151,7 +151,7 @@ impl DistinctCountAccumulator { } // calculating the size for fixed length values, taking first batch size * number of batches - // This method is faster than .size, however it is not suitable for variable length values like strings or complex types + // This method is faster than .full_size(), however it is not suitable for variable length values like strings or complex types fn fixed_size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.values.capacity()) From 669fb851cc04495b65354ee2e925e89981742e26 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 27 Feb 2023 15:40:37 -0800 Subject: [PATCH 6/6] Optimize count_distinct.size. refactor --- .../src/aggregate/count_distinct.rs | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/count_distinct.rs b/datafusion/physical-expr/src/aggregate/count_distinct.rs index 613816799f4aa..cdfc4f46aa844 100644 --- a/datafusion/physical-expr/src/aggregate/count_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/count_distinct.rs @@ -253,26 +253,10 @@ impl Accumulator for DistinctCountAccumulator { } fn size(&self) -> usize { - match &self.count_data_type { - DataType::Boolean - | DataType::Date32 - | DataType::Date64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Int8 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Null - | DataType::Timestamp(_, _) - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::UInt8 => self.fixed_size(), - _ => self.full_size(), + if self.count_data_type.is_primitive() { + self.fixed_size() + } else { + self.full_size() } } }