From ea06312b9356c049094aa16aa0e4cac368512158 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 7 Jan 2023 15:54:41 -0500 Subject: [PATCH 1/4] Update variance/stddev to work with single values Following Postgres: - var/stddev of single element is NULL - var_pop/stddev_pop of single element is 0 --- .../sqllogictests/test_files/aggregate.slt | 12 ++++++++++ .../physical-expr/src/aggregate/variance.rs | 22 +++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt index 7a1b012b84108..611fd75eff9ee 100644 --- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt +++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt @@ -1091,3 +1091,15 @@ query U SELECT ARRAY_AGG([1]); ---- [[1]] + +# variance_single_value +query RRRR +select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0)) as sq; +---- +NULL 0 NULL 0 + +# variance_two_values +query RRRR +select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0), (3.0)) as sq; +---- +2 1 1.4142135623730951 1 diff --git a/datafusion/physical-expr/src/aggregate/variance.rs b/datafusion/physical-expr/src/aggregate/variance.rs index d1ccea7e1d7a5..2bc11dd5eb698 100644 --- a/datafusion/physical-expr/src/aggregate/variance.rs +++ b/datafusion/physical-expr/src/aggregate/variance.rs @@ -286,17 +286,17 @@ impl Accumulator for VarianceAccumulator { } }; - if count <= 1 { - return Err(DataFusionError::Internal( - "At least two values are needed to calculate variance".to_string(), - )); - } - - if self.count == 0 { - Ok(ScalarValue::Float64(None)) - } else { - Ok(ScalarValue::Float64(Some(self.m2 / count as f64))) - } + Ok(ScalarValue::Float64(match self.count { + 0 => None, + 1 => { + if matches!(self.stats_type, StatsType::Population) { + Some(0.0) + } else { + None + } + } + _ => Some(self.m2 / count as f64), + })) } fn size(&self) -> usize { From 319a11512a2a188c73994422c45439e7ad186d69 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 7 Jan 2023 18:15:12 -0500 Subject: [PATCH 2/4] Fix tests --- datafusion/physical-expr/src/aggregate/stddev.rs | 5 ++--- datafusion/physical-expr/src/aggregate/variance.rs | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/stddev.rs b/datafusion/physical-expr/src/aggregate/stddev.rs index 05dc56cff2c6f..cf4440995d3cf 100644 --- a/datafusion/physical-expr/src/aggregate/stddev.rs +++ b/datafusion/physical-expr/src/aggregate/stddev.rs @@ -341,9 +341,8 @@ mod tests { "bla".to_string(), DataType::Float64, )); - let actual = aggregate(&batch, agg); - assert!(actual.is_err()); - + let actual = aggregate(&batch, agg).unwrap(); + assert_eq!(actual, ScalarValue::Float64(None)); Ok(()) } diff --git a/datafusion/physical-expr/src/aggregate/variance.rs b/datafusion/physical-expr/src/aggregate/variance.rs index 2bc11dd5eb698..c938d737be8d1 100644 --- a/datafusion/physical-expr/src/aggregate/variance.rs +++ b/datafusion/physical-expr/src/aggregate/variance.rs @@ -382,8 +382,8 @@ mod tests { "bla".to_string(), DataType::Float64, )); - let actual = aggregate(&batch, agg); - assert!(actual.is_err()); + let actual = aggregate(&batch, agg).unwrap(); + assert_eq!(actual, ScalarValue::Float64(None)); Ok(()) } @@ -416,8 +416,8 @@ mod tests { "bla".to_string(), DataType::Float64, )); - let actual = aggregate(&batch, agg); - assert!(actual.is_err()); + let actual = aggregate(&batch, agg).unwrap(); + assert_eq!(actual, ScalarValue::Float64(None)); Ok(()) } From bc65eeb0300b6e330b0a9963d961ecea5bd760c2 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 7 Jan 2023 18:17:04 -0500 Subject: [PATCH 3/4] matches! to if let --- datafusion/physical-expr/src/aggregate/variance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/aggregate/variance.rs b/datafusion/physical-expr/src/aggregate/variance.rs index c938d737be8d1..2895137447114 100644 --- a/datafusion/physical-expr/src/aggregate/variance.rs +++ b/datafusion/physical-expr/src/aggregate/variance.rs @@ -289,7 +289,7 @@ impl Accumulator for VarianceAccumulator { Ok(ScalarValue::Float64(match self.count { 0 => None, 1 => { - if matches!(self.stats_type, StatsType::Population) { + if let StatsType::Population = self.stats_type { Some(0.0) } else { None From 6f7d67e7dcd1e0f958149d6ab15e2c552297844c Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 7 Jan 2023 18:45:49 -0500 Subject: [PATCH 4/4] fix test_stddev_1_input test --- datafusion/physical-expr/src/aggregate/stddev.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate/stddev.rs b/datafusion/physical-expr/src/aggregate/stddev.rs index cf4440995d3cf..4c9e46644a746 100644 --- a/datafusion/physical-expr/src/aggregate/stddev.rs +++ b/datafusion/physical-expr/src/aggregate/stddev.rs @@ -307,8 +307,8 @@ mod tests { "bla".to_string(), DataType::Float64, )); - let actual = aggregate(&batch, agg); - assert!(actual.is_err()); + let actual = aggregate(&batch, agg).unwrap(); + assert_eq!(actual, ScalarValue::Float64(None)); Ok(()) }