From af7f8eef2c3add0323f0c0a4a530302301b6c641 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 18:50:35 -0600 Subject: [PATCH 1/9] Workaround for COUNT performance --- .../core/src/execution/datafusion/planner.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index b4d723eb1f..3da0c3f262 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -19,7 +19,6 @@ use arrow_schema::{DataType, Field, Schema, TimeUnit, DECIMAL128_MAX_PRECISION}; use datafusion::functions_aggregate::bit_and_or_xor::{bit_and_udaf, bit_or_udaf, bit_xor_udaf}; -use datafusion::functions_aggregate::count::count_udaf; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::physical_plan::windows::BoundedWindowAggExec; use datafusion::physical_plan::InputOrderMode; @@ -1234,15 +1233,18 @@ impl PhysicalPlanner { ) -> Result, ExecutionError> { match spark_expr.expr_struct.as_ref().unwrap() { AggExprStruct::Count(expr) => { - let children = expr - .children - .iter() - .map(|child| self.create_expr(child, schema.clone())) - .collect::, _>>()?; + assert_eq!(1, expr.children.len()); + + let the_expr = &expr.children[0]; + let child = Arc::new(IfExpr::new( + Arc::new(IsNullExpr::new(self.create_expr(the_expr, schema.clone())?)), + Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), + Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), + )); create_aggregate_expr( - &count_udaf(), - &children, + &sum_udaf(), + &[child], &[], &[], &[], From 8cfb826642aa16215b905349e11204e877c0405f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:07:51 -0600 Subject: [PATCH 2/9] add comments --- native/Cargo.lock | 28 ++-- native/Cargo.toml | 14 +- .../core/src/execution/datafusion/planner.rs | 5 + .../CometAggregateBenchmark-jdk11-results.txt | 147 ++++++++++++++++-- 4 files changed, 162 insertions(+), 32 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 3f6b1d1c71..7ff83977a4 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -804,7 +804,7 @@ dependencies = [ [[package]] name = "datafusion" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -851,7 +851,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "arrow-schema", "async-trait", @@ -948,7 +948,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -968,7 +968,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "tokio", ] @@ -976,7 +976,7 @@ dependencies = [ [[package]] name = "datafusion-execution" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "arrow", "chrono", @@ -996,7 +996,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -1014,7 +1014,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "arrow", "arrow-buffer", @@ -1040,7 +1040,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -1057,7 +1057,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "arrow", "async-trait", @@ -1076,7 +1076,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -1105,7 +1105,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -1118,7 +1118,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "datafusion-common", "datafusion-execution", @@ -1129,7 +1129,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "ahash", "arrow", @@ -1162,7 +1162,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" +source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" dependencies = [ "arrow", "arrow-array", diff --git a/native/Cargo.toml b/native/Cargo.toml index c6cf571a76..b971a5f9e7 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -39,13 +39,13 @@ arrow-buffer = { version = "52.2.0" } arrow-data = { version = "52.2.0" } arrow-schema = { version = "52.2.0" } parquet = { version = "52.2.0", default-features = false, features = ["experimental"] } -datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e" } -datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", features = ["unicode_expressions", "crypto_expressions"] } -datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", features = ["crypto_expressions"] } -datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } -datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } -datafusion-physical-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } -datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } +datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17" } +datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "9e90e17", features = ["unicode_expressions", "crypto_expressions"] } +datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", features = ["crypto_expressions"] } +datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } +datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } +datafusion-physical-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } +datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } datafusion-comet-spark-expr = { path = "spark-expr", version = "0.2.0" } datafusion-comet-proto = { path = "proto", version = "0.2.0" } chrono = { version = "0.4", default-features = false, features = ["clock"] } diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 3da0c3f262..3c910c04af 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -1235,7 +1235,12 @@ impl PhysicalPlanner { AggExprStruct::Count(expr) => { assert_eq!(1, expr.children.len()); + // Using `count_udaf` from Comet is exceptionally slow for some reason, so + // as a workaround we translate it to `SUM(IF(expr IS NULL, 0, 1))` + // https://github.com/apache/datafusion-comet/issues/744 let the_expr = &expr.children[0]; + + //TODO this only handles COUNT(col) and not COUNT(1) so far let child = Arc::new(IfExpr::new( Arc::new(IsNullExpr::new(self.create_expr(the_expr, schema.clone())?)), Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), diff --git a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt b/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt index 9e3e15bc67..c3deb72b47 100644 --- a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt +++ b/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt @@ -2,23 +2,148 @@ Grouped Aggregate (single group key + single aggregate SUM) ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.24+8-post-Ubuntu-1ubuntu322.04 on Linux 6.5.0-41-generic -AMD Ryzen 9 7950X3D 16-Core Processor +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 177 199 12 59.2 16.9 1.0X +SQL Parquet - Comet (Scan) (SUM) 179 201 17 58.7 17.0 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 136 161 15 76.8 13.0 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 176 197 12 59.6 16.8 1.0X +SQL Parquet - Comet (Scan) (SUM) 190 202 14 55.1 18.2 0.9X +SQL Parquet - Comet (Scan, Exec) (SUM) 142 173 20 73.8 13.5 1.2X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------ -SQL Parquet - Spark (SUM) 2663 2744 115 3.9 254.0 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 1067 1084 24 9.8 101.8 2.5X +SQL Parquet - Spark (SUM) 1659 1678 26 6.3 158.2 1.0X +SQL Parquet - Comet (Scan) (SUM) 1842 1874 45 5.7 175.7 0.9X +SQL Parquet - Comet (Scan, Exec) (SUM) 702 719 20 14.9 67.0 2.4X + + +================================================================================================ +Grouped Aggregate (multiple group keys + single aggregate SUM) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 100), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 522 555 38 20.1 49.8 1.0X +SQL Parquet - Comet (Scan) (SUM) 535 576 32 19.6 51.0 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 302 336 35 34.8 28.8 1.7X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1024), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (SUM) 2390 2417 37 4.4 228.0 1.0X +SQL Parquet - Comet (Scan) (SUM) 2234 2263 41 4.7 213.1 1.1X +SQL Parquet - Comet (Scan, Exec) (SUM) 1303 1378 106 8.0 124.3 1.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1048576), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 5711 5727 23 1.8 544.6 1.0X +SQL Parquet - Comet (Scan) (SUM) 5787 5804 23 1.8 551.9 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 4816 4878 88 2.2 459.3 1.2X ================================================================================================ -Grouped Aggregate (single group key + single aggregate COUNT) +Grouped Aggregate (single group key + multiple aggregates SUM) ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.24+8-post-Ubuntu-1ubuntu322.04 on Linux 6.5.0-41-generic -AMD Ryzen 9 7950X3D 16-Core Processor -Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (COUNT) 2532 2552 28 4.1 241.5 1.0X -SQL Parquet - Comet (Scan, Exec) (COUNT) 4590 4592 4 2.3 437.7 0.6X +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 237 259 31 44.3 22.6 1.0X +SQL Parquet - Comet (Scan) (SUM) 244 266 26 42.9 23.3 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 189 218 37 55.4 18.1 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (SUM) 244 309 42 42.9 23.3 1.0X +SQL Parquet - Comet (Scan) (SUM) 249 277 26 42.1 23.8 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 197 231 44 53.2 18.8 1.2X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 2592 2809 306 4.0 247.2 1.0X +SQL Parquet - Comet (Scan) (SUM) 2285 2295 14 4.6 217.9 1.1X +SQL Parquet - Comet (Scan, Exec) (SUM) 1264 1328 91 8.3 120.5 2.1X + +================================================================================================ +Grouped Aggregate (single group key + single aggregate SUM on decimal) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 1094 1129 49 9.6 104.3 1.0X +SQL Parquet - Comet (Scan) (SUM) 1101 1149 69 9.5 105.0 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 248 274 27 42.2 23.7 4.4X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 1070 1111 57 9.8 102.1 1.0X +SQL Parquet - Comet (Scan) (SUM) 1184 1202 25 8.9 112.9 0.9X +SQL Parquet - Comet (Scan, Exec) (SUM) 258 300 42 40.6 24.6 4.1X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (SUM) 3787 3841 77 2.8 361.1 1.0X +SQL Parquet - Comet (Scan) (SUM) 3781 3834 75 2.8 360.6 1.0X +SQL Parquet - Comet (Scan, Exec) (SUM) 1068 1074 8 9.8 101.9 3.5X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate MIN) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 169 216 39 61.9 16.2 1.0X +SQL Parquet - Comet (Scan) (MIN) 178 213 35 59.0 16.9 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 138 170 31 76.1 13.1 1.2X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 184 219 37 57.1 17.5 1.0X +SQL Parquet - Comet (Scan) (MIN) 180 210 45 58.3 17.1 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 138 189 44 76.0 13.2 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MIN) 1769 1800 43 5.9 168.7 1.0X +SQL Parquet - Comet (Scan) (MIN) 1825 1843 25 5.7 174.0 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 705 749 39 14.9 67.2 2.5X + + +================================================================================================ +Grouped Aggregate (multiple group keys + single aggregate MIN) +================================================================================================ From bdc1331204dfdae7b9dc53c43331e03ace80833a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:23:49 -0600 Subject: [PATCH 3/9] remove benchmark results --- .../CometAggregateBenchmark-jdk11-results.txt | 149 ------------------ 1 file changed, 149 deletions(-) delete mode 100644 spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt diff --git a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt b/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt deleted file mode 100644 index c3deb72b47..0000000000 --- a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt +++ /dev/null @@ -1,149 +0,0 @@ -================================================================================================ -Grouped Aggregate (single group key + single aggregate SUM) -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 100), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 177 199 12 59.2 16.9 1.0X -SQL Parquet - Comet (Scan) (SUM) 179 201 17 58.7 17.0 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 136 161 15 76.8 13.0 1.3X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 176 197 12 59.6 16.8 1.0X -SQL Parquet - Comet (Scan) (SUM) 190 202 14 55.1 18.2 0.9X -SQL Parquet - Comet (Scan, Exec) (SUM) 142 173 20 73.8 13.5 1.2X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 1659 1678 26 6.3 158.2 1.0X -SQL Parquet - Comet (Scan) (SUM) 1842 1874 45 5.7 175.7 0.9X -SQL Parquet - Comet (Scan, Exec) (SUM) 702 719 20 14.9 67.0 2.4X - - -================================================================================================ -Grouped Aggregate (multiple group keys + single aggregate SUM) -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: multiple group keys (cardinality 100), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------ -SQL Parquet - Spark (SUM) 522 555 38 20.1 49.8 1.0X -SQL Parquet - Comet (Scan) (SUM) 535 576 32 19.6 51.0 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 302 336 35 34.8 28.8 1.7X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: multiple group keys (cardinality 1024), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 2390 2417 37 4.4 228.0 1.0X -SQL Parquet - Comet (Scan) (SUM) 2234 2263 41 4.7 213.1 1.1X -SQL Parquet - Comet (Scan, Exec) (SUM) 1303 1378 106 8.0 124.3 1.8X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: multiple group keys (cardinality 1048576), single aggregate SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 5711 5727 23 1.8 544.6 1.0X -SQL Parquet - Comet (Scan) (SUM) 5787 5804 23 1.8 551.9 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 4816 4878 88 2.2 459.3 1.2X - - -================================================================================================ -Grouped Aggregate (single group key + multiple aggregates SUM) -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 100), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------ -SQL Parquet - Spark (SUM) 237 259 31 44.3 22.6 1.0X -SQL Parquet - Comet (Scan) (SUM) 244 266 26 42.9 23.3 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 189 218 37 55.4 18.1 1.3X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1024), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 244 309 42 42.9 23.3 1.0X -SQL Parquet - Comet (Scan) (SUM) 249 277 26 42.1 23.8 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 197 231 44 53.2 18.8 1.2X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1048576), multiple aggregates SUM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 2592 2809 306 4.0 247.2 1.0X -SQL Parquet - Comet (Scan) (SUM) 2285 2295 14 4.6 217.9 1.1X -SQL Parquet - Comet (Scan, Exec) (SUM) 1264 1328 91 8.3 120.5 2.1X - - -================================================================================================ -Grouped Aggregate (single group key + single aggregate SUM on decimal) -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 100), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 1094 1129 49 9.6 104.3 1.0X -SQL Parquet - Comet (Scan) (SUM) 1101 1149 69 9.5 105.0 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 248 274 27 42.2 23.7 4.4X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (SUM) 1070 1111 57 9.8 102.1 1.0X -SQL Parquet - Comet (Scan) (SUM) 1184 1202 25 8.9 112.9 0.9X -SQL Parquet - Comet (Scan, Exec) (SUM) 258 300 42 40.6 24.6 4.1X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate SUM on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -SQL Parquet - Spark (SUM) 3787 3841 77 2.8 361.1 1.0X -SQL Parquet - Comet (Scan) (SUM) 3781 3834 75 2.8 360.6 1.0X -SQL Parquet - Comet (Scan, Exec) (SUM) 1068 1074 8 9.8 101.9 3.5X - - -================================================================================================ -Grouped Aggregate (single group key + single aggregate MIN) -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 100), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (MIN) 169 216 39 61.9 16.2 1.0X -SQL Parquet - Comet (Scan) (MIN) 178 213 35 59.0 16.9 1.0X -SQL Parquet - Comet (Scan, Exec) (MIN) 138 170 31 76.1 13.1 1.2X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (MIN) 184 219 37 57.1 17.5 1.0X -SQL Parquet - Comet (Scan) (MIN) 180 210 45 58.3 17.1 1.0X -SQL Parquet - Comet (Scan, Exec) (MIN) 138 189 44 76.0 13.2 1.3X - -OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 -Apple M3 Max -Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -SQL Parquet - Spark (MIN) 1769 1800 43 5.9 168.7 1.0X -SQL Parquet - Comet (Scan) (MIN) 1825 1843 25 5.7 174.0 1.0X -SQL Parquet - Comet (Scan, Exec) (MIN) 705 749 39 14.9 67.2 2.5X - - -================================================================================================ -Grouped Aggregate (multiple group keys + single aggregate MIN) -================================================================================================ - From c5842989ed3e6193ef6af582ba9975e2c871a628 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:39:51 -0600 Subject: [PATCH 4/9] fix regression --- .../core/src/execution/datafusion/planner.rs | 64 ++-- .../CometAggregateBenchmark-jdk11-results.txt | 315 ++++++++++++++++++ 2 files changed, 354 insertions(+), 25 deletions(-) diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 3c910c04af..6252d2580a 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -1233,32 +1233,46 @@ impl PhysicalPlanner { ) -> Result, ExecutionError> { match spark_expr.expr_struct.as_ref().unwrap() { AggExprStruct::Count(expr) => { - assert_eq!(1, expr.children.len()); - - // Using `count_udaf` from Comet is exceptionally slow for some reason, so - // as a workaround we translate it to `SUM(IF(expr IS NULL, 0, 1))` - // https://github.com/apache/datafusion-comet/issues/744 - let the_expr = &expr.children[0]; - - //TODO this only handles COUNT(col) and not COUNT(1) so far - let child = Arc::new(IfExpr::new( - Arc::new(IsNullExpr::new(self.create_expr(the_expr, schema.clone())?)), - Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), - Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), - )); + if expr.children.iter().len() == 1 { + // Using `count_udaf` from Comet is exceptionally slow for some reason, so + // as a workaround we translate it to `SUM(IF(expr IS NULL, 0, 1))` + // https://github.com/apache/datafusion-comet/issues/744 + let the_expr = &expr.children[0]; + + // TODO this could be optimized more for the `COUNT(1)` case + let child = Arc::new(IfExpr::new( + Arc::new(IsNullExpr::new(self.create_expr(the_expr, schema.clone())?)), + Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), + Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), + )); - create_aggregate_expr( - &sum_udaf(), - &[child], - &[], - &[], - &[], - schema.as_ref(), - "count", - false, - false, - ) - .map_err(|e| ExecutionError::DataFusionError(e.to_string())) + create_aggregate_expr( + &sum_udaf(), + &[child], + &[], + &[], + &[], + schema.as_ref(), + "count", + false, + false, + ) + .map_err(|e| ExecutionError::DataFusionError(e.to_string())) + } else { + // use count_udaf, which has poor performance + create_aggregate_expr( + &sum_udaf(), + &expr.children, + &[], + &[], + &[], + schema.as_ref(), + "count", + false, + false, + ) + .map_err(|e| ExecutionError::DataFusionError(e.to_string())) + } } AggExprStruct::Min(expr) => { let child = self.create_expr(expr.child.as_ref().unwrap(), schema)?; diff --git a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt b/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt index c3deb72b47..f05c89c006 100644 --- a/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt +++ b/spark/benchmarks/CometAggregateBenchmark-jdk11-results.txt @@ -147,3 +147,318 @@ SQL Parquet - Comet (Scan, Exec) (MIN) Grouped Aggregate (multiple group keys + single aggregate MIN) ================================================================================================ +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 100), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 480 543 45 21.9 45.7 1.0X +SQL Parquet - Comet (Scan) (MIN) 494 528 23 21.2 47.1 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 309 368 49 34.0 29.5 1.6X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1024), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MIN) 2351 2440 126 4.5 224.2 1.0X +SQL Parquet - Comet (Scan) (MIN) 2386 2425 56 4.4 227.5 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 1390 1400 15 7.5 132.5 1.7X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1048576), single aggregate MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 5771 5842 101 1.8 550.3 1.0X +SQL Parquet - Comet (Scan) (MIN) 5961 5976 22 1.8 568.5 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 4893 4903 14 2.1 466.6 1.2X + + +================================================================================================ +Grouped Aggregate (single group key + multiple aggregates MIN) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), multiple aggregates MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 235 275 38 44.6 22.4 1.0X +SQL Parquet - Comet (Scan) (MIN) 246 309 37 42.6 23.5 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 187 210 38 55.9 17.9 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), multiple aggregates MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MIN) 255 273 15 41.0 24.4 1.0X +SQL Parquet - Comet (Scan) (MIN) 255 296 38 41.2 24.3 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 197 240 40 53.2 18.8 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), multiple aggregates MIN: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 2584 2588 5 4.1 246.4 1.0X +SQL Parquet - Comet (Scan) (MIN) 2484 2520 51 4.2 236.9 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 1199 1206 9 8.7 114.4 2.2X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate MIN on decimal) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate MIN on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 274 355 75 38.2 26.1 1.0X +SQL Parquet - Comet (Scan) (MIN) 339 398 54 30.9 32.3 0.8X +SQL Parquet - Comet (Scan, Exec) (MIN) 210 231 31 49.8 20.1 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate MIN on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 320 365 56 32.8 30.5 1.0X +SQL Parquet - Comet (Scan) (MIN) 314 370 55 33.4 30.0 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 214 245 39 48.9 20.4 1.5X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate MIN on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MIN) 2295 2309 21 4.6 218.8 1.0X +SQL Parquet - Comet (Scan) (MIN) 2217 2235 26 4.7 211.4 1.0X +SQL Parquet - Comet (Scan, Exec) (MIN) 708 763 58 14.8 67.5 3.2X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate MAX) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 172 206 39 61.1 16.4 1.0X +SQL Parquet - Comet (Scan) (MAX) 172 224 48 61.0 16.4 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 132 150 23 79.5 12.6 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 174 202 37 60.3 16.6 1.0X +SQL Parquet - Comet (Scan) (MAX) 190 266 34 55.3 18.1 0.9X +SQL Parquet - Comet (Scan, Exec) (MAX) 136 171 40 77.1 13.0 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MAX) 1800 1807 9 5.8 171.7 1.0X +SQL Parquet - Comet (Scan) (MAX) 1707 1734 38 6.1 162.8 1.1X +SQL Parquet - Comet (Scan, Exec) (MAX) 679 725 48 15.4 64.8 2.6X + + +================================================================================================ +Grouped Aggregate (multiple group keys + single aggregate MAX) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 100), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 550 571 36 19.1 52.4 1.0X +SQL Parquet - Comet (Scan) (MAX) 477 482 4 22.0 45.5 1.2X +SQL Parquet - Comet (Scan, Exec) (MAX) 308 364 56 34.0 29.4 1.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1024), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MAX) 2619 2657 54 4.0 249.8 1.0X +SQL Parquet - Comet (Scan) (MAX) 2359 2368 12 4.4 225.0 1.1X +SQL Parquet - Comet (Scan, Exec) (MAX) 1341 1403 87 7.8 127.9 2.0X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1048576), single aggregate MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 5880 5969 126 1.8 560.8 1.0X +SQL Parquet - Comet (Scan) (MAX) 5798 5804 8 1.8 553.0 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 4772 4793 29 2.2 455.1 1.2X + + +================================================================================================ +Grouped Aggregate (single group key + multiple aggregates MAX) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), multiple aggregates MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 285 323 49 36.8 27.2 1.0X +SQL Parquet - Comet (Scan) (MAX) 299 372 41 35.0 28.6 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 175 200 25 60.0 16.7 1.6X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), multiple aggregates MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------ +SQL Parquet - Spark (MAX) 292 346 48 35.9 27.9 1.0X +SQL Parquet - Comet (Scan) (MAX) 301 340 49 34.9 28.7 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 182 198 9 57.6 17.4 1.6X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), multiple aggregates MAX: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 2640 2659 28 4.0 251.7 1.0X +SQL Parquet - Comet (Scan) (MAX) 2743 2745 3 3.8 261.6 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 1166 1215 70 9.0 111.2 2.3X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate MAX on decimal) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate MAX on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 274 343 62 38.3 26.1 1.0X +SQL Parquet - Comet (Scan) (MAX) 302 360 54 34.8 28.8 0.9X +SQL Parquet - Comet (Scan, Exec) (MAX) 208 215 8 50.4 19.8 1.3X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate MAX on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 293 325 58 35.8 27.9 1.0X +SQL Parquet - Comet (Scan) (MAX) 381 446 53 27.5 36.3 0.8X +SQL Parquet - Comet (Scan, Exec) (MAX) 214 269 54 49.0 20.4 1.4X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate MAX on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (MAX) 2181 2205 33 4.8 208.0 1.0X +SQL Parquet - Comet (Scan) (MAX) 2112 2172 84 5.0 201.4 1.0X +SQL Parquet - Comet (Scan, Exec) (MAX) 705 746 65 14.9 67.2 3.1X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate COUNT) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +---------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 150 188 47 70.1 14.3 1.0X +SQL Parquet - Comet (Scan) (COUNT) 173 234 47 60.6 16.5 0.9X +SQL Parquet - Comet (Scan, Exec) (COUNT) 193 219 35 54.2 18.4 0.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 161 184 22 65.1 15.4 1.0X +SQL Parquet - Comet (Scan) (COUNT) 167 201 41 62.8 15.9 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 195 208 10 53.8 18.6 0.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 1716 1728 17 6.1 163.7 1.0X +SQL Parquet - Comet (Scan) (COUNT) 1677 1680 4 6.3 159.9 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 782 800 27 13.4 74.6 2.2X + + +================================================================================================ +Grouped Aggregate (multiple group keys + single aggregate COUNT) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 100), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 545 562 21 19.3 51.9 1.0X +SQL Parquet - Comet (Scan) (COUNT) 528 532 3 19.9 50.3 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 359 446 53 29.2 34.2 1.5X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1024), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 2462 2529 96 4.3 234.8 1.0X +SQL Parquet - Comet (Scan) (COUNT) 2492 2561 97 4.2 237.7 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 1429 1440 15 7.3 136.3 1.7X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: multiple group keys (cardinality 1048576), single aggregate COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 5634 5641 10 1.9 537.3 1.0X +SQL Parquet - Comet (Scan) (COUNT) 5621 5626 7 1.9 536.1 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 4679 4686 11 2.2 446.2 1.2X + + +================================================================================================ +Grouped Aggregate (single group key + multiple aggregates COUNT) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), multiple aggregates COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 219 297 49 47.8 20.9 1.0X +SQL Parquet - Comet (Scan) (COUNT) 282 303 20 37.2 26.9 0.8X +SQL Parquet - Comet (Scan, Exec) (COUNT) 286 302 9 36.7 27.2 0.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), multiple aggregates COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 216 234 40 48.5 20.6 1.0X +SQL Parquet - Comet (Scan) (COUNT) 311 342 34 33.7 29.7 0.7X +SQL Parquet - Comet (Scan, Exec) (COUNT) 306 329 37 34.2 29.2 0.7X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), multiple aggregates COUNT: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 2181 2197 23 4.8 208.0 1.0X +SQL Parquet - Comet (Scan) (COUNT) 2577 2580 4 4.1 245.8 0.8X +SQL Parquet - Comet (Scan, Exec) (COUNT) 1222 1267 64 8.6 116.5 1.8X + + +================================================================================================ +Grouped Aggregate (single group key + single aggregate COUNT on decimal) +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 100), single aggregate COUNT on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 204 253 48 51.3 19.5 1.0X +SQL Parquet - Comet (Scan) (COUNT) 281 314 45 37.3 26.8 0.7X +SQL Parquet - Comet (Scan, Exec) (COUNT) 258 295 43 40.7 24.6 0.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1024), single aggregate COUNT on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 212 255 43 49.4 20.2 1.0X +SQL Parquet - Comet (Scan) (COUNT) 296 312 16 35.4 28.2 0.7X +SQL Parquet - Comet (Scan, Exec) (COUNT) 257 288 42 40.8 24.5 0.8X + +OpenJDK 64-Bit Server VM 11.0.22+7-LTS on Mac OS X 14.3 +Apple M3 Max +Grouped HashAgg Exec: single group key (cardinality 1048576), single aggregate COUNT on decimal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +SQL Parquet - Spark (COUNT) 1836 1845 13 5.7 175.1 1.0X +SQL Parquet - Comet (Scan) (COUNT) 1916 1934 25 5.5 182.7 1.0X +SQL Parquet - Comet (Scan, Exec) (COUNT) 778 824 42 13.5 74.2 2.4X + + From 76e5615abe47b40a884fdc276563f7d27f635bc7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:42:35 -0600 Subject: [PATCH 5/9] revert change to datafusion version --- native/Cargo.lock | 142 +++++++++++++++++++++++++--------------------- native/Cargo.toml | 14 ++--- 2 files changed, 85 insertions(+), 71 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 7ff83977a4..659c66ced6 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -84,9 +84,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anyhow" @@ -457,9 +457,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" [[package]] name = "byteorder" @@ -469,9 +469,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "cast" @@ -481,9 +481,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.6" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" +checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" dependencies = [ "jobserver", "libc", @@ -586,18 +586,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.9" +version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" +checksum = "0fbb260a053428790f3de475e304ff84cdbc4face759ea7a3e64c1edd938a7fc" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.9" +version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" +checksum = "64b17d7ea74e9f833c7dbf2cbe4fb12ff26783eda4782a8975b72f895c9b4d99" dependencies = [ "anstyle", "clap_lex", @@ -605,9 +605,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" [[package]] name = "combine" @@ -804,7 +804,7 @@ dependencies = [ [[package]] name = "datafusion" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -851,7 +851,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "arrow-schema", "async-trait", @@ -948,7 +948,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -968,7 +968,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "tokio", ] @@ -976,7 +976,7 @@ dependencies = [ [[package]] name = "datafusion-execution" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "arrow", "chrono", @@ -996,7 +996,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -1014,7 +1014,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "arrow", "arrow-buffer", @@ -1040,7 +1040,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -1057,7 +1057,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "arrow", "async-trait", @@ -1076,7 +1076,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -1105,7 +1105,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -1118,7 +1118,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "datafusion-common", "datafusion-execution", @@ -1129,7 +1129,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "ahash", "arrow", @@ -1162,7 +1162,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "40.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=9e90e17#9e90e17a61444b77d7d0de416a33857d1a5105b5" +source = "git+https://github.com/apache/datafusion.git?rev=35c2e7e#35c2e7e7eb04e80877bbbc1fa4a5b06f31a4e4bc" dependencies = [ "arrow", "arrow-array", @@ -1276,9 +1276,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" dependencies = [ "crc32fast", "miniz_oxide", @@ -1525,9 +1525,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" dependencies = [ "equivalent", "hashbrown", @@ -1535,9 +1535,9 @@ dependencies = [ [[package]] name = "inferno" -version = "0.11.20" +version = "0.11.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c77a3ae7d4761b9c64d2c030f70746ceb8cfba32dce0325a56792e0a4816c31" +checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", "indexmap", @@ -1658,9 +1658,9 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -2020,9 +2020,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.1" +version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" +checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" dependencies = [ "memchr", ] @@ -2248,9 +2248,12 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" @@ -2416,9 +2419,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -2445,9 +2448,9 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "rgb" -version = "0.8.45" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade4539f42266ded9e755c605bdddf546242b2c961b03b06a7375260788a0523" +checksum = "e12bc8d2f72df26a5d3178022df33720fbede0d31d82c7291662eff89836994d" dependencies = [ "bytemuck", ] @@ -2551,11 +2554,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -2708,9 +2712,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "symbolic-common" -version = "12.9.2" +version = "12.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71297dc3e250f7dbdf8adb99e235da783d690f5819fdeb4cce39d9cfb0aca9f1" +checksum = "16629323a4ec5268ad23a575110a724ad4544aae623451de600c747bf87b36cf" dependencies = [ "debugid", "memmap2", @@ -2720,9 +2724,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "12.9.2" +version = "12.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "424fa2c9bf2c862891b9cfd354a752751a6730fd838a4691e7f6c2c7957b9daf" +checksum = "48c043a45f08f41187414592b3ceb53fb0687da57209cc77401767fb69d5b596" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -2753,12 +2757,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.10.1" +version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -2840,22 +2845,21 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.1" +version = "1.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" +checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" dependencies = [ "backtrace", "bytes", - "num_cpus", "pin-project-lite", "tokio-macros", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", @@ -2989,9 +2993,9 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" @@ -3103,11 +3107,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3143,6 +3147,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -3270,6 +3283,7 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] @@ -3305,9 +3319,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/native/Cargo.toml b/native/Cargo.toml index b971a5f9e7..c6cf571a76 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -39,13 +39,13 @@ arrow-buffer = { version = "52.2.0" } arrow-data = { version = "52.2.0" } arrow-schema = { version = "52.2.0" } parquet = { version = "52.2.0", default-features = false, features = ["experimental"] } -datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17" } -datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "9e90e17", features = ["unicode_expressions", "crypto_expressions"] } -datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", features = ["crypto_expressions"] } -datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } -datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } -datafusion-physical-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } -datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "9e90e17", default-features = false } +datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e" } +datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", features = ["unicode_expressions", "crypto_expressions"] } +datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", features = ["crypto_expressions"] } +datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } +datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } +datafusion-physical-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } +datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "35c2e7e", default-features = false } datafusion-comet-spark-expr = { path = "spark-expr", version = "0.2.0" } datafusion-comet-proto = { path = "proto", version = "0.2.0" } chrono = { version = "0.4", default-features = false, features = ["clock"] } From 0dd0196550f3148e92ceb434ff2ce2bebb2bf500 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:44:18 -0600 Subject: [PATCH 6/9] Revert change to Cargo.lock --- native/Cargo.lock | 114 ++++++++---------- .../core/src/execution/datafusion/planner.rs | 4 +- 2 files changed, 52 insertions(+), 66 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 659c66ced6..3f6b1d1c71 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -84,9 +84,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.8" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anyhow" @@ -457,9 +457,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.16.3" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" +checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" [[package]] name = "byteorder" @@ -469,9 +469,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.1" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" +checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952" [[package]] name = "cast" @@ -481,9 +481,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.7" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" +checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" dependencies = [ "jobserver", "libc", @@ -586,18 +586,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.13" +version = "4.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fbb260a053428790f3de475e304ff84cdbc4face759ea7a3e64c1edd938a7fc" +checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.13" +version = "4.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b17d7ea74e9f833c7dbf2cbe4fb12ff26783eda4782a8975b72f895c9b4d99" +checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" dependencies = [ "anstyle", "clap_lex", @@ -605,9 +605,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" [[package]] name = "combine" @@ -1276,9 +1276,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" dependencies = [ "crc32fast", "miniz_oxide", @@ -1525,9 +1525,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.3.0" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", "hashbrown", @@ -1535,9 +1535,9 @@ dependencies = [ [[package]] name = "inferno" -version = "0.11.21" +version = "0.11.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" +checksum = "7c77a3ae7d4761b9c64d2c030f70746ceb8cfba32dce0325a56792e0a4816c31" dependencies = [ "ahash", "indexmap", @@ -1658,9 +1658,9 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] name = "jobserver" -version = "0.1.32" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" dependencies = [ "libc", ] @@ -2020,9 +2020,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.2" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" +checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" dependencies = [ "memchr", ] @@ -2248,12 +2248,9 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" -dependencies = [ - "zerocopy", -] +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" @@ -2419,9 +2416,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" dependencies = [ "aho-corasick", "memchr", @@ -2448,9 +2445,9 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "rgb" -version = "0.8.47" +version = "0.8.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12bc8d2f72df26a5d3178022df33720fbede0d31d82c7291662eff89836994d" +checksum = "ade4539f42266ded9e755c605bdddf546242b2c961b03b06a7375260788a0523" dependencies = [ "bytemuck", ] @@ -2554,12 +2551,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" dependencies = [ "itoa", - "memchr", "ryu", "serde", ] @@ -2712,9 +2708,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "symbolic-common" -version = "12.10.0" +version = "12.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16629323a4ec5268ad23a575110a724ad4544aae623451de600c747bf87b36cf" +checksum = "71297dc3e250f7dbdf8adb99e235da783d690f5819fdeb4cce39d9cfb0aca9f1" dependencies = [ "debugid", "memmap2", @@ -2724,9 +2720,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "12.10.0" +version = "12.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c043a45f08f41187414592b3ceb53fb0687da57209cc77401767fb69d5b596" +checksum = "424fa2c9bf2c862891b9cfd354a752751a6730fd838a4691e7f6c2c7957b9daf" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -2757,13 +2753,12 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.11.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -2845,21 +2840,22 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", + "num_cpus", "pin-project-lite", "tokio-macros", ] [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -2993,9 +2989,9 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.5" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "walkdir" @@ -3107,11 +3103,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -3147,15 +3143,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets 0.52.6", -] - [[package]] name = "windows-targets" version = "0.42.2" @@ -3283,7 +3270,6 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ - "byteorder", "zerocopy-derive", ] @@ -3319,9 +3305,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.13+zstd.1.5.6" +version = "2.0.12+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" dependencies = [ "cc", "pkg-config", diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 6252d2580a..0dfa1e3f70 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -62,7 +62,7 @@ use jni::objects::GlobalRef; use num::{BigInt, ToPrimitive}; use std::cmp::max; use std::{collections::HashMap, sync::Arc}; - +use datafusion::functions_aggregate::count::count_udaf; use crate::{ errors::ExpressionError, execution::{ @@ -1261,7 +1261,7 @@ impl PhysicalPlanner { } else { // use count_udaf, which has poor performance create_aggregate_expr( - &sum_udaf(), + &count_udaf(), &expr.children, &[], &[], From db105f3a76a6a80552b100241f0c9ad5905d5dc1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Aug 2024 19:48:43 -0600 Subject: [PATCH 7/9] fix --- .../core/src/execution/datafusion/planner.rs | 67 ++++++++++--------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 0dfa1e3f70..4e343c9bcf 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -17,8 +17,38 @@ //! Converts Spark physical plan to DataFusion physical plan +use crate::{ + errors::ExpressionError, + execution::{ + datafusion::{ + expressions::{ + avg::Avg, + avg_decimal::AvgDecimal, + bitwise_not::BitwiseNotExpr, + bloom_filter_might_contain::BloomFilterMightContain, + checkoverflow::CheckOverflow, + correlation::Correlation, + covariance::Covariance, + negative, + stats::StatsType, + stddev::Stddev, + strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr}, + subquery::Subquery, + sum_decimal::SumDecimal, + unbound::UnboundColumn, + variance::Variance, + NormalizeNaNAndZero, + }, + operators::expand::CometExpandExec, + shuffle_writer::ShuffleWriterExec, + }, + operators::{CopyExec, ExecutionError, ScanExec}, + serde::to_arrow_datatype, + }, +}; use arrow_schema::{DataType, Field, Schema, TimeUnit, DECIMAL128_MAX_PRECISION}; use datafusion::functions_aggregate::bit_and_or_xor::{bit_and_udaf, bit_or_udaf, bit_xor_udaf}; +use datafusion::functions_aggregate::count::count_udaf; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::physical_plan::windows::BoundedWindowAggExec; use datafusion::physical_plan::InputOrderMode; @@ -62,36 +92,6 @@ use jni::objects::GlobalRef; use num::{BigInt, ToPrimitive}; use std::cmp::max; use std::{collections::HashMap, sync::Arc}; -use datafusion::functions_aggregate::count::count_udaf; -use crate::{ - errors::ExpressionError, - execution::{ - datafusion::{ - expressions::{ - avg::Avg, - avg_decimal::AvgDecimal, - bitwise_not::BitwiseNotExpr, - bloom_filter_might_contain::BloomFilterMightContain, - checkoverflow::CheckOverflow, - correlation::Correlation, - covariance::Covariance, - negative, - stats::StatsType, - stddev::Stddev, - strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr}, - subquery::Subquery, - sum_decimal::SumDecimal, - unbound::UnboundColumn, - variance::Variance, - NormalizeNaNAndZero, - }, - operators::expand::CometExpandExec, - shuffle_writer::ShuffleWriterExec, - }, - operators::{CopyExec, ExecutionError, ScanExec}, - serde::to_arrow_datatype, - }, -}; use super::expressions::EvalMode; use crate::execution::datafusion::expressions::comet_scalar_funcs::create_comet_physical_fun; @@ -1260,9 +1260,14 @@ impl PhysicalPlanner { .map_err(|e| ExecutionError::DataFusionError(e.to_string())) } else { // use count_udaf, which has poor performance + let children = expr + .children + .iter() + .map(|child| self.create_expr(child, schema.clone())) + .collect::, _>>()?; create_aggregate_expr( &count_udaf(), - &expr.children, + &children, &[], &[], &[], From be73a0fd1f231f37fcbdb350164a89247f9b8b5d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 6 Aug 2024 06:19:58 -0600 Subject: [PATCH 8/9] unify code for single and multiple arguments --- .../core/src/execution/datafusion/planner.rs | 118 +++++++++--------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 4e343c9bcf..16ef070f59 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -17,6 +17,8 @@ //! Converts Spark physical plan to DataFusion physical plan +use super::expressions::EvalMode; +use crate::execution::datafusion::expressions::comet_scalar_funcs::create_comet_physical_fun; use crate::{ errors::ExpressionError, execution::{ @@ -48,7 +50,6 @@ use crate::{ }; use arrow_schema::{DataType, Field, Schema, TimeUnit, DECIMAL128_MAX_PRECISION}; use datafusion::functions_aggregate::bit_and_or_xor::{bit_and_udaf, bit_or_udaf, bit_xor_udaf}; -use datafusion::functions_aggregate::count::count_udaf; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::physical_plan::windows::BoundedWindowAggExec; use datafusion::physical_plan::InputOrderMode; @@ -78,23 +79,6 @@ use datafusion::{ }, prelude::SessionContext, }; -use datafusion_common::{ - tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter}, - JoinType as DFJoinType, ScalarValue, -}; -use datafusion_expr::expr::find_df_window_func; -use datafusion_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition}; -use datafusion_physical_expr::window::WindowExpr; -use datafusion_physical_expr_common::aggregate::create_aggregate_expr; -use datafusion_physical_expr_common::expressions::Literal; -use itertools::Itertools; -use jni::objects::GlobalRef; -use num::{BigInt, ToPrimitive}; -use std::cmp::max; -use std::{collections::HashMap, sync::Arc}; - -use super::expressions::EvalMode; -use crate::execution::datafusion::expressions::comet_scalar_funcs::create_comet_physical_fun; use datafusion_comet_proto::{ spark_expression::{ self, agg_expr::ExprStruct as AggExprStruct, expr::ExprStruct, literal::Value, AggExpr, @@ -111,6 +95,20 @@ use datafusion_comet_spark_expr::{ Cast, CreateNamedStruct, DateTruncExpr, GetStructField, HourExpr, IfExpr, MinuteExpr, RLike, SecondExpr, TimestampTruncExpr, }; +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter}, + JoinType as DFJoinType, ScalarValue, +}; +use datafusion_expr::expr::find_df_window_func; +use datafusion_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition}; +use datafusion_physical_expr::window::WindowExpr; +use datafusion_physical_expr_common::aggregate::create_aggregate_expr; +use datafusion_physical_expr_common::expressions::Literal; +use itertools::Itertools; +use jni::objects::GlobalRef; +use num::{BigInt, ToPrimitive}; +use std::cmp::max; +use std::{collections::HashMap, sync::Arc}; // For clippy error on type_complexity. type ExecResult = Result; @@ -1233,51 +1231,47 @@ impl PhysicalPlanner { ) -> Result, ExecutionError> { match spark_expr.expr_struct.as_ref().unwrap() { AggExprStruct::Count(expr) => { - if expr.children.iter().len() == 1 { - // Using `count_udaf` from Comet is exceptionally slow for some reason, so - // as a workaround we translate it to `SUM(IF(expr IS NULL, 0, 1))` - // https://github.com/apache/datafusion-comet/issues/744 - let the_expr = &expr.children[0]; - - // TODO this could be optimized more for the `COUNT(1)` case - let child = Arc::new(IfExpr::new( - Arc::new(IsNullExpr::new(self.create_expr(the_expr, schema.clone())?)), - Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), - Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), - )); + assert!(expr.children.len() > 0); + // Using `count_udaf` from Comet is exceptionally slow for some reason, so + // as a workaround we translate it to `SUM(IF(expr IS NOT NULL, 1, 0))` + // https://github.com/apache/datafusion-comet/issues/744 - create_aggregate_expr( - &sum_udaf(), - &[child], - &[], - &[], - &[], - schema.as_ref(), - "count", - false, - false, - ) - .map_err(|e| ExecutionError::DataFusionError(e.to_string())) - } else { - // use count_udaf, which has poor performance - let children = expr - .children - .iter() - .map(|child| self.create_expr(child, schema.clone())) - .collect::, _>>()?; - create_aggregate_expr( - &count_udaf(), - &children, - &[], - &[], - &[], - schema.as_ref(), - "count", - false, - false, - ) - .map_err(|e| ExecutionError::DataFusionError(e.to_string())) - } + let children = expr + .children + .iter() + .map(|child| self.create_expr(child, schema.clone())) + .collect::, _>>()?; + + // create `IS NOT NULL expr` and join them with `AND` if there are multiple + let not_null_expr: Arc = children.iter().skip(1).fold( + Arc::new(IsNotNullExpr::new(children[0].clone())) as Arc, + |acc, child| { + Arc::new(BinaryExpr::new( + acc, + DataFusionOperator::And, + Arc::new(IsNotNullExpr::new(child.clone())), + )) + }, + ); + + let child = Arc::new(IfExpr::new( + not_null_expr, + Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), + Arc::new(Literal::new(ScalarValue::Int64(Some(0)))), + )); + + create_aggregate_expr( + &sum_udaf(), + &[child], + &[], + &[], + &[], + schema.as_ref(), + "count", + false, + false, + ) + .map_err(|e| ExecutionError::DataFusionError(e.to_string())) } AggExprStruct::Min(expr) => { let child = self.create_expr(expr.child.as_ref().unwrap(), schema)?; From 87e0d596354e9e599b5a254201abc51bc6e81b0c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 6 Aug 2024 06:24:32 -0600 Subject: [PATCH 9/9] clippy --- native/core/src/execution/datafusion/planner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 16ef070f59..3cdf799c28 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -1231,7 +1231,7 @@ impl PhysicalPlanner { ) -> Result, ExecutionError> { match spark_expr.expr_struct.as_ref().unwrap() { AggExprStruct::Count(expr) => { - assert!(expr.children.len() > 0); + assert!(!expr.children.is_empty()); // Using `count_udaf` from Comet is exceptionally slow for some reason, so // as a workaround we translate it to `SUM(IF(expr IS NOT NULL, 1, 0))` // https://github.com/apache/datafusion-comet/issues/744