From c32eb2fd628b8a155e74e64a0d611a1fed8c7821 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 30 Jun 2025 15:02:25 +0200 Subject: [PATCH 01/12] Validate states shape in merge_batch Due to `..` in the pattern, the `OrderSensitiveArrayAggAccumulator::merge_batch` did not validate it's not receiving additional states columns it ignores. Update the code to check number of inputs. --- datafusion/functions-aggregate/src/array_agg.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 4ec73e306e0f7..e5abfb34ac521 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -31,7 +31,9 @@ use arrow::datatypes::{DataType, Field, FieldRef, Fields}; use datafusion_common::cast::as_list_array; use datafusion_common::scalar::copy_array_data; -use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder}; +use datafusion_common::utils::{ + get_row_at_idx, take_function_args, SingleRowListArrayBuilder, +}; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; @@ -610,9 +612,8 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { // inside `ARRAY_AGG` list, we will receive an `Array` that stores values // received from its ordering requirement expression. (This information // is necessary for during merging). - let [array_agg_values, agg_orderings, ..] = &states else { - return exec_err!("State should have two elements"); - }; + let [array_agg_values, agg_orderings] = + take_function_args("OrderSensitiveArrayAggAccumulator::merge_batch", states)?; let Some(agg_orderings) = agg_orderings.as_list_opt::() else { return exec_err!("Expects to receive a list array"); }; From cf4d8ae6d091e5c109d1d836dada5b945f8607fb Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 30 Jun 2025 12:27:45 +0200 Subject: [PATCH 02/12] Support multiple ordered array_agg Before the change, `array_agg` with ordering would depend on input being ordered. As a result, it was impossible to do two or more `array_agg(x ORDER BY ...)` with incompatible ordering. This change moves ordering responsibility into `OrderSensitiveArrayAggAccumulator`. When input is pre-ordered (beneficial ordering), no additional work is done. However, when it's not, `array_agg` accumulator will order the data on its own. --- .../functions-aggregate/src/array_agg.rs | 63 ++++++++++++++++-- .../physical-plan/src/aggregates/mod.rs | 19 +++--- .../sqllogictest/test_files/aggregate.slt | 55 +++++++++++++++- .../sqllogictest/test_files/group_by.slt | 66 ++++++++----------- 4 files changed, 151 insertions(+), 52 deletions(-) diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index e5abfb34ac521..70ef420b34c14 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -19,7 +19,7 @@ use std::cmp::Ordering; use std::collections::{HashSet, VecDeque}; -use std::mem::{size_of, size_of_val}; +use std::mem::{size_of, size_of_val, take}; use std::sync::Arc; use arrow::array::{ @@ -32,7 +32,7 @@ use arrow::datatypes::{DataType, Field, FieldRef, Fields}; use datafusion_common::cast::as_list_array; use datafusion_common::scalar::copy_array_data; use datafusion_common::utils::{ - get_row_at_idx, take_function_args, SingleRowListArrayBuilder, + compare_rows, get_row_at_idx, take_function_args, SingleRowListArrayBuilder, }; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; @@ -41,6 +41,7 @@ use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; +use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; use datafusion_functions_aggregate_common::utils::ordering_fields; use datafusion_macros::user_doc; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -80,12 +81,14 @@ This aggregation function can only mix DISTINCT and ORDER BY if the ordering exp /// ARRAY_AGG aggregate expression pub struct ArrayAgg { signature: Signature, + is_input_pre_ordered: bool, } impl Default for ArrayAgg { fn default() -> Self { Self { signature: Signature::any(1, Volatility::Immutable), + is_input_pre_ordered: false, } } } @@ -146,6 +149,20 @@ impl AggregateUDFImpl for ArrayAgg { Ok(fields) } + fn order_sensitivity(&self) -> AggregateOrderSensitivity { + AggregateOrderSensitivity::Beneficial + } + + fn with_beneficial_ordering( + self: Arc, + beneficial_ordering: bool, + ) -> Result>> { + Ok(Some(Arc::new(Self { + signature: self.signature.clone(), + is_input_pre_ordered: beneficial_ordering, + }))) + } + fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; let ignore_nulls = @@ -198,6 +215,7 @@ impl AggregateUDFImpl for ArrayAgg { &data_type, &ordering_dtypes, ordering, + self.is_input_pre_ordered, acc_args.is_reversed, ignore_nulls, ) @@ -514,6 +532,8 @@ pub(crate) struct OrderSensitiveArrayAggAccumulator { datatypes: Vec, /// Stores the ordering requirement of the `Accumulator`. ordering_req: LexOrdering, + /// Whether the input is known to be pre-ordered + is_input_pre_ordered: bool, /// Whether the aggregation is running in reverse. reverse: bool, /// Whether the aggregation should ignore null values. @@ -527,6 +547,7 @@ impl OrderSensitiveArrayAggAccumulator { datatype: &DataType, ordering_dtypes: &[DataType], ordering_req: LexOrdering, + is_input_pre_ordered: bool, reverse: bool, ignore_nulls: bool, ) -> Result { @@ -537,11 +558,34 @@ impl OrderSensitiveArrayAggAccumulator { ordering_values: vec![], datatypes, ordering_req, + is_input_pre_ordered, reverse, ignore_nulls, }) } + fn sort(&mut self) { + let sort_options = self + .ordering_req + .iter() + .map(|sort_expr| sort_expr.options) + .collect::>(); + let mut values = take(&mut self.values) + .into_iter() + .zip(take(&mut self.ordering_values)) + .collect::>(); + let mut delayed_cmp_err = Ok(()); + values.sort_by(|(_, left_ordering), (_, right_ordering)| { + compare_rows(left_ordering, right_ordering, &sort_options).unwrap_or_else( + |err| { + delayed_cmp_err = Err(err); + Ordering::Equal + }, + ) + }); + (self.values, self.ordering_values) = values.into_iter().unzip(); + } + fn evaluate_orderings(&self) -> Result { let fields = ordering_fields(&self.ordering_req, &self.datatypes[1..]); @@ -624,8 +668,11 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { let mut partition_ordering_values = vec![]; // Existing values should be merged also. - partition_values.push(self.values.clone().into()); - partition_ordering_values.push(self.ordering_values.clone().into()); + if !self.is_input_pre_ordered { + self.sort(); + } + partition_values.push(take(&mut self.values).into()); + partition_ordering_values.push(take(&mut self.ordering_values).into()); // Convert array to Scalars to sort them easily. Convert back to array at evaluation. let array_agg_res = ScalarValue::convert_array_to_scalar_vec(array_agg_values)?; @@ -674,6 +721,10 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { } fn state(&mut self) -> Result> { + if !self.is_input_pre_ordered { + self.sort(); + } + let mut result = vec![self.evaluate()?]; result.push(self.evaluate_orderings()?); @@ -681,6 +732,10 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { } fn evaluate(&mut self) -> Result { + if !self.is_input_pre_ordered { + self.sort(); + } + if self.values.is_empty() { return Ok(ScalarValue::new_null_list( self.datatypes[0].clone(), diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 14b2d0a932c2a..0ee759082e073 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1442,7 +1442,7 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_execution::memory_pool::FairSpillPool; use datafusion_execution::runtime_env::RuntimeEnvBuilder; - use datafusion_functions_aggregate::array_agg::array_agg_udaf; + use datafusion_expr::test::function_stub::max_udaf; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::first_last::{first_value_udaf, last_value_udaf}; @@ -2428,13 +2428,16 @@ mod tests { let mut aggr_exprs = order_by_exprs .into_iter() .map(|order_by_expr| { - AggregateExprBuilder::new(array_agg_udaf(), vec![Arc::clone(col_a)]) - .alias("a") - .order_by(order_by_expr) - .schema(Arc::clone(&test_schema)) - .build() - .map(Arc::new) - .unwrap() + AggregateExprBuilder::new( + max_udaf(), // any UDAF not using Beneficial order sensitivity + vec![Arc::clone(col_a)], + ) + .alias("a") + .order_by(order_by_expr) + .schema(Arc::clone(&test_schema)) + .build() + .map(Arc::new) + .unwrap() }) .collect::>(); let group_by = PhysicalGroupBy::new_single(vec![]); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 3f064485e51aa..90ba16cccbb1f 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -206,6 +206,56 @@ query error Execution error: In an aggregate with DISTINCT, ORDER BY expressions SELECT array_agg(DISTINCT c13 ORDER BY c13, c12) FROM aggregate_test_100 +query ?? rowsort +with tbl as (SELECT * FROM (VALUES ('xxx', 'yyy'), ('xxx', 'yyy'), ('xxx2', 'yyy2')) AS t(x, y)) +select + array_agg(x order by x) as x_agg, + array_agg(y order by y) as y_agg +from tbl +group by all +---- +[xxx, xxx, xxx2] [yyy, yyy, yyy2] + +query ?? +SELECT + (SELECT array_agg(c12 ORDER BY c12) FROM aggregate_test_100), + (SELECT array_agg(c13 ORDER BY c13) FROM aggregate_test_100) +---- +[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8] + +query ?? +SELECT + array_agg(c12 ORDER BY c12), + array_agg(c13 ORDER BY c13) +FROM aggregate_test_100 +---- +[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8] + +query ?? rowsort +with tbl as (SELECT * FROM (VALUES ('xxx', 'yyy'), ('xxx', 'yyy'), ('xxx2', 'yyy2')) AS t(x, y)) +select + array_agg(distinct x order by x) as x_agg, + array_agg(distinct y order by y) as y_agg +from tbl +group by all +---- +[xxx, xxx2] [yyy, yyy2] + +query ?? +SELECT + (SELECT array_agg(DISTINCT c12 ORDER BY c12) FROM aggregate_test_100), + (SELECT array_agg(DISTINCT c13 ORDER BY c13) FROM aggregate_test_100) +---- +[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8] + +query ?? +SELECT + array_agg(DISTINCT c12 ORDER BY c12), + array_agg(DISTINCT c13 ORDER BY c13) +FROM aggregate_test_100 +---- +[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8] + statement ok CREATE EXTERNAL TABLE agg_order ( c1 INT NOT NULL, @@ -232,9 +282,8 @@ physical_plan 01)AggregateExec: mode=Final, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] -04)------SortExec: expr=[c2@1 DESC, c3@2 ASC NULLS LAST], preserve_partitioning=[true] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true # test array_agg_order with list data type statement ok diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 9e67018ecd0b9..1e3ac507e46c0 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2288,8 +2288,7 @@ logical_plan 02)--TableScan: annotated_data_infinite2 projection=[a, b, d] physical_plan 01)AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[array_agg(annotated_data_infinite2.d) ORDER BY [annotated_data_infinite2.d ASC NULLS LAST]], ordering_mode=Sorted -02)--PartialSortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, d@2 ASC NULLS LAST], common_prefix_length=[2] -03)----StreamingTableExec: partition_sizes=1, projection=[a, b, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST] +02)--StreamingTableExec: partition_sizes=1, projection=[a, b, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST] # as can be seen in the result below d is indeed ordered. query II? @@ -2460,8 +2459,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query T? @@ -2490,8 +2488,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)] -03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, @@ -2506,12 +2503,16 @@ TUR [100.0, 75.0] 175 # test_ordering_sensitive_aggregation3 # When different aggregators have conflicting requirements, we cannot satisfy all of them in current implementation. # test below should raise Plan Error. -statement error DataFusion error: This feature is not implemented: Conflicting ordering requirements in aggregate functions is not supported +query ??? SELECT ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, ARRAY_AGG(s.amount ORDER BY s.amount ASC) AS amounts2, ARRAY_AGG(s.amount ORDER BY s.sn ASC) AS amounts3 FROM sales_global AS s GROUP BY s.country +---- +[80.0, 30.0] [30.0, 80.0] [30.0, 80.0] +[200.0, 50.0] [50.0, 200.0] [50.0, 200.0] +[100.0, 75.0] [75.0, 100.0] [75.0, 100.0] # test_ordering_sensitive_aggregation4 # If aggregators can work with bounded memory (Sorted or PartiallySorted mode), we should append requirement to @@ -2535,7 +2536,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=Sorted -03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST, amount@1 DESC], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2573,7 +2574,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, zip_code@1 as zip_code, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@2 as amounts, sum(s.amount)@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@1 as country, zip_code@0 as zip_code], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=PartiallySorted([0]) -03)----SortExec: TopK(fetch=10), expr=[country@1 ASC NULLS LAST, amount@2 DESC], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@1 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] query TI?R rowsort @@ -2646,7 +2647,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.country DESC NULLS FIRST, s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.country DESC NULLS FIRST, s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=Sorted -03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST, amount@1 DESC], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2742,9 +2743,8 @@ logical_plan 03)----TableScan: sales_global projection=[country, amount] physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query T?RR rowsort SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, @@ -2773,9 +2773,8 @@ logical_plan 03)----TableScan: sales_global projection=[country, amount] physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts, @@ -2805,9 +2804,8 @@ logical_plan 03)----TableScan: sales_global projection=[country, amount] physical_plan 01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@2 as fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@3 as amounts] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query TRR? SELECT country, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, @@ -2836,8 +2834,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as sum1, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as amounts] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----SortExec: expr=[amount@2 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query TR? SELECT country, SUM(amount ORDER BY ts DESC) AS sum1, @@ -3109,9 +3106,8 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY ts ASC) AS array_agg1 @@ -3133,9 +3129,8 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------SortExec: expr=[ts@0 DESC], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY ts DESC) AS array_agg1 @@ -3157,9 +3152,8 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -05)--------SortExec: expr=[amount@0 ASC NULLS LAST], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1 @@ -3187,9 +3181,8 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=4 06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -08)--------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -10)------------------DataSourceExec: partitions=1, partition_sizes=[1] +08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] query T? SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1 @@ -3222,10 +3215,9 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] 05)--------CoalesceBatchesExec: target_batch_size=4 06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 -07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -10)------------------DataSourceExec: partitions=1, partition_sizes=[1] +07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, From 4cd992ca799ac9f3076e818e64fe5236cfe98f85 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 1 Jul 2025 10:27:08 +0200 Subject: [PATCH 03/12] Generate sorts based on aggregations soft requirements The sorting consideration before aggregations did respect only ordered aggregation functions with `AggregateOrderSensitivity::HardRequirement`. This change includes sorting expectations from `AggregateOrderSensitivity::Beneficial` functions. When beneficial ordered function requirements are not satisfied, no error is raised, they are considered in the second pass only. --- .../physical-plan/src/aggregates/mod.rs | 130 +++++++----- .../sqllogictest/test_files/aggregate.slt | 9 +- .../sqllogictest/test_files/distinct_on.slt | 5 +- .../sqllogictest/test_files/group_by.slt | 193 +++++++++--------- datafusion/sqllogictest/test_files/joins.slt | 4 +- .../sqllogictest/test_files/subquery_sort.slt | 5 +- 6 files changed, 184 insertions(+), 162 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 0ee759082e073..4bdc3a37b5033 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -52,6 +52,7 @@ use datafusion_physical_expr_common::sort_expr::{ LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortRequirement, }; +use datafusion_expr::utils::AggregateOrderSensitivity; use itertools::Itertools; pub(crate) mod group_values; @@ -1071,13 +1072,25 @@ fn get_aggregate_expr_req( aggr_expr: &AggregateFunctionExpr, group_by: &PhysicalGroupBy, agg_mode: &AggregateMode, + include_soft_requirement: bool, ) -> Option { - // If the aggregation function is ordering requirement is not absolutely - // necessary, or the aggregation is performing a "second stage" calculation, - // then ignore the ordering requirement. - if !aggr_expr.order_sensitivity().hard_requires() || !agg_mode.is_first_stage() { + // If the aggregation is performing a "second stage" calculation, + // then ignore the ordering requirement. Ordering requirement applies + // only to the aggregation input data. + if !agg_mode.is_first_stage() { return None; } + + match aggr_expr.order_sensitivity() { + AggregateOrderSensitivity::Insensitive => return None, + AggregateOrderSensitivity::HardRequirement => {} + AggregateOrderSensitivity::Beneficial => { + if !include_soft_requirement { + return None; + } + } + } + let mut sort_exprs = aggr_expr.order_bys().to_vec(); // In non-first stage modes, we accumulate data (using `merge_batch`) from // different partitions (i.e. merge partial results). During this merge, we @@ -1142,60 +1155,73 @@ pub fn get_finer_aggregate_exprs_requirement( agg_mode: &AggregateMode, ) -> Result> { let mut requirement = None; - for aggr_expr in aggr_exprs.iter_mut() { - let Some(aggr_req) = get_aggregate_expr_req(aggr_expr, group_by, agg_mode) - .and_then(|o| eq_properties.normalize_sort_exprs(o)) - else { - // There is no aggregate ordering requirement, or it is trivially - // satisfied -- we can skip this expression. - continue; - }; - // If the common requirement is finer than the current expression's, - // we can skip this expression. If the latter is finer than the former, - // adopt it if it is satisfied by the equivalence properties. Otherwise, - // defer the analysis to the reverse expression. - let forward_finer = determine_finer(&requirement, &aggr_req); - if let Some(finer) = forward_finer { - if !finer { - continue; - } else if eq_properties.ordering_satisfy(aggr_req.clone())? { - requirement = Some(aggr_req); - continue; - } - } - if let Some(reverse_aggr_expr) = aggr_expr.reverse_expr() { - let Some(rev_aggr_req) = - get_aggregate_expr_req(&reverse_aggr_expr, group_by, agg_mode) - .and_then(|o| eq_properties.normalize_sort_exprs(o)) - else { - // The reverse requirement is trivially satisfied -- just reverse - // the expression and continue with the next one: - *aggr_expr = Arc::new(reverse_aggr_expr); + + for include_soft_requirement in [false, true] { + for aggr_expr in aggr_exprs.iter_mut() { + let Some(aggr_req) = get_aggregate_expr_req( + aggr_expr, + group_by, + agg_mode, + include_soft_requirement, + ) + .and_then(|o| eq_properties.normalize_sort_exprs(o)) else { + // There is no aggregate ordering requirement, or it is trivially + // satisfied -- we can skip this expression. continue; }; - // If the common requirement is finer than the reverse expression's, - // just reverse it and continue the loop with the next aggregate - // expression. If the latter is finer than the former, adopt it if - // it is satisfied by the equivalence properties. Otherwise, adopt - // the forward expression. - if let Some(finer) = determine_finer(&requirement, &rev_aggr_req) { + // If the common requirement is finer than the current expression's, + // we can skip this expression. If the latter is finer than the former, + // adopt it if it is satisfied by the equivalence properties. Otherwise, + // defer the analysis to the reverse expression. + let forward_finer = determine_finer(&requirement, &aggr_req); + if let Some(finer) = forward_finer { if !finer { + continue; + } else if eq_properties.ordering_satisfy(aggr_req.clone())? { + requirement = Some(aggr_req); + continue; + } + } + if let Some(reverse_aggr_expr) = aggr_expr.reverse_expr() { + let Some(rev_aggr_req) = get_aggregate_expr_req( + &reverse_aggr_expr, + group_by, + agg_mode, + include_soft_requirement, + ) + .and_then(|o| eq_properties.normalize_sort_exprs(o)) else { + // The reverse requirement is trivially satisfied -- just reverse + // the expression and continue with the next one: *aggr_expr = Arc::new(reverse_aggr_expr); - } else if eq_properties.ordering_satisfy(rev_aggr_req.clone())? { - *aggr_expr = Arc::new(reverse_aggr_expr); - requirement = Some(rev_aggr_req); - } else { + continue; + }; + // If the common requirement is finer than the reverse expression's, + // just reverse it and continue the loop with the next aggregate + // expression. If the latter is finer than the former, adopt it if + // it is satisfied by the equivalence properties. Otherwise, adopt + // the forward expression. + if let Some(finer) = determine_finer(&requirement, &rev_aggr_req) { + if !finer { + *aggr_expr = Arc::new(reverse_aggr_expr); + } else if eq_properties.ordering_satisfy(rev_aggr_req.clone())? { + *aggr_expr = Arc::new(reverse_aggr_expr); + requirement = Some(rev_aggr_req); + } else { + requirement = Some(aggr_req); + } + } else if forward_finer.is_some() { requirement = Some(aggr_req); + } else { + // Neither the existing requirement nor the current aggregate + // requirement satisfy the other (forward or reverse), this + // means they are conflicting. This is a problem only for hard + // requirements. Unsatisfied soft requirements can be ignored. + if !include_soft_requirement { + return not_impl_err!( + "Conflicting ordering requirements in aggregate functions is not supported" + ); + } } - } else if forward_finer.is_some() { - requirement = Some(aggr_req); - } else { - // Neither the existing requirement nor the current aggregate - // requirement satisfy the other (forward or reverse), this - // means they are conflicting. - return not_impl_err!( - "Conflicting ordering requirements in aggregate functions is not supported" - ); } } } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 90ba16cccbb1f..17c15c1a65a7e 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -282,8 +282,9 @@ physical_plan 01)AggregateExec: mode=Final, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] -04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true +04)------SortExec: expr=[c2@1 DESC, c3@2 ASC NULLS LAST], preserve_partitioning=[true] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true # test array_agg_order with list data type statement ok @@ -6353,7 +6354,7 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]] 02)--TableScan: convert_first_last_table projection=[c1, c3] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] +01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 @@ -6367,7 +6368,7 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]] 02)--TableScan: convert_first_last_table projection=[c1, c2] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] +01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index b4a491619e893..371007eac7f85 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -101,8 +101,9 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]] -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true +08)--------------SortExec: expr=[c3@2 ASC NULLS LAST], preserve_partitioning=[true] +09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true # ON expressions are not a sub-set of the ORDER BY expressions query error SELECT DISTINCT ON expressions must match initial ORDER BY expressions diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 1e3ac507e46c0..9a271f1ed87c2 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2019,13 +2019,14 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4 -07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] -10)------------------CoalesceBatchesExec: target_batch_size=8192 -11)--------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)] -12)----------------------DataSourceExec: partitions=1, partition_sizes=[3] -13)----------------------DataSourceExec: partitions=1, partition_sizes=[3] +07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]], ordering_mode=PartiallySorted([0]) +08)--------------SortExec: expr=[col0@3 ASC NULLS LAST], preserve_partitioning=[true] +09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +10)------------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] +11)--------------------CoalesceBatchesExec: target_batch_size=8192 +12)----------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)] +13)------------------------DataSourceExec: partitions=1, partition_sizes=[3] +14)------------------------DataSourceExec: partitions=1, partition_sizes=[3] # Columns in the table are a,b,c,d. Source is DataSourceExec which is ordered by # a,b,c column. Column a has cardinality 2, column b has cardinality 4. @@ -2288,7 +2289,8 @@ logical_plan 02)--TableScan: annotated_data_infinite2 projection=[a, b, d] physical_plan 01)AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[array_agg(annotated_data_infinite2.d) ORDER BY [annotated_data_infinite2.d ASC NULLS LAST]], ordering_mode=Sorted -02)--StreamingTableExec: partition_sizes=1, projection=[a, b, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST] +02)--PartialSortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, d@2 ASC NULLS LAST], common_prefix_length=[2] +03)----StreamingTableExec: partition_sizes=1, projection=[a, b, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST] # as can be seen in the result below d is indeed ordered. query II? @@ -2459,7 +2461,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] query T? @@ -2488,7 +2491,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, @@ -2510,9 +2514,9 @@ SELECT ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, FROM sales_global AS s GROUP BY s.country ---- -[80.0, 30.0] [30.0, 80.0] [30.0, 80.0] [200.0, 50.0] [50.0, 200.0] [50.0, 200.0] [100.0, 75.0] [75.0, 100.0] [75.0, 100.0] +[80.0, 30.0] [30.0, 80.0] [30.0, 80.0] # test_ordering_sensitive_aggregation4 # If aggregators can work with bounded memory (Sorted or PartiallySorted mode), we should append requirement to @@ -2536,7 +2540,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=Sorted -03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST, amount@1 DESC], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2574,7 +2578,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, zip_code@1 as zip_code, array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST]@2 as amounts, sum(s.amount)@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@1 as country, zip_code@0 as zip_code], aggr=[array_agg(s.amount) ORDER BY [s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=PartiallySorted([0]) -03)----SortExec: TopK(fetch=10), expr=[country@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@1 ASC NULLS LAST, amount@2 DESC], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] query TI?R rowsort @@ -2647,7 +2651,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, array_agg(s.amount) ORDER BY [s.country DESC NULLS FIRST, s.amount DESC NULLS FIRST]@1 as amounts, sum(s.amount)@2 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(s.amount) ORDER BY [s.country DESC NULLS FIRST, s.amount DESC NULLS FIRST], sum(s.amount)], ordering_mode=Sorted -03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST], preserve_partitioning=[false] +03)----SortExec: TopK(fetch=10), expr=[country@0 ASC NULLS LAST, amount@1 DESC], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2741,21 +2745,20 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR rowsort +query error SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -FRA [200.0, 50.0] 50 50 -GRC [80.0, 30.0] 30 30 -TUR [100.0, 75.0] 75 75 +DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr2 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2771,21 +2774,20 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR +query error SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -GRC [30.0, 80.0] 30 30 -FRA [50.0, 200.0] 50 50 -TUR [75.0, 100.0] 75 75 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr3 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2802,21 +2804,20 @@ logical_plan 01)Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@2 as fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@3 as amounts] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query TRR? +query error SELECT country, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts FROM sales_global GROUP BY country ---- -GRC 30 30 [30.0, 80.0] -FRA 50 50 [50.0, 200.0] -TUR 75 75 [75.0, 100.0] +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr4 # Ordering requirement by the ordering insensitive aggregators shouldn't have effect on @@ -2834,7 +2835,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as sum1, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as amounts] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +03)----SortExec: expr=[amount@2 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] query TR? SELECT country, SUM(amount ORDER BY ts DESC) AS sum1, @@ -2867,7 +2869,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as lv1, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +03)----SortExec: expr=[ts@1 DESC], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, @@ -2900,7 +2903,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as lv1, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----DataSourceExec: partitions=1, partition_sizes=[1] +03)----SortExec: expr=[ts@1 DESC], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, @@ -2936,11 +2940,12 @@ physical_plan 01)SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate] 03)----AggregateExec: mode=Single, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency], aggr=[last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]] -04)------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8] -07)------------DataSourceExec: partitions=1, partition_sizes=[1] -08)------------DataSourceExec: partitions=1, partition_sizes=[1] +04)------SortExec: expr=[sn@5 ASC NULLS LAST], preserve_partitioning=[false] +05)--------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8] +08)--------------DataSourceExec: partitions=1, partition_sizes=[1] +09)--------------DataSourceExec: partitions=1, partition_sizes=[1] query ITIPTR rowsort SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate @@ -2985,7 +2990,8 @@ physical_plan 06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -09)----------------DataSourceExec: partitions=1, partition_sizes=[1] +09)----------------SortExec: expr=[ts@1 ASC NULLS LAST], preserve_partitioning=[false] +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] query TRR SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, @@ -3012,28 +3018,21 @@ logical_plan 02)--Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, ts, amount] -physical_plan -01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] -02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2] -04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 -07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -09)----------------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query TRR +query error SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global GROUP BY country ORDER BY country ---- -FRA 50 50 -GRC 30 30 -TUR 75 75 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # make sure that batch size is small. So that query below runs in multi partitions # row number of the sales_global is 5. Hence we choose batch size 4 to make is smaller. @@ -3056,8 +3055,9 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] query RR SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, @@ -3077,20 +3077,18 @@ logical_plan 01)Projection: first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[ts, amount] -physical_plan -01)ProjectionExec: expr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@0 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv2] -02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query RR +query error SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global ---- -30 30 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # ARRAY_AGG should work in multiple partitions query TT @@ -3106,8 +3104,9 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY ts ASC) AS array_agg1 @@ -3129,8 +3128,9 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------SortExec: expr=[ts@0 DESC], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY ts DESC) AS array_agg1 @@ -3152,8 +3152,9 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------SortExec: expr=[amount@0 ASC NULLS LAST], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] query ? SELECT ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1 @@ -3181,8 +3182,9 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=4 06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -09)----------------DataSourceExec: partitions=1, partition_sizes=[1] +08)--------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true] +09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] query T? SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1 @@ -3208,18 +3210,11 @@ logical_plan 02)--Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, amount] -physical_plan -01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] -02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -05)--------CoalesceBatchesExec: target_batch_size=4 -06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 -07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -09)----------------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR +query error SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 @@ -3227,9 +3222,9 @@ SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, GROUP BY country ORDER BY country ---- -FRA [200.0, 50.0] 50 50 -GRC [80.0, 30.0] 30 30 -TUR [100.0, 75.0] 75 75 +DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # make sure that query below runs in multi partitions statement ok @@ -3875,17 +3870,15 @@ GROUP BY d; 0 4 0 9 -query III rowsort +query error SELECT d, FIRST_VALUE(c ORDER BY a DESC, c DESC) as first_a, LAST_VALUE(c ORDER BY c DESC) as last_c FROM multiple_ordered_table GROUP BY d; ---- -0 95 0 -1 90 4 -2 97 1 -3 99 15 -4 98 9 +DataFusion error: Internal error: Input field name last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c ASC NULLS LAST] does not match with the projection expression first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + query TT EXPLAIN SELECT c diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 3be5c1b1c370e..17934f706ca57 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3451,7 +3451,7 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2 -07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]] +07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0]) 08)--------------CoalesceBatchesExec: target_batch_size=2 09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)] 10)------------------CoalesceBatchesExec: target_batch_size=2 @@ -3459,7 +3459,7 @@ physical_plan 12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true 14)------------------CoalesceBatchesExec: target_batch_size=2 -15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 +15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC NULLS LAST 16)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 17)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index d993515f4de99..c9c330130eee0 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -153,8 +153,9 @@ physical_plan 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true +09)----------------SortExec: expr=[c3@2 DESC, c9@3 ASC NULLS LAST], preserve_partitioning=[true] +10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true query TI From 9b7e94d9d2f3a2003c34b03a30dddaddfc84f015 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 1 Jul 2025 10:42:43 +0200 Subject: [PATCH 04/12] Fix reversing first_value, last_value Upon reversing, a schema and field mismatch would happen. --- datafusion/physical-expr/src/aggregate.rs | 40 +----- .../sqllogictest/test_files/aggregate.slt | 8 +- .../sqllogictest/test_files/group_by.slt | 121 +++++++++++------- 3 files changed, 80 insertions(+), 89 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index 9175c01274cba..ef113c0193a75 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -223,6 +223,7 @@ impl AggregateExprBuilder { let return_field = fun.return_field(&input_exprs_fields)?; let is_nullable = fun.is_nullable(); + // TODO rename AggregateExprBuilder::alias to name let name = match alias { None => { return internal_err!( @@ -575,18 +576,10 @@ impl AggregateFunctionExpr { ReversedUDAF::NotSupported => None, ReversedUDAF::Identical => Some(self.clone()), ReversedUDAF::Reversed(reverse_udf) => { - let mut name = self.name().to_string(); - // If the function is changed, we need to reverse order_by clause as well - // i.e. First(a order by b asc null first) -> Last(a order by b desc null last) - if self.fun().name() != reverse_udf.name() { - replace_order_by_clause(&mut name); - } - replace_fn_name_clause(&mut name, self.fun.name(), reverse_udf.name()); - AggregateExprBuilder::new(reverse_udf, self.args.to_vec()) .order_by(self.order_bys.iter().map(|e| e.reverse()).collect()) .schema(Arc::new(self.schema.clone())) - .alias(name) + .alias(self.name()) .with_ignore_nulls(self.ignore_nulls) .with_distinct(self.is_distinct) .with_reversed(!self.is_reversed) @@ -684,32 +677,3 @@ impl PartialEq for AggregateFunctionExpr { .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) } } - -fn replace_order_by_clause(order_by: &mut String) { - let suffixes = [ - (" DESC NULLS FIRST]", " ASC NULLS LAST]"), - (" ASC NULLS FIRST]", " DESC NULLS LAST]"), - (" DESC NULLS LAST]", " ASC NULLS FIRST]"), - (" ASC NULLS LAST]", " DESC NULLS FIRST]"), - ]; - - if let Some(start) = order_by.find("ORDER BY [") { - if let Some(end) = order_by[start..].find(']') { - let order_by_start = start + 9; - let order_by_end = start + end; - - let column_order = &order_by[order_by_start..=order_by_end]; - for (suffix, replacement) in suffixes { - if column_order.ends_with(suffix) { - let new_order = column_order.replace(suffix, replacement); - order_by.replace_range(order_by_start..=order_by_end, &new_order); - break; - } - } - } - } -} - -fn replace_fn_name_clause(aggr_name: &mut String, fn_name_old: &str, fn_name_new: &str) { - *aggr_name = aggr_name.replace(fn_name_old, fn_name_new); -} diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 17c15c1a65a7e..abb1c035ec7d1 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6354,9 +6354,9 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]] 02)--TableScan: convert_first_last_table projection=[c1, c3] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] +01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] 02)--CoalescePartitionsExec -03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] +03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], file_type=csv, has_header=true @@ -6368,9 +6368,9 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]] 02)--TableScan: convert_first_last_table projection=[c1, c2] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] +01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] 02)--CoalescePartitionsExec -03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] +03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 9a271f1ed87c2..3d2c358ef5405 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2745,20 +2745,22 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR rowsort SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA [200.0, 50.0] 50 50 +GRC [80.0, 30.0] 30 30 +TUR [100.0, 75.0] 75 75 # test_reverse_aggregate_expr2 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2774,20 +2776,22 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +GRC [30.0, 80.0] 30 30 +FRA [50.0, 200.0] 50 50 +TUR [75.0, 100.0] 75 75 # test_reverse_aggregate_expr3 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2804,20 +2808,22 @@ logical_plan 01)Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@2 as fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@3 as amounts] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] +03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query TRR? SELECT country, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +GRC 30 30 [30.0, 80.0] +FRA 50 50 [50.0, 200.0] +TUR 75 75 [75.0, 100.0] # test_reverse_aggregate_expr4 # Ordering requirement by the ordering insensitive aggregators shouldn't have effect on @@ -3018,21 +3024,29 @@ logical_plan 02)--Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, ts, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] +02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2] +04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 +07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +09)----------------SortExec: expr=[ts@1 ASC NULLS LAST], preserve_partitioning=[false] +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query TRR SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global GROUP BY country ORDER BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA 50 50 +GRC 30 30 +TUR 75 75 # make sure that batch size is small. So that query below runs in multi partitions # row number of the sales_global is 5. Hence we choose batch size 4 to make is smaller. @@ -3077,18 +3091,21 @@ logical_plan 01)Projection: first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[ts, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@0 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv2] +02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query RR SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +30 30 # ARRAY_AGG should work in multiple partitions query TT @@ -3210,11 +3227,19 @@ logical_plan 02)--Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] +02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +05)--------CoalesceBatchesExec: target_batch_size=4 +06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 +07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true] +09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 @@ -3222,9 +3247,9 @@ SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, GROUP BY country ORDER BY country ---- -DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA [200.0, 50.0] 50 50 +GRC [80.0, 30.0] 30 30 +TUR [100.0, 75.0] 75 75 # make sure that query below runs in multi partitions statement ok @@ -3854,7 +3879,7 @@ physical_plan 02)--AggregateExec: mode=FinalPartitioned, gby=[d@0 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8 -05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]] +05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]] 06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true @@ -3870,15 +3895,17 @@ GROUP BY d; 0 4 0 9 -query error +query III rowsort SELECT d, FIRST_VALUE(c ORDER BY a DESC, c DESC) as first_a, LAST_VALUE(c ORDER BY c DESC) as last_c FROM multiple_ordered_table GROUP BY d; ---- -DataFusion error: Internal error: Input field name last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c ASC NULLS LAST] does not match with the projection expression first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +0 95 0 +1 90 4 +2 97 1 +3 99 15 +4 98 9 query TT EXPLAIN SELECT c From 3bacde9fa582950ce9dd1276415e26aa34f3f3a7 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 2 Jul 2025 22:02:27 +0200 Subject: [PATCH 05/12] Revert "Fix reversing first_value, last_value" This reverts commit 9b7e94d9d2f3a2003c34b03a30dddaddfc84f015. --- datafusion/physical-expr/src/aggregate.rs | 40 +++++- .../sqllogictest/test_files/aggregate.slt | 8 +- .../sqllogictest/test_files/group_by.slt | 121 +++++++----------- 3 files changed, 89 insertions(+), 80 deletions(-) diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index ef113c0193a75..9175c01274cba 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -223,7 +223,6 @@ impl AggregateExprBuilder { let return_field = fun.return_field(&input_exprs_fields)?; let is_nullable = fun.is_nullable(); - // TODO rename AggregateExprBuilder::alias to name let name = match alias { None => { return internal_err!( @@ -576,10 +575,18 @@ impl AggregateFunctionExpr { ReversedUDAF::NotSupported => None, ReversedUDAF::Identical => Some(self.clone()), ReversedUDAF::Reversed(reverse_udf) => { + let mut name = self.name().to_string(); + // If the function is changed, we need to reverse order_by clause as well + // i.e. First(a order by b asc null first) -> Last(a order by b desc null last) + if self.fun().name() != reverse_udf.name() { + replace_order_by_clause(&mut name); + } + replace_fn_name_clause(&mut name, self.fun.name(), reverse_udf.name()); + AggregateExprBuilder::new(reverse_udf, self.args.to_vec()) .order_by(self.order_bys.iter().map(|e| e.reverse()).collect()) .schema(Arc::new(self.schema.clone())) - .alias(self.name()) + .alias(name) .with_ignore_nulls(self.ignore_nulls) .with_distinct(self.is_distinct) .with_reversed(!self.is_reversed) @@ -677,3 +684,32 @@ impl PartialEq for AggregateFunctionExpr { .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) } } + +fn replace_order_by_clause(order_by: &mut String) { + let suffixes = [ + (" DESC NULLS FIRST]", " ASC NULLS LAST]"), + (" ASC NULLS FIRST]", " DESC NULLS LAST]"), + (" DESC NULLS LAST]", " ASC NULLS FIRST]"), + (" ASC NULLS LAST]", " DESC NULLS FIRST]"), + ]; + + if let Some(start) = order_by.find("ORDER BY [") { + if let Some(end) = order_by[start..].find(']') { + let order_by_start = start + 9; + let order_by_end = start + end; + + let column_order = &order_by[order_by_start..=order_by_end]; + for (suffix, replacement) in suffixes { + if column_order.ends_with(suffix) { + let new_order = column_order.replace(suffix, replacement); + order_by.replace_range(order_by_start..=order_by_end, &new_order); + break; + } + } + } + } +} + +fn replace_fn_name_clause(aggr_name: &mut String, fn_name_old: &str, fn_name_new: &str) { + *aggr_name = aggr_name.replace(fn_name_old, fn_name_new); +} diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index abb1c035ec7d1..17c15c1a65a7e 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6354,9 +6354,9 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]] 02)--TableScan: convert_first_last_table projection=[c1, c3] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] +01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] 02)--CoalescePartitionsExec -03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] +03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], file_type=csv, has_header=true @@ -6368,9 +6368,9 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]] 02)--TableScan: convert_first_last_table projection=[c1, c2] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] +01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] 02)--CoalescePartitionsExec -03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] +03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 3d2c358ef5405..9a271f1ed87c2 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2745,22 +2745,20 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR rowsort +query error SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -FRA [200.0, 50.0] 50 50 -GRC [80.0, 30.0] 30 30 -TUR [100.0, 75.0] 75 75 +DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr2 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2776,22 +2774,20 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR +query error SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -GRC [30.0, 80.0] 30 30 -FRA [50.0, 200.0] 50 50 -TUR [75.0, 100.0] 75 75 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr3 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2808,22 +2804,20 @@ logical_plan 01)Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan -01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@2 as fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@3 as amounts] -02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] -03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query TRR? +query error SELECT country, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts FROM sales_global GROUP BY country ---- -GRC 30 30 [30.0, 80.0] -FRA 50 50 [50.0, 200.0] -TUR 75 75 [75.0, 100.0] +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # test_reverse_aggregate_expr4 # Ordering requirement by the ordering insensitive aggregators shouldn't have effect on @@ -3024,29 +3018,21 @@ logical_plan 02)--Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, ts, amount] -physical_plan -01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] -02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2] -04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 -07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -09)----------------SortExec: expr=[ts@1 ASC NULLS LAST], preserve_partitioning=[false] -10)------------------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query TRR +query error SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global GROUP BY country ORDER BY country ---- -FRA 50 50 -GRC 30 30 -TUR 75 75 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # make sure that batch size is small. So that query below runs in multi partitions # row number of the sales_global is 5. Hence we choose batch size 4 to make is smaller. @@ -3091,21 +3077,18 @@ logical_plan 01)Projection: first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[ts, amount] -physical_plan -01)ProjectionExec: expr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@0 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv2] -02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query RR +query error SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global ---- -30 30 +DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # ARRAY_AGG should work in multiple partitions query TT @@ -3227,19 +3210,11 @@ logical_plan 02)--Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, amount] -physical_plan -01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] -02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] -04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -05)--------CoalesceBatchesExec: target_batch_size=4 -06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 -07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] -08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -10)------------------DataSourceExec: partitions=1, partition_sizes=[1] +physical_plan_error +01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -query T?RR +query error SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 @@ -3247,9 +3222,9 @@ SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, GROUP BY country ORDER BY country ---- -FRA [200.0, 50.0] 50 50 -GRC [80.0, 30.0] 30 30 -TUR [100.0, 75.0] 75 75 +DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + # make sure that query below runs in multi partitions statement ok @@ -3879,7 +3854,7 @@ physical_plan 02)--AggregateExec: mode=FinalPartitioned, gby=[d@0 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8 -05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]] +05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]] 06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true @@ -3895,17 +3870,15 @@ GROUP BY d; 0 4 0 9 -query III rowsort +query error SELECT d, FIRST_VALUE(c ORDER BY a DESC, c DESC) as first_a, LAST_VALUE(c ORDER BY c DESC) as last_c FROM multiple_ordered_table GROUP BY d; ---- -0 95 0 -1 90 4 -2 97 1 -3 99 15 -4 98 9 +DataFusion error: Internal error: Input field name last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c ASC NULLS LAST] does not match with the projection expression first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c DESC NULLS FIRST]. +This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker + query TT EXPLAIN SELECT c From 90db3d2ecf04cd448c4dbc9ecea162b908598f79 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 2 Jul 2025 22:05:29 +0200 Subject: [PATCH 06/12] sort array_agg input the old way whenever possible --- datafusion/ffi/src/udaf/mod.rs | 4 + .../functions-aggregate-common/src/order.rs | 6 +- .../functions-aggregate/src/array_agg.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 3 +- .../sqllogictest/test_files/aggregate.slt | 4 +- .../sqllogictest/test_files/distinct_on.slt | 5 +- .../sqllogictest/test_files/group_by.slt | 161 ++++++++++-------- datafusion/sqllogictest/test_files/joins.slt | 4 +- .../sqllogictest/test_files/subquery_sort.slt | 5 +- 9 files changed, 110 insertions(+), 84 deletions(-) diff --git a/datafusion/ffi/src/udaf/mod.rs b/datafusion/ffi/src/udaf/mod.rs index eb7a408ab1788..0218cb10226c9 100644 --- a/datafusion/ffi/src/udaf/mod.rs +++ b/datafusion/ffi/src/udaf/mod.rs @@ -561,6 +561,7 @@ impl AggregateUDFImpl for ForeignAggregateUDF { pub enum FFI_AggregateOrderSensitivity { Insensitive, HardRequirement, + SoftRequirement, Beneficial, } @@ -569,6 +570,7 @@ impl From for AggregateOrderSensitivity { match value { FFI_AggregateOrderSensitivity::Insensitive => Self::Insensitive, FFI_AggregateOrderSensitivity::HardRequirement => Self::HardRequirement, + FFI_AggregateOrderSensitivity::SoftRequirement => Self::SoftRequirement, FFI_AggregateOrderSensitivity::Beneficial => Self::Beneficial, } } @@ -579,6 +581,7 @@ impl From for FFI_AggregateOrderSensitivity { match value { AggregateOrderSensitivity::Insensitive => Self::Insensitive, AggregateOrderSensitivity::HardRequirement => Self::HardRequirement, + AggregateOrderSensitivity::SoftRequirement => Self::SoftRequirement, AggregateOrderSensitivity::Beneficial => Self::Beneficial, } } @@ -720,6 +723,7 @@ mod tests { fn test_round_trip_all_order_sensitivities() { test_round_trip_order_sensitivity(AggregateOrderSensitivity::Insensitive); test_round_trip_order_sensitivity(AggregateOrderSensitivity::HardRequirement); + test_round_trip_order_sensitivity(AggregateOrderSensitivity::SoftRequirement); test_round_trip_order_sensitivity(AggregateOrderSensitivity::Beneficial); } } diff --git a/datafusion/functions-aggregate-common/src/order.rs b/datafusion/functions-aggregate-common/src/order.rs index bfa6e39138f9e..5b96ac0b0a798 100644 --- a/datafusion/functions-aggregate-common/src/order.rs +++ b/datafusion/functions-aggregate-common/src/order.rs @@ -25,6 +25,10 @@ pub enum AggregateOrderSensitivity { /// The aggregator can not produce a correct result unless its ordering /// requirement is satisfied. HardRequirement, + /// Indicates that the aggregate expression strongly prefers the input to be ordered. + /// The aggregator can produce its result correctly regardless of the input ordering, + /// This is a similar to, but stronger than, `Beneficial`. + SoftRequirement, /// Indicates that ordering is beneficial for the aggregate expression in terms /// of evaluation efficiency. The aggregator can produce its result efficiently /// when its required ordering is satisfied; however, it can still produce the @@ -38,7 +42,7 @@ impl AggregateOrderSensitivity { } pub fn is_beneficial(&self) -> bool { - self.eq(&AggregateOrderSensitivity::Beneficial) + matches!(self, Self::SoftRequirement | Self::Beneficial) } pub fn hard_requires(&self) -> bool { diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 49dee064d6d6a..5f5738d153123 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -150,7 +150,7 @@ impl AggregateUDFImpl for ArrayAgg { } fn order_sensitivity(&self) -> AggregateOrderSensitivity { - AggregateOrderSensitivity::Beneficial + AggregateOrderSensitivity::SoftRequirement } fn with_beneficial_ordering( diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 4bdc3a37b5033..f3ae819b5ee79 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1084,11 +1084,12 @@ fn get_aggregate_expr_req( match aggr_expr.order_sensitivity() { AggregateOrderSensitivity::Insensitive => return None, AggregateOrderSensitivity::HardRequirement => {} - AggregateOrderSensitivity::Beneficial => { + AggregateOrderSensitivity::SoftRequirement => { if !include_soft_requirement { return None; } } + AggregateOrderSensitivity::Beneficial => return None, } let mut sort_exprs = aggr_expr.order_bys().to_vec(); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 17c15c1a65a7e..63a19338811a4 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6354,7 +6354,7 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]] 02)--TableScan: convert_first_last_table projection=[c1, c3] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] +01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 @@ -6368,7 +6368,7 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]] 02)--TableScan: convert_first_last_table projection=[c1, c2] physical_plan -01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] +01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] 02)--CoalescePartitionsExec 03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index 371007eac7f85..b4a491619e893 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -101,9 +101,8 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]] -08)--------------SortExec: expr=[c3@2 ASC NULLS LAST], preserve_partitioning=[true] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true # ON expressions are not a sub-set of the ORDER BY expressions query error SELECT DISTINCT ON expressions must match initial ORDER BY expressions diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 9a271f1ed87c2..ebe78f4f875df 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2019,14 +2019,13 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4 -07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]], ordering_mode=PartiallySorted([0]) -08)--------------SortExec: expr=[col0@3 ASC NULLS LAST], preserve_partitioning=[true] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] -11)--------------------CoalesceBatchesExec: target_batch_size=8192 -12)----------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)] -13)------------------------DataSourceExec: partitions=1, partition_sizes=[3] -14)------------------------DataSourceExec: partitions=1, partition_sizes=[3] +07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] +10)------------------CoalesceBatchesExec: target_batch_size=8192 +11)--------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)] +12)----------------------DataSourceExec: partitions=1, partition_sizes=[3] +13)----------------------DataSourceExec: partitions=1, partition_sizes=[3] # Columns in the table are a,b,c,d. Source is DataSourceExec which is ordered by # a,b,c column. Column a has cardinality 2, column b has cardinality 4. @@ -2507,15 +2506,15 @@ TUR [100.0, 75.0] 175 # test_ordering_sensitive_aggregation3 # When different aggregators have conflicting requirements, we cannot satisfy all of them in current implementation. # test below should raise Plan Error. -query ??? +query ??? rowsort SELECT ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, ARRAY_AGG(s.amount ORDER BY s.amount ASC) AS amounts2, ARRAY_AGG(s.amount ORDER BY s.sn ASC) AS amounts3 FROM sales_global AS s GROUP BY s.country ---- -[200.0, 50.0] [50.0, 200.0] [50.0, 200.0] [100.0, 75.0] [75.0, 100.0] [75.0, 100.0] +[200.0, 50.0] [50.0, 200.0] [50.0, 200.0] [80.0, 30.0] [30.0, 80.0] [30.0, 80.0] # test_ordering_sensitive_aggregation4 @@ -2745,20 +2744,22 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +03)----SortExec: expr=[amount@1 DESC], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR rowsort SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA [200.0, 50.0] 50 50 +GRC [80.0, 30.0] 30 30 +TUR [100.0, 75.0] 75 75 # test_reverse_aggregate_expr2 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2774,20 +2775,22 @@ logical_plan 01)Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] +03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +GRC [30.0, 80.0] 30 30 +FRA [50.0, 200.0] 50 50 +TUR [75.0, 100.0] 75 75 # test_reverse_aggregate_expr3 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2804,20 +2807,22 @@ logical_plan 01)Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS amounts 02)--Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]] 03)----TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@2 as fv2, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@3 as amounts] +02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]] +03)----SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[false] +04)------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query TRR? SELECT country, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2, ARRAY_AGG(amount ORDER BY amount ASC) AS amounts FROM sales_global GROUP BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +GRC 30 30 [30.0, 80.0] +FRA 50 50 [50.0, 200.0] +TUR 75 75 [75.0, 100.0] # test_reverse_aggregate_expr4 # Ordering requirement by the ordering insensitive aggregators shouldn't have effect on @@ -2869,8 +2874,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as lv1, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----SortExec: expr=[ts@1 DESC], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, @@ -2903,8 +2907,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as lv1, sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@3 as sum1] 02)--AggregateExec: mode=Single, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST], sum(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] -03)----SortExec: expr=[ts@1 DESC], preserve_partitioning=[false] -04)------DataSourceExec: partitions=1, partition_sizes=[1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, @@ -2940,12 +2943,11 @@ physical_plan 01)SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate] 03)----AggregateExec: mode=Single, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency], aggr=[last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]] -04)------SortExec: expr=[sn@5 ASC NULLS LAST], preserve_partitioning=[false] -05)--------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8] -08)--------------DataSourceExec: partitions=1, partition_sizes=[1] -09)--------------DataSourceExec: partitions=1, partition_sizes=[1] +04)------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8] +07)------------DataSourceExec: partitions=1, partition_sizes=[1] +08)------------DataSourceExec: partitions=1, partition_sizes=[1] query ITIPTR rowsort SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate @@ -2990,8 +2992,7 @@ physical_plan 06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -09)----------------SortExec: expr=[ts@1 ASC NULLS LAST], preserve_partitioning=[false] -10)------------------DataSourceExec: partitions=1, partition_sizes=[1] +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] query TRR SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, @@ -3018,21 +3019,28 @@ logical_plan 02)--Projection: sales_global.country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, ts, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] +02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2] +04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 +07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query TRR SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global GROUP BY country ORDER BY country ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA 50 50 +GRC 30 30 +TUR 75 75 # make sure that batch size is small. So that query below runs in multi partitions # row number of the sales_global is 5. Hence we choose batch size 4 to make is smaller. @@ -3055,9 +3063,8 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]] -05)--------SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] query RR SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, @@ -3077,18 +3084,20 @@ logical_plan 01)Projection: first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST] AS fv2 02)--Aggregate: groupBy=[[]], aggr=[[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]] 03)----TableScan: sales_global projection=[ts, amount] -physical_plan_error -01)Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)ProjectionExec: expr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@0 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@1 as fv2] +02)--AggregateExec: mode=Final, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]] +05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query RR SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, LAST_VALUE(amount ORDER BY ts DESC) AS fv2 FROM sales_global ---- -DataFusion error: Internal error: Input field name first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST] does not match with the projection expression last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +30 30 # ARRAY_AGG should work in multiple partitions query TT @@ -3210,11 +3219,19 @@ logical_plan 02)--Projection: sales_global.country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST] AS fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] AS fv2 03)----Aggregate: groupBy=[[sales_global.country]], aggr=[[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]] 04)------TableScan: sales_global projection=[country, amount] -physical_plan_error -01)Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -02)This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker +physical_plan +01)SortPreservingMergeExec: [country@0 ASC NULLS LAST] +02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2] +04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +05)--------CoalesceBatchesExec: target_batch_size=4 +06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8 +07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]] +08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true] +09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] -query error +query T?RR SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 @@ -3222,9 +3239,9 @@ SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, GROUP BY country ORDER BY country ---- -DataFusion error: Internal error: Input field name last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST] does not match with the projection expression first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +FRA [200.0, 50.0] 50 50 +GRC [80.0, 30.0] 30 30 +TUR [100.0, 75.0] 75 75 # make sure that query below runs in multi partitions statement ok @@ -3870,15 +3887,17 @@ GROUP BY d; 0 4 0 9 -query error +query III rowsort SELECT d, FIRST_VALUE(c ORDER BY a DESC, c DESC) as first_a, LAST_VALUE(c ORDER BY c DESC) as last_c FROM multiple_ordered_table GROUP BY d; ---- -DataFusion error: Internal error: Input field name last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c ASC NULLS LAST] does not match with the projection expression first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.a DESC NULLS FIRST, multiple_ordered_table.c DESC NULLS FIRST]. -This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker - +0 95 0 +1 90 4 +2 97 1 +3 99 15 +4 98 9 query TT EXPLAIN SELECT c diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 17934f706ca57..3be5c1b1c370e 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3451,7 +3451,7 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2 -07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0]) +07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]] 08)--------------CoalesceBatchesExec: target_batch_size=2 09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)] 10)------------------CoalesceBatchesExec: target_batch_size=2 @@ -3459,7 +3459,7 @@ physical_plan 12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true 14)------------------CoalesceBatchesExec: target_batch_size=2 -15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC NULLS LAST +15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 16)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 17)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index c9c330130eee0..d993515f4de99 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -153,9 +153,8 @@ physical_plan 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]] -09)----------------SortExec: expr=[c3@2 DESC, c9@3 ASC NULLS LAST], preserve_partitioning=[true] -10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true +09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true query TI From 5f00ec451aa79c9fb35666963ef6a353ece86730 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 2 Jul 2025 22:09:07 +0200 Subject: [PATCH 07/12] revert some now unnecessary change --- .../physical-plan/src/aggregates/mod.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index f3ae819b5ee79..9ce005218f2b8 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1469,7 +1469,7 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_execution::memory_pool::FairSpillPool; use datafusion_execution::runtime_env::RuntimeEnvBuilder; - use datafusion_expr::test::function_stub::max_udaf; + use datafusion_functions_aggregate::array_agg::array_agg_udaf; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::first_last::{first_value_udaf, last_value_udaf}; @@ -2455,16 +2455,13 @@ mod tests { let mut aggr_exprs = order_by_exprs .into_iter() .map(|order_by_expr| { - AggregateExprBuilder::new( - max_udaf(), // any UDAF not using Beneficial order sensitivity - vec![Arc::clone(col_a)], - ) - .alias("a") - .order_by(order_by_expr) - .schema(Arc::clone(&test_schema)) - .build() - .map(Arc::new) - .unwrap() + AggregateExprBuilder::new(array_agg_udaf(), vec![Arc::clone(col_a)]) + .alias("a") + .order_by(order_by_expr) + .schema(Arc::clone(&test_schema)) + .build() + .map(Arc::new) + .unwrap() }) .collect::>(); let group_by = PhysicalGroupBy::new_single(vec![]); From 134da5aa0799f7bb6c17aa3c2021d43b5c5346de Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 24 Jul 2025 20:01:42 +0200 Subject: [PATCH 08/12] Improve doc for SoftRequiement Co-authored-by: Andrew Lamb --- datafusion/functions-aggregate-common/src/order.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/order.rs b/datafusion/functions-aggregate-common/src/order.rs index 5b96ac0b0a798..bd7021b7bb365 100644 --- a/datafusion/functions-aggregate-common/src/order.rs +++ b/datafusion/functions-aggregate-common/src/order.rs @@ -25,9 +25,16 @@ pub enum AggregateOrderSensitivity { /// The aggregator can not produce a correct result unless its ordering /// requirement is satisfied. HardRequirement, - /// Indicates that the aggregate expression strongly prefers the input to be ordered. - /// The aggregator can produce its result correctly regardless of the input ordering, - /// This is a similar to, but stronger than, `Beneficial`. + /// Indicates that the aggregator is more efficient when the input is ordered + /// but can still produce its result correctly regardless of the input ordering. + /// This is a similar to, but stronger than, [`Self::Beneficial`]. + /// + /// Similarly to [`Self::HardRequirement`], when possible DataFusion will insert + /// a `SortExec`, to reorder the input to match the SoftRequirement. However, + /// when such a `SortExec` cannot be inserted, (for example, due to conflicting + /// [`Self::HardRequirements`] with other ordered aggregates in the query), + /// the aggregate function will still execute, without the preferred order, unlike with + /// with [`Self::HardRequirement`] SoftRequirement, /// Indicates that ordering is beneficial for the aggregate expression in terms /// of evaluation efficiency. The aggregator can produce its result efficiently From 1420f8df329faf448934be321e0f356b8e59eb82 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 24 Jul 2025 20:02:13 +0200 Subject: [PATCH 09/12] Add comment for include_soft_requirement Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/aggregates/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 9ce005218f2b8..f9dbbf6eec4ad 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1157,6 +1157,9 @@ pub fn get_finer_aggregate_exprs_requirement( ) -> Result> { let mut requirement = None; + // First try and find a match for all hard and soft requirements. + // If a match can't be found, try a second time just matching hard + // requirements. for include_soft_requirement in [false, true] { for aggr_expr in aggr_exprs.iter_mut() { let Some(aggr_req) = get_aggregate_expr_req( From 5c1bce9da7b04a0ec62dbc80ae438720ffd3956c Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 24 Jul 2025 20:04:42 +0200 Subject: [PATCH 10/12] Document include_soft_requirement param --- datafusion/physical-plan/src/aggregates/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index f9dbbf6eec4ad..3d92e1044500c 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1063,6 +1063,11 @@ fn create_schema( /// physical GROUP BY expression. /// - `agg_mode`: A reference to an `AggregateMode` instance representing the /// mode of aggregation. +/// - `include_soft_requirement`: When `false`, only hard requirements are +/// considered, as indicated by [`AggregateFunctionExpr::order_sensitivity`] +/// returning [`AggregateOrderSensitivity::HardRequirement`]. +/// Otherwise, also soft requirements ([`AggregateOrderSensitivity::SoftRequirement`]) +/// are considered. /// /// # Returns /// From 8a1abe885ba6b6a176f68e0cf7e4e6e1d4a97bec Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 24 Jul 2025 20:06:18 +0200 Subject: [PATCH 11/12] fmt --- datafusion/functions-aggregate-common/src/order.rs | 12 ++++++------ datafusion/physical-plan/src/aggregates/mod.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/order.rs b/datafusion/functions-aggregate-common/src/order.rs index bd7021b7bb365..e4566dd6a08cb 100644 --- a/datafusion/functions-aggregate-common/src/order.rs +++ b/datafusion/functions-aggregate-common/src/order.rs @@ -25,15 +25,15 @@ pub enum AggregateOrderSensitivity { /// The aggregator can not produce a correct result unless its ordering /// requirement is satisfied. HardRequirement, - /// Indicates that the aggregator is more efficient when the input is ordered + /// Indicates that the aggregator is more efficient when the input is ordered /// but can still produce its result correctly regardless of the input ordering. /// This is a similar to, but stronger than, [`Self::Beneficial`]. /// - /// Similarly to [`Self::HardRequirement`], when possible DataFusion will insert - /// a `SortExec`, to reorder the input to match the SoftRequirement. However, - /// when such a `SortExec` cannot be inserted, (for example, due to conflicting - /// [`Self::HardRequirements`] with other ordered aggregates in the query), - /// the aggregate function will still execute, without the preferred order, unlike with + /// Similarly to [`Self::HardRequirement`], when possible DataFusion will insert + /// a `SortExec`, to reorder the input to match the SoftRequirement. However, + /// when such a `SortExec` cannot be inserted, (for example, due to conflicting + /// [`Self::HardRequirements`] with other ordered aggregates in the query), + /// the aggregate function will still execute, without the preferred order, unlike with /// with [`Self::HardRequirement`] SoftRequirement, /// Indicates that ordering is beneficial for the aggregate expression in terms diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 3d92e1044500c..8e404ca2fe73f 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1162,8 +1162,8 @@ pub fn get_finer_aggregate_exprs_requirement( ) -> Result> { let mut requirement = None; - // First try and find a match for all hard and soft requirements. - // If a match can't be found, try a second time just matching hard + // First try and find a match for all hard and soft requirements. + // If a match can't be found, try a second time just matching hard // requirements. for include_soft_requirement in [false, true] { for aggr_expr in aggr_exprs.iter_mut() { From a1031e040a54311484393cec3908e6a627974e70 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 24 Jul 2025 20:08:27 +0200 Subject: [PATCH 12/12] doc fix --- datafusion/functions-aggregate-common/src/order.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/order.rs b/datafusion/functions-aggregate-common/src/order.rs index e4566dd6a08cb..0908396d78341 100644 --- a/datafusion/functions-aggregate-common/src/order.rs +++ b/datafusion/functions-aggregate-common/src/order.rs @@ -22,18 +22,18 @@ pub enum AggregateOrderSensitivity { /// Ordering at the input is not important for the result of the aggregator. Insensitive, /// Indicates that the aggregate expression has a hard requirement on ordering. - /// The aggregator can not produce a correct result unless its ordering + /// The aggregator cannot produce a correct result unless its ordering /// requirement is satisfied. HardRequirement, /// Indicates that the aggregator is more efficient when the input is ordered /// but can still produce its result correctly regardless of the input ordering. - /// This is a similar to, but stronger than, [`Self::Beneficial`]. + /// This is similar to, but stronger than, [`Self::Beneficial`]. /// /// Similarly to [`Self::HardRequirement`], when possible DataFusion will insert /// a `SortExec`, to reorder the input to match the SoftRequirement. However, /// when such a `SortExec` cannot be inserted, (for example, due to conflicting - /// [`Self::HardRequirements`] with other ordered aggregates in the query), - /// the aggregate function will still execute, without the preferred order, unlike with + /// [`Self::HardRequirement`] with other ordered aggregates in the query), + /// the aggregate function will still execute, without the preferred order, unlike /// with [`Self::HardRequirement`] SoftRequirement, /// Indicates that ordering is beneficial for the aggregate expression in terms