From ad67858ae93915cb653a710eb40910e3dc4b0e8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 11:04:26 +0000 Subject: [PATCH 01/12] first pass at `semester()` + unit tests --- r/R/dplyr-funcs-datetime.R | 9 +++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 13 +++++++++++++ 2 files changed, 22 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d9bfcbd7e3e..331cc95423a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -153,5 +153,14 @@ register_bindings_datetime <- function() { } x$type()$timezone() + register_binding("semester", function(x, with_year = FALSE) { + month <- call_binding("month", x) + semester <- call_binding("if_else", month <= 6, 1L, 2L) + if (with_year) { + year <- call_binding("year", x) + return(year + semester / 10) + } else { + return(semester) + } }) } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e3092f3455c..a3ddc616181 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -747,5 +747,18 @@ test_that("extract tz", { expect_error( call_binding("tz", Expression$scalar("2020-10-01")), "timezone extraction for objects of class `string` not supported in Arrow" +test_that("semester", { + test_df <- tibble( + month = c(1:12, NA), + month_char_pad = ifelse(month < 10, paste0("0", month), month), + dates = as.Date(paste0("2021-", month_char_pad, "-15")) + ) + + compare_dplyr_binding( + .input %>% + mutate(sem_wo_year = semester(dates), + sem_w_year = semester(dates, with_year = TRUE)) %>% + collect(), + test_df ) }) From 25460cc1342a19beb0b122789f5e0b5792242611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 12:39:31 +0000 Subject: [PATCH 02/12] added NEWS bullet point --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index e69d46275fe..6586938a5bc 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,6 +21,7 @@ * `lubridate`: * `tz()` to extract/get timezone +* `lubridate` features: `semester()` # arrow 7.0.0 From 583d515288d458edd0d3af1e13de906cc8cff4b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 15 Feb 2022 13:42:38 +0000 Subject: [PATCH 03/12] typo --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 6586938a5bc..0fa990ecf72 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,7 @@ * `lubridate`: * `tz()` to extract/get timezone * `lubridate` features: `semester()` +* Additional `lubridate` features: `semester()`. # arrow 7.0.0 From 77c3298ee68010289bd677c37b1980febb9f55b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Feb 2022 13:27:28 +0000 Subject: [PATCH 04/12] reorganise unit tests and add reference to ARROW-15701 --- r/tests/testthat/test-dplyr-funcs-datetime.R | 36 ++++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a3ddc616181..1d80f001252 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -748,12 +748,14 @@ test_that("extract tz", { call_binding("tz", Expression$scalar("2020-10-01")), "timezone extraction for objects of class `string` not supported in Arrow" test_that("semester", { +test_that("semester works with temporal types", { test_df <- tibble( - month = c(1:12, NA), - month_char_pad = ifelse(month < 10, paste0("0", month), month), - dates = as.Date(paste0("2021-", month_char_pad, "-15")) + month_as_int = c(1:12, NA), + month_as_char_pad = ifelse(month_as_int < 10, paste0("0", month_as_int), month_as_int), + dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) ) + # test extraction from dates compare_dplyr_binding( .input %>% mutate(sem_wo_year = semester(dates), @@ -762,3 +764,31 @@ test_that("semester", { test_df ) }) + +test_that("semester errors with integers and characters", { + test_df <- tibble( + month_as_int = c(1:12, NA), + month_as_char_pad = ifelse(month_as_int < 10, paste0("0", month_as_int), month_as_int), + dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) + ) + + # extraction from integers should error as we currently do not support setting + # month components with month, but this is supported by `lubridate::month()` + # this should no longer fail once https://issues.apache.org/jira/browse/ARROW-15701 + # is addressed + expect_error( + test_df %>% + arrow_table() %>% + mutate(sem_month_as_int = semester(month_as_int)) %>% + collect(), + regexp = "no kernel matching input types" + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(sem_month_as_char_pad = semester(month_as_char_pad)) %>% + collect(), + regexp = "no kernel matching input types" + ) +}) From 522e2b24761c41bf39ee064f0b7550487f49b3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Feb 2022 15:40:25 +0000 Subject: [PATCH 05/12] a bit more specific about the error message --- r/tests/testthat/test-dplyr-funcs-datetime.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 1d80f001252..0af5a2a9b52 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -781,7 +781,8 @@ test_that("semester errors with integers and characters", { arrow_table() %>% mutate(sem_month_as_int = semester(month_as_int)) %>% collect(), - regexp = "no kernel matching input types" + regexp = "NotImplemented: Function 'month' has no kernel matching input types (array[int32])", + fixed = TRUE ) expect_error( @@ -789,6 +790,7 @@ test_that("semester errors with integers and characters", { arrow_table() %>% mutate(sem_month_as_char_pad = semester(month_as_char_pad)) %>% collect(), - regexp = "no kernel matching input types" + regexp = "NotImplemented: Function 'month' has no kernel matching input types (array[string])", + fixed = TRUE ) }) From 2aab652dad838f473a06c8e37233ca02f1d8d20e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Feb 2022 16:51:11 +0000 Subject: [PATCH 06/12] use `sprintf()` and add TODO --- r/tests/testthat/test-dplyr-funcs-datetime.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0af5a2a9b52..4b30565a8f7 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -751,7 +751,7 @@ test_that("semester", { test_that("semester works with temporal types", { test_df <- tibble( month_as_int = c(1:12, NA), - month_as_char_pad = ifelse(month_as_int < 10, paste0("0", month_as_int), month_as_int), + month_as_char_pad = sprintf("%02i", month_as_int), dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) ) @@ -768,13 +768,13 @@ test_that("semester works with temporal types", { test_that("semester errors with integers and characters", { test_df <- tibble( month_as_int = c(1:12, NA), - month_as_char_pad = ifelse(month_as_int < 10, paste0("0", month_as_int), month_as_int), + month_as_char_pad = sprintf("%02i", month_as_int), dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) ) # extraction from integers should error as we currently do not support setting # month components with month, but this is supported by `lubridate::month()` - # this should no longer fail once https://issues.apache.org/jira/browse/ARROW-15701 + # TODO this should no longer fail once https://issues.apache.org/jira/browse/ARROW-15701 # is addressed expect_error( test_df %>% From eff83bdf3f59f58c976e15aac23a04fa1d8d102f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 16 Feb 2022 16:51:56 +0000 Subject: [PATCH 07/12] early pass at supporting integer inputs for `month()` * the logic is correct, but the implementation fails --- r/R/dplyr-funcs-datetime.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 331cc95423a..57d1f2113b5 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -115,6 +115,14 @@ register_bindings_datetime <- function() { return(Expression$create("strftime", x, options = list(format = format, locale = locale))) } + if (call_binding("is.integer", x)) { + if (call_binding_agg("all", call_binding("between", x, 1, 12))) { + return(x) + } else { + abort("Values are not in 1:12") + } + } + Expression$create("month", x) }) From 7a196c8512516007ad8d132ae21117e9b98d838a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 13:10:18 +0000 Subject: [PATCH 08/12] simplified the unit testing blocks --- r/tests/testthat/test-dplyr-funcs-datetime.R | 27 +++++++------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4b30565a8f7..818c7aa5eb5 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -749,13 +749,14 @@ test_that("extract tz", { "timezone extraction for objects of class `string` not supported in Arrow" test_that("semester", { test_that("semester works with temporal types", { +test_that("semester works with temporal types and integers", { test_df <- tibble( month_as_int = c(1:12, NA), month_as_char_pad = sprintf("%02i", month_as_int), dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) ) - # test extraction from dates + # semester extraction from dates compare_dplyr_binding( .input %>% mutate(sem_wo_year = semester(dates), @@ -763,26 +764,16 @@ test_that("semester works with temporal types", { collect(), test_df ) -}) - -test_that("semester errors with integers and characters", { - test_df <- tibble( - month_as_int = c(1:12, NA), - month_as_char_pad = sprintf("%02i", month_as_int), - dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) + # semester extraction from months as integers + compare_dplyr_binding( + .input %>% + mutate(sem_month_as_int = semester(month_as_int)) %>% + collect(), + test_df ) - # extraction from integers should error as we currently do not support setting - # month components with month, but this is supported by `lubridate::month()` - # TODO this should no longer fail once https://issues.apache.org/jira/browse/ARROW-15701 - # is addressed expect_error( - test_df %>% - arrow_table() %>% - mutate(sem_month_as_int = semester(month_as_int)) %>% - collect(), - regexp = "NotImplemented: Function 'month' has no kernel matching input types (array[int32])", - fixed = TRUE + call_binding("semester", Expression$scalar(1:13)) ) expect_error( From 01e0b3cde417600f1fc30d0a01f08e0d01981ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 13:11:26 +0000 Subject: [PATCH 09/12] supporting R integers & arrow expressions --- r/R/dplyr-funcs-datetime.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 57d1f2113b5..821890500e1 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -116,10 +116,19 @@ register_bindings_datetime <- function() { } if (call_binding("is.integer", x)) { - if (call_binding_agg("all", call_binding("between", x, 1, 12))) { - return(x) + if (inherits(x, "Expression")) { + call_binding( + "if_else", + call_binding_agg("all", call_binding("between", x, 1, 12))$data, + x, + abort("bla: Values are not in 1:12") + ) } else { - abort("Values are not in 1:12") + if (all(1 <= x & x <= 12)) { + return(x) + } else { + abort("bla2: Values are not in 1:12") + } } } From 4e642686822f991cd4017989671e8fd9f43d0ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 21 Feb 2022 15:25:46 +0000 Subject: [PATCH 10/12] remove support for integer months and delegate that to ARROW-15701 --- r/R/dplyr-funcs-datetime.R | 17 ----------------- r/tests/testthat/test-dplyr-funcs-datetime.R | 16 ++++++++-------- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 821890500e1..331cc95423a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -115,23 +115,6 @@ register_bindings_datetime <- function() { return(Expression$create("strftime", x, options = list(format = format, locale = locale))) } - if (call_binding("is.integer", x)) { - if (inherits(x, "Expression")) { - call_binding( - "if_else", - call_binding_agg("all", call_binding("between", x, 1, 12))$data, - x, - abort("bla: Values are not in 1:12") - ) - } else { - if (all(1 <= x & x <= 12)) { - return(x) - } else { - abort("bla2: Values are not in 1:12") - } - } - } - Expression$create("month", x) }) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 818c7aa5eb5..58eb8564340 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -764,16 +764,16 @@ test_that("semester works with temporal types and integers", { collect(), test_df ) - # semester extraction from months as integers - compare_dplyr_binding( - .input %>% + # semester extraction from months as integers is not supported yet + # it will be once https://issues.apache.org/jira/browse/ARROW-15701 is done + # TODO change from expect_error to compare_dplyr_bindings + expect_error( + test_df %>% + arrow_table() %>% mutate(sem_month_as_int = semester(month_as_int)) %>% collect(), - test_df - ) - - expect_error( - call_binding("semester", Expression$scalar(1:13)) + regexp = "NotImplemented: Function 'month' has no kernel matching input types (array[int32])", + fixed = TRUE ) expect_error( From 0e334d15274eccf74e0ed1c333c684dd67c9f80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Feb 2022 14:20:26 +0000 Subject: [PATCH 11/12] solve merge conflict weirdness --- r/R/dplyr-funcs-datetime.R | 1 + r/tests/testthat/test-dplyr-funcs-datetime.R | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 331cc95423a..a72df4c777a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -153,6 +153,7 @@ register_bindings_datetime <- function() { } x$type()$timezone() + }) register_binding("semester", function(x, with_year = FALSE) { month <- call_binding("month", x) semester <- call_binding("if_else", month <= 6, 1L, 2L) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 58eb8564340..c5de2684c02 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -744,11 +744,12 @@ test_that("extract tz", { ) # Test one expression - expect_error( - call_binding("tz", Expression$scalar("2020-10-01")), - "timezone extraction for objects of class `string` not supported in Arrow" -test_that("semester", { -test_that("semester works with temporal types", { + expect_error( + call_binding("tz", Expression$scalar("2020-10-01")), + "timezone extraction for objects of class `string` not supported in Arrow" + ) +}) + test_that("semester works with temporal types and integers", { test_df <- tibble( month_as_int = c(1:12, NA), From c1d96f0c0d7d2bd20f1958699bb3566767b62827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 22 Feb 2022 14:25:45 +0000 Subject: [PATCH 12/12] polished NEWS.md --- r/NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 0fa990ecf72..20cae164445 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,8 +21,7 @@ * `lubridate`: * `tz()` to extract/get timezone -* `lubridate` features: `semester()` -* Additional `lubridate` features: `semester()`. + * `semester()` to extract/get semester # arrow 7.0.0