From aaa43a5dc1fa1edaf3d79ddb571466c1ab132624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 18 May 2022 16:50:44 +0100 Subject: [PATCH 01/12] adding `tz` support to `strptime` + updated unit tests --- r/R/dplyr-funcs-datetime.R | 41 +++++++++++++++----- r/tests/testthat/test-dplyr-funcs-datetime.R | 22 ++++++----- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 02ec35bda26..28f4602437b 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -28,25 +28,48 @@ register_bindings_datetime <- function() { } register_bindings_datetime_utility <- function() { - register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, + register_binding("strptime", function(x, + format = "%Y-%m-%d %H:%M:%S", + tz = NULL, unit = "ms") { # Arrow uses unit for time parsing, strptime() does not. # Arrow has no default option for strptime (format, unit), # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", # (ARROW-12809) - # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). - # Stop if tz is provided. - if (is.character(tz)) { - arrow_not_supported("Time zone argument") - } + unit <- make_valid_time_unit( + unit, + c(valid_time64_units, valid_time32_units) + ) - unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + output <- build_expr( + "strptime", + x, + options = + list( + format = format, + unit = unit, + error_is_null = TRUE + ) + ) - build_expr("strptime", x, options = list(format = format, unit = unit, error_is_null = TRUE)) + if (!is.null(tz)) { + output <- build_expr( + "assume_timezone", + output, + options = + list( + timezone = tz + ) + ) + } + output }) - register_binding("strftime", function(x, format = "", tz = "", usetz = FALSE) { + register_binding("strftime", function(x, + format = "", + tz = "", + usetz = FALSE) { if (usetz) { format <- paste(format, "%Z") } diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index b1223630153..f12c706c70e 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -50,6 +50,19 @@ test_df <- tibble::tibble( test_that("strptime", { t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + t_stamp_with_tz <- tibble( + x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA) + ) + + expect_equal( + t_string %>% + arrow_table() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas") + ) %>% + collect(), + t_stamp_with_tz + ) expect_equal( t_string %>% @@ -111,15 +124,6 @@ test_that("strptime", { ) }) -test_that("errors in strptime", { - # Error when tz is passed - x <- Expression$field_ref("x") - expect_error( - call_binding("strptime", x, tz = "PDT"), - "Time zone argument not supported in Arrow" - ) -}) - test_that("strptime returns NA when format doesn't match the data", { df <- tibble( str_date = c("2022-02-07", "2012/02-07", "1975/01-02", "1981/01-07", NA) From 2619723e7e641b362db27d4f696cecd1b7bd108e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 13:40:26 +0100 Subject: [PATCH 02/12] set default value for `tz` to match `base::strptime()` --- r/R/dplyr-funcs-datetime.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 28f4602437b..d8687f7cd30 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -30,7 +30,7 @@ register_bindings_datetime <- function() { register_bindings_datetime_utility <- function() { register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", - tz = NULL, + tz = "", unit = "ms") { # Arrow uses unit for time parsing, strptime() does not. # Arrow has no default option for strptime (format, unit), @@ -53,6 +53,10 @@ register_bindings_datetime_utility <- function() { ) ) + if (tz == "") { + tz <- Sys.timezone() + } + if (!is.null(tz)) { output <- build_expr( "assume_timezone", From f9c29a12b775412363139824085dc8bae44d3ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 13:40:54 +0100 Subject: [PATCH 03/12] update unit tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 36 ++++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index f12c706c70e..934942f46d2 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -50,10 +50,22 @@ test_df <- tibble::tibble( test_that("strptime", { t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) - t_stamp_with_tz <- tibble( + t_stamp_with_pm_tz <- tibble( x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA) ) + withr::with_timezone("Pacific/Marquesas", { + expect_equal( + t_string %>% + arrow_table() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S") + ) %>% + collect(), + t_stamp_with_pm_tz + ) + }) + expect_equal( t_string %>% arrow_table() %>% @@ -61,51 +73,47 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas") ) %>% collect(), - t_stamp_with_tz + t_stamp_with_pm_tz ) expect_equal( t_string %>% Table$create() %>% mutate( - x = strptime(x) + x = strptime(x, tz = "UTC") ) %>% collect(), - t_stamp, - ignore_attr = "tzone" + t_stamp ) expect_equal( t_string %>% Table$create() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S") + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "UTC") ) %>% collect(), - t_stamp, - ignore_attr = "tzone" + t_stamp ) expect_equal( t_string %>% Table$create() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns", tz = "UTC") ) %>% collect(), - t_stamp, - ignore_attr = "tzone" + t_stamp ) expect_equal( t_string %>% Table$create() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s", tz = "UTC") ) %>% collect(), - t_stamp, - ignore_attr = "tzone" + t_stamp ) tstring <- tibble(x = c("08-05-2008", NA)) From b42453e14dd1c7b7a60fd8d3b961a72e7689aa46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 14:37:10 +0100 Subject: [PATCH 04/12] bump ci From c27dbf968ef6b4c4f2aaabf772c2b737edfef352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 15:45:39 +0100 Subject: [PATCH 05/12] bump ci From 5525accee642b24a3db653973c4fba92c35042d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 25 May 2022 20:40:40 +0100 Subject: [PATCH 06/12] bump ci From e34f552427f19903a163fa63326bd3adfd81b648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 26 May 2022 12:13:58 +0100 Subject: [PATCH 07/12] comment + `record_batch()` in one of the tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 934942f46d2..d3140a83a70 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -54,10 +54,12 @@ test_that("strptime", { x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA) ) + # base::strptime returns a POSIXlt (a list) => we cannot use compare_dplyr_binding + # => we use expect_equal withr::with_timezone("Pacific/Marquesas", { expect_equal( t_string %>% - arrow_table() %>% + record_batch() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S") ) %>% From 8e216aa07b18744a4c7a53e247970150c29c1554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 6 Jun 2022 10:42:17 +0100 Subject: [PATCH 08/12] added comment on the use of `assume_timezone` --- r/R/dplyr-funcs-datetime.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d8687f7cd30..b04a473d921 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -57,6 +57,7 @@ register_bindings_datetime_utility <- function() { tz <- Sys.timezone() } + # convert a "timezone-naive" into a "timezone-aware" timestamp if (!is.null(tz)) { output <- build_expr( "assume_timezone", From ee737953e98c1407556347b6950549f5665cabbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 7 Jun 2022 09:24:00 +0100 Subject: [PATCH 09/12] expanded on why `assume_timezone` is needed --- r/R/dplyr-funcs-datetime.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index b04a473d921..501d6b33335 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -57,7 +57,11 @@ register_bindings_datetime_utility <- function() { tz <- Sys.timezone() } - # convert a "timezone-naive" into a "timezone-aware" timestamp + # if a timestamp does not contain timezone information (i.e. it is + # "timezone-naive") we can attach timezone information (i.e. convert it into + # a "timezone-aware" timestamp) with `assume_timezone` + # if we want to cast to a different timezone, we can only do it for + # timezone-aware timestamps, not for timezone-naive ones if (!is.null(tz)) { output <- build_expr( "assume_timezone", From f0c332384e55b51fe5aaf1e430b8881a91e452ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 7 Jun 2022 09:59:27 +0100 Subject: [PATCH 10/12] improved comments and renamed one of the test tibbles for more clarity --- r/tests/testthat/test-dplyr-funcs-datetime.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index d3140a83a70..0010f1994e3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -49,14 +49,19 @@ test_df <- tibble::tibble( test_that("strptime", { t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) - t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + # lubridate defaults to "UTC" as timezone => t_stamp is in "UTC" + t_stamp_with_utc_tz <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) t_stamp_with_pm_tz <- tibble( x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA) ) # base::strptime returns a POSIXlt (a list) => we cannot use compare_dplyr_binding - # => we use expect_equal + # => we use expect_equal for the tests below + withr::with_timezone("Pacific/Marquesas", { + # the default value for strptime's `tz` argument is "", which is interpreted + # as the current timezone. we test here if the strptime binding picks up + # correctly the current timezone (similarly to the base R version) expect_equal( t_string %>% record_batch() %>% @@ -68,6 +73,7 @@ test_that("strptime", { ) }) + # adding a timezone to a timezone-naive timestamp works expect_equal( t_string %>% arrow_table() %>% @@ -85,7 +91,7 @@ test_that("strptime", { x = strptime(x, tz = "UTC") ) %>% collect(), - t_stamp + t_stamp_with_utc_tz ) expect_equal( @@ -95,7 +101,7 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "UTC") ) %>% collect(), - t_stamp + t_stamp_with_utc_tz ) expect_equal( @@ -115,7 +121,7 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s", tz = "UTC") ) %>% collect(), - t_stamp + t_stamp_with_utc_tz ) tstring <- tibble(x = c("08-05-2008", NA)) From ca4e902947da77bdb4d5ceb323d1f26f32f016ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 7 Jun 2022 10:26:44 +0100 Subject: [PATCH 11/12] forgot about one `t_stamp` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 0010f1994e3..8445226d7e3 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -111,7 +111,7 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns", tz = "UTC") ) %>% collect(), - t_stamp + t_stamp_with_utc_tz ) expect_equal( From 90ce31b47cbfbffe0aac72564611359a614be0f6 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Tue, 7 Jun 2022 07:20:33 -0500 Subject: [PATCH 12/12] Update r/tests/testthat/test-dplyr-funcs-datetime.R --- r/tests/testthat/test-dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 8445226d7e3..c644a66db55 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -74,6 +74,9 @@ test_that("strptime", { }) # adding a timezone to a timezone-naive timestamp works + # and since our TZ when running the test is (typically) Pacific/Marquesas + # this also tests that assigning a TZ different from the current session one + # works as expected expect_equal( t_string %>% arrow_table() %>%