From db29c42e02f876a7d7302e0f3f2b06b313a957f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 25 Jan 2022 14:38:58 +0000 Subject: [PATCH 01/26] first pass at implementing `format()` for date-time --- r/R/dplyr-funcs-datetime.R | 4 ++ r/tests/testthat/test-dplyr-funcs-datetime.R | 64 ++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5a22c970965..038d8b1bfa3 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -51,6 +51,10 @@ register_bindings_datetime <- function() { Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) }) + register_binding("format", function(x, format = "", tz = "", usetz = FALSE) { + call_binding("strftime", x = x, format = format, tz = tz, usetz = usetz) + }) + register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { ISO8601_precision_map <- list( diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 073041a4b87..95bbaa3df75 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -206,6 +206,70 @@ test_that("strftime", { ) }) +test_that("format POSIXct", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + times <- tibble( + datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), + date = c(as.Date("2021-01-01"), NA) + ) + formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" + formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(date, format = formats_date)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "Pacific/Marquesas")) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% + collect(), + times + ) + + withr::with_timezone( + "Pacific/Marquesas", + { + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats, tz = "EST"), + x_date = format(date, format = formats_date, tz = "EST") + ) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats), + x_date = format(date, format = formats_date) + ) %>% + collect(), + times + ) + } + ) +}) + test_that("format_ISO8601", { # https://issues.apache.org/jira/projects/ARROW/issues/ARROW-15266 skip_if_not_available("re2") From cb162d9c3620c7d1d79516eaabae2b5bd1a37a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 11:35:22 +0000 Subject: [PATCH 02/26] `arrow::format()` uses object timezone if available --- r/R/dplyr-funcs-datetime.R | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 038d8b1bfa3..9e0c3f067da 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -52,7 +52,21 @@ register_bindings_datetime <- function() { }) register_binding("format", function(x, format = "", tz = "", usetz = FALSE) { - call_binding("strftime", x = x, format = format, tz = tz, usetz = usetz) + if (usetz) { + format <- paste(format, "%Z") + } + + if (call_binding("is.POSIXct", x)) { + if(tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x + } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) }) register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { From fb67975ce3c6fc44f156c04e9200209fd91d87fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 12:24:54 +0000 Subject: [PATCH 03/26] lint --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9e0c3f067da..9a9393e7214 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -57,7 +57,7 @@ register_bindings_datetime <- function() { } if (call_binding("is.POSIXct", x)) { - if(tz == "" && x$type()$timezone() != "") { + if (tz == "" && x$type()$timezone() != "") { tz <- x$type()$timezone() } else if (tz == "") { tz <- Sys.timezone() From a7707c205c6067345b3356727da921c5a203f21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 13:25:11 +0000 Subject: [PATCH 04/26] `format()` dispatch for `Timestamp` and `Date32` types --- r/R/dplyr-funcs-datetime.R | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9a9393e7214..d6e4d3c4a20 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -52,21 +52,27 @@ register_bindings_datetime <- function() { }) register_binding("format", function(x, format = "", tz = "", usetz = FALSE) { - if (usetz) { - format <- paste(format, "%Z") - } + if (inherits(x, "Expression") && + any(inherits(x$type(), "Timestamp"), inherits(x$type(), "Date32"))) { + if (usetz) { + format <- paste(format, "%Z") + } - if (call_binding("is.POSIXct", x)) { - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() + if (call_binding("is.POSIXct", x)) { + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } else { - ts <- x + "WIP" } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + }) register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { From f5f199ad23a72fd15df65e2ef69ea0942fc13a3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 13:34:36 +0000 Subject: [PATCH 05/26] create helper function for date/time format --- r/R/dplyr-funcs-datetime.R | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index d6e4d3c4a20..9d3980c96d5 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -51,28 +51,32 @@ register_bindings_datetime <- function() { Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) }) - register_binding("format", function(x, format = "", tz = "", usetz = FALSE) { - if (inherits(x, "Expression") && - any(inherits(x$type(), "Timestamp"), inherits(x$type(), "Date32"))) { - if (usetz) { - format <- paste(format, "%Z") - } + binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { + if (usetz) { + format <- paste(format, "%Z") + } - if (call_binding("is.POSIXct", x)) { - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() - } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - } else { - ts <- x + if (call_binding("is.POSIXct", x)) { + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) } else { - "WIP" + ts <- x } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + } + register_binding("format", function(x, ...) { + if (inherits(x, "Expression") && + any(inherits(x$type(), "Timestamp"), inherits(x$type(), "Date32"))) { + binding_format_datetime(x, ...) + } else { + # other types + "WIP" + } }) register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { From 3e09b5ca0aaaa2861162af575374087578a7d734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 14:10:49 +0000 Subject: [PATCH 06/26] moved format date/time and helper (+tests) to dplyr-funcs-type --- r/R/dplyr-funcs-datetime.R | 28 --------- r/R/dplyr-funcs-type.R | 33 ++++++++++ r/tests/testthat/test-dplyr-funcs-datetime.R | 64 -------------------- r/tests/testthat/test-dplyr-funcs-type.R | 64 ++++++++++++++++++++ 4 files changed, 97 insertions(+), 92 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9d3980c96d5..5a22c970965 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -51,34 +51,6 @@ register_bindings_datetime <- function() { Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) }) - binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - if (usetz) { - format <- paste(format, "%Z") - } - - if (call_binding("is.POSIXct", x)) { - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() - } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - } else { - ts <- x - } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) - } - - register_binding("format", function(x, ...) { - if (inherits(x, "Expression") && - any(inherits(x$type(), "Timestamp"), inherits(x$type(), "Date32"))) { - binding_format_datetime(x, ...) - } else { - # other types - "WIP" - } - }) - register_binding("format_ISO8601", function(x, usetz = FALSE, precision = NULL, ...) { ISO8601_precision_map <- list( diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index fa839269abe..f8c0ea25110 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -20,6 +20,7 @@ register_bindings_type <- function() { register_bindings_type_cast() register_bindings_type_inspect() register_bindings_type_elementwise() + register_bindings_type_format() } register_bindings_type_cast <- function() { @@ -292,3 +293,35 @@ register_bindings_type_elementwise <- function() { is_inf & !call_binding("is.na", is_inf) }) } + +register_bindings_type_format <- function() { + register_binding("format", function(x, ...) { + if (inherits(x, "Expression") && + x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) { + binding_format_datetime(x, ...) + } else { + # other types + "WIP" + } + }) +} + +binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { + base_format_args <- formals(format.POSIXct)[-5] + + if (usetz) { + format <- paste(format, "%Z") + } + + if (call_binding("is.POSIXct", x)) { + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x + } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 95bbaa3df75..073041a4b87 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -206,70 +206,6 @@ test_that("strftime", { ) }) -test_that("format POSIXct", { - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - - times <- tibble( - datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), - date = c(as.Date("2021-01-01"), NA) - ) - formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" - formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" - - compare_dplyr_binding( - .input %>% - mutate(x = format(datetime, format = formats)) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate(x = format(date, format = formats_date)) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate(x = format(datetime, format = formats, tz = "Pacific/Marquesas")) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate(x = format(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% - collect(), - times - ) - - withr::with_timezone( - "Pacific/Marquesas", - { - compare_dplyr_binding( - .input %>% - mutate( - x = format(datetime, format = formats, tz = "EST"), - x_date = format(date, format = formats_date, tz = "EST") - ) %>% - collect(), - times - ) - - compare_dplyr_binding( - .input %>% - mutate( - x = format(datetime, format = formats), - x_date = format(date, format = formats_date) - ) %>% - collect(), - times - ) - } - ) -}) - test_that("format_ISO8601", { # https://issues.apache.org/jira/projects/ARROW/issues/ARROW-15266 skip_if_not_available("re2") diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 043a36a951a..1ab37e11529 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -843,3 +843,67 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a test_df ) }) + +test_that("format POSIXct", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + times <- tibble( + datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), + date = c(as.Date("2021-01-01"), NA) + ) + formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" + formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(date, format = formats_date)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "Pacific/Marquesas")) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = format(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% + collect(), + times + ) + + withr::with_timezone( + "Pacific/Marquesas", + { + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats, tz = "EST"), + x_date = format(date, format = formats_date, tz = "EST") + ) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate( + x = format(datetime, format = formats), + x_date = format(date, format = formats_date) + ) %>% + collect(), + times + ) + } + ) +}) From def79dc9d86f9d14dcda2f9cc235e65310df5e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 14:14:03 +0000 Subject: [PATCH 07/26] remove arg check --- r/R/dplyr-funcs-type.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index f8c0ea25110..8061b7c2f53 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -307,7 +307,6 @@ register_bindings_type_format <- function() { } binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - base_format_args <- formals(format.POSIXct)[-5] if (usetz) { format <- paste(format, "%Z") From 46ee1801a86bf050fa9f4690a39b05d796cabd80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 14:54:45 +0000 Subject: [PATCH 08/26] testing abandon ship for unsupported types --- r/R/dplyr-funcs-type.R | 2 +- r/tests/testthat/test-dplyr-funcs-type.R | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 8061b7c2f53..aa8bde9ee41 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -301,7 +301,7 @@ register_bindings_type_format <- function() { binding_format_datetime(x, ...) } else { # other types - "WIP" + abort(paste("`format()` not yet supported for ", x$type())) } }) } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 1ab37e11529..7ef14789581 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -844,7 +844,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a ) }) -test_that("format POSIXct", { +test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 times <- tibble( @@ -907,3 +907,23 @@ test_that("format POSIXct", { } ) }) + +test_that("format for other types errors", { + + expect_warning( + example_data %>% + record_batch() %>% + mutate(x = format(int, trim = TRUE)) %>% + collect(), + regexp = "not supported in Arrow; pulling data into R" + ) + + expect_warning( + example_data %>% + record_batch() %>% + mutate(y = format(dbl, nsmall = 3)) %>% + collect(), + regexp = "not supported in Arrow; pulling data into R" + ) + +}) From 976e172c137d5bbcd8cc45d03b6081819b1b3db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 15:19:43 +0000 Subject: [PATCH 09/26] improve `format()` abort message --- r/R/dplyr-funcs-type.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index aa8bde9ee41..20b1af915b2 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -300,8 +300,7 @@ register_bindings_type_format <- function() { x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) { binding_format_datetime(x, ...) } else { - # other types - abort(paste("`format()` not yet supported for ", x$type())) + abort(paste0("`format()` not yet supported for `", class(x$type())[[1]], "`")) } }) } From 1bbcdac2591ea8658fd3dbaba9157f69b2834d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Feb 2022 15:56:18 +0000 Subject: [PATCH 10/26] improve test description --- r/tests/testthat/test-dplyr-funcs-type.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 7ef14789581..00bcc87a86a 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -908,8 +908,7 @@ test_that("format date/time", { ) }) -test_that("format for other types errors", { - +test_that("format() for unsupported types errors and pulls back to R", { expect_warning( example_data %>% record_batch() %>% @@ -917,7 +916,6 @@ test_that("format for other types errors", { collect(), regexp = "not supported in Arrow; pulling data into R" ) - expect_warning( example_data %>% record_batch() %>% @@ -925,5 +923,4 @@ test_that("format for other types errors", { collect(), regexp = "not supported in Arrow; pulling data into R" ) - }) From cf514ebeeb5b6223645ea9546686979ac619246c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 18:32:42 +0000 Subject: [PATCH 11/26] use `"Etc/GMT+6"` as unlikely unit test --- r/tests/testthat/test-dplyr-funcs-type.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 00bcc87a86a..dc52d81b85d 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -848,7 +848,7 @@ test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 times <- tibble( - datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), + datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA), date = c(as.Date("2021-01-01"), NA) ) formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" @@ -870,7 +870,7 @@ test_that("format date/time", { compare_dplyr_binding( .input %>% - mutate(x = format(datetime, format = formats, tz = "Pacific/Marquesas")) %>% + mutate(x = format(datetime, format = formats, tz = "Europe/Bucharest")) %>% collect(), times ) From f8c2ed2d31dc5649fc67839a703eb03d1a4aa36f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 18:34:28 +0000 Subject: [PATCH 12/26] test on windows too --- r/tests/testthat/test-dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index dc52d81b85d..a52f351c5bc 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -845,7 +845,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a }) test_that("format date/time", { - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + # skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 times <- tibble( datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA), From 2ae62493f2c2fd15bd5c3457e1fd17812b809d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 20:11:28 +0000 Subject: [PATCH 13/26] skip on win --- r/tests/testthat/test-dplyr-funcs-type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index a52f351c5bc..dc52d81b85d 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -845,7 +845,7 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a }) test_that("format date/time", { - # skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 times <- tibble( datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA), From 40e1914a1794e972693be6708cac8f6699138469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 13:14:44 +0000 Subject: [PATCH 14/26] moved `binding_format_datetime()` to dplyr-funcs-datetime.R --- r/R/dplyr-funcs-datetime.R | 19 +++++++++++++++++++ r/R/dplyr-funcs-type.R | 19 ------------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5a22c970965..e2ab2029d76 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -168,3 +168,22 @@ register_bindings_datetime <- function() { build_expr("cast", x, options = list(to_type = date32())) }) } + +binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { + + if (usetz) { + format <- paste(format, "%Z") + } + + if (call_binding("is.POSIXct", x)) { + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x + } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 20b1af915b2..dfc75e2a455 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -304,22 +304,3 @@ register_bindings_type_format <- function() { } }) } - -binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - - if (usetz) { - format <- paste(format, "%Z") - } - - if (call_binding("is.POSIXct", x)) { - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() - } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - } else { - ts <- x - } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) -} From f77168204689629ab3a974bfdf4d8fa0415c69da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 14:07:17 +0000 Subject: [PATCH 15/26] cast as string for unsupported formats + update unit tests --- r/R/dplyr-funcs-type.R | 6 +++++- r/tests/testthat/test-dplyr-funcs-type.R | 16 +++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index dfc75e2a455..2883cc9561a 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -296,11 +296,15 @@ register_bindings_type_elementwise <- function() { register_bindings_type_format <- function() { register_binding("format", function(x, ...) { + if (!inherits(x, "Expression")) { + return(format(x, ...)) + } + if (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) { binding_format_datetime(x, ...) } else { - abort(paste0("`format()` not yet supported for `", class(x$type())[[1]], "`")) + x$cast(string()) } }) } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index dc52d81b85d..95dbe207282 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -908,19 +908,25 @@ test_that("format date/time", { ) }) -test_that("format() for unsupported types errors and pulls back to R", { - expect_warning( +test_that("format() for unsupported types returns the input as string", { + expect_equal( example_data %>% record_batch() %>% mutate(x = format(int, trim = TRUE)) %>% collect(), - regexp = "not supported in Arrow; pulling data into R" + example_data %>% + record_batch() %>% + mutate(x = as.character(int)) %>% + collect() ) - expect_warning( + expect_equal( example_data %>% record_batch() %>% mutate(y = format(dbl, nsmall = 3)) %>% collect(), - regexp = "not supported in Arrow; pulling data into R" + example_data %>% + record_batch() %>% + mutate(y = as.character(dbl)) %>% + collect() ) }) From 5411458f4ae57640137ce1cbcf080d92ba38b8cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 14:29:38 +0000 Subject: [PATCH 16/26] without casting --- r/R/dplyr-funcs-datetime.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index e2ab2029d76..c1801880d31 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -170,20 +170,20 @@ register_bindings_datetime <- function() { } binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - if (usetz) { format <- paste(format, "%Z") } - if (call_binding("is.POSIXct", x)) { - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() - } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - } else { - ts <- x - } + # if (call_binding("is.POSIXct", x)) { + # if (tz == "" && x$type()$timezone() != "") { + # tz <- x$type()$timezone() + # } else if (tz == "") { + # tz <- Sys.timezone() + # } + # ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + # } else { + # ts <- x + # } + ts <- x Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } From 8859ebb60a5fb3e9fa51d0cd0e7a00eaac847947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 17 Feb 2022 15:01:59 +0000 Subject: [PATCH 17/26] with casting --- r/R/dplyr-funcs-datetime.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index c1801880d31..2585edcfa8e 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -174,16 +174,16 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { format <- paste(format, "%Z") } - # if (call_binding("is.POSIXct", x)) { - # if (tz == "" && x$type()$timezone() != "") { - # tz <- x$type()$timezone() - # } else if (tz == "") { - # tz <- Sys.timezone() - # } - # ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - # } else { - # ts <- x - # } - ts <- x + if (call_binding("is.POSIXct", x)) { + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x + } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } From 03eb111f44a3bb2bf9d30d78eb8fb16ee8a8d3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 23 Feb 2022 09:52:41 +0000 Subject: [PATCH 18/26] added TODO to revisit the casting step once #12240 is merged --- r/R/dplyr-funcs-datetime.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 2585edcfa8e..29b6f29fb47 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -175,6 +175,9 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { } if (call_binding("is.POSIXct", x)) { + # the casting part might not be required once + # https://issues.apache.org/jira/browse/ARROW-14442 is solved + # TODO revisit the steps below once the PR for that issue is merged if (tz == "" && x$type()$timezone() != "") { tz <- x$type()$timezone() } else if (tz == "") { From 09d99b4a054dfb324982df9c68e2665435e0db4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 21:19:00 +0000 Subject: [PATCH 19/26] used `build_expr()` and simplified the implementation --- r/R/dplyr-funcs-datetime.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 29b6f29fb47..83c06c99230 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -183,10 +183,8 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { } else if (tz == "") { tz <- Sys.timezone() } - ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) - } else { - ts <- x + ts <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) } - Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + build_expr("strptime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } From 8ecf37a05fd16e72e6afd4ead080f73da9c96083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 22:03:10 +0000 Subject: [PATCH 20/26] typo --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 83c06c99230..60265cc4d57 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -186,5 +186,5 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { ts <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) } - build_expr("strptime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + build_expr("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } From c0688e10d192d0610b20b409949ce6f05eac44a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 3 Mar 2022 22:09:17 +0000 Subject: [PATCH 21/26] :) --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 60265cc4d57..2904abb30fe 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -183,8 +183,8 @@ binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { } else if (tz == "") { tz <- Sys.timezone() } - ts <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) } - build_expr("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) + build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) } From c118700d40682e3376534f8a89e3f347aa3fcae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 5 Mar 2022 12:40:41 +0000 Subject: [PATCH 22/26] changed one test to use `arrow_table()` --- r/tests/testthat/test-dplyr-funcs-type.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 95dbe207282..80d2a325bbf 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -921,11 +921,11 @@ test_that("format() for unsupported types returns the input as string", { ) expect_equal( example_data %>% - record_batch() %>% + arrow_table() %>% mutate(y = format(dbl, nsmall = 3)) %>% collect(), example_data %>% - record_batch() %>% + arrow_table() %>% mutate(y = as.character(dbl)) %>% collect() ) From a6c0b19e209b129cbe578412790c2d932f696751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 5 Mar 2022 12:41:10 +0000 Subject: [PATCH 23/26] updated the `format()` binding to use `build_expr()` --- r/R/dplyr-funcs-type.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 2883cc9561a..df94fd797dc 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -296,15 +296,11 @@ register_bindings_type_elementwise <- function() { register_bindings_type_format <- function() { register_binding("format", function(x, ...) { - if (!inherits(x, "Expression")) { - return(format(x, ...)) - } - if (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) { binding_format_datetime(x, ...) } else { - x$cast(string()) + build_expr("cast", x, options = cast_options(to_type = string())) } }) } From 7878b373a5dea2aea1a3e834f69e2666443ab6fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Sat, 5 Mar 2022 12:59:02 +0000 Subject: [PATCH 24/26] keep dispatch for regular R objects in and add a unit test for `format(r_object)` --- r/R/dplyr-funcs-type.R | 4 ++++ r/tests/testthat/test-dplyr-funcs-type.R | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index df94fd797dc..9c08d9c0237 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -296,6 +296,10 @@ register_bindings_type_elementwise <- function() { register_bindings_type_format <- function() { register_binding("format", function(x, ...) { + if (!inherits(x, "Expression")) { + return(format(x, ...)) + } + if (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) { binding_format_datetime(x, ...) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 80d2a325bbf..c075b67cbc2 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -882,6 +882,14 @@ test_that("format date/time", { times ) + compare_dplyr_binding( + .input %>% + mutate(x = format(1), + y = format(13.7, nsmall = 3)) %>% + collect(), + times + ) + withr::with_timezone( "Pacific/Marquesas", { From ce5a0c17e1567e53e4c1ee76737130f457bbf69c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 8 Mar 2022 16:18:34 +0000 Subject: [PATCH 25/26] additional unit test with regular R object --- r/tests/testthat/test-dplyr-funcs-type.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index c075b67cbc2..a31d928c6b3 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -890,6 +890,13 @@ test_that("format date/time", { times ) + compare_dplyr_binding( + .input %>% + mutate(start_date = format(as.POSIXct("2022-01-01 01:01:00"))) %>% + collect(), + times + ) + withr::with_timezone( "Pacific/Marquesas", { From 9668f01cc16ab23c926f7811063ceb2cac246d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Wed, 9 Mar 2022 21:42:50 +0000 Subject: [PATCH 26/26] removed unevaluated args and added comment on why we use `base::format()` for regular R objects --- r/R/dplyr-funcs-type.R | 2 ++ r/tests/testthat/test-dplyr-funcs-type.R | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9c08d9c0237..52a8eeabaf3 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -296,6 +296,8 @@ register_bindings_type_elementwise <- function() { register_bindings_type_format <- function() { register_binding("format", function(x, ...) { + # We use R's format if we get a single R object here since we don't (yet) + # support all of the possible options for casting to string if (!inherits(x, "Expression")) { return(format(x, ...)) } diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index a31d928c6b3..5cbe77763af 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -927,7 +927,7 @@ test_that("format() for unsupported types returns the input as string", { expect_equal( example_data %>% record_batch() %>% - mutate(x = format(int, trim = TRUE)) %>% + mutate(x = format(int)) %>% collect(), example_data %>% record_batch() %>% @@ -937,7 +937,7 @@ test_that("format() for unsupported types returns the input as string", { expect_equal( example_data %>% arrow_table() %>% - mutate(y = format(dbl, nsmall = 3)) %>% + mutate(y = format(dbl)) %>% collect(), example_data %>% arrow_table() %>%