From a1112de92b2ee5a2fcb438dc8d603ca533b86f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 13:24:45 +0000 Subject: [PATCH 1/7] date/time parsing unit test --- r/tests/testthat/test-dplyr-funcs-datetime.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a7a705678c1..61e65b4c4e2 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -711,3 +711,16 @@ test_that("am/pm mirror lubridate", { ) }) + +test_that("date/time parsing / ymd() and `-` separator", { + df <- tibble::tibble( + date_hyphen_full_year = "2022-02-05", + date_hyphen_short_year = "22-02-05") + + compare_dplyr_binding( + .input %>% + mutate(date = ymd(date_hyphen)) %>% + collect(), + df + ) +}) From b79780c58c5d7ae474865cf3d245bd3403106098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 14:09:31 +0000 Subject: [PATCH 2/7] better data frame for parsing testing --- r/tests/testthat/test-dplyr-funcs-datetime.R | 37 +++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 61e65b4c4e2..1a9ccd5adf0 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -713,14 +713,33 @@ test_that("am/pm mirror lubridate", { }) test_that("date/time parsing / ymd() and `-` separator", { - df <- tibble::tibble( - date_hyphen_full_year = "2022-02-05", - date_hyphen_short_year = "22-02-05") - - compare_dplyr_binding( - .input %>% - mutate(date = ymd(date_hyphen)) %>% - collect(), - df + test_dates <- tibble::tibble( + string_ymd = c( + "2021-09-10", "2021/09/10", "2021.09.10", "2021,09,10", "2021:09:10", + "20210910", "2021 Sep 10", "2021 September 10", "21-09-10", "21/09/10", + "21.09.10", "21,09,10", "21:09:10", "210910", "21 Sep 10", + "21 September 10", NA + ), + string_dmy = c( + "10-09-2021", "10/09/2021", "10.09.2021", "10,09,2021", "10:09:2021", + "10092021", "10 Sep 2021", "10 September 2021", "10-09-21", "10/09/21", + "10.09.21", "10,09,21", "10:09:21", "100921", "10 Sep 21", + "10 September 21", NA + ), + string_mdy = c( + "09-10-2021", "09/10/2021", "09.10.2021", "09,10,2021", "09:10:2021", + "09102021", "Sep 10 2021", "September 10 2021", "09-10-21", "09/10/21", + "09.10.21", "09,10,21", "09:10:21", "091021", "Sep 10 21", + "September 10 21", NA + ), + date = c(rep(as.Date("2021-09-10"), 16), NA), + date_midnight = c(rep(as.POSIXct("2021-09-10 00:00:00", tz = "UTC"), 16), NA) + ) + + compare_dplyr_binding( + .input %>% + mutate(x = ymd(string_ymd)) %>% + collect(), + test_dates ) }) From bee92f2267fe4020f3750fd31ae4077026f703bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 4 Feb 2022 14:54:38 +0000 Subject: [PATCH 3/7] interim list of supported formats for ymd --- r/R/dplyr-funcs-datetime.R | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 04c0214fdfb..db58a137163 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -148,4 +148,53 @@ register_bindings_datetime <- function() { !call_binding("am", x) }) + register_binding("ymd", function(x) { + format_map <- + list( + ymd_hyphen1 = "%Y-%m-%d", + ymd_hyphen2 = "%y-%m-%d", + ymd_hyphen3 = "%Y-%B-%d", + ymd_hyphen4 = "%y-%B-%d", + ymd_hyphen5 = "%Y-%b-%d", + ymd_hyphen6 = "%y-%b-%d", + ymd_fslash1 = "%Y/%m/%d", + ymd_fslash2 = "%y/%m/%d", + ymd_fslash3 = "%Y/%B/%d", + ymd_fslash4 = "%y/%B/%d", + ymd_fslash5 = "%Y/%b/%d", + ymd_fslash6 = "%y/%b/%d", + ymd_dot1 = "%Y.%m.%d", + ymd_dot2 = "%y.%m.%d", + ymd_dot3 = "%Y.%B.%d", + ymd_dot4 = "%y.%B.%d", + ymd_dot5 = "%Y.%b.%d", + ymd_dot6 = "%y.%b.%d", + ymd_comma1 = "%Y,%m,%d", + ymd_comma2 = "%y,%m,%d", + ymd_comma3 = "%Y,%B,%d", + ymd_comma4 = "%y,%B,%d", + ymd_comma5 = "%Y,%b,%d", + ymd_comma6 = "%y,%b,%d", + ymd_colon1 = "%Y:%m:%d", + ymd_colon2 = "%y:%m:%d", + ymd_colon3 = "%Y:%B:%d", + ymd_colon4 = "%y:%B:%d", + ymd_colon5 = "%Y:%b:%d", + ymd_colon6 = "%y:%b:%d", + ymd_cont1 = "%Y%m%d", + ymd_cont2 = "%y%m%d", + ymd_cont3 = "%Y%B%d", + ymd_cont4 = "%y%B%d", + ymd_cont5 = "%Y%b%d", + ymd_cont6 = "%y%b%d", + ymd_space1 = "%Y %m %d", + ymd_space2 = "%y %m %d", + ymd_space3 = "%Y %B %d", + ymd_space4 = "%y %B %d", + ymd_space5 = "%Y %b %d", + ymd_space6 = "%y %b %d" + ) + + call_binding("strptime", x, format = format) + }) } From 9fdbd3c9d840bccd47cba0b119cfb2723fc4444f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Feb 2022 13:53:20 +0000 Subject: [PATCH 4/7] naive calling of the `srtptime` binding with the first matching format for `ymd()` --- r/R/dplyr-funcs-datetime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index db58a137163..7fcdbb049fd 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -195,6 +195,6 @@ register_bindings_datetime <- function() { ymd_space6 = "%y %b %d" ) - call_binding("strptime", x, format = format) + call_binding("strptime", x, format = format_map[[1]], unit = "s") }) } From cfba9e1dfbedd5dfdf652c805e93692808dd092e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Mon, 7 Feb 2022 14:06:39 +0000 Subject: [PATCH 5/7] use `coalesce()` --- r/R/dplyr-funcs-datetime.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 7fcdbb049fd..03f00283911 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -194,7 +194,10 @@ register_bindings_datetime <- function() { ymd_space5 = "%Y %b %d", ymd_space6 = "%y %b %d" ) - - call_binding("strptime", x, format = format_map[[1]], unit = "s") + call_binding( + "coalesce", + call_binding("strptime", x, format = format_map[[1]], unit = "s"), + call_binding("strptime", x, format = format_map[[2]], unit = "s") + ) }) } From 8d1a252b500685e6c1e4661a136b08a68abc2037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 10 Feb 2022 09:34:53 +0000 Subject: [PATCH 6/7] reduce the list of possible formats --- r/R/dplyr-funcs-datetime.R | 56 ++++++++++---------------------------- 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 03f00283911..234f0bda872 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -156,48 +156,20 @@ register_bindings_datetime <- function() { ymd_hyphen3 = "%Y-%B-%d", ymd_hyphen4 = "%y-%B-%d", ymd_hyphen5 = "%Y-%b-%d", - ymd_hyphen6 = "%y-%b-%d", - ymd_fslash1 = "%Y/%m/%d", - ymd_fslash2 = "%y/%m/%d", - ymd_fslash3 = "%Y/%B/%d", - ymd_fslash4 = "%y/%B/%d", - ymd_fslash5 = "%Y/%b/%d", - ymd_fslash6 = "%y/%b/%d", - ymd_dot1 = "%Y.%m.%d", - ymd_dot2 = "%y.%m.%d", - ymd_dot3 = "%Y.%B.%d", - ymd_dot4 = "%y.%B.%d", - ymd_dot5 = "%Y.%b.%d", - ymd_dot6 = "%y.%b.%d", - ymd_comma1 = "%Y,%m,%d", - ymd_comma2 = "%y,%m,%d", - ymd_comma3 = "%Y,%B,%d", - ymd_comma4 = "%y,%B,%d", - ymd_comma5 = "%Y,%b,%d", - ymd_comma6 = "%y,%b,%d", - ymd_colon1 = "%Y:%m:%d", - ymd_colon2 = "%y:%m:%d", - ymd_colon3 = "%Y:%B:%d", - ymd_colon4 = "%y:%B:%d", - ymd_colon5 = "%Y:%b:%d", - ymd_colon6 = "%y:%b:%d", - ymd_cont1 = "%Y%m%d", - ymd_cont2 = "%y%m%d", - ymd_cont3 = "%Y%B%d", - ymd_cont4 = "%y%B%d", - ymd_cont5 = "%Y%b%d", - ymd_cont6 = "%y%b%d", - ymd_space1 = "%Y %m %d", - ymd_space2 = "%y %m %d", - ymd_space3 = "%Y %B %d", - ymd_space4 = "%y %B %d", - ymd_space5 = "%Y %b %d", - ymd_space6 = "%y %b %d" + ymd_hyphen6 = "%y-%b-%d" ) - call_binding( - "coalesce", - call_binding("strptime", x, format = format_map[[1]], unit = "s"), - call_binding("strptime", x, format = format_map[[2]], unit = "s") - ) + + x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x) + + # call_binding( + # "coalesce", + call_binding("strptime", x, format = format_map[[1]], unit = "s")#, + # call_binding("strptime", x, format = format_map[[2]], unit = "s"), + # call_binding("strptime", x, format = format_map[[3]], unit = "s"), + # call_binding("strptime", x, format = format_map[[4]], unit = "s"), + # call_binding("strptime", x, format = format_map[[5]], unit = "s"), + # call_binding("strptime", x, format = format_map[[6]], unit = "s") + # ) + }) } From 421e72aa0c855b63eb06749e8a7cf6d979cd0637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Thu, 10 Feb 2022 09:35:12 +0000 Subject: [PATCH 7/7] play around with the `ymd()` tests --- r/tests/testthat/test-dplyr-funcs-datetime.R | 49 ++++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 1a9ccd5adf0..549c51dc357 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -715,25 +715,46 @@ test_that("am/pm mirror lubridate", { test_that("date/time parsing / ymd() and `-` separator", { test_dates <- tibble::tibble( string_ymd = c( - "2021-09-10", "2021/09/10", "2021.09.10", "2021,09,10", "2021:09:10", - "20210910", "2021 Sep 10", "2021 September 10", "21-09-10", "21/09/10", - "21.09.10", "21,09,10", "21:09:10", "210910", "21 Sep 10", - "21 September 10", NA + "2021-09-10", "2021/09/10", "2021.09.10", "2021,09,10", "2021:09:10", "2021 09 10", + # "21-09-10", + # "21/09/10", + # "21.09.10", + # "21,09,10", + # "21:09:10", + # "20210910", + # "210910", + # "2021 Sep 10", + # "2021 September 10", + # "21 Sep 10", + # "21 September 10", + NA ), string_dmy = c( - "10-09-2021", "10/09/2021", "10.09.2021", "10,09,2021", "10:09:2021", - "10092021", "10 Sep 2021", "10 September 2021", "10-09-21", "10/09/21", - "10.09.21", "10,09,21", "10:09:21", "100921", "10 Sep 21", - "10 September 21", NA + "10-09-2021", "10/09/2021", "10.09.2021", "10,09,2021", "10:09:2021", "10 09 2021", + # "10-09-21", "10/09/21", + # "10.09.21", "10,09,21", "10:09:21", + # "10092021", + # "100921", + # "10 Sep 2021", + # "10 September 2021", + # "10 Sep 21", + # "10 September 21", + NA ), string_mdy = c( - "09-10-2021", "09/10/2021", "09.10.2021", "09,10,2021", "09:10:2021", - "09102021", "Sep 10 2021", "September 10 2021", "09-10-21", "09/10/21", - "09.10.21", "09,10,21", "09:10:21", "091021", "Sep 10 21", - "September 10 21", NA + "09-10-2021", "09/10/2021", "09.10.2021", "09,10,2021", "09:10:2021", "09 10 2021", + # "09-10-21", "09/10/21", + # "09.10.21", "09,10,21", "09:10:21", + # "09102021", + # "091021", + # "Sep 10 2021", + # "September 10 2021", + # "Sep 10 21", + # "September 10 21", + NA ), - date = c(rep(as.Date("2021-09-10"), 16), NA), - date_midnight = c(rep(as.POSIXct("2021-09-10 00:00:00", tz = "UTC"), 16), NA) + date = c(rep(as.Date("2021-09-10"), 6), NA), + date_midnight = c(rep(as.POSIXct("2021-09-10 00:00:00", tz = "UTC"), 6), NA) ) compare_dplyr_binding(