From d04fbf2a9b7aa187209fb59948c6c1cb2a3543c0 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Sat, 15 May 2021 12:05:07 +0000 Subject: [PATCH 1/6] ARROW-12198: [R] bindings for strptime --- r/R/compute.R | 5 +++++ r/src/compute.cpp | 6 ++++++ r/tests/testthat/test-Array.R | 11 +++++++++++ 3 files changed, 22 insertions(+) diff --git a/r/R/compute.R b/r/R/compute.R index 43c3285481c..91625400314 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -297,3 +297,8 @@ cast_options <- function(safe = TRUE, ...) { ) modifyList(opts, list(...)) } + +strptime_arrow <- function(..., format, unit){ + a <- collect_arrays_from_dots(list(...)) + call_function("strptime", a, options = list(format=format, unit=unit)) +} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 90c7b4129c7..5c9227957d1 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -234,6 +234,12 @@ std::shared_ptr make_compute_options( max_replacements); } + if (func_name == "strptime") { + using Options = arrow::compute::StrptimeOptions; + return std::make_shared(cpp11::as_cpp(options["format"]), + cpp11::as_cpp(options["unit"])); + } + if (func_name == "split_pattern" || func_name == "split_pattern_regex") { using Options = arrow::compute::SplitPatternOptions; int64_t max_splits = -1; diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 26d0a3005e4..d19373acc35 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -291,6 +291,17 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { expect_identical(read_feather(feather_file), df) }) +test_that("strptime", { + # array of strings + time_strings <- Array$create(c("2018-10-07 19:04:05", NA)) + # array of timestamps (doesn't work if tz="" is added!) + timestamps <- Array$create(c(as.POSIXct("2018-10-07 19:04:05"), NA)) + # array of parsed timestamps + parsed_timestamps <- strptime_arrow(time_strings, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$MICRO) + + expect_equal(timestamps, parsed_timestamps) +}) + test_that("array supports integer64", { x <- bit64::as.integer64(1:10) + MAX_INT expect_array_roundtrip(x, int64()) From 4aa1bc995fed400f2adaa756a052b578b70571c0 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Sun, 23 May 2021 18:46:07 +0200 Subject: [PATCH 2/6] Moving strptime to nse_funcs and tests to dplyr_string_functions. --- r/R/compute.R | 5 -- r/R/dplyr-functions.R | 21 ++++++ r/src/compute.cpp | 5 +- r/tests/testthat/test-Array.R | 11 --- .../testthat/test-dplyr-string-functions.R | 71 +++++++++++++++++++ 5 files changed, 95 insertions(+), 18 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 91625400314..43c3285481c 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -297,8 +297,3 @@ cast_options <- function(safe = TRUE, ...) { ) modifyList(opts, list(...)) } - -strptime_arrow <- function(..., format, unit){ - a <- collect_arrays_from_dots(list(...)) - call_function("strptime", a, options = list(format=format, unit=unit)) -} diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e3ff5cecebd..d205c0a2255 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -338,3 +338,24 @@ get_stringr_pattern_options <- function(pattern) { contains_regex <- function(string) { grepl("[.\\|()[{^$*+?]", string) } + +nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = 1L) { + # Arrow uses unit for time parsing, strptime() does not. + # Arrow has no default option for strptime (format, unit), + # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", + # (ARROW-12809) + + # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). + # Stop if tz is provided. + if (is.character(tz)) { + arrow_not_supported("Time zone argument") + } + + t_unit <- make_valid_time_unit(unit,c("s" = TimeUnit$SECOND, "ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO, "ns" = TimeUnit$NANO)) + + Expression$create("strptime", + x, + options = list( + format = format, + unit = t_unit)) +} diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 5c9227957d1..5d594964294 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -236,8 +236,9 @@ std::shared_ptr make_compute_options( if (func_name == "strptime") { using Options = arrow::compute::StrptimeOptions; - return std::make_shared(cpp11::as_cpp(options["format"]), - cpp11::as_cpp(options["unit"])); + return std::make_shared( + cpp11::as_cpp(options["format"]), + cpp11::as_cpp(options["unit"])); } if (func_name == "split_pattern" || func_name == "split_pattern_regex") { diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index d19373acc35..26d0a3005e4 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -291,17 +291,6 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { expect_identical(read_feather(feather_file), df) }) -test_that("strptime", { - # array of strings - time_strings <- Array$create(c("2018-10-07 19:04:05", NA)) - # array of timestamps (doesn't work if tz="" is added!) - timestamps <- Array$create(c(as.POSIXct("2018-10-07 19:04:05"), NA)) - # array of parsed timestamps - parsed_timestamps <- strptime_arrow(time_strings, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$MICRO) - - expect_equal(timestamps, parsed_timestamps) -}) - test_that("array supports integer64", { x <- bit64::as.integer64(1:10) + MAX_INT expect_array_roundtrip(x, int64()) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index bb4794ef4c5..b8b947ba925 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -493,3 +493,74 @@ test_that("edge cases in string detection and replacement", { tibble(x = c("ABC")) ) }) + +test_that("strptime", { + + t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) + t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + t_stampPDT <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "PDT"), NA)) + + expect_equivalent( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x) + ) %>% + collect(), + t_stamp + ) + + expect_equivalent( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S") + ) %>% + collect(), + t_stamp + ) + + expect_equivalent( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$NANO) + ) %>% + collect(), + t_stamp + ) + + expect_equivalent( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") + ) %>% + collect(), + t_stamp + ) + + expect_equivalent( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz="PDT") + ) %>% + collect(), + t_stamp + ) + + tstring <- tibble(x = c("08-05-2008", NA)) + tstamp <- tibble(x = c(lubridate::mdy("08/05/2008"), NA)) + + expect_equivalent( + tstring %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%m-%d-%Y") + ) %>% + collect(), + tstamp + ) + +}) From ff582ee72766399363212f5321a9e0a60636882c Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 26 May 2021 18:14:15 +0200 Subject: [PATCH 3/6] Adding check.tzone = FALSE --- .../testthat/test-dplyr-string-functions.R | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index b8b947ba925..42ec93aa773 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -507,7 +507,8 @@ test_that("strptime", { x = strptime(x) ) %>% collect(), - t_stamp + t_stamp, + check.tzone = FALSE ) expect_equivalent( @@ -517,7 +518,8 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S") ) %>% collect(), - t_stamp + t_stamp, + check.tzone = FALSE ) expect_equivalent( @@ -527,7 +529,8 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$NANO) ) %>% collect(), - t_stamp + t_stamp, + check.tzone = FALSE ) expect_equivalent( @@ -537,21 +540,24 @@ test_that("strptime", { x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") ) %>% collect(), - t_stamp + t_stamp, + check.tzone = FALSE ) expect_equivalent( t_string %>% Table$create() %>% - mutate( + mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz="PDT") ) %>% - collect(), - t_stamp + collect(), + t_stamp, + check.tzone = FALSE ) tstring <- tibble(x = c("08-05-2008", NA)) tstamp <- tibble(x = c(lubridate::mdy("08/05/2008"), NA)) + tstamp[[1]] <- as.POSIXct(tstamp[[1]]) expect_equivalent( tstring %>% @@ -560,7 +566,8 @@ test_that("strptime", { x = strptime(x, format = "%m-%d-%Y") ) %>% collect(), - tstamp + tstamp, + check.tzone = FALSE ) }) From 84765c314de034cdc165e728110fb3ae591fd8b5 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 28 May 2021 20:00:43 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Jonathan Keane --- r/R/dplyr-functions.R | 2 +- r/tests/testthat/test-dplyr-string-functions.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index d205c0a2255..76a404ed91b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -351,7 +351,7 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit arrow_not_supported("Time zone argument") } - t_unit <- make_valid_time_unit(unit,c("s" = TimeUnit$SECOND, "ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO, "ns" = TimeUnit$NANO)) + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) Expression$create("strptime", x, diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 42ec93aa773..a884e71d91a 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -526,7 +526,7 @@ test_that("strptime", { t_string %>% Table$create() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$NANO) + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") ) %>% collect(), t_stamp, From a271e6fb69b152d3e2c3aa7725b8e95a60902b69 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 1 Jun 2021 13:11:21 +0200 Subject: [PATCH 5/6] Apply suggestions from code review no. 2 --- r/R/dplyr-functions.R | 8 ++-- .../testthat/test-dplyr-string-functions.R | 37 +++++++++---------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index d205c0a2255..b91b3ed6c10 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -339,7 +339,7 @@ contains_regex <- function(string) { grepl("[.\\|()[{^$*+?]", string) } -nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = 1L) { +nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { # Arrow uses unit for time parsing, strptime() does not. # Arrow has no default option for strptime (format, unit), # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", @@ -351,11 +351,9 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit arrow_not_supported("Time zone argument") } - t_unit <- make_valid_time_unit(unit,c("s" = TimeUnit$SECOND, "ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO, "ns" = TimeUnit$NANO)) + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) Expression$create("strptime", x, - options = list( - format = format, - unit = t_unit)) + options = list(format = format, unit = unit)) } diff --git a/r/tests/testthat/test-dplyr-string-functions.R b/r/tests/testthat/test-dplyr-string-functions.R index 42ec93aa773..ea27aa14777 100644 --- a/r/tests/testthat/test-dplyr-string-functions.R +++ b/r/tests/testthat/test-dplyr-string-functions.R @@ -498,9 +498,8 @@ test_that("strptime", { t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) - t_stampPDT <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "PDT"), NA)) - expect_equivalent( + expect_equal( t_string %>% Table$create() %>% mutate( @@ -511,7 +510,7 @@ test_that("strptime", { check.tzone = FALSE ) - expect_equivalent( + expect_equal( t_string %>% Table$create() %>% mutate( @@ -522,18 +521,18 @@ test_that("strptime", { check.tzone = FALSE ) - expect_equivalent( + expect_equal( t_string %>% Table$create() %>% mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = TimeUnit$NANO) + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") ) %>% collect(), t_stamp, check.tzone = FALSE ) - expect_equivalent( + expect_equal( t_string %>% Table$create() %>% mutate( @@ -544,22 +543,10 @@ test_that("strptime", { check.tzone = FALSE ) - expect_equivalent( - t_string %>% - Table$create() %>% - mutate( - x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz="PDT") - ) %>% - collect(), - t_stamp, - check.tzone = FALSE - ) - tstring <- tibble(x = c("08-05-2008", NA)) - tstamp <- tibble(x = c(lubridate::mdy("08/05/2008"), NA)) - tstamp[[1]] <- as.POSIXct(tstamp[[1]]) + tstamp <- tibble(x = c(strptime("08-05-2008", format = "%m-%d-%Y"), NA)) - expect_equivalent( + expect_equal( tstring %>% Table$create() %>% mutate( @@ -571,3 +558,13 @@ test_that("strptime", { ) }) + +test_that("errors in strptime", { + # Error when tz is passed + + x <- Expression$field_ref("x") + expect_error( + nse_funcs$strptime(x, tz = "PDT"), + 'Time zone argument not supported by Arrow' + ) +}) From be6294f0d10421b0a60b1feefe27648ba76dc784 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 4 Jun 2021 14:38:40 -0500 Subject: [PATCH 6/6] tiny formatting change --- r/R/dplyr-functions.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index b91b3ed6c10..ec161d8361c 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -353,7 +353,5 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) - Expression$create("strptime", - x, - options = list(format = format, unit = unit)) + Expression$create("strptime", x, options = list(format = format, unit = unit)) }