Skip to content
Merged
50 changes: 41 additions & 9 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,57 @@ register_bindings_datetime <- function() {
}

register_bindings_datetime_utility <- function() {
register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL,
register_binding("strptime", function(x,
format = "%Y-%m-%d %H:%M:%S",
tz = "",
unit = "ms") {
# Arrow uses unit for time parsing, strptime() does not.
# Arrow has no default option for strptime (format, unit),
# we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms",
# (ARROW-12809)

# ParseTimestampStrptime currently ignores the timezone information (ARROW-12820).
# Stop if tz is provided.
if (is.character(tz)) {
arrow_not_supported("Time zone argument")
}
unit <- make_valid_time_unit(
unit,
c(valid_time64_units, valid_time32_units)
)

output <- build_expr(
"strptime",
x,
options =
list(
format = format,
unit = unit,
error_is_null = TRUE
)
)

unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
if (tz == "") {
tz <- Sys.timezone()
}

build_expr("strptime", x, options = list(format = format, unit = unit, error_is_null = TRUE))
# if a timestamp does not contain timezone information (i.e. it is
# "timezone-naive") we can attach timezone information (i.e. convert it into
# a "timezone-aware" timestamp) with `assume_timezone`
# if we want to cast to a different timezone, we can only do it for
# timezone-aware timestamps, not for timezone-naive ones
if (!is.null(tz)) {
output <- build_expr(
"assume_timezone",
output,
options =
list(
timezone = tz
)
)
}
output
})

register_binding("strftime", function(x, format = "", tz = "", usetz = FALSE) {
register_binding("strftime", function(x,
format = "",
tz = "",
usetz = FALSE) {
if (usetz) {
format <- paste(format, "%Z")
}
Expand Down
67 changes: 45 additions & 22 deletions r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,50 +49,82 @@ test_df <- tibble::tibble(

test_that("strptime", {
t_string <- tibble(x = c("2018-10-07 19:04:05", NA))
t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
# lubridate defaults to "UTC" as timezone => t_stamp is in "UTC"
t_stamp_with_utc_tz <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
t_stamp_with_pm_tz <- tibble(
x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA)
)

# base::strptime returns a POSIXlt (a list) => we cannot use compare_dplyr_binding
# => we use expect_equal for the tests below

withr::with_timezone("Pacific/Marquesas", {
# the default value for strptime's `tz` argument is "", which is interpreted
# as the current timezone. we test here if the strptime binding picks up
# correctly the current timezone (similarly to the base R version)
expect_equal(
t_string %>%
record_batch() %>%
mutate(
x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
) %>%
collect(),
t_stamp_with_pm_tz
)
})

# adding a timezone to a timezone-naive timestamp works
# and since our TZ when running the test is (typically) Pacific/Marquesas
# this also tests that assigning a TZ different from the current session one
# works as expected
expect_equal(
t_string %>%
arrow_table() %>%
mutate(
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas")
) %>%
collect(),
t_stamp_with_pm_tz
)

expect_equal(
t_string %>%
Table$create() %>%
mutate(
x = strptime(x)
x = strptime(x, tz = "UTC")
) %>%
collect(),
t_stamp,
ignore_attr = "tzone"
t_stamp_with_utc_tz
)

expect_equal(
t_string %>%
Table$create() %>%
mutate(
x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
) %>%
collect(),
t_stamp,
ignore_attr = "tzone"
t_stamp_with_utc_tz
)

expect_equal(
t_string %>%
Table$create() %>%
mutate(
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns")
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns", tz = "UTC")
) %>%
collect(),
t_stamp,
ignore_attr = "tzone"
t_stamp_with_utc_tz
)

expect_equal(
t_string %>%
Table$create() %>%
mutate(
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s")
x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s", tz = "UTC")
) %>%
collect(),
t_stamp,
ignore_attr = "tzone"
t_stamp_with_utc_tz
)

tstring <- tibble(x = c("08-05-2008", NA))
Expand All @@ -111,15 +143,6 @@ test_that("strptime", {
)
})

test_that("errors in strptime", {
# Error when tz is passed
x <- Expression$field_ref("x")
expect_error(
call_binding("strptime", x, tz = "PDT"),
"Time zone argument not supported in Arrow"
)
})

test_that("strptime returns NA when format doesn't match the data", {
df <- tibble(
str_date = c("2022-02-07", "2012/02-07", "1975/01-02", "1981/01-07", NA)
Expand Down