Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -618,4 +618,40 @@ register_bindings_datetime_parsers <- function() {
for (ymd_order in ymd_parser_vec) {
register_binding(ymd_order, ymd_parser_map_factory(ymd_order))
}

register_binding("fast_strptime", function(x,
format,
tz = "UTC",
lt = FALSE,
cutoff_2000 = 68L) {
# `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play
# well with mutate, for example)
if (lt) {
arrow_not_supported("`lt = TRUE` argument")
}

# TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done
if (cutoff_2000 != 68L) {
arrow_not_supported("`cutoff_2000` != 68L argument")
}

parse_attempt_expressions <- list()

parse_attempt_expressions <- map(
format,
~ build_expr(
"strptime",
x,
options = list(
format = .x,
unit = 0L,
error_is_null = TRUE
)
)
)

coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)

build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
})
}
130 changes: 130 additions & 0 deletions r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -1812,3 +1812,133 @@ test_that("ym, my & yq parsers", {
test_df
)
})

test_that("lubridate's fast_strptime", {

compare_dplyr_binding(
.input %>%
mutate(
y =
fast_strptime(
x,
format = "%Y-%m-%d %H:%M:%S",
lt = FALSE
)
) %>%
collect(),
tibble(
x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA)
)#,
# arrow does not preserve the `tzone` attribute
# test ignore_attr = TRUE
)

# R object
compare_dplyr_binding(
.input %>%
mutate(
y =
fast_strptime(
"68-10-07 19:04:05",
format = "%y-%m-%d %H:%M:%S",
lt = FALSE
)
) %>%
collect(),
tibble(
x = c("2018-10-07 19:04:05", NA)
)#,
# test ignore_attr = TRUE
)

compare_dplyr_binding(
.input %>%
mutate(
date_multi_formats =
fast_strptime(
x,
format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"),
lt = FALSE
)
) %>%
collect(),
tibble(
x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05")
)
)

compare_dplyr_binding(
.input %>%
mutate(
dttm_with_tz = fast_strptime(
dttm_as_string,
format = "%Y-%m-%d %H:%M:%S",
tz = "Pacific/Marquesas",
lt = FALSE
)
) %>%
collect(),
tibble(
dttm_as_string =
c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA)
)
)

# fast_strptime()'s `cutoff_2000` argument is not supported, but its value is
# implicitly set to 68L both in lubridate and in Arrow
compare_dplyr_binding(
.input %>%
mutate(
date_short_year =
fast_strptime(
x,
format = "%y-%m-%d %H:%M:%S",
lt = FALSE
)
) %>%
collect(),
tibble(
x =
c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
)#,
# arrow does not preserve the `tzone` attribute
# test ignore_attr = TRUE
Comment on lines +1903 to +1905
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should remove these commented lines, yeah?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I yes, sorry. Forgot about those. Do I open a minor PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or should I just do it in one of the other PR I have going?

Copy link
Contributor Author

@dragosmg dragosmg May 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonkeane I removed the tests in dragosmg@bde85ce

)

# the arrow binding errors for a value different from 68L for `cutoff_2000`
compare_dplyr_binding(
.input %>%
mutate(
date_short_year =
fast_strptime(
x,
format = "%y-%m-%d %H:%M:%S",
lt = FALSE,
cutoff_2000 = 69L
)
) %>%
collect(),
tibble(
x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
),
warning = TRUE
)

# compare_dplyr_binding would not work here since lt = TRUE returns a list
# and it also errors in regular dplyr pipelines
expect_warning(
tibble(
x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
) %>%
arrow_table() %>%
mutate(
date_short_year =
fast_strptime(
x,
format = "%y-%m-%d %H:%M:%S",
lt = TRUE
)
) %>%
collect()
)
})