Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
da4158a
added binding for `date()` + unit test + NEWS bullet point
dragosmg Feb 15, 2022
1bce2a8
namespace the call to `date()`
dragosmg Feb 15, 2022
3ea6beb
remove namespacing for `date()` inside the `mutate()` context
dragosmg Feb 15, 2022
96c6b3f
trying to mask `base::date()`
dragosmg Feb 15, 2022
27cf385
test 2
dragosmg Feb 15, 2022
4ef422d
test 2 for the `date()` masking issue
dragosmg Feb 15, 2022
35ddfec
added comment on namespacing
dragosmg Feb 15, 2022
53b91ac
skip on windows
dragosmg Feb 15, 2022
87d809c
point to relevant Jira
dragosmg Feb 17, 2022
3ea3048
test some unsupported inputs
dragosmg Feb 21, 2022
eea11c2
create an Array from vector + unit test
dragosmg Feb 21, 2022
0ffe555
comment + skip integer test
dragosmg Feb 22, 2022
5951160
oversight
dragosmg Feb 23, 2022
cefad69
use `build_expr()` when defining the `"date"` binding
dragosmg Feb 24, 2022
1d96572
typo
dragosmg Feb 24, 2022
6210956
using `cast_options()` to generate the `options` list
dragosmg Feb 24, 2022
9d487c0
extended `as.Date()` to support multiple input types + tests
dragosmg Feb 25, 2022
4255041
cleaned-up the testing for `date()`
dragosmg Feb 25, 2022
a3b0f11
clean-up
dragosmg Feb 25, 2022
ad49d2c
added `tryFormats` arg and handling for situations in which `x` is a …
dragosmg Feb 28, 2022
2c26ef7
figure out which one Windows doesn't like
dragosmg Feb 28, 2022
9b6a9e2
use interim step + skip test involving tzdb on Windows
dragosmg Feb 28, 2022
4105b82
update NEWS
dragosmg Feb 28, 2022
6e1be4e
moved unit test to the success block
dragosmg Mar 1, 2022
47f8119
remove the `interim_x` object and the additional step it introduced
dragosmg Mar 1, 2022
0b8cfe3
float -> integer with `safe = FALSE` & 2 additional unit tests (with …
dragosmg Mar 1, 2022
2be4e31
improved comments
dragosmg Mar 1, 2022
a280f45
switch back to `floor`
dragosmg Mar 1, 2022
81e9add
let `build_expr()` handle conversion to `Expression`
dragosmg Mar 1, 2022
af3d5cf
simplify building the strptime expression (unit = O)
dragosmg Mar 1, 2022
d09bee8
unit tests updates
dragosmg Mar 2, 2022
5da34fb
simplified `format` vs `tryFormats` implementation and moved the chec…
dragosmg Mar 2, 2022
4f9d0c1
moved arg checking at the top and reorganised the body of the functio…
dragosmg Mar 2, 2022
47587a9
removed redundant unit tests
dragosmg Mar 2, 2022
41e604f
Merge branch 'master' into lubridate_date
dragosmg Mar 2, 2022
fc7667c
support other timezones than `"UTC"` and update tests to reflect that
dragosmg Mar 3, 2022
036e349
update NEWS
dragosmg Mar 3, 2022
5788fd5
remove suport for double and corresponding unit tests
dragosmg Mar 3, 2022
a549f4c
use `compare_dplyr_binding()` to test pull back to R
dragosmg Mar 3, 2022
1a9bbfd
added a couple of comments to improve quality
dragosmg Mar 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
* `tz()` to extract/get timezone
* `semester()` to extract/get semester
* `dst()` to get daylight savings time indicator.
* `date()` to extract date
* `epiyear()` to get epiyear
* date-time functionality:
* `as.Date()` to convert to date

# arrow 7.0.0

Expand Down
3 changes: 3 additions & 0 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,7 @@ register_bindings_datetime <- function() {
return(semester)
}
})
register_binding("date", function(x) {
build_expr("cast", x, options = list(to_type = date32()))
})
}
44 changes: 44 additions & 0 deletions r/R/dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,50 @@ register_bindings_type_cast <- function() {
register_binding("as.numeric", function(x) {
Expression$create("cast", x, options = cast_options(to_type = float64()))
})
register_binding("as.Date", function(x,
format = NULL,
tryFormats = "%Y-%m-%d",
origin = "1970-01-01",
tz = "UTC") {

# the origin argument will be better supported once we implement temporal
# arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
# TODO revisit once the above has been sorted
if (call_binding("is.numeric", x) & origin != "1970-01-01") {
abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
}

# this could be improved with tryFormats once strptime returns NA and we
# can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
# TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done
if (is.null(format) && length(tryFormats) > 1) {
abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
}

if (call_binding("is.Date", x)) {
return(x)

# cast from POSIXct
} else if (call_binding("is.POSIXct", x)) {
# base::as.Date() first converts to the desired timezone and then extracts
# the date, which is why we need to go through timestamp() first
x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))

# cast from character
} else if (call_binding("is.character", x)) {
format <- format %||% tryFormats[[1]]
# unit = 0L is the identifier for seconds in valid_time32_units
x <- build_expr("strptime", x, options = list(format = format, unit = 0L))

# cast from numeric
} else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
# Arrow does not support direct casting from double to date32()
# https://issues.apache.org/jira/browse/ARROW-15798
# TODO revisit if arrow decides to support double -> date casting
abort("`as.Date()` with double/float is not supported in Arrow")
}
build_expr("cast", x, options = cast_options(to_type = date32()))
})

register_binding("is", function(object, class2) {
if (is.string(class2)) {
Expand Down
85 changes: 85 additions & 0 deletions r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -819,3 +819,88 @@ test_that("dst extracts daylight savings time correctly", {
test_df
)
})

test_that("date works in arrow", {
# https://issues.apache.org/jira/browse/ARROW-13168
skip_on_os("windows")
# this date is specific since lubridate::date() is different from base::as.Date()
# since as.Date returns the UTC date and date() doesn't
test_df <- tibble(
posixct_date = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York"),
integer_var = c(32L, NA))

r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13")

# we can't (for now) use namespacing, so we need to make sure lubridate::date()
# and not base::date() is being used. This is due to the way testthat runs and
# normal use of arrow would not have to do this explicitly.
# TODO remove once https://issues.apache.org/jira/browse/ARROW-14575 is done
date <- lubridate::date

compare_dplyr_binding(
.input %>%
mutate(a_date = date(posixct_date)) %>%
collect(),
test_df
)

compare_dplyr_binding(
.input %>%
mutate(a_date_base = as.Date(posixct_date)) %>%
collect(),
test_df
)

compare_dplyr_binding(
.input %>%
mutate(date_from_r_object = date(r_date_object)) %>%
collect(),
test_df
)

compare_dplyr_binding(
.input %>%
mutate(as_date_from_r_object = as.Date(r_date_object)) %>%
collect(),
test_df
)

# date from integer supported in arrow (similar to base::as.Date()), but in
# Arrow it assumes a fixed origin "1970-01-01". However this is not supported
# by lubridate. lubridate::date(integer_var) errors without an `origin`
expect_equal(
test_df %>%
arrow_table() %>%
select(integer_var) %>%
mutate(date_int = date(integer_var)) %>%
collect(),
tibble(integer_var = c(32L, NA),
date_int = as.Date(c("1970-02-02", NA)))
)
})

test_that("date() errors with unsupported inputs", {
expect_error(
example_data %>%
arrow_table() %>%
mutate(date_char = date("2022-02-25 00:00:01")) %>%
collect(),
regexp = "Unsupported cast from string to date32 using function cast_date32"
)

expect_error(
example_data %>%
arrow_table() %>%
mutate(date_bool = date(TRUE)) %>%
collect(),
regexp = "Unsupported cast from bool to date32 using function cast_date32"
)

expect_error(
example_data %>%
arrow_table() %>%
mutate(date_double = date(34.56)) %>%
collect(),
regexp = "Unsupported cast from double to date32 using function cast_date32"
)
})
75 changes: 75 additions & 0 deletions r/tests/testthat/test-dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ skip_if_not_available("dataset")

library(dplyr, warn.conflicts = FALSE)
suppressPackageStartupMessages(library(bit64))
suppressPackageStartupMessages(library(lubridate))


tbl <- example_data
Expand Down Expand Up @@ -767,4 +768,78 @@ test_that("nested structs can be created from scalars and existing data frames",
collect(),
tibble(a = 1:2)
)

})

test_that("as.Date() converts successfully from date, timestamp, integer, char and double", {
test_df <- tibble::tibble(
posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"),
date_var = as.Date("2022-02-25"),
character_ymd_var = "2022-02-25 00:00:01",
character_ydm_var = "2022/25/02 00:00:01",
integer_var = 32L,
double_var = 34.56
)

# casting from POSIXct treated separately so we can skip on Windows
# TODO move the test for casting from POSIXct below once ARROW-13168 is done
compare_dplyr_binding(
.input %>%
mutate(
date_dv = as.Date(date_var),
date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
date_int = as.Date(integer_var, origin = "1970-01-01")
) %>%
collect(),
test_df
)

# currently we do not support an origin different to "1970-01-01"
compare_dplyr_binding(
.input %>%
mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>%
collect(),
test_df,
warning = TRUE
)

# we do not support multiple tryFormats
compare_dplyr_binding(
.input %>%
mutate(date_char_ymd = as.Date(character_ymd_var,
tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>%
collect(),
test_df,
warning = TRUE
)

expect_error(
test_df %>%
arrow_table() %>%
mutate(date_char_ymd = as.Date(character_ymd_var)) %>%
collect(),
regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]",
fixed = TRUE
)

# we do not support as.Date() with double/ float
compare_dplyr_binding(
.input %>%
mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>%
collect(),
test_df,
warning = TRUE
)

skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
compare_dplyr_binding(
.input %>%
mutate(
date_pv = as.Date(posixct_var),
date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas")
) %>%
collect(),
test_df
)
})