From 596bae7ee9d401940be8cd0ab85b98656bee2875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 10:13:43 +0100 Subject: [PATCH 1/9] moved datetime helper functions from `R/dplyr-funcs-datetime.R` to `R/dplyr-datetime-helpers.R` --- r/DESCRIPTION | 1 + r/R/dplyr-datetime-helpers.R | 130 +++++++++++++++++++++++++++++++++++ r/R/dplyr-funcs-datetime.R | 129 ---------------------------------- 3 files changed, 131 insertions(+), 129 deletions(-) create mode 100644 r/R/dplyr-datetime-helpers.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 8c751520a78..29f5ee29321 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -91,6 +91,7 @@ Collate: 'dplyr-arrange.R' 'dplyr-collect.R' 'dplyr-count.R' + 'dplyr-datetime-helpers.R' 'dplyr-distinct.R' 'dplyr-eval.R' 'dplyr-filter.R' diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R new file mode 100644 index 00000000000..03c06a6ac80 --- /dev/null +++ b/r/R/dplyr-datetime-helpers.R @@ -0,0 +1,130 @@ +.helpers_function_map <- list( + "dminutes" = list(60, "s"), + "dhours" = list(3600, "s"), + "ddays" = list(86400, "s"), + "dweeks" = list(604800, "s"), + "dmonths" = list(2629800, "s"), + "dyears" = list(31557600, "s"), + "dseconds" = list(1, "s"), + "dmilliseconds" = list(1, "ms"), + "dmicroseconds" = list(1, "us"), + "dnanoseconds" = list(1, "ns") +) +make_duration <- function(x, unit) { + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + x$cast(duration(unit)) +} + +binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { + if (usetz) { + format <- paste(format, "%Z") + } + + if (call_binding("is.POSIXct", x)) { + # the casting part might not be required once + # https://issues.apache.org/jira/browse/ARROW-14442 is solved + # TODO revisit the steps below once the PR for that issue is merged + if (tz == "" && x$type()$timezone() != "") { + tz <- x$type()$timezone() + } else if (tz == "") { + tz <- Sys.timezone() + } + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) + } + + build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} + +# this is a helper function used for creating a difftime / duration objects from +# several of the accepted pieces (second, minute, hour, day, week) +duration_from_chunks <- function(chunks) { + accepted_chunks <- c("second", "minute", "hour", "day", "week") + matched_chunks <- accepted_chunks[pmatch(names(chunks), accepted_chunks, duplicates.ok = TRUE)] + + if (any(is.na(matched_chunks))) { + abort( + paste0( + "named `difftime` units other than: ", + oxford_paste(accepted_chunks, quote_symbol = "`"), + " not supported in Arrow. \nInvalid `difftime` parts: ", + oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`") + ) + ) + } + + matched_chunks <- matched_chunks[!is.na(matched_chunks)] + + chunks <- chunks[matched_chunks] + chunk_duration <- c( + "second" = 1L, + "minute" = 60L, + "hour" = 3600L, + "day" = 86400L, + "week" = 604800L + ) + + # transform the duration of each chunk in seconds and add everything together + duration <- 0 + for (chunk in names(chunks)) { + duration <- duration + chunks[[chunk]] * chunk_duration[[chunk]] + } + duration +} + + +binding_as_date <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01") { + + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") + } + + if (call_binding("is.Date", x)) { + return(x) + + # cast from character + } else if (call_binding("is.character", x)) { + x <- binding_as_date_character(x, format, tryFormats) + + # cast from numeric + } else if (call_binding("is.numeric", x)) { + x <- binding_as_date_numeric(x, origin) + } + + build_expr("cast", x, options = cast_options(to_type = date32())) +} + +binding_as_date_character <- function(x, + format = NULL, + tryFormats = "%Y-%m-%d") { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + build_expr("strptime", x, options = list(format = format, unit = 0L)) +} + +binding_as_date_numeric <- function(x, origin = "1970-01-01") { + + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + if (!call_binding("is.integer", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + + if (origin != "1970-01-01") { + delta_in_sec <- call_binding("difftime", origin, "1970-01-01") + # TODO: revisit once either of these issues is addressed: + # https://issues.apache.org/jira/browse/ARROW-16253 (helper function for + # casting from double to duration) or + # https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32 + # -> duration or double -> duration) + delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) + delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) + x <- build_expr("+", x, delta_in_days) + } + + x +} diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 4f2517e8efc..9476e817aac 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -359,22 +359,6 @@ register_bindings_duration <- function() { }) } -.helpers_function_map <- list( - "dminutes" = list(60, "s"), - "dhours" = list(3600, "s"), - "ddays" = list(86400, "s"), - "dweeks" = list(604800, "s"), - "dmonths" = list(2629800, "s"), - "dyears" = list(31557600, "s"), - "dseconds" = list(1, "s"), - "dmilliseconds" = list(1, "ms"), - "dmicroseconds" = list(1, "us"), - "dnanoseconds" = list(1, "ns") -) -make_duration <- function(x, unit) { - x <- build_expr("cast", x, options = cast_options(to_type = int64())) - x$cast(duration(unit)) -} register_bindings_duration_helpers <- function() { duration_helpers_map_factory <- function(value, unit) { force(value) @@ -426,116 +410,3 @@ register_bindings_difftime_constructors <- function() { duration$cast(duration("s")) }) } - -binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) { - if (usetz) { - format <- paste(format, "%Z") - } - - if (call_binding("is.POSIXct", x)) { - # the casting part might not be required once - # https://issues.apache.org/jira/browse/ARROW-14442 is solved - # TODO revisit the steps below once the PR for that issue is merged - if (tz == "" && x$type()$timezone() != "") { - tz <- x$type()$timezone() - } else if (tz == "") { - tz <- Sys.timezone() - } - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(x$type()$unit(), tz))) - } - - build_expr("strftime", x, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) -} - -# this is a helper function used for creating a difftime / duration objects from -# several of the accepted pieces (second, minute, hour, day, week) -duration_from_chunks <- function(chunks) { - accepted_chunks <- c("second", "minute", "hour", "day", "week") - matched_chunks <- accepted_chunks[pmatch(names(chunks), accepted_chunks, duplicates.ok = TRUE)] - - if (any(is.na(matched_chunks))) { - abort( - paste0( - "named `difftime` units other than: ", - oxford_paste(accepted_chunks, quote_symbol = "`"), - " not supported in Arrow. \nInvalid `difftime` parts: ", - oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`") - ) - ) - } - - matched_chunks <- matched_chunks[!is.na(matched_chunks)] - - chunks <- chunks[matched_chunks] - chunk_duration <- c( - "second" = 1L, - "minute" = 60L, - "hour" = 3600L, - "day" = 86400L, - "week" = 604800L - ) - - # transform the duration of each chunk in seconds and add everything together - duration <- 0 - for (chunk in names(chunks)) { - duration <- duration + chunks[[chunk]] * chunk_duration[[chunk]] - } - duration -} - -binding_as_date <- function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01") { - - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from character - } else if (call_binding("is.character", x)) { - x <- binding_as_date_character(x, format, tryFormats) - - # cast from numeric - } else if (call_binding("is.numeric", x)) { - x <- binding_as_date_numeric(x, origin) - } - - build_expr("cast", x, options = cast_options(to_type = date32())) -} - -binding_as_date_character <- function(x, - format = NULL, - tryFormats = "%Y-%m-%d") { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - build_expr("strptime", x, options = list(format = format, unit = 0L)) -} - -binding_as_date_numeric <- function(x, origin = "1970-01-01") { - - # Arrow does not support direct casting from double to date32(), but for - # integer-like values we can go via int32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - if (!call_binding("is.integer", x)) { - x <- build_expr("cast", x, options = cast_options(to_type = int32())) - } - - if (origin != "1970-01-01") { - delta_in_sec <- call_binding("difftime", origin, "1970-01-01") - # TODO: revisit once either of these issues is addressed: - # https://issues.apache.org/jira/browse/ARROW-16253 (helper function for - # casting from double to duration) or - # https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32 - # -> duration or double -> duration) - delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64())) - delta_in_days <- (delta_in_sec / 86400L)$cast(int32()) - x <- build_expr("+", x, delta_in_days) - } - - x -} From c379fd477e862634755cbbe4ffeb6d1a278082d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 10:22:07 +0100 Subject: [PATCH 2/9] added `checked_time_locale()` to `dplyr-datetime-helpers.R` --- r/R/dplyr-datetime-helpers.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 03c06a6ac80..48f37a7bb81 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -1,3 +1,14 @@ + + +check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) { + if (tolower(Sys.info()[["sysname"]]) == "windows" & locale != "C") { + # MingW C++ std::locale only supports "C" and "POSIX" + stop(paste0("On Windows, time locales other than 'C' are not supported in Arrow. ", + "Consider setting `Sys.setlocale('LC_TIME', 'C')`")) + } + locale +} + .helpers_function_map <- list( "dminutes" = list(60, "s"), "dhours" = list(3600, "s"), From 854a8259bfec5f07be021167e61d9bc2cebc1d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 10:22:25 +0100 Subject: [PATCH 3/9] added file header --- r/R/dplyr-datetime-helpers.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 48f37a7bb81..9acf8b18435 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -1,4 +1,19 @@ - +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) { if (tolower(Sys.info()[["sysname"]]) == "windows" & locale != "C") { From 04090aa272e29d604744502eab025df1523ef81d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 10:24:17 +0100 Subject: [PATCH 4/9] removed `check_time_locale()` from `dplyr-funcs-datetime.R` --- r/R/dplyr-funcs-datetime.R | 9 --------- 1 file changed, 9 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 9476e817aac..f1bb8ba3175 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -15,15 +15,6 @@ # specific language governing permissions and limitations # under the License. -check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) { - if (tolower(Sys.info()[["sysname"]]) == "windows" & locale != "C") { - # MingW C++ std::locale only supports "C" and "POSIX" - stop(paste0("On Windows, time locales other than 'C' are not supported in Arrow. ", - "Consider setting `Sys.setlocale('LC_TIME', 'C')`")) - } - locale -} - register_bindings_datetime <- function() { register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { From 3748e9286e206750c79e3f99a00b9d08eb9c872b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 11:02:16 +0100 Subject: [PATCH 5/9] move `leap_year` to `.unary_function_map()` --- r/R/dplyr-funcs-datetime.R | 4 ---- r/R/expression.R | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index f1bb8ba3175..82343df524b 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -157,10 +157,6 @@ register_bindings_datetime <- function() { (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")]) }) - register_binding("leap_year", function(date) { - Expression$create("is_leap_year", date) - }) - register_binding("am", function(x) { hour <- Expression$create("hour", x) hour < 12 diff --git a/r/R/expression.R b/r/R/expression.R index eb37950c34a..be43de01e1c 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -75,6 +75,7 @@ "mday" = "day", "yday" = "day_of_year", "year" = "year", + "leap_year" = "is_leap_year", # type conversion functions "as.factor" = "dictionary_encode" From c96cf24e2193e7026caf6017f58b27053c17f689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 13:40:44 +0100 Subject: [PATCH 6/9] moved unit tests for `as.Date()`, `as_date()`, and `as_datetime()` to `test-dplyr-funcs-datetime.R` --- r/tests/testthat/test-dplyr-funcs-datetime.R | 155 +++++++++++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 155 ------------------- 2 files changed, 155 insertions(+), 155 deletions(-) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index a4c5ee3c224..e2ca36f8fae 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1471,3 +1471,158 @@ test_that("make_difftime()", { ) ) }) + +test_that("`as.Date()` and `as_date()`", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), + dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), + dt_utc = ymd_hms("2010-08-03 00:50:50"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + compare_dplyr_binding( + .input %>% + mutate( + date_dv1 = as.Date(date_var), + date_pv1 = as.Date(posixct_var), + date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), + date_utc1 = as.Date(dt_utc), + date_europe1 = as.Date(dt_europe), + date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int1 = as.Date(integer_var, origin = "1970-01-01"), + date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), + date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), + date_dv2 = as_date(date_var), + date_pv2 = as_date(posixct_var), + date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), + date_utc2 = as_date(dt_utc), + date_europe2 = as_date(dt_europe), + date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int2 = as_date(integer_var, origin = "1970-01-01"), + date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), + date_integerish2 = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - testing an error surfaced from + # C++ (hence not testing the content of the error message) + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + + # we do not support as.Date() with double/ float (error surfaced from C++) + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + # we do not support as_date with double/ float (error surfaced from C++) + # TODO: revisit after https://issues.apache.org/jira/browse/ARROW-15798 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) +}) + +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + test_df %>% + arrow_table() %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_non_iso = as_datetime(char_date_non_iso, format = "%Y-%d-%m %H:%M:%S"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect() + + compare_dplyr_binding( + .input %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # Arrow does not support conversion of double to date + # the below should error with an error message originating in the C++ code + expect_error( + test_df %>% + arrow_table() %>% + mutate( + ddouble_date = as_datetime(double_date) + ) %>% + collect(), + regexp = "Float value 10.1 was truncated converting to int64" + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 6a07d36e818..83bfe2873a2 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -802,161 +802,6 @@ test_that("nested structs can be created from scalars and existing data frames", }) -test_that("`as.Date()` and `as_date()`", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), - dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"), - dt_utc = ymd_hms("2010-08-03 00:50:50"), - date_var = as.Date("2022-02-25"), - difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - integerish_var = 32, - double_var = 34.56 - ) - - compare_dplyr_binding( - .input %>% - mutate( - date_dv1 = as.Date(date_var), - date_pv1 = as.Date(posixct_var), - date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), - date_utc1 = as.Date(dt_utc), - date_europe1 = as.Date(dt_europe), - date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int1 = as.Date(integer_var, origin = "1970-01-01"), - date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), - date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), - date_dv2 = as_date(date_var), - date_pv2 = as_date(posixct_var), - date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), - date_utc2 = as_date(dt_utc), - date_europe2 = as_date(dt_europe), - date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int2 = as_date(integer_var, origin = "1970-01-01"), - date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), - date_integerish2 = as_date(integerish_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # we do not support multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date(character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% - collect(), - test_df, - warning = TRUE - ) - - # strptime does not support a partial format - testing an error surfaced from - # C++ (hence not testing the content of the error message) - # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as_date(character_ymd_var)) %>% - collect() - ) - - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as.Date(character_ymd_var)) %>% - collect(), - regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", - fixed = TRUE - ) - - - # we do not support as.Date() with double/ float (error surfaced from C++) - # TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - # we do not support as_date with double/ float (error surfaced from C++) - # TODO: revisit after https://issues.apache.org/jira/browse/ARROW-15798 - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% - collect() - ) - - # difference between as.Date() and as_date(): - #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg - # to `as.Date()` - # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL - compare_dplyr_binding( - .input %>% - transmute( - date_diff_lubridate = as_date(difference_date), - date_diff_base = as.Date(difference_date) - ) %>% - collect(), - test_df - ) -}) - -test_that("`as_datetime()`", { - test_df <- tibble( - date = as.Date(c("2022-03-22", "2021-07-30", NA)), - char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), - char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA), - int_date = c(10L, 25L, NA), - integerish_date = c(10, 25, NA), - double_date = c(10.1, 25.2, NA) - ) - - test_df %>% - arrow_table() %>% - mutate( - ddate = as_datetime(date), - dchar_date_no_tz = as_datetime(char_date), - dchar_date_non_iso = as_datetime(char_date_non_iso, format = "%Y-%d-%m %H:%M:%S"), - dint_date = as_datetime(int_date, origin = "1970-01-02"), - dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") - ) %>% - collect() - - compare_dplyr_binding( - .input %>% - mutate( - ddate = as_datetime(date), - dchar_date_no_tz = as_datetime(char_date), - dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), - dint_date = as_datetime(int_date, origin = "1970-01-02"), - dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), - dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # Arrow does not support conversion of double to date - # the below should error with an error message originating in the C++ code - expect_error( - test_df %>% - arrow_table() %>% - mutate( - ddouble_date = as_datetime(double_date) - ) %>% - collect(), - regexp = "Float value 10.1 was truncated converting to int64" - ) -}) - test_that("format date/time", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 # In 3.4 the lack of tzone attribute causes spurious failures From eab713c5473745c570b33382f44bf942f8176b11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 13:56:47 +0100 Subject: [PATCH 7/9] moved `as.Date()`, `as_date()`, and `as_datetime()` to `dplyr-funcs-datetime.R` and re-structured the registering functions --- r/R/dplyr-funcs-datetime.R | 232 +++++++++++++++++++++++++------------ r/R/dplyr-funcs-type.R | 65 ----------- 2 files changed, 160 insertions(+), 137 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 82343df524b..64ce1e0bb25 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -15,7 +15,18 @@ # specific language governing permissions and limitations # under the License. +# Split up into several register functions by category to reduce cyclomatic +# complexity (linter) register_bindings_datetime <- function() { + register_bindings_datetime_utility() + register_bindings_datetime_components() + register_bindings_datetime_conversion() + register_bindings_duration() + register_bindings_duration_constructor() + register_bindings_duration_helpers() +} + +register_bindings_datetime_utility <- function() { register_binding("strptime", function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { # Arrow uses unit for time parsing, strptime() does not. @@ -82,6 +93,29 @@ register_bindings_datetime <- function() { Expression$create("strftime", x, options = list(format = format, locale = "C")) }) + register_binding("is.Date", function(x) { + inherits(x, "Date") || + (inherits(x, "Expression") && x$type_id() %in% Type[c("DATE32", "DATE64")]) + }) + + is_instant_binding <- function(x) { + inherits(x, c("POSIXt", "POSIXct", "POSIXlt", "Date")) || + (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) + } + register_binding("is.instant", is_instant_binding) + register_binding("is.timepoint", is_instant_binding) + + register_binding("is.POSIXct", function(x) { + inherits(x, "POSIXct") || + (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")]) + }) + + register_binding("date", function(x) { + build_expr("cast", x, options = list(to_type = date32())) + }) +} + +register_bindings_datetime_components <- function() { register_binding("second", function(x) { Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) }) @@ -140,23 +174,6 @@ register_bindings_datetime <- function() { build_expr("month", x) }) - register_binding("is.Date", function(x) { - inherits(x, "Date") || - (inherits(x, "Expression") && x$type_id() %in% Type[c("DATE32", "DATE64")]) - }) - - is_instant_binding <- function(x) { - inherits(x, c("POSIXt", "POSIXct", "POSIXlt", "Date")) || - (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP", "DATE32", "DATE64")]) - } - register_binding("is.instant", is_instant_binding) - register_binding("is.timepoint", is_instant_binding) - - register_binding("is.POSIXct", function(x) { - inherits(x, "POSIXct") || - (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")]) - }) - register_binding("am", function(x) { hour <- Expression$create("hour", x) hour < 12 @@ -187,12 +204,9 @@ register_bindings_datetime <- function() { return(semester) } }) - register_binding("date", function(x) { - build_expr("cast", x, options = list(to_type = date32())) - }) } -register_bindings_duration <- function() { +register_bindings_datetime_conversion <- function() { register_binding("make_datetime", function(year = 1970L, month = 1L, day = 1L, @@ -210,10 +224,12 @@ register_bindings_duration <- function() { x <- call_binding("str_c", year, month, day, hour, min, sec, sep = "-") build_expr("strptime", x, options = list(format = "%Y-%m-%d-%H-%M-%S", unit = 0L)) }) + register_binding("make_date", function(year = 1970L, month = 1L, day = 1L) { x <- call_binding("make_datetime", year, month, day) build_expr("cast", x, options = cast_options(to_type = date32())) }) + register_binding("ISOdatetime", function(year, month, day, @@ -232,6 +248,7 @@ register_bindings_duration <- function() { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) + register_binding("ISOdate", function(year, month, day, @@ -241,6 +258,105 @@ register_bindings_duration <- function() { tz = "UTC") { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) + + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) + if (call_binding("is.POSIXct", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + + binding_as_date( + x = x, + format = format, + tryFormats = tryFormats, + origin = origin + ) + }) + + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = NULL) { + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) + if (call_binding("is.POSIXct", x) && !is.null(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + binding_as_date( + x = x, + format = format, + origin = origin + ) + }) + + register_binding("as_datetime", function(x, + origin = "1970-01-01", + tz = "UTC", + format = NULL) { + if (call_binding("is.numeric", x)) { + delta <- call_binding("difftime", origin, "1970-01-01") + delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + x <- build_expr("+", x, delta) + } + + if (call_binding("is.character", x) && !is.null(format)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr( + "strptime", + x, + options = list(format = format, unit = 0L, error_is_null = TRUE) + ) + } + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) + build_expr("assume_timezone", output, options = list(timezone = tz)) + }) + + register_binding("decimal_date", function(date) { + y <- build_expr("year", date) + start <- call_binding("make_datetime", year = y, tz = "UTC") + sofar <- call_binding("difftime", date, start, units = "secs") + total <- call_binding( + "if_else", + build_expr("is_leap_year", date), + Expression$scalar(31622400L), # number of seconds in a leap year (366 days) + Expression$scalar(31536000L) # number of seconds in a regular year (365 days) + ) + y + sofar$cast(int64()) / total + }) + + register_binding("date_decimal", function(decimal, tz = "UTC") { + y <- build_expr("floor", decimal) + + start <- call_binding("make_datetime", year = y, tz = tz) + seconds <- call_binding( + "if_else", + build_expr("is_leap_year", start), + Expression$scalar(31622400L), # number of seconds in a leap year (366 days) + Expression$scalar(31536000L) # number of seconds in a regular year (365 days) + ) + + fraction <- decimal - y + delta <- build_expr("floor", seconds * fraction) + delta <- delta$cast(int64()) + start + delta$cast(duration("s")) + }) +} + +register_bindings_duration <- function() { register_binding("difftime", function(time1, time2, tz, @@ -316,59 +432,9 @@ register_bindings_duration <- function() { build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) }) - register_binding("decimal_date", function(date) { - y <- build_expr("year", date) - start <- call_binding("make_datetime", year = y, tz = "UTC") - sofar <- call_binding("difftime", date, start, units = "secs") - total <- call_binding( - "if_else", - build_expr("is_leap_year", date), - Expression$scalar(31622400L), # number of seconds in a leap year (366 days) - Expression$scalar(31536000L) # number of seconds in a regular year (365 days) - ) - y + sofar$cast(int64()) / total - }) - register_binding("date_decimal", function(decimal, tz = "UTC") { - y <- build_expr("floor", decimal) - - start <- call_binding("make_datetime", year = y, tz = tz) - seconds <- call_binding( - "if_else", - build_expr("is_leap_year", start), - Expression$scalar(31622400L), # number of seconds in a leap year (366 days) - Expression$scalar(31536000L) # number of seconds in a regular year (365 days) - ) - - fraction <- decimal - y - delta <- build_expr("floor", seconds * fraction) - delta <- delta$cast(int64()) - start + delta$cast(duration("s")) - }) } -register_bindings_duration_helpers <- function() { - duration_helpers_map_factory <- function(value, unit) { - force(value) - force(unit) - function(x = 1) make_duration(x * value, unit) - } - - for (name in names(.helpers_function_map)) { - register_binding( - name, - duration_helpers_map_factory( - .helpers_function_map[[name]][[1]], - .helpers_function_map[[name]][[2]] - ) - ) - } - - register_binding("dpicoseconds", function(x = 1) { - abort("Duration in picoseconds not supported in Arrow.") - }) -} - -register_bindings_difftime_constructors <- function() { +register_bindings_duration_constructor <- function(){ register_binding("make_difftime", function(num = NULL, units = "secs", ...) { @@ -397,3 +463,25 @@ register_bindings_difftime_constructors <- function() { duration$cast(duration("s")) }) } + +register_bindings_duration_helpers <- function(){ + duration_helpers_map_factory <- function(value, unit) { + force(value) + force(unit) + function(x = 1) make_duration(x * value, unit) + } + + for (name in names(.helpers_function_map)) { + register_binding( + name, + duration_helpers_map_factory( + .helpers_function_map[[name]][[1]], + .helpers_function_map[[name]][[2]] + ) + ) + } + + register_binding("dpicoseconds", function(x = 1) { + abort("Duration in picoseconds not supported in Arrow.") + }) +} diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index e3700cf35b7..653719fa2cc 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -77,71 +77,6 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { build_expr("cast", x, options = cast_options(to_type = float64())) }) - register_binding("as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - # base::as.Date() and lubridate::as_date() differ in the way they use the - # `tz` argument. Both cast to the desired timezone, if present. The - # difference appears when the `tz` argument is not set: `as.Date()` uses the - # default value ("UTC"), while `as_date()` keeps the original attribute - # => we only cast when we want the behaviour of the base version or when - # `tz` is set (i.e. not NULL) - if (call_binding("is.POSIXct", x)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - - binding_as_date( - x = x, - format = format, - tryFormats = tryFormats, - origin = origin - ) - }) - - register_binding("as_date", function(x, - format = NULL, - origin = "1970-01-01", - tz = NULL) { - # base::as.Date() and lubridate::as_date() differ in the way they use the - # `tz` argument. Both cast to the desired timezone, if present. The - # difference appears when the `tz` argument is not set: `as.Date()` uses the - # default value ("UTC"), while `as_date()` keeps the original attribute - # => we only cast when we want the behaviour of the base version or when - # `tz` is set (i.e. not NULL) - if (call_binding("is.POSIXct", x) && !is.null(tz)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } - binding_as_date( - x = x, - format = format, - origin = origin - ) - }) - - register_binding("as_datetime", function(x, - origin = "1970-01-01", - tz = "UTC", - format = NULL) { - if (call_binding("is.numeric", x)) { - delta <- call_binding("difftime", origin, "1970-01-01") - delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) - x <- build_expr("cast", x, options = cast_options(to_type = int64())) - x <- build_expr("+", x, delta) - } - - if (call_binding("is.character", x) && !is.null(format)) { - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr( - "strptime", - x, - options = list(format = format, unit = 0L, error_is_null = TRUE) - ) - } - output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) - build_expr("assume_timezone", output, options = list(timezone = tz)) - }) register_binding("is", function(object, class2) { if (is.string(class2)) { From 43902cdb3f344324e37032a60cd1a98bc21e1868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 13:57:28 +0100 Subject: [PATCH 8/9] all datetime bindings are now registered with `register_bindings_datetime()` --- r/R/dplyr-funcs.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index c66ed04893d..4d7cb3bc63d 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -106,9 +106,6 @@ create_binding_cache <- function() { register_bindings_aggregate() register_bindings_conditional() register_bindings_datetime() - register_bindings_difftime_constructors() - register_bindings_duration() - register_bindings_duration_helpers() register_bindings_math() register_bindings_string() register_bindings_type() From 1a407c534be6ad145e05ea3faa64513ea3c5a45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Fri, 29 Apr 2022 14:15:12 +0100 Subject: [PATCH 9/9] lint --- r/R/dplyr-funcs-datetime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 64ce1e0bb25..4d7ea050a0a 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -434,7 +434,7 @@ register_bindings_duration <- function() { }) } -register_bindings_duration_constructor <- function(){ +register_bindings_duration_constructor <- function() { register_binding("make_difftime", function(num = NULL, units = "secs", ...) { @@ -464,7 +464,7 @@ register_bindings_duration_constructor <- function(){ }) } -register_bindings_duration_helpers <- function(){ +register_bindings_duration_helpers <- function() { duration_helpers_map_factory <- function(value, unit) { force(value) force(unit)