From 475962ac8e3cda9b686aec88ac2313c169148baa Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 10 Jun 2021 19:29:06 +0100 Subject: [PATCH 01/22] Add functions for extracting time/date components --- r/R/dplyr-functions.R | 34 +++++++++ r/R/expression.R | 11 ++- r/tests/testthat/test-dplyr-lubridate.R | 95 +++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 r/tests/testthat/test-dplyr-lubridate.R diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 91d1b21ad88..f761f25d1c5 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -442,3 +442,37 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit Expression$create("strptime", x, options = list(format = format, unit = unit)) } + +nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7), locale = Sys.getlocale("LC_TIME")){ + if(label){ + arrow_not_supported("Label argument") + } + offset <- get_date_offset(week_start) + Expression$create("add", Expression$create("day_of_week", x), Expression$scalar(offset)) +} + + +#' Get date offset +#' +#' Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas +#' `lubridate::wday` counts from 1 to 7, and allows users to specify which day +#' of the week is first (Sunday by default). This function converts the returned +#' day of the week back to the value that would be returned by lubridate by +#' providing offset values based on the specified week_start day, and adding 1 +#' so the returned value is 1-indexed instead of 0-indexed. +#' +#' @param week_start day on which week starts following ISO conventions - 1 means Monday, 7 means Sunday. +#' +#' @keywords internal +get_date_offset <- function(week_start){ + if(week_start < 1 || week_start > 7){ + abort(c( + "The value of `week_start` must be between 1 and 7", + x = paste("`week_start` =", week_start) + ) + ) + } + offset_vals <- c(0:-4, 2, 1) + + offset_vals[week_start] + 1 +} diff --git a/r/R/expression.R b/r/R/expression.R index ba542339ff8..94678ffd868 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -29,8 +29,17 @@ # stringr spellings of those "str_length" = "utf8_length", "str_to_lower" = "utf8_lower", - "str_to_upper" = "utf8_upper" + "str_to_upper" = "utf8_upper", # str_trim is defined in dplyr.R + "year" = "year", + "isoyear" = "iso_year", + "quarter" = "quarter", + "month" = "month", + "day" = "day", + "yday" = "day_of_year", + "isoweek" = "iso_week", + "minute" = "minute", + "second" = "second" ) .binary_function_map <- list( diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R new file mode 100644 index 00000000000..e2c4f18f4c4 --- /dev/null +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +library(lubridate) +library(dplyr) + +test_date <- ymd_hms("1987-10-09 23:00:00", tz = NULL) +test_df <- tibble::tibble(date = test_date) + +test_that("extract date components", { + expect_dplyr_equal( + input %>% + mutate(x = year(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = isoyear(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = quarter(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = month(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = wday(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = day(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = yday(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = isoweek(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = minute(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( + input %>% + mutate(x = second(date)) %>% + collect(), + test_df + ) + +}) From 51f3aca825d2ef3c9ea1d8ef999d5268bf502842 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 10 Jun 2021 21:13:59 +0100 Subject: [PATCH 02/22] Implement day_of_week so it matches lubridate::wday --- r/R/dplyr-functions.R | 7 +++---- r/tests/testthat/test-dplyr-lubridate.R | 27 ++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index f761f25d1c5..6aa20621d1e 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -443,7 +443,7 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit Expression$create("strptime", x, options = list(format = format, unit = unit)) } -nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7), locale = Sys.getlocale("LC_TIME")){ +nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)){ if(label){ arrow_not_supported("Label argument") } @@ -451,7 +451,6 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption Expression$create("add", Expression$create("day_of_week", x), Expression$scalar(offset)) } - #' Get date offset #' #' Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas @@ -461,8 +460,8 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption #' providing offset values based on the specified week_start day, and adding 1 #' so the returned value is 1-indexed instead of 0-indexed. #' -#' @param week_start day on which week starts following ISO conventions - 1 means Monday, 7 means Sunday. -#' +#' @param week_start Day on which week starts following ISO conventions - 1 means Monday, 7 means Sunday. +#' @return Offset value, integer #' @keywords internal get_date_offset <- function(week_start){ if(week_start < 1 || week_start > 7){ diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index e2c4f18f4c4..72a2b2d2c76 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -21,7 +21,8 @@ library(dplyr) test_date <- ymd_hms("1987-10-09 23:00:00", tz = NULL) test_df <- tibble::tibble(date = test_date) -test_that("extract date components", { + +test_that("extract datetime components from date", { expect_dplyr_equal( input %>% mutate(x = year(date)) %>% @@ -57,6 +58,30 @@ test_that("extract date components", { test_df ) + expect_dplyr_equal( + input %>% + mutate(x = wday(date, week_start = 3)) %>% + collect(), + test_df + ) + + expect_warning( + test_df %>% + Table$create() %>% + mutate(x = wday(date, label = TRUE)) %>% + collect(), + regexp = "Label argument not supported by Arrow; pulling data into R" + ) + + expect_warning( + test_df %>% + Table$create() %>% + mutate(x = wday(date, locale = Sys.getlocale("LC_TIME"))) %>% + collect(), + regexp = 'Expression wday(date, locale = Sys.getlocale("LC_TIME")) not supported in Arrow; pulling data into R', + fixed = TRUE + ) + expect_dplyr_equal( input %>% mutate(x = day(date)) %>% From a4bd4b8e213963b5b6309a2b86d591ca868ff3bf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 10 Jun 2021 21:17:53 +0100 Subject: [PATCH 03/22] Sort out spacing --- r/R/dplyr-functions.R | 9 +++++---- r/tests/testthat/test-dplyr-lubridate.R | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 6aa20621d1e..468d46aa0d6 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -443,8 +443,8 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit Expression$create("strptime", x, options = list(format = format, unit = unit)) } -nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)){ - if(label){ +nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { + if (label) { arrow_not_supported("Label argument") } offset <- get_date_offset(week_start) @@ -463,8 +463,8 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption #' @param week_start Day on which week starts following ISO conventions - 1 means Monday, 7 means Sunday. #' @return Offset value, integer #' @keywords internal -get_date_offset <- function(week_start){ - if(week_start < 1 || week_start > 7){ +get_date_offset <- function(week_start) { + if (week_start < 1 || week_start > 7) { abort(c( "The value of `week_start` must be between 1 and 7", x = paste("`week_start` =", week_start) @@ -475,3 +475,4 @@ get_date_offset <- function(week_start){ offset_vals[week_start] + 1 } + diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 72a2b2d2c76..dd9fb1f787a 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -21,7 +21,6 @@ library(dplyr) test_date <- ymd_hms("1987-10-09 23:00:00", tz = NULL) test_df <- tibble::tibble(date = test_date) - test_that("extract datetime components from date", { expect_dplyr_equal( input %>% From 5621cd6d010a8bc5336ada1de6df8bdbcdf23dab Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 14 Jun 2021 09:02:47 +0100 Subject: [PATCH 04/22] Add bindings for hour --- r/R/dplyr-functions.R | 1 + r/R/expression.R | 1 + r/tests/testthat/test-dplyr-lubridate.R | 28 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 468d46aa0d6..5afa6718f8d 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -447,6 +447,7 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption if (label) { arrow_not_supported("Label argument") } + # After ARROW-13054 is completed, we can change the line below and delete the function offset <- get_date_offset(week_start) Expression$create("add", Expression$create("day_of_week", x), Expression$scalar(offset)) } diff --git a/r/R/expression.R b/r/R/expression.R index 94678ffd868..aadada80137 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -38,6 +38,7 @@ "day" = "day", "yday" = "day_of_year", "isoweek" = "iso_week", + "hour" = "hour", "minute" = "minute", "second" = "second" ) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index dd9fb1f787a..d90fadf7efd 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -18,7 +18,7 @@ library(lubridate) library(dplyr) -test_date <- ymd_hms("1987-10-09 23:00:00", tz = NULL) +test_date <- ymd_hms("2021-06-11 10:01:38.85") test_df <- tibble::tibble(date = test_date) test_that("extract datetime components from date", { @@ -26,7 +26,8 @@ test_that("extract datetime components from date", { input %>% mutate(x = year(date)) %>% collect(), - test_df + test_df, + check.tzone = FALSE ) expect_dplyr_equal( @@ -51,6 +52,19 @@ test_that("extract datetime components from date", { ) expect_dplyr_equal( + input %>% + mutate(x = isoweek(date)) %>% + collect(), + test_df + ) + expect_dplyr_equal( + input %>% + mutate(x = day(date)) %>% + collect(), + test_df + ) + + expect_dplyr_equal( input %>% mutate(x = wday(date)) %>% collect(), @@ -80,13 +94,7 @@ test_that("extract datetime components from date", { regexp = 'Expression wday(date, locale = Sys.getlocale("LC_TIME")) not supported in Arrow; pulling data into R', fixed = TRUE ) - - expect_dplyr_equal( - input %>% - mutate(x = day(date)) %>% - collect(), - test_df - ) + expect_dplyr_equal( input %>% @@ -97,7 +105,7 @@ test_that("extract datetime components from date", { expect_dplyr_equal( input %>% - mutate(x = isoweek(date)) %>% + mutate(x = hour(date)) %>% collect(), test_df ) From ab8abfd9cef3aaa5e3966d8e33ed979371aae8d7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 21 Jun 2021 09:31:33 +0100 Subject: [PATCH 05/22] Add in implementation of second, update offset func, separate tests --- r/R/dplyr-functions.R | 16 ++++++++++++++-- r/R/expression.R | 5 ++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 5afa6718f8d..8680e618474 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -443,11 +443,15 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit Expression$create("strptime", x, options = list(format = format, unit = unit)) } +nse_funcs$second <- function(x) { + Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) +} + nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { if (label) { arrow_not_supported("Label argument") } - # After ARROW-13054 is completed, we can change the line below and delete the function + # After ARROW-13054 is completed, we can change the 2 lines below and delete the function get_date_offset offset <- get_date_offset(week_start) Expression$create("add", Expression$create("day_of_week", x), Expression$scalar(offset)) } @@ -474,6 +478,14 @@ get_date_offset <- function(week_start) { } offset_vals <- c(0:-4, 2, 1) - offset_vals[week_start] + 1 + # offset by 1 due to R being 1-indexed + off_val <- offset_vals[week_start] + 1 + + # so that day numbers go from 1 to 7 then repeat + if(off_val > 7){ + off_val <- off_val - floor(off_val/7) * 7 + } + + off_val } diff --git a/r/R/expression.R b/r/R/expression.R index aadada80137..4a13a463e92 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -35,12 +35,11 @@ "isoyear" = "iso_year", "quarter" = "quarter", "month" = "month", + "isoweek" = "iso_week", "day" = "day", "yday" = "day_of_year", - "isoweek" = "iso_week", "hour" = "hour", - "minute" = "minute", - "second" = "second" + "minute" = "minute" ) .binary_function_map <- list( From 4803b8be2abd79dde9051148cf63599601ae9982 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 03:33:41 +0100 Subject: [PATCH 06/22] Entirely refactor wday formulation so it is achievable via Expressions --- r/R/dplyr-functions.R | 98 +++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 8680e618474..2d931c28bbd 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -447,45 +447,73 @@ nse_funcs$second <- function(x) { Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) } + nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { if (label) { arrow_not_supported("Label argument") } - # After ARROW-13054 is completed, we can change the 2 lines below and delete the function get_date_offset - offset <- get_date_offset(week_start) - Expression$create("add", Expression$create("day_of_week", x), Expression$scalar(offset)) -} - -#' Get date offset -#' -#' Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas -#' `lubridate::wday` counts from 1 to 7, and allows users to specify which day -#' of the week is first (Sunday by default). This function converts the returned -#' day of the week back to the value that would be returned by lubridate by -#' providing offset values based on the specified week_start day, and adding 1 -#' so the returned value is 1-indexed instead of 0-indexed. -#' -#' @param week_start Day on which week starts following ISO conventions - 1 means Monday, 7 means Sunday. -#' @return Offset value, integer -#' @keywords internal -get_date_offset <- function(week_start) { - if (week_start < 1 || week_start > 7) { - abort(c( - "The value of `week_start` must be between 1 and 7", - x = paste("`week_start` =", week_start) - ) - ) - } - offset_vals <- c(0:-4, 2, 1) - # offset by 1 due to R being 1-indexed - off_val <- offset_vals[week_start] + 1 + # After ARROW-13054 is completed, we can refactor this for simplicity + # + # Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas + # `lubridate::wday` counts from 1 to 7, and allows users to specify which day + # of the week is first (Sunday by default). This Expression converts the returned + # day of the week back to the value that would be returned by lubridate by + # providing offset values based on the specified week_start day, and adding 1 + # so the returned value is 1-indexed instead of 0-indexed. + # + # This looks complex but if you look at the lubridate implementation of wday, + # you can see that this is the same maths, but with 1 added at the end to + # account for the fact that R is 1-based and the Arrow implementation is 0-based + # + # overall formula to convert from arrow::wday to lubridate::wday is: + # (3 +wday(day) + (5 - start)) %% 7) + 1 + # + # To calculate e1%%e2, we can rearrange it as: + # {e1 - e2 * ( e1 %/% e2 )} + # + # use for testing: x <- Expression$field_ref("day") + # week_start = 1 - # so that day numbers go from 1 to 7 then repeat - if(off_val > 7){ - off_val <- off_val - floor(off_val/7) * 7 - } + # e1 = 3 +wday(day) + (5 - start) + e1 = Expression$create( + "add_checked", + # 3 + wday(date) + Expression$create("add_checked", + Expression$scalar(3), + Expression$create("day_of_week", x) + ), + # (5 - start) + Expression$create("subtract_checked", + Expression$scalar(5), + Expression$scalar(week_start) + ) + ) - off_val -} - + e2 = Expression$scalar(7) + + # (e1 - e2 * ( e1 %/% e2 )) + 1 + Expression$create( + "add_checked", + Expression$scalar(1), + # e1 - e2 * ( e1 %/% e2 ) + Expression$create( + "subtract_checked", + # e1 + e1, + # e2 * ( e1 %/% e2 ) + Expression$create( + "multiply_checked", + e2, + # e1 %/% e2; because as.integer(x/y) == x%/%y + Expression$create( + "cast", + Expression$create("divide_checked", e1, e2), + options = cast_options(to_type = int32(), allow_float_truncate = TRUE, + allow_decimal_truncate = TRUE + ) + ) + ) + ) + ) +} \ No newline at end of file From 7539b761f0964f51dc66168d9b151c1231e246bd Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 03:34:20 +0100 Subject: [PATCH 07/22] Separate out tests and add an extra week_start one --- r/tests/testthat/test-dplyr-lubridate.R | 47 ++++++++++++++++++++----- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index d90fadf7efd..8388940dec5 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -18,10 +18,10 @@ library(lubridate) library(dplyr) -test_date <- ymd_hms("2021-06-11 10:01:38.85") +test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") test_df <- tibble::tibble(date = test_date) -test_that("extract datetime components from date", { +test_that("extract year from date", { expect_dplyr_equal( input %>% mutate(x = year(date)) %>% @@ -29,41 +29,55 @@ test_that("extract datetime components from date", { test_df, check.tzone = FALSE ) - +}) + +test_that("extract isoyear from date", { expect_dplyr_equal( input %>% mutate(x = isoyear(date)) %>% collect(), test_df ) +}) +test_that("extract quarter from date", { expect_dplyr_equal( input %>% mutate(x = quarter(date)) %>% collect(), test_df ) - +}) + +test_that("extract month from date", { expect_dplyr_equal( input %>% mutate(x = month(date)) %>% collect(), test_df ) - +}) + +test_that("extract isoweek from date", { expect_dplyr_equal( input %>% mutate(x = isoweek(date)) %>% collect(), test_df ) +}) + +test_that("extract day from date", { expect_dplyr_equal( input %>% mutate(x = day(date)) %>% collect(), test_df ) +}) + +test_that("extract wday from date", { expect_dplyr_equal( input %>% mutate(x = wday(date)) %>% @@ -78,11 +92,19 @@ test_that("extract datetime components from date", { test_df ) + expect_dplyr_equal( + input %>% + mutate(x = wday(date, week_start = 1)) %>% + collect(), + test_df + ) + expect_warning( test_df %>% Table$create() %>% mutate(x = wday(date, label = TRUE)) %>% collect(), + # Update this test after ARROW-13133 is resolved regexp = "Label argument not supported by Arrow; pulling data into R" ) @@ -94,34 +116,43 @@ test_that("extract datetime components from date", { regexp = 'Expression wday(date, locale = Sys.getlocale("LC_TIME")) not supported in Arrow; pulling data into R', fixed = TRUE ) - +}) +test_that("extract yday from date", { expect_dplyr_equal( input %>% mutate(x = yday(date)) %>% collect(), test_df ) +}) +test_that("extract hour from date", { expect_dplyr_equal( input %>% mutate(x = hour(date)) %>% collect(), test_df ) +}) +test_that("extract minute from date", { expect_dplyr_equal( input %>% mutate(x = minute(date)) %>% collect(), test_df ) +}) +test_that("extract second from date", { expect_dplyr_equal( input %>% mutate(x = second(date)) %>% collect(), - test_df + test_df, + # arrow supports nanosecond resolution but lubridate does not + tolerance = 1e-6 ) - }) + From c6e20b29ad0c160bc453bc38079cb53b13b30428 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 03:47:54 +0100 Subject: [PATCH 08/22] Call nse_func directly when expecting an error --- r/tests/testthat/test-dplyr-lubridate.R | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 8388940dec5..ea6f000d704 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -99,23 +99,12 @@ test_that("extract wday from date", { test_df ) - expect_warning( - test_df %>% - Table$create() %>% - mutate(x = wday(date, label = TRUE)) %>% - collect(), - # Update this test after ARROW-13133 is resolved - regexp = "Label argument not supported by Arrow; pulling data into R" + x <- Expression$field_ref("x") + expect_error( + nse_funcs$wday(x, label = TRUE), + "Label argument not supported by Arrow" ) - expect_warning( - test_df %>% - Table$create() %>% - mutate(x = wday(date, locale = Sys.getlocale("LC_TIME"))) %>% - collect(), - regexp = 'Expression wday(date, locale = Sys.getlocale("LC_TIME")) not supported in Arrow; pulling data into R', - fixed = TRUE - ) }) test_that("extract yday from date", { From e66ac3d2f03c6c2f89afb5a86610a2f7ec999680 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 03:54:02 +0100 Subject: [PATCH 09/22] Add test for if there is a timezone aware timestamp --- r/tests/testthat/test-dplyr-lubridate.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index ea6f000d704..7d0b8e72474 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -21,6 +21,19 @@ library(dplyr) test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") test_df <- tibble::tibble(date = test_date) +tz_aware_date <- ymd_hms("2017-01-01 00:00:12.3456789") +tz_aware_df <- tibble::tibble(date = tz_aware_date) + +test_that("timezone aware timestamps are not supported",{ + x <- Expression$field_ref("x") + expect_error( + Table$create(tz_aware_df) %>% + mutate(x = wday(date)) %>% + collect(), + "Timezone aware timestamps not supported" + ) +}) + test_that("extract year from date", { expect_dplyr_equal( input %>% From 08d44baabb423cf2131063bb1a071c7a8c00a066 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 04:12:10 +0100 Subject: [PATCH 10/22] Can't extract date from date32 --- r/tests/testthat/test-dplyr-lubridate.R | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 7d0b8e72474..8e035d3d977 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -21,10 +21,11 @@ library(dplyr) test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") test_df <- tibble::tibble(date = test_date) -tz_aware_date <- ymd_hms("2017-01-01 00:00:12.3456789") -tz_aware_df <- tibble::tibble(date = tz_aware_date) - test_that("timezone aware timestamps are not supported",{ + + tz_aware_date <- ymd_hms("2017-01-01") + tz_aware_df <- tibble::tibble(date = tz_aware_date) + x <- Expression$field_ref("x") expect_error( Table$create(tz_aware_df) %>% @@ -34,6 +35,20 @@ test_that("timezone aware timestamps are not supported",{ ) }) +test_that("date32 objects are not supported",{ + + date <- ymd("2017-01-01") + df <- tibble::tibble(date = date) + + expect_error( + Table$create(df) %>% + mutate(x = year(date)) %>% + collect(), + "Function year has no kernel matching input types" + ) +}) + + test_that("extract year from date", { expect_dplyr_equal( input %>% From a98623ff728cb977378f4be6ce3c12df77c314a2 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 04:26:42 +0100 Subject: [PATCH 11/22] Fix test --- r/tests/testthat/test-dplyr-lubridate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 8e035d3d977..2743f0cad49 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -23,7 +23,7 @@ test_df <- tibble::tibble(date = test_date) test_that("timezone aware timestamps are not supported",{ - tz_aware_date <- ymd_hms("2017-01-01") + tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "BST") tz_aware_df <- tibble::tibble(date = tz_aware_date) x <- Expression$field_ref("x") From 942e92f597514a8c5bb11305e9e2949a2e618964 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 08:53:54 +0100 Subject: [PATCH 12/22] Update error message --- r/tests/testthat/test-dplyr-lubridate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 2743f0cad49..ea9ce32374c 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -31,7 +31,7 @@ test_that("timezone aware timestamps are not supported",{ Table$create(tz_aware_df) %>% mutate(x = wday(date)) %>% collect(), - "Timezone aware timestamps not supported" + "Cannot extract components from timestamp with specific timezone" ) }) From 86af140ae459acacc0b097968e5d1757e3e0bf68 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 12:23:30 +0100 Subject: [PATCH 13/22] Rearrange and tidy comments --- r/R/dplyr-functions.R | 46 ++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 2d931c28bbd..90edacc63af 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -447,34 +447,31 @@ nse_funcs$second <- function(x) { Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) } - +# After ARROW-13054 is completed, we can refactor this for simplicity +# +# Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas +# `lubridate::wday` counts from 1 to 7, and allows users to specify which day +# of the week is first (Sunday by default). This Expression converts the returned +# day of the week back to the value that would be returned by lubridate by +# providing offset values based on the specified week_start day, and adding 1 +# so the returned value is 1-indexed instead of 0-indexed. +# +# This looks complex but if you look at the lubridate implementation of wday, +# you can see that this is the same maths, but with 1 added at the end to +# account for the fact that R is 1-based and the Arrow implementation is 0-based +# +# overall formula to convert from arrow::wday to lubridate::wday is: +# (3 +wday(day) + (5 - start)) %% 7) + 1 +# +# To calculate e1 %% e2, we can rearrange it as: +# {e1 - e2 * ( e1 %/% e2 )} +# +# And e1 %/% e2 can itself be done via casting e1/e2 to an integer nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { if (label) { arrow_not_supported("Label argument") } - # After ARROW-13054 is completed, we can refactor this for simplicity - # - # Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas - # `lubridate::wday` counts from 1 to 7, and allows users to specify which day - # of the week is first (Sunday by default). This Expression converts the returned - # day of the week back to the value that would be returned by lubridate by - # providing offset values based on the specified week_start day, and adding 1 - # so the returned value is 1-indexed instead of 0-indexed. - # - # This looks complex but if you look at the lubridate implementation of wday, - # you can see that this is the same maths, but with 1 added at the end to - # account for the fact that R is 1-based and the Arrow implementation is 0-based - # - # overall formula to convert from arrow::wday to lubridate::wday is: - # (3 +wday(day) + (5 - start)) %% 7) + 1 - # - # To calculate e1%%e2, we can rearrange it as: - # {e1 - e2 * ( e1 %/% e2 )} - # - # use for testing: x <- Expression$field_ref("day") - # week_start = 1 - # e1 = 3 +wday(day) + (5 - start) e1 = Expression$create( "add_checked", @@ -502,8 +499,7 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption # e1 e1, # e2 * ( e1 %/% e2 ) - Expression$create( - "multiply_checked", + Expression$create("multiply_checked", e2, # e1 %/% e2; because as.integer(x/y) == x%/%y Expression$create( From 583a9960e36fae42d7596ca165d3b1368d5f1cde Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 22 Jun 2021 17:10:17 +0100 Subject: [PATCH 14/22] Remove unnecessary field ref creation --- r/tests/testthat/test-dplyr-lubridate.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index ea9ce32374c..3aa2ef93c88 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -26,7 +26,6 @@ test_that("timezone aware timestamps are not supported",{ tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "BST") tz_aware_df <- tibble::tibble(date = tz_aware_date) - x <- Expression$field_ref("x") expect_error( Table$create(tz_aware_df) %>% mutate(x = wday(date)) %>% From e8d7a5f2881ddea357567a2625b7e038f2c80326 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:17:00 +0100 Subject: [PATCH 15/22] Add comment that certain functions defined in dplyr-functions.R --- r/R/expression.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/R/expression.R b/r/R/expression.R index 4a13a463e92..3f256693685 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -37,8 +37,10 @@ "month" = "month", "isoweek" = "iso_week", "day" = "day", + # wday is defined in dplyr-functions.R "yday" = "day_of_year", "hour" = "hour", + # second is defined in dplyr-functions.R "minute" = "minute" ) From bc0f851e29fbb80b072e36293547dbb2f8d18f6a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:17:20 +0100 Subject: [PATCH 16/22] Massively simplify arrow::wday -> lubridate::wday code --- r/R/dplyr-functions.R | 58 ++++++------------------------------------- 1 file changed, 7 insertions(+), 51 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 90edacc63af..2b09b37b282 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -455,61 +455,17 @@ nse_funcs$second <- function(x) { # day of the week back to the value that would be returned by lubridate by # providing offset values based on the specified week_start day, and adding 1 # so the returned value is 1-indexed instead of 0-indexed. -# -# This looks complex but if you look at the lubridate implementation of wday, -# you can see that this is the same maths, but with 1 added at the end to -# account for the fact that R is 1-based and the Arrow implementation is 0-based -# -# overall formula to convert from arrow::wday to lubridate::wday is: -# (3 +wday(day) + (5 - start)) %% 7) + 1 -# -# To calculate e1 %% e2, we can rearrange it as: -# {e1 - e2 * ( e1 %/% e2 )} -# -# And e1 %/% e2 can itself be done via casting e1/e2 to an integer nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { if (label) { arrow_not_supported("Label argument") } - # e1 = 3 +wday(day) + (5 - start) - e1 = Expression$create( - "add_checked", - # 3 + wday(date) - Expression$create("add_checked", - Expression$scalar(3), - Expression$create("day_of_week", x) - ), - # (5 - start) - Expression$create("subtract_checked", - Expression$scalar(5), - Expression$scalar(week_start) - ) - ) - - e2 = Expression$scalar(7) + # overall formula to convert from arrow::wday to lubridate::wday is: + # (3 +wday(day) + (5 - start)) %% 7) + 1 + ((Expression$scalar(3) + + Expression$create("day_of_week", x) + + Expression$scalar(5) - + Expression$scalar(week_start) + )%% 7) + 1 - # (e1 - e2 * ( e1 %/% e2 )) + 1 - Expression$create( - "add_checked", - Expression$scalar(1), - # e1 - e2 * ( e1 %/% e2 ) - Expression$create( - "subtract_checked", - # e1 - e1, - # e2 * ( e1 %/% e2 ) - Expression$create("multiply_checked", - e2, - # e1 %/% e2; because as.integer(x/y) == x%/%y - Expression$create( - "cast", - Expression$create("divide_checked", e1, e2), - options = cast_options(to_type = int32(), allow_float_truncate = TRUE, - allow_decimal_truncate = TRUE - ) - ) - ) - ) - ) } \ No newline at end of file From 7390eaaba8dbf4591b29ccee4dcfe779c5223444 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:30:09 +0100 Subject: [PATCH 17/22] Add link to ticket that affects supporting label arg --- r/R/dplyr-functions.R | 4 ++++ r/tests/testthat/test-dplyr-lubridate.R | 2 ++ 2 files changed, 6 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 2b09b37b282..d9bbf82ccaa 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -456,6 +456,10 @@ nse_funcs$second <- function(x) { # providing offset values based on the specified week_start day, and adding 1 # so the returned value is 1-indexed instead of 0-indexed. nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) { + + # The "day_of_week" compute function returns numeric days of week and not locale-aware strftime + # When the ticket below is resolved, we should be able to support the label argument + # https://issues.apache.org/jira/browse/ARROW-13133 if (label) { arrow_not_supported("Label argument") } diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 3aa2ef93c88..36cb9ea80f9 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -126,6 +126,8 @@ test_that("extract wday from date", { test_df ) + # We should be able to support the label argument after this ticket is resolved: + # https://issues.apache.org/jira/browse/ARROW-13133 x <- Expression$field_ref("x") expect_error( nse_funcs$wday(x, label = TRUE), From 9452071b98303fe902d13ed95d39ccd15ad5642f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:33:26 +0100 Subject: [PATCH 18/22] Simplify expression further --- r/R/dplyr-functions.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index d9bbf82ccaa..56ebd197747 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -465,10 +465,9 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption } # overall formula to convert from arrow::wday to lubridate::wday is: - # (3 +wday(day) + (5 - start)) %% 7) + 1 - ((Expression$scalar(3) + - Expression$create("day_of_week", x) + - Expression$scalar(5) - + # (8 + wday(day) - start)) %% 7) + 1 + ((Expression$scalar(8) + + Expression$create("day_of_week", x) - Expression$scalar(week_start) )%% 7) + 1 From fb4bc026ac83328d0086f7345eacf1d01c156991 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:35:26 +0100 Subject: [PATCH 19/22] Simplyify further --- r/R/dplyr-functions.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 56ebd197747..ebea7f002db 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -465,10 +465,7 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption } # overall formula to convert from arrow::wday to lubridate::wday is: - # (8 + wday(day) - start)) %% 7) + 1 - ((Expression$scalar(8) + - Expression$create("day_of_week", x) - - Expression$scalar(week_start) - )%% 7) + 1 + # ((wday(day) - start + 8) %% 7) + 1 + ((Expression$create("day_of_week", x) - Expression$scalar(week_start) + 8) %% 7) + 1 } \ No newline at end of file From f08cfdc289441dd63c2754d3e1bb354439df2384 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:37:00 +0100 Subject: [PATCH 20/22] Reference correct file --- r/R/expression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/expression.R b/r/R/expression.R index 3f256693685..b06490d678c 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -30,7 +30,7 @@ "str_length" = "utf8_length", "str_to_lower" = "utf8_lower", "str_to_upper" = "utf8_upper", - # str_trim is defined in dplyr.R + # str_trim is defined in dplyr-functions.R "year" = "year", "isoyear" = "iso_year", "quarter" = "quarter", From 5719c030958615deb0a6d4387ca028dabbb17f29 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 23 Jun 2021 14:40:50 +0100 Subject: [PATCH 21/22] Add ticket numbers to unsupported features --- r/tests/testthat/test-dplyr-lubridate.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/tests/testthat/test-dplyr-lubridate.R b/r/tests/testthat/test-dplyr-lubridate.R index 36cb9ea80f9..2ebb6f3b93e 100644 --- a/r/tests/testthat/test-dplyr-lubridate.R +++ b/r/tests/testthat/test-dplyr-lubridate.R @@ -21,6 +21,7 @@ library(dplyr) test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "") test_df <- tibble::tibble(date = test_date) +# We can support this feature after ARROW-12980 is merged test_that("timezone aware timestamps are not supported",{ tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "BST") @@ -34,6 +35,7 @@ test_that("timezone aware timestamps are not supported",{ ) }) +# We can support this feature when ARROW-13138 is resolved test_that("date32 objects are not supported",{ date <- ymd("2017-01-01") From 0ce4c5af0d574c56492966a2d81f8fc0f7f991d7 Mon Sep 17 00:00:00 2001 From: Nic Date: Wed, 23 Jun 2021 17:20:24 +0100 Subject: [PATCH 22/22] Update r/R/dplyr-functions.R Co-authored-by: Neal Richardson --- r/R/dplyr-functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index ebea7f002db..a068cf09c61 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -466,6 +466,6 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption # overall formula to convert from arrow::wday to lubridate::wday is: # ((wday(day) - start + 8) %% 7) + 1 - ((Expression$create("day_of_week", x) - Expression$scalar(week_start) + 8) %% 7) + 1 + ((Expression$create("day_of_week", x) - Expression$scalar(week_start) + 8) %% 7) + 1 -} \ No newline at end of file +}