Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
475962a
Add functions for extracting time/date components
thisisnic Jun 10, 2021
51f3aca
Implement day_of_week so it matches lubridate::wday
thisisnic Jun 10, 2021
a4bd4b8
Sort out spacing
thisisnic Jun 10, 2021
5621cd6
Add bindings for hour
thisisnic Jun 14, 2021
ab8abfd
Add in implementation of second, update offset func, separate tests
thisisnic Jun 21, 2021
4803b8b
Entirely refactor wday formulation so it is achievable via Expressions
thisisnic Jun 22, 2021
7539b76
Separate out tests and add an extra week_start one
thisisnic Jun 22, 2021
c6e20b2
Call nse_func directly when expecting an error
thisisnic Jun 22, 2021
e66ac3d
Add test for if there is a timezone aware timestamp
thisisnic Jun 22, 2021
08d44ba
Can't extract date from date32
thisisnic Jun 22, 2021
a98623f
Fix test
thisisnic Jun 22, 2021
942e92f
Update error message
thisisnic Jun 22, 2021
86af140
Rearrange and tidy comments
thisisnic Jun 22, 2021
583a996
Remove unnecessary field ref creation
thisisnic Jun 22, 2021
e8d7a5f
Add comment that certain functions defined in dplyr-functions.R
thisisnic Jun 23, 2021
bc0f851
Massively simplify arrow::wday -> lubridate::wday code
thisisnic Jun 23, 2021
7390eaa
Add link to ticket that affects supporting label arg
thisisnic Jun 23, 2021
9452071
Simplify expression further
thisisnic Jun 23, 2021
fb4bc02
Simplyify further
thisisnic Jun 23, 2021
f08cfdc
Reference correct file
thisisnic Jun 23, 2021
5719c03
Add ticket numbers to unsupported features
thisisnic Jun 23, 2021
0ce4c5a
Update r/R/dplyr-functions.R
thisisnic Jun 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions r/R/dplyr-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -442,3 +442,30 @@ nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit

Expression$create("strptime", x, options = list(format = format, unit = unit))
}

nse_funcs$second <- function(x) {
Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x))
}

# After ARROW-13054 is completed, we can refactor this for simplicity
#
# Arrow's `day_of_week` kernel counts from 0 (Monday) to 6 (Sunday), whereas
# `lubridate::wday` counts from 1 to 7, and allows users to specify which day
# of the week is first (Sunday by default). This Expression converts the returned
# day of the week back to the value that would be returned by lubridate by
# providing offset values based on the specified week_start day, and adding 1
# so the returned value is 1-indexed instead of 0-indexed.
nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption("lubridate.week.start", 7)) {

# The "day_of_week" compute function returns numeric days of week and not locale-aware strftime
# When the ticket below is resolved, we should be able to support the label argument
# https://issues.apache.org/jira/browse/ARROW-13133
if (label) {
arrow_not_supported("Label argument")
}

# overall formula to convert from arrow::wday to lubridate::wday is:
# ((wday(day) - start + 8) %% 7) + 1
((Expression$create("day_of_week", x) - Expression$scalar(week_start) + 8) %% 7) + 1

}
15 changes: 13 additions & 2 deletions r/R/expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,19 @@
# stringr spellings of those
"str_length" = "utf8_length",
"str_to_lower" = "utf8_lower",
"str_to_upper" = "utf8_upper"
# str_trim is defined in dplyr.R
"str_to_upper" = "utf8_upper",
# str_trim is defined in dplyr-functions.R
"year" = "year",
"isoyear" = "iso_year",
"quarter" = "quarter",
"month" = "month",
"isoweek" = "iso_week",
"day" = "day",
# wday is defined in dplyr-functions.R
"yday" = "day_of_year",
"hour" = "hour",
# second is defined in dplyr-functions.R
"minute" = "minute"
)

.binary_function_map <- list(
Expand Down
178 changes: 178 additions & 0 deletions r/tests/testthat/test-dplyr-lubridate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

library(lubridate)
library(dplyr)

test_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "")
test_df <- tibble::tibble(date = test_date)

# We can support this feature after ARROW-12980 is merged
test_that("timezone aware timestamps are not supported",{

tz_aware_date <- as.POSIXct("2017-01-01 00:00:12.3456789", tz = "BST")
tz_aware_df <- tibble::tibble(date = tz_aware_date)

expect_error(
Table$create(tz_aware_df) %>%
mutate(x = wday(date)) %>%
collect(),
"Cannot extract components from timestamp with specific timezone"
)
})

# We can support this feature when ARROW-13138 is resolved
test_that("date32 objects are not supported",{

date <- ymd("2017-01-01")
df <- tibble::tibble(date = date)

expect_error(
Table$create(df) %>%
mutate(x = year(date)) %>%
collect(),
"Function year has no kernel matching input types"
)
})


test_that("extract year from date", {
expect_dplyr_equal(
input %>%
mutate(x = year(date)) %>%
collect(),
test_df,
check.tzone = FALSE
)
})

test_that("extract isoyear from date", {
expect_dplyr_equal(
input %>%
mutate(x = isoyear(date)) %>%
collect(),
test_df
)
})

test_that("extract quarter from date", {
expect_dplyr_equal(
input %>%
mutate(x = quarter(date)) %>%
collect(),
test_df
)
})

test_that("extract month from date", {
expect_dplyr_equal(
input %>%
mutate(x = month(date)) %>%
collect(),
test_df
)
})

test_that("extract isoweek from date", {
expect_dplyr_equal(
input %>%
mutate(x = isoweek(date)) %>%
collect(),
test_df
)
})

test_that("extract day from date", {
expect_dplyr_equal(
input %>%
mutate(x = day(date)) %>%
collect(),
test_df
)
})


test_that("extract wday from date", {
expect_dplyr_equal(
input %>%
mutate(x = wday(date)) %>%
collect(),
test_df
)

expect_dplyr_equal(
input %>%
mutate(x = wday(date, week_start = 3)) %>%
collect(),
test_df
)

expect_dplyr_equal(
input %>%
mutate(x = wday(date, week_start = 1)) %>%
collect(),
test_df
)

# We should be able to support the label argument after this ticket is resolved:
# https://issues.apache.org/jira/browse/ARROW-13133
x <- Expression$field_ref("x")
expect_error(
nse_funcs$wday(x, label = TRUE),
"Label argument not supported by Arrow"
)

})

test_that("extract yday from date", {
expect_dplyr_equal(
input %>%
mutate(x = yday(date)) %>%
collect(),
test_df
)
})

test_that("extract hour from date", {
expect_dplyr_equal(
input %>%
mutate(x = hour(date)) %>%
collect(),
test_df
)
})

test_that("extract minute from date", {
expect_dplyr_equal(
input %>%
mutate(x = minute(date)) %>%
collect(),
test_df
)
})

test_that("extract second from date", {
expect_dplyr_equal(
input %>%
mutate(x = second(date)) %>%
collect(),
test_df,
# arrow supports nanosecond resolution but lubridate does not
tolerance = 1e-6
)
})