From 7afd09ffabaf632db1790e2121d2e4bde17a1007 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 6 Jul 2022 10:46:47 -0700 Subject: [PATCH 01/26] tests: Move S3 tests into shared filesystems tests --- r/R/filesystem.R | 9 +- r/tests/testthat/helper-filesystems.R | 177 ++++++++++++++++++++++++++ r/tests/testthat/test-gcs.R | 32 +++++ r/tests/testthat/test-s3-minio.R | 170 +------------------------ 4 files changed, 222 insertions(+), 166 deletions(-) create mode 100644 r/tests/testthat/helper-filesystems.R diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 2f0b1cfd585..4ad6aa83e3d 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -497,7 +497,9 @@ gs_bucket <- function(bucket, ...) { GcsFileSystem <- R6Class("GcsFileSystem", inherit = FileSystem ) -GcsFileSystem$create <- function(anonymous = FALSE, ...) { +GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, ...) { + # The default retry limit in C++ is 15 minutes, but that is experienced as + # hanging in an interactive context, so default is set here to 15 seconds. options <- list(...) # Validate options @@ -525,8 +527,7 @@ GcsFileSystem$create <- function(anonymous = FALSE, ...) { valid_opts <- c( "access_token", "expiration", "json_credentials", "endpoint_override", - "scheme", "default_bucket_location", "retry_limit_seconds", - "default_metadata" + "scheme", "default_bucket_location", "default_metadata" ) invalid_opts <- setdiff(names(options), valid_opts) @@ -538,6 +539,8 @@ GcsFileSystem$create <- function(anonymous = FALSE, ...) { ) } + options$retry_limit_seconds <- retry_limit_seconds + fs___GcsFileSystem__Make(anonymous, options) } diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R new file mode 100644 index 00000000000..f7a2b88f3ae --- /dev/null +++ b/r/tests/testthat/helper-filesystems.R @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_filesystem <- function(name, fs, path_formatter, uri_formatter) { + test_that(sprintf("read/write Feather on %s", name), { + write_feather(example_data, uri_formatter("test.feather")) + expect_identical(read_feather(uri_formatter("test.feather")), example_data) + }) + + test_that("read/write Feather by filesystem, not URI", { + write_feather(example_data, fs$path(path_formatter("test2.feather"))) + expect_identical( + read_feather(fs$path(path_formatter("test2.feather"))), + example_data + ) + }) + + library(dplyr) + + test_that("read/write compressed csv by filesystem", { + skip_if_not_available("gzip") + dat <- tibble(x = seq(1, 10, by = 0.2)) + write_csv_arrow(dat, fs$path(path_formatter("test.csv.gz"))) + expect_identical( + read_csv_arrow(fs$path(path_formatter("test.csv.gz"))), + dat + ) + }) + + test_that("read/write csv by filesystem", { + skip_if_not_available("gzip") + dat <- tibble(x = seq(1, 10, by = 0.2)) + write_csv_arrow(dat, fs$path(path_formatter("test.csv"))) + expect_identical( + read_csv_arrow(fs$path(path_formatter("test.csv"))), + dat + ) + }) + + test_that("read/write stream", { + write_ipc_stream(example_data, fs$path(path_formatter("test3.ipc"))) + expect_identical( + read_ipc_stream(fs$path(path_formatter("test3.ipc"))), + example_data + ) + }) + + test_that(sprintf("read/write Parquet on %s", name), { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(path_formatter("test.parquet"))) + expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) + }) + + if (arrow_with_dataset()) { + make_temp_dir <- function() { + path <- tempfile() + dir.create(path) + normalizePath(path, winslash = "/") + } + + test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { + skip_if_not_available("parquet") + expect_identical( + open_dataset(uri_formatter("test.parquet")) %>% collect() %>% arrange(int), + example_data %>% arrange(int) + ) + }) + + test_that(sprintf("open_dataset with vector of %s file URIs", name), { + expect_identical( + open_dataset( + c(uri_formatter("test.feather"), uri_formatter("test2.feather")), + format = "feather" + ) %>% + arrange(int) %>% + collect(), + rbind(example_data, example_data) %>% arrange(int) + ) + }) + + test_that("open_dataset errors on URIs for different file systems", { + td <- make_temp_dir() + expect_error( + open_dataset( + c( + uri_formatter("test.feather"), + paste0("file://", file.path(td, "fake.feather")) + ), + format = "feather" + ), + "Vectors of URIs for different file systems are not supported" + ) + }) + + # Dataset test setup, cf. test-dataset.R + first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") + df1 <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + ts = first_date + lubridate::days(1:10) + ) + + second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") + df2 <- tibble( + int = 101:110, + dbl = as.numeric(51:60), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + ts = second_date + lubridate::days(10:1) + ) + + # This is also to set up the dataset tests + test_that("write_parquet with filesystem arg", { + skip_if_not_available("parquet") + fs$CreateDir(path_formatter("hive_dir", "group=1", "other=xxx")) + fs$CreateDir(path_formatter("hive_dir", "group=2", "other=yyy")) + expect_length(fs$ls(path_formatter("hive_dir")), 2) + write_parquet(df1, fs$path(path_formatter("hive_dir", "group=1", "other=xxx", "file1.parquet"))) + write_parquet(df2, fs$path(path_formatter("hive_dir", "group=2", "other=yyy", "file2.parquet"))) + expect_identical( + read_parquet(fs$path(path_formatter("hive_dir", "group=1", "other=xxx", "file1.parquet"))), + df1 + ) + }) + + test_that("open_dataset with fs", { + ds <- open_dataset(fs$path(path_formatter("hive_dir"))) + expect_identical( + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + }) + + test_that("write_dataset with fs", { + ds <- open_dataset(fs$path(path_formatter("hive_dir"))) + write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) + expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) + }) + + test_that("Let's test copy_files too", { + td <- make_temp_dir() + copy_files(uri_formatter("hive_dir"), td) + expect_length(dir(td), 2) + ds <- open_dataset(td) + expect_identical( + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + + # Let's copy the other way and use a SubTreeFileSystem rather than URI + copy_files(td, fs$path(path_formatter("hive_dir2"))) + ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) + expect_identical( + ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + }) + } # if(arrow_with_dataset()) +} diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index a823442f30b..44fba1ae144 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -17,6 +17,8 @@ skip_if_not_available("gcs") +source_file("helper-filesystems.R") + test_that("FileSystem$from_uri with gs://", { fs_and_path <- FileSystem$from_uri("gs://my/test/bucket/") expect_r6_class(fs_and_path$fs, "GcsFileSystem") @@ -58,3 +60,33 @@ test_that("GcsFileSystem$create() input validation", { 'Invalid options for GcsFileSystem: "role_arn"' ) }) + +if (process_is_running("testbench")) { + testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") + + fs <- GcsFileSystem$create( + endpoint_override = sprintf("localhost:%s", testbench_port), + retry_limit_seconds = 1, + scheme = "http", + anonymous = TRUE # Will fail to resolve host name if anonymous isn't TRUE + ) + + now <- as.character(as.numeric(Sys.time())) + fs$CreateDir(now) + # Clean up when we're all done + on.exit(fs$DeleteDir(now)) + + gcs_path <- function(...) { + paste(now, ..., sep = "/") + } + gcs_uri <- function(...) { + template <- "gs://anonymous@%s?scheme=http&endpoint_override=localhost%s%s&retry_limit_seconds=1" + sprintf(template, gcs_path(...), "%3A", testbench_port) + } + + test_filesystem("gcs", fs, gcs_path, gcs_uri) +} else { + test_that("GCSFileSystem tests with testbench", { + skip("testbench is not running") + }) +} diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index ad11d04d5e9..d07559f0ad8 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +source_file("helper-filesystems.R") if (arrow_with_s3() && process_is_running("minio server")) { # Get minio config, with expected defaults @@ -51,173 +52,16 @@ if (arrow_with_s3() && process_is_running("minio server")) { # Clean up when we're all done on.exit(fs$DeleteDir(now)) - test_that("read/write Feather on minio", { - write_feather(example_data, minio_uri("test.feather")) - expect_identical(read_feather(minio_uri("test.feather")), example_data) - }) - - test_that("read/write Feather by filesystem, not URI", { - write_feather(example_data, fs$path(minio_path("test2.feather"))) - expect_identical( - read_feather(fs$path(minio_path("test2.feather"))), - example_data - ) - }) - - test_that("read/write compressed csv by filesystem", { - skip_if_not_available("gzip") - dat <- tibble(x = seq(1, 10, by = 0.2)) - write_csv_arrow(dat, fs$path(minio_path("test.csv.gz"))) - expect_identical( - read_csv_arrow(fs$path(minio_path("test.csv.gz"))), - dat - ) - }) - - test_that("read/write csv by filesystem", { - skip_if_not_available("gzip") - dat <- tibble(x = seq(1, 10, by = 0.2)) - write_csv_arrow(dat, fs$path(minio_path("test.csv"))) - expect_identical( - read_csv_arrow(fs$path(minio_path("test.csv"))), - dat - ) - }) + test_filesystem("s3", fs, minio_path, minio_uri) - test_that("read/write stream", { - write_ipc_stream(example_data, fs$path(minio_path("test3.ipc"))) - expect_identical( - read_ipc_stream(fs$path(minio_path("test3.ipc"))), - example_data - ) - }) + test_that("CreateDir fails on bucket if allow_bucket_creation=False", { + now_tmp <- paste0(now, "-test-fail-delete") + fs$CreateDir(now_tmp) - test_that("read/write Parquet on minio", { - skip_if_not_available("parquet") - write_parquet(example_data, fs$path(minio_uri("test.parquet"))) - expect_identical(read_parquet(minio_uri("test.parquet")), example_data) + expect_error(limited_fs$CreateDir("should-fail")) + expect_error(limited_fs$DeleteDir(now_tmp)) }) - if (arrow_with_dataset()) { - library(dplyr) - - make_temp_dir <- function() { - path <- tempfile() - dir.create(path) - normalizePath(path, winslash = "/") - } - - test_that("open_dataset with an S3 file (not directory) URI", { - skip_if_not_available("parquet") - expect_identical( - open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int), - example_data %>% arrange(int) - ) - }) - - test_that("open_dataset with vector of S3 file URIs", { - expect_identical( - open_dataset( - c(minio_uri("test.feather"), minio_uri("test2.feather")), - format = "feather" - ) %>% - arrange(int) %>% - collect(), - rbind(example_data, example_data) %>% arrange(int) - ) - }) - - test_that("open_dataset errors on URIs for different file systems", { - td <- make_temp_dir() - expect_error( - open_dataset( - c( - minio_uri("test.feather"), - paste0("file://", file.path(td, "fake.feather")) - ), - format = "feather" - ), - "Vectors of URIs for different file systems are not supported" - ) - }) - - # Dataset test setup, cf. test-dataset.R - first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") - df1 <- tibble( - int = 1:10, - dbl = as.numeric(1:10), - lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), - chr = letters[1:10], - fct = factor(LETTERS[1:10]), - ts = first_date + lubridate::days(1:10) - ) - - second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") - df2 <- tibble( - int = 101:110, - dbl = as.numeric(51:60), - lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), - chr = letters[10:1], - fct = factor(LETTERS[10:1]), - ts = second_date + lubridate::days(10:1) - ) - - # This is also to set up the dataset tests - test_that("write_parquet with filesystem arg", { - skip_if_not_available("parquet") - fs$CreateDir(minio_path("hive_dir", "group=1", "other=xxx")) - fs$CreateDir(minio_path("hive_dir", "group=2", "other=yyy")) - expect_length(fs$ls(minio_path("hive_dir")), 2) - write_parquet(df1, fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))) - write_parquet(df2, fs$path(minio_path("hive_dir", "group=2", "other=yyy", "file2.parquet"))) - expect_identical( - read_parquet(fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))), - df1 - ) - }) - - test_that("open_dataset with fs", { - ds <- open_dataset(fs$path(minio_path("hive_dir"))) - expect_identical( - ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) - ) - }) - - test_that("write_dataset with fs", { - ds <- open_dataset(fs$path(minio_path("hive_dir"))) - write_dataset(ds, fs$path(minio_path("new_dataset_dir"))) - expect_length(fs$ls(minio_path("new_dataset_dir")), 1) - }) - - test_that("CreateDir fails on bucket if allow_bucket_creation=False", { - now_tmp <- paste0(now, "-test-fail-delete") - fs$CreateDir(now_tmp) - - expect_error(limited_fs$CreateDir("should-fail")) - expect_error(limited_fs$DeleteDir(now_tmp)) - }) - - test_that("Let's test copy_files too", { - td <- make_temp_dir() - copy_files(minio_uri("hive_dir"), td) - expect_length(dir(td), 2) - ds <- open_dataset(td) - expect_identical( - ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) - ) - - # Let's copy the other way and use a SubTreeFileSystem rather than URI - copy_files(td, fs$path(minio_path("hive_dir2"))) - ds2 <- open_dataset(fs$path(minio_path("hive_dir2"))) - expect_identical( - ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) - ) - }) - } - test_that("S3FileSystem input validation", { expect_error( S3FileSystem$create(access_key = "foo"), From 729246f84fbbde24d0138bf5ddb55c037e68bb3c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 7 Jul 2022 10:29:11 -0700 Subject: [PATCH 02/26] test: try to get GCS testbench working in R CI --- ci/scripts/r_test.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 519144ab4c5..d1bcad51abb 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -108,6 +108,12 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') pid_minio <- sys::exec_background('minio', c('server', minio_dir)) on.exit(tools::pskill(pid_minio), add = TRUE) } + + if (requireNamespace('reticulate', quietly = TRUE) && reticulate::py_module_available('testbench')) { + message('Running testbench for GCS tests (if build supports them)') + pid_minio <- sys::exec_background('python', c('-m', 'testbench', '--port', '9001')) + on.exit(tools::pskill(pid_minio), add = TRUE) + } } if (requireNamespace('reticulate', quietly = TRUE) && reticulate::py_module_available('pyarrow')) { From 86103fa71a0d876240eb42616b9a18d4ed8dd549 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 7 Jul 2022 11:16:07 -0700 Subject: [PATCH 03/26] fix: enable reticulate to enable tests --- ci/scripts/r_docker_configure.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 79bf55550e1..9bcbec960fd 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -88,6 +88,8 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "T fi if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh" ] && [ "`which pip`" ]; then + # Will need reticulate to check whether testbench is installed + Rscript -e 'install.packages("reticulate")' ${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh default fi fi From 78d7ff004cee839d54be547e827ff1c0fb1bd52f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 13:40:24 -0700 Subject: [PATCH 04/26] feat: Add nice message for gcs tests --- r/tests/testthat/test-gcs.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index 44fba1ae144..3e345871dc6 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -72,7 +72,16 @@ if (process_is_running("testbench")) { ) now <- as.character(as.numeric(Sys.time())) - fs$CreateDir(now) + tryCatch(fs$CreateDir(now), error=function(cond) { + if (grepl("Couldn't connect to server", cond, fixed = TRUE)) { + abort( + c(sprintf("Unable to connect to testbench on port %s.", testbench_port), + i = "You can set a custom port with TESTBENCH_PORT environment variable."), + parent=cond) + } else { + stop(cond) + } + }) # Clean up when we're all done on.exit(fs$DeleteDir(now)) From e1a9e43ca0b10761622ccf09bb895b5545b3ad74 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 14:46:06 -0700 Subject: [PATCH 05/26] fix: drop slash only for openoutputstream --- cpp/src/arrow/filesystem/gcsfs.cc | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index ce11c0aa223..7ef35d42066 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -71,22 +71,28 @@ struct GcsPath { std::string bucket; std::string object; - static Result FromString(const std::string& s) { - if (internal::IsLikelyUri(s)) { + static Result FromString(const std::string& s, + bool is_definitely_file = false) { + // If we know it's definitely a file, we should remove the trailing slash + const auto src = + is_definitely_file ? internal::RemoveTrailingSlash(s) : util::string_view(s); + + if (internal::IsLikelyUri(src)) { return Status::Invalid( - "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); + "Expected a GCS object path of the form 'bucket/key...', got a URI: '", src, + "'"); } - auto const first_sep = s.find_first_of(internal::kSep); + auto const first_sep = src.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", s, "')"); + return Status::Invalid("Path cannot start with a separator ('", src, "')"); } if (first_sep == std::string::npos) { return GcsPath{s, std::string(internal::RemoveTrailingSlash(s)), ""}; } GcsPath path; - path.full_path = s; - path.bucket = s.substr(0, first_sep); - path.object = s.substr(first_sep + 1); + path.full_path = src.to_string(); + path.bucket = src.substr(0, first_sep).to_string(); + path.object = src.substr(first_sep + 1).to_string(); return path; } From d9da96911bfee04539a334ba14838d3d57d8624f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 20:54:15 -0700 Subject: [PATCH 06/26] chore: lint R --- r/tests/testthat/test-gcs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index 3e345871dc6..e7f998e261b 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -72,12 +72,12 @@ if (process_is_running("testbench")) { ) now <- as.character(as.numeric(Sys.time())) - tryCatch(fs$CreateDir(now), error=function(cond) { + tryCatch(fs$CreateDir(now), error = function(cond) { if (grepl("Couldn't connect to server", cond, fixed = TRUE)) { abort( c(sprintf("Unable to connect to testbench on port %s.", testbench_port), i = "You can set a custom port with TESTBENCH_PORT environment variable."), - parent=cond) + parent = cond) } else { stop(cond) } From 352b3e7d2757c5131ea7cdc337672e127e35dd88 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 13 Jul 2022 13:00:59 -0700 Subject: [PATCH 07/26] feat: add help message for starting testbench --- r/tests/testthat/test-gcs.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index e7f998e261b..8bebff3d216 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -96,6 +96,8 @@ if (process_is_running("testbench")) { test_filesystem("gcs", fs, gcs_path, gcs_uri) } else { test_that("GCSFileSystem tests with testbench", { - skip("testbench is not running") + suggested_command <- paste('gunicorn --bind "localhost:9001" --worker-class sync', + '--threads 10 --reload --access-logfile - "testbench:run()"') + skip(sprintf("testbench is not running. You can start it with:\n %s", suggested_command)) }) } From da12db9bda2e8d64f281656db2a8b3d7265d6c7b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 13 Jul 2022 13:01:27 -0700 Subject: [PATCH 08/26] test: differentiate test names for filesystems --- r/tests/testthat/helper-filesystems.R | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index f7a2b88f3ae..bb78588533e 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -15,13 +15,24 @@ # specific language governing permissions and limitations # under the License. +#' Run standard suite of integration tests for a filesystem +#' +#' @param name Name of filesystem to be printed in test name +#' @param fs A `FileSystem` instance to test with +#' @param path_formatter A function that takes a sequence of path segments and +#' returns a absolute path. +#' @param uri_formatter A function that takes a sequence of path segments and +#' returns a URI containing the filesystem scheme (e.g. 's3://', 'gs://'), the +#' absolute path, and any necessary connection options as URL query parameters. test_filesystem <- function(name, fs, path_formatter, uri_formatter) { - test_that(sprintf("read/write Feather on %s", name), { + # NOTE: it's important that we label these tests with name of filesystem so + # that we can differentiate the different calls to these test in the output. + test_that(sprintf("read/write Feather on %s using URIs", name), { write_feather(example_data, uri_formatter("test.feather")) expect_identical(read_feather(uri_formatter("test.feather")), example_data) }) - test_that("read/write Feather by filesystem, not URI", { + test_that(sprintf("read/write Feather on %s using Filesystem", name), { write_feather(example_data, fs$path(path_formatter("test2.feather"))) expect_identical( read_feather(fs$path(path_formatter("test2.feather"))), @@ -31,7 +42,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { library(dplyr) - test_that("read/write compressed csv by filesystem", { + test_that(sprintf("read/write compressed csv on %s using FileSystem", name), { skip_if_not_available("gzip") dat <- tibble(x = seq(1, 10, by = 0.2)) write_csv_arrow(dat, fs$path(path_formatter("test.csv.gz"))) @@ -41,7 +52,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that("read/write csv by filesystem", { + test_that(sprintf("read/write csv on %s using FileSystem", name), { skip_if_not_available("gzip") dat <- tibble(x = seq(1, 10, by = 0.2)) write_csv_arrow(dat, fs$path(path_formatter("test.csv"))) @@ -51,7 +62,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that("read/write stream", { + test_that(sprintf("read/write IPC stream on %s", name), { write_ipc_stream(example_data, fs$path(path_formatter("test3.ipc"))) expect_identical( read_ipc_stream(fs$path(path_formatter("test3.ipc"))), @@ -92,7 +103,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that("open_dataset errors on URIs for different file systems", { + test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { td <- make_temp_dir() expect_error( open_dataset( @@ -128,7 +139,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) # This is also to set up the dataset tests - test_that("write_parquet with filesystem arg", { + test_that(sprintf("write_parquet with %s filesystem arg", name), { skip_if_not_available("parquet") fs$CreateDir(path_formatter("hive_dir", "group=1", "other=xxx")) fs$CreateDir(path_formatter("hive_dir", "group=2", "other=yyy")) @@ -141,7 +152,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that("open_dataset with fs", { + test_that(sprintf("open_dataset with %s", name), { ds <- open_dataset(fs$path(path_formatter("hive_dir"))) expect_identical( ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), @@ -149,13 +160,13 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that("write_dataset with fs", { + test_that(sprintf("write_dataset with %s", name), { ds <- open_dataset(fs$path(path_formatter("hive_dir"))) write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) }) - test_that("Let's test copy_files too", { + test_that(sprintf("copy files with %s", name), { td <- make_temp_dir() copy_files(uri_formatter("hive_dir"), td) expect_length(dir(td), 2) From fa99d5f8060e5eeafe689e4dfb8ec33870342af2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 13 Jul 2022 14:24:13 -0700 Subject: [PATCH 09/26] feat: attempt to test GCS and MinIO on Windows CI --- .github/workflows/r.yml | 16 ++++++++++++++++ r/tests/testthat/helper-skip.R | 15 +++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index c5fc6d819bf..b915b383c02 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -273,6 +273,17 @@ jobs: cd r/windows ls *.zip | xargs -n 1 unzip -uo rm -rf *.zip + - name: Install MinIO + shell: bash + run: | + mkdir -p /usr/local/bin + wget \ + --output-document /usr/local/bin/minio.exe \ + https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z + chmod +x /usr/local/bin/minio.exe + - name: Install Google Cloud Storage Testbench + shell: bash + run: ci/scripts/install_gcs_testbench.sh default - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.rversion }} @@ -288,6 +299,11 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck + - name: Start MinIO and GCS Testbench + shell: bash + run: | + minio server ~/minio-data --port 9000 + python -m testbench --port 9001 - name: Check shell: Rscript {0} run: | diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 7a6c2687ed8..7279e245f23 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -109,6 +109,17 @@ process_is_running <- function(x) { return(TRUE) } - cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) - tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) + if (tolower(Sys.info()[["sysname"]]) == "windows") { + # Batch scripts (CMD.exe) doesn't provide a command that shows the original + # call arguments, which we need for testbench since it's launched from Python. + inner_cmd <- paste("WMIC path win32_process get Commandline", + sprintf("| Select-String %s", x), + "| Select-String powershell.exe -NotMatch") + cmd <- sprintf("powershell -command \"%s\"", inner_cmd) + tryCatch(length(system(cmd, intern = TRUE, show.output.on.console = FALSE)) > 0, + error = function(e) FALSE) + } else { + cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) + tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) + } } From ae01bf5d6d96835173059053c0891303656fca83 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 14 Jul 2022 18:11:26 -0700 Subject: [PATCH 10/26] fix: use curl instead of wget --- .github/workflows/r.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index b915b383c02..fef79caaec0 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -277,8 +277,8 @@ jobs: shell: bash run: | mkdir -p /usr/local/bin - wget \ - --output-document /usr/local/bin/minio.exe \ + curl \ + --output /usr/local/bin/minio.exe \ https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Install Google Cloud Storage Testbench From 2317adab47229c28927d57adf423337aa00ae6b9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 14 Jul 2022 18:32:13 -0700 Subject: [PATCH 11/26] fix: reference absolute path to minio --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index fef79caaec0..e10d23fc325 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -302,7 +302,7 @@ jobs: - name: Start MinIO and GCS Testbench shell: bash run: | - minio server ~/minio-data --port 9000 + /usr/local/bin/minio.exe server ~/minio-data --port 9000 python -m testbench --port 9001 - name: Check shell: Rscript {0} From 0ea8710eec5311308d7d96dcdeae2f079c7a5b40 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 14 Jul 2022 19:03:59 -0700 Subject: [PATCH 12/26] fix: move install down after new bash installed --- .github/workflows/r.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e10d23fc325..e9f32df2cfc 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -273,17 +273,6 @@ jobs: cd r/windows ls *.zip | xargs -n 1 unzip -uo rm -rf *.zip - - name: Install MinIO - shell: bash - run: | - mkdir -p /usr/local/bin - curl \ - --output /usr/local/bin/minio.exe \ - https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z - chmod +x /usr/local/bin/minio.exe - - name: Install Google Cloud Storage Testbench - shell: bash - run: ci/scripts/install_gcs_testbench.sh default - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.rversion }} @@ -299,6 +288,17 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck + - name: Install MinIO + shell: bash + run: | + mkdir -p /usr/local/bin + curl \ + --output /usr/local/bin/minio.exe \ + https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z + chmod +x /usr/local/bin/minio.exe + - name: Install Google Cloud Storage Testbench + shell: bash + run: ci/scripts/install_gcs_testbench.sh default - name: Start MinIO and GCS Testbench shell: bash run: | From ecbd2d3e890122591eca18db1ec40653ae6f9db0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 14 Jul 2022 19:34:21 -0700 Subject: [PATCH 13/26] fix: drop unnecessary port --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e9f32df2cfc..7cfeebccf3b 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -302,7 +302,7 @@ jobs: - name: Start MinIO and GCS Testbench shell: bash run: | - /usr/local/bin/minio.exe server ~/minio-data --port 9000 + /usr/local/bin/minio.exe server ~/minio-data python -m testbench --port 9001 - name: Check shell: Rscript {0} From e64ba0360dd98dc1a1caca29b332f1d2423dc7c0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 14 Jul 2022 20:07:32 -0700 Subject: [PATCH 14/26] fix: run gcs testbench in the background --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 7cfeebccf3b..78e89f1c7d5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -303,7 +303,7 @@ jobs: shell: bash run: | /usr/local/bin/minio.exe server ~/minio-data - python -m testbench --port 9001 + python -m testbench --port 9001 &>/dev/null & - name: Check shell: Rscript {0} run: | From ac68b5e3a2cacbbbc48959f299ca8ed61f141525 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 15 Jul 2022 07:54:08 -0700 Subject: [PATCH 15/26] fix: use R to run services in background --- .github/workflows/r.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 78e89f1c7d5..55c6b9f9576 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -299,11 +299,6 @@ jobs: - name: Install Google Cloud Storage Testbench shell: bash run: ci/scripts/install_gcs_testbench.sh default - - name: Start MinIO and GCS Testbench - shell: bash - run: | - /usr/local/bin/minio.exe server ~/minio-data - python -m testbench --port 9001 &>/dev/null & - name: Check shell: Rscript {0} run: | @@ -313,6 +308,14 @@ jobs: zip("libarrow.zip", ".") setwd("..") + minio_dir <- tempfile() + dir.create(minio_dir) + pid_minio <- sys::exec_background('/usr/local/bin/minio.exe', c('server', minio_dir)) + on.exit(tools::pskill(pid_minio), add = TRUE) + + pid_minio <- sys::exec_background('python', c('-m', 'testbench', '--port', '9001')) + on.exit(tools::pskill(pid_minio), add = TRUE) + Sys.setenv( RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), MAKEFLAGS = paste0("-j", parallel::detectCores()), From 09ec151e0021d26faefa8ac111c04c8a670002a4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 15 Jul 2022 08:03:05 -0700 Subject: [PATCH 16/26] chore: undo accidental changes to C++ --- cpp/src/arrow/filesystem/gcsfs.cc | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 7ef35d42066..ce11c0aa223 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -71,28 +71,22 @@ struct GcsPath { std::string bucket; std::string object; - static Result FromString(const std::string& s, - bool is_definitely_file = false) { - // If we know it's definitely a file, we should remove the trailing slash - const auto src = - is_definitely_file ? internal::RemoveTrailingSlash(s) : util::string_view(s); - - if (internal::IsLikelyUri(src)) { + static Result FromString(const std::string& s) { + if (internal::IsLikelyUri(s)) { return Status::Invalid( - "Expected a GCS object path of the form 'bucket/key...', got a URI: '", src, - "'"); + "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); } - auto const first_sep = src.find_first_of(internal::kSep); + auto const first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", src, "')"); + return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { return GcsPath{s, std::string(internal::RemoveTrailingSlash(s)), ""}; } GcsPath path; - path.full_path = src.to_string(); - path.bucket = src.substr(0, first_sep).to_string(); - path.object = src.substr(first_sep + 1).to_string(); + path.full_path = s; + path.bucket = s.substr(0, first_sep); + path.object = s.substr(first_sep + 1); return path; } From 2b675522836574c1d1c00dda0a3b56438ec7066b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 15 Jul 2022 13:02:35 -0700 Subject: [PATCH 17/26] feat: move server init within tests --- .github/workflows/r.yml | 8 -------- ci/scripts/r_docker_configure.sh | 2 -- ci/scripts/r_test.sh | 14 -------------- r/tests/testthat/test-gcs.R | 25 +++++++++++++++++++------ r/tests/testthat/test-s3-minio.R | 20 +++++++++++++++----- 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 55c6b9f9576..4b356704d8b 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -308,14 +308,6 @@ jobs: zip("libarrow.zip", ".") setwd("..") - minio_dir <- tempfile() - dir.create(minio_dir) - pid_minio <- sys::exec_background('/usr/local/bin/minio.exe', c('server', minio_dir)) - on.exit(tools::pskill(pid_minio), add = TRUE) - - pid_minio <- sys::exec_background('python', c('-m', 'testbench', '--port', '9001')) - on.exit(tools::pskill(pid_minio), add = TRUE) - Sys.setenv( RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), MAKEFLAGS = paste0("-j", parallel::detectCores()), diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 9bcbec960fd..79bf55550e1 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -88,8 +88,6 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "T fi if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh" ] && [ "`which pip`" ]; then - # Will need reticulate to check whether testbench is installed - Rscript -e 'install.packages("reticulate")' ${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh default fi fi diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index d1bcad51abb..f532bc7cf0a 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -100,20 +100,6 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') } else { args <- c('--no-manual', '--ignore-vignettes') build_args <- '--no-build-vignettes' - - if (nzchar(Sys.which('minio'))) { - message('Running minio for S3 tests (if build supports them)') - minio_dir <- tempfile() - dir.create(minio_dir) - pid_minio <- sys::exec_background('minio', c('server', minio_dir)) - on.exit(tools::pskill(pid_minio), add = TRUE) - } - - if (requireNamespace('reticulate', quietly = TRUE) && reticulate::py_module_available('testbench')) { - message('Running testbench for GCS tests (if build supports them)') - pid_minio <- sys::exec_background('python', c('-m', 'testbench', '--port', '9001')) - on.exit(tools::pskill(pid_minio), add = TRUE) - } } if (requireNamespace('reticulate', quietly = TRUE) && reticulate::py_module_available('pyarrow')) { diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index 8bebff3d216..2f6ed54c1ca 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -61,9 +61,16 @@ test_that("GcsFileSystem$create() input validation", { ) }) -if (process_is_running("testbench")) { +if (system('python -c "import testbench"') == 0) { testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") + pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port), + std_out = FALSE, + std_err = FALSE # TODO: is there a good place to send output? + ) + withr::defer(tools::pskill(pid_minio)) + Sys.sleep(1) # Wait for startup + fs <- GcsFileSystem$create( endpoint_override = sprintf("localhost:%s", testbench_port), retry_limit_seconds = 1, @@ -76,14 +83,16 @@ if (process_is_running("testbench")) { if (grepl("Couldn't connect to server", cond, fixed = TRUE)) { abort( c(sprintf("Unable to connect to testbench on port %s.", testbench_port), - i = "You can set a custom port with TESTBENCH_PORT environment variable."), - parent = cond) + i = "You can set a custom port with TESTBENCH_PORT environment variable." + ), + parent = cond + ) } else { stop(cond) } }) # Clean up when we're all done - on.exit(fs$DeleteDir(now)) + withr::defer(fs$DeleteDir(now)) gcs_path <- function(...) { paste(now, ..., sep = "/") @@ -94,10 +103,14 @@ if (process_is_running("testbench")) { } test_filesystem("gcs", fs, gcs_path, gcs_uri) + + withr::deferred_run() } else { test_that("GCSFileSystem tests with testbench", { - suggested_command <- paste('gunicorn --bind "localhost:9001" --worker-class sync', - '--threads 10 --reload --access-logfile - "testbench:run()"') + suggested_command <- paste( + 'gunicorn --bind "localhost:9001" --worker-class sync', + '--threads 10 --reload --access-logfile - "testbench:run()"' + ) skip(sprintf("testbench is not running. You can start it with:\n %s", suggested_command)) }) } diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index d07559f0ad8..46abceb02e5 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -17,12 +17,19 @@ source_file("helper-filesystems.R") -if (arrow_with_s3() && process_is_running("minio server")) { - # Get minio config, with expected defaults - minio_key <- Sys.getenv("MINIO_ACCESS_KEY", "minioadmin") - minio_secret <- Sys.getenv("MINIO_SECRET_KEY", "minioadmin") +if (arrow_with_s3() && nzchar(Sys.which("minio"))) { + minio_dir <- Sys.getenv("MINIO_DATA_DIR", tempfile()) + minio_key <- "minioadmin" + minio_secret <- "minioadmin" minio_port <- Sys.getenv("MINIO_PORT", "9000") + # Start minio server + dir.create(minio_dir, showWarnings = FALSE) + pid_minio <- sys::exec_background("minio", c("server", minio_dir, "--address", sprintf(":%s", minio_port)), + std_out = FALSE + ) + withr::defer(tools::pskill(pid_minio)) + # Helper function for minio URIs minio_uri <- function(...) { template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s" @@ -50,7 +57,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) # Clean up when we're all done - on.exit(fs$DeleteDir(now)) + withr::defer(fs$DeleteDir(now)) test_filesystem("s3", fs, minio_path, minio_uri) @@ -96,6 +103,9 @@ if (arrow_with_s3() && process_is_running("minio server")) { 'Cannot specify "external_id" without providing a role_arn string' ) }) + + # Cleanup + withr::deferred_run() } else { # Kinda hacky, let's put a skipped test here, just so we note that the tests # didn't run From a5bb5dcbec37214670ea4c7efd7c42cb1c64206f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 15 Jul 2022 14:09:27 -0700 Subject: [PATCH 18/26] fix: alternative messages should say not installed --- r/tests/testthat/test-gcs.R | 6 +----- r/tests/testthat/test-s3-minio.R | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index 2f6ed54c1ca..f9ce039a844 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -107,10 +107,6 @@ if (system('python -c "import testbench"') == 0) { withr::deferred_run() } else { test_that("GCSFileSystem tests with testbench", { - suggested_command <- paste( - 'gunicorn --bind "localhost:9001" --worker-class sync', - '--threads 10 --reload --access-logfile - "testbench:run()"' - ) - skip(sprintf("testbench is not running. You can start it with:\n %s", suggested_command)) + skip("googleapis-storage-testbench is not installed.") }) } diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 46abceb02e5..d6caadb529d 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -110,6 +110,6 @@ if (arrow_with_s3() && nzchar(Sys.which("minio"))) { # Kinda hacky, let's put a skipped test here, just so we note that the tests # didn't run test_that("S3FileSystem tests with Minio", { - skip("Minio is not running") + skip("Minio is not installed") }) } From b6ac039a5946b1c429047fc6acf80ef6207ebf98 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 18 Jul 2022 08:48:09 -0700 Subject: [PATCH 19/26] fix: install sys in Windows CI --- .github/workflows/r.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 4b356704d8b..e0b60c91a6c 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -288,6 +288,7 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck + sys - name: Install MinIO shell: bash run: | From a6bec839af6399629c82bdf704d2365aa1f4e380 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 18 Jul 2022 09:45:06 -0700 Subject: [PATCH 20/26] fix: as a workaround, use Rscript to install sys --- .github/workflows/r.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e0b60c91a6c..c11d5016798 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -288,7 +288,11 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck - sys + - name: Install sys package + shell: Rscript {0} + # Adding this to extra-packages above causes pak to attempt to install + # the unix package for unknown reasons. + run: install.packages("sys") - name: Install MinIO shell: bash run: | From 1fb253d1d08d556d32bfc9521f4052df56253d29 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 19 Jul 2022 15:24:42 -0700 Subject: [PATCH 21/26] fix: add minio to path and sys to Suggests --- .github/workflows/r.yml | 10 +++------- r/DESCRIPTION | 1 + 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index c11d5016798..a5cd9f9ccf4 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -288,19 +288,15 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck - - name: Install sys package - shell: Rscript {0} - # Adding this to extra-packages above causes pak to attempt to install - # the unix package for unknown reasons. - run: install.packages("sys") - name: Install MinIO shell: bash run: | - mkdir -p /usr/local/bin + mkdir -p "$HOME/.local/bin" curl \ --output /usr/local/bin/minio.exe \ https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z - chmod +x /usr/local/bin/minio.exe + chmod +x "$HOME/.local/bin/minio.exe" + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install Google Cloud Storage Testbench shell: bash run: ci/scripts/install_gcs_testbench.sh default diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 90e84d34bc2..cf83f563902 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -61,6 +61,7 @@ Suggests: rmarkdown, stringi, stringr, + sys, testthat (>= 3.1.0), tibble, tzdb, From 6b97163475e72157d0c2ef42ad0c642ad2e5bc27 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 19 Jul 2022 15:41:15 -0700 Subject: [PATCH 22/26] fix: fix output path for minio --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index a5cd9f9ccf4..a3c0c0dbee0 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -293,7 +293,7 @@ jobs: run: | mkdir -p "$HOME/.local/bin" curl \ - --output /usr/local/bin/minio.exe \ + --output "$HOME/.local/bin/minio.exe" \ https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x "$HOME/.local/bin/minio.exe" echo "$HOME/.local/bin" >> $GITHUB_PATH From baba443b4cd4dd906651e22d0991e7694ae2462d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 20 Jul 2022 12:06:31 -0700 Subject: [PATCH 23/26] fix: try without GCS on Windows for now --- .github/workflows/r.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index a3c0c0dbee0..29b6ec600c5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -297,9 +297,10 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x "$HOME/.local/bin/minio.exe" echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install Google Cloud Storage Testbench - shell: bash - run: ci/scripts/install_gcs_testbench.sh default + # TODO: figure out why the GCS tests are hanging + # - name: Install Google Cloud Storage Testbench + # shell: bash + # run: ci/scripts/install_gcs_testbench.sh default - name: Check shell: Rscript {0} run: | From c5d5e550de13c655cf53768e9e997a92b12bff3d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 20 Jul 2022 13:20:23 -0700 Subject: [PATCH 24/26] fix: remove unnecessary source --- r/tests/testthat/test-gcs.R | 2 -- r/tests/testthat/test-s3-minio.R | 2 -- 2 files changed, 4 deletions(-) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index f9ce039a844..1cc990eeb3f 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -17,8 +17,6 @@ skip_if_not_available("gcs") -source_file("helper-filesystems.R") - test_that("FileSystem$from_uri with gs://", { fs_and_path <- FileSystem$from_uri("gs://my/test/bucket/") expect_r6_class(fs_and_path$fs, "GcsFileSystem") diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index d6caadb529d..1c03b524946 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -source_file("helper-filesystems.R") - if (arrow_with_s3() && nzchar(Sys.which("minio"))) { minio_dir <- Sys.getenv("MINIO_DATA_DIR", tempfile()) minio_key <- "minioadmin" From e12f508e73ffcd190ec132b04092f3d876378369 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 21 Jul 2022 10:48:10 -0700 Subject: [PATCH 25/26] fix: move dplyr import and use skip instead of if statement --- r/tests/testthat/helper-filesystems.R | 4 +- r/tests/testthat/test-gcs.R | 82 ++++++------ r/tests/testthat/test-s3-minio.R | 171 +++++++++++++------------- 3 files changed, 127 insertions(+), 130 deletions(-) diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index bb78588533e..2ad6d23b010 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -40,7 +40,9 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - library(dplyr) + if (!("package:dplyr" %in% search())) { + abort("library(dplyr) required for test_filesystem()") + } test_that(sprintf("read/write compressed csv on %s using FileSystem", name), { skip_if_not_available("gzip") diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index 1cc990eeb3f..c0a02193c55 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -59,52 +59,50 @@ test_that("GcsFileSystem$create() input validation", { ) }) -if (system('python -c "import testbench"') == 0) { - testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") +skip_on_cran() +skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.") +library(dplyr) - pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port), - std_out = FALSE, - std_err = FALSE # TODO: is there a good place to send output? - ) - withr::defer(tools::pskill(pid_minio)) - Sys.sleep(1) # Wait for startup +testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") - fs <- GcsFileSystem$create( - endpoint_override = sprintf("localhost:%s", testbench_port), - retry_limit_seconds = 1, - scheme = "http", - anonymous = TRUE # Will fail to resolve host name if anonymous isn't TRUE - ) +pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port), + std_out = FALSE, + std_err = FALSE # TODO: is there a good place to send output? +) +withr::defer(tools::pskill(pid_minio)) +Sys.sleep(1) # Wait for startup - now <- as.character(as.numeric(Sys.time())) - tryCatch(fs$CreateDir(now), error = function(cond) { - if (grepl("Couldn't connect to server", cond, fixed = TRUE)) { - abort( - c(sprintf("Unable to connect to testbench on port %s.", testbench_port), - i = "You can set a custom port with TESTBENCH_PORT environment variable." - ), - parent = cond - ) - } else { - stop(cond) - } - }) - # Clean up when we're all done - withr::defer(fs$DeleteDir(now)) +fs <- GcsFileSystem$create( + endpoint_override = sprintf("localhost:%s", testbench_port), + retry_limit_seconds = 1, + scheme = "http", + anonymous = TRUE # Will fail to resolve host name if anonymous isn't TRUE +) - gcs_path <- function(...) { - paste(now, ..., sep = "/") +now <- as.character(as.numeric(Sys.time())) +tryCatch(fs$CreateDir(now), error = function(cond) { + if (grepl("Couldn't connect to server", cond, fixed = TRUE)) { + abort( + c(sprintf("Unable to connect to testbench on port %s.", testbench_port), + i = "You can set a custom port with TESTBENCH_PORT environment variable." + ), + parent = cond + ) + } else { + stop(cond) } - gcs_uri <- function(...) { - template <- "gs://anonymous@%s?scheme=http&endpoint_override=localhost%s%s&retry_limit_seconds=1" - sprintf(template, gcs_path(...), "%3A", testbench_port) - } - - test_filesystem("gcs", fs, gcs_path, gcs_uri) +}) +# Clean up when we're all done +withr::defer(fs$DeleteDir(now)) - withr::deferred_run() -} else { - test_that("GCSFileSystem tests with testbench", { - skip("googleapis-storage-testbench is not installed.") - }) +gcs_path <- function(...) { + paste(now, ..., sep = "/") +} +gcs_uri <- function(...) { + template <- "gs://anonymous@%s?scheme=http&endpoint_override=localhost%s%s&retry_limit_seconds=1" + sprintf(template, gcs_path(...), "%3A", testbench_port) } + +test_filesystem("gcs", fs, gcs_path, gcs_uri) + +withr::deferred_run() diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 1c03b524946..4cd11333922 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -15,99 +15,96 @@ # specific language governing permissions and limitations # under the License. -if (arrow_with_s3() && nzchar(Sys.which("minio"))) { - minio_dir <- Sys.getenv("MINIO_DATA_DIR", tempfile()) - minio_key <- "minioadmin" - minio_secret <- "minioadmin" - minio_port <- Sys.getenv("MINIO_PORT", "9000") +skip_if_not(arrow_with_s3(), message = "arrow not build with S3 support.") +skip_if_not(nzchar(Sys.which("minio")), message = "minio is not installed.") - # Start minio server - dir.create(minio_dir, showWarnings = FALSE) - pid_minio <- sys::exec_background("minio", c("server", minio_dir, "--address", sprintf(":%s", minio_port)), - std_out = FALSE - ) - withr::defer(tools::pskill(pid_minio)) +library(dplyr) - # Helper function for minio URIs - minio_uri <- function(...) { - template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s" - sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port) - } - minio_path <- function(...) paste(now, ..., sep = "/") +minio_dir <- Sys.getenv("MINIO_DATA_DIR", tempfile()) +minio_key <- "minioadmin" +minio_secret <- "minioadmin" +minio_port <- Sys.getenv("MINIO_PORT", "9000") - # Create a "bucket" on minio for this test run, which we'll delete when done. - fs <- S3FileSystem$create( - access_key = minio_key, - secret_key = minio_secret, - scheme = "http", - endpoint_override = paste0("localhost:", minio_port), - allow_bucket_creation = TRUE, - allow_bucket_deletion = TRUE - ) - limited_fs <- S3FileSystem$create( - access_key = minio_key, - secret_key = minio_secret, - scheme = "http", - endpoint_override = paste0("localhost:", minio_port), - allow_bucket_creation = FALSE, - allow_bucket_deletion = FALSE - ) - now <- as.character(as.numeric(Sys.time())) - fs$CreateDir(now) - # Clean up when we're all done - withr::defer(fs$DeleteDir(now)) +# Start minio server +dir.create(minio_dir, showWarnings = FALSE) +pid_minio <- sys::exec_background("minio", c("server", minio_dir, "--address", sprintf(":%s", minio_port)), + std_out = FALSE +) +withr::defer(tools::pskill(pid_minio)) - test_filesystem("s3", fs, minio_path, minio_uri) +# Helper function for minio URIs +minio_uri <- function(...) { + template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s" + sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port) +} +minio_path <- function(...) paste(now, ..., sep = "/") - test_that("CreateDir fails on bucket if allow_bucket_creation=False", { - now_tmp <- paste0(now, "-test-fail-delete") - fs$CreateDir(now_tmp) +# Create a "bucket" on minio for this test run, which we'll delete when done. +fs <- S3FileSystem$create( + access_key = minio_key, + secret_key = minio_secret, + scheme = "http", + endpoint_override = paste0("localhost:", minio_port), + allow_bucket_creation = TRUE, + allow_bucket_deletion = TRUE +) +limited_fs <- S3FileSystem$create( + access_key = minio_key, + secret_key = minio_secret, + scheme = "http", + endpoint_override = paste0("localhost:", minio_port), + allow_bucket_creation = FALSE, + allow_bucket_deletion = FALSE +) +now <- as.character(as.numeric(Sys.time())) +fs$CreateDir(now) +# Clean up when we're all done +withr::defer(fs$DeleteDir(now)) - expect_error(limited_fs$CreateDir("should-fail")) - expect_error(limited_fs$DeleteDir(now_tmp)) - }) +test_filesystem("s3", fs, minio_path, minio_uri) - test_that("S3FileSystem input validation", { - expect_error( - S3FileSystem$create(access_key = "foo"), - "Key authentication requires both access_key and secret_key" - ) - expect_error( - S3FileSystem$create(secret_key = "foo"), - "Key authentication requires both access_key and secret_key" - ) - expect_error( - S3FileSystem$create(session_token = "foo"), - paste0( - "In order to initialize a session with temporary credentials, ", - "both secret_key and access_key must be provided ", - "in addition to session_token." - ) - ) - expect_error( - S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE), - 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE' - ) - expect_error( - S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"), - "Cannot provide both key authentication and role_arn" - ) - expect_error( - S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"), - 'Cannot specify "external_id" without providing a role_arn string' - ) - expect_error( - S3FileSystem$create(external_id = "foo"), - 'Cannot specify "external_id" without providing a role_arn string' +test_that("CreateDir fails on bucket if allow_bucket_creation=False", { + now_tmp <- paste0(now, "-test-fail-delete") + fs$CreateDir(now_tmp) + + expect_error(limited_fs$CreateDir("should-fail")) + expect_error(limited_fs$DeleteDir(now_tmp)) +}) + +test_that("S3FileSystem input validation", { + expect_error( + S3FileSystem$create(access_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(secret_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(session_token = "foo"), + paste0( + "In order to initialize a session with temporary credentials, ", + "both secret_key and access_key must be provided ", + "in addition to session_token." ) - }) + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE), + 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE' + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"), + "Cannot provide both key authentication and role_arn" + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"), + 'Cannot specify "external_id" without providing a role_arn string' + ) + expect_error( + S3FileSystem$create(external_id = "foo"), + 'Cannot specify "external_id" without providing a role_arn string' + ) +}) - # Cleanup - withr::deferred_run() -} else { - # Kinda hacky, let's put a skipped test here, just so we note that the tests - # didn't run - test_that("S3FileSystem tests with Minio", { - skip("Minio is not installed") - }) -} +# Cleanup +withr::deferred_run() From b42042aa7d1a32bf0f66be1a54406c06c5f9b86c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 20 Sep 2022 07:51:11 -0700 Subject: [PATCH 26/26] Update .github/workflows/r.yml --- .github/workflows/r.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 29b6ec600c5..02454a6ee3a 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -297,7 +297,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x "$HOME/.local/bin/minio.exe" echo "$HOME/.local/bin" >> $GITHUB_PATH - # TODO: figure out why the GCS tests are hanging + # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows # - name: Install Google Cloud Storage Testbench # shell: bash # run: ci/scripts/install_gcs_testbench.sh default