From bba466c7c6be25fc9023e04a9c773ae83f11ba41 Mon Sep 17 00:00:00 2001 From: karldw Date: Mon, 23 Aug 2021 22:24:45 -0700 Subject: [PATCH 01/27] Start on offline build --- r/.gitignore | 1 + r/Makefile | 1 + r/R/util.R | 63 +++++++++++++++++++++++++ r/man/download_optional_dependencies.Rd | 32 +++++++++++++ r/tools/nixlibs.R | 53 ++++++++++++++------- 5 files changed, 132 insertions(+), 18 deletions(-) create mode 100644 r/man/download_optional_dependencies.Rd diff --git a/r/.gitignore b/r/.gitignore index 76e8a8dd0bd..937f055fcf3 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -18,3 +18,4 @@ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz extra-tests/files +/tools/cpp/ diff --git a/r/Makefile b/r/Makefile index 7a51cbd5188..882bbf06015 100644 --- a/r/Makefile +++ b/r/Makefile @@ -38,6 +38,7 @@ deps: build: doc cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ R CMD build . check: build diff --git a/r/R/util.R b/r/R/util.R index 5958b0b3111..b6264479368 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -183,3 +183,66 @@ repeat_value_as_array <- function(object, n) { } return(Scalar$create(object)$as_array(n)) } + + +#' Download all optional Arrow dependencies +#' +#' @param deps_dir Directory to save files into. Will be created if necessary. +#' +#' @return TRUE/FALSE for whether the downloads were successful +#' +#' This function is used for setting up an offline build. If it's possible to +#' download at build time, don't use this function. Instead, let `cmake` +#' download them for you. +#' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` +#' is `BUNDLED` or `AUTO`. +#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +#' +#' @examples +#' \dontrun{ +#' download_optional_dependencies("arrow-thirdparty") +#' } +#' # Now define the environment variables (see arrow-thirdparty/DEFINE_ENV_VARS.sh) +#' # and run your offline build. +download_optional_dependencies <- function(deps_dir) { + #' This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... + download_dependencies_sh <- system.file( + "tools/cpp/thirdparty/download_dependencies.sh", + package = "arrow", + mustWork = TRUE + ) + + # Make sure the directory is sort of reasonable before creating it + deps_dir <- trimws(deps_dir) + stopifnot(nchar(deps_dir) >= 1) + dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) + + # Run download_dependencies.sh + stdout_file <- tempfile() + stderr_file <- tempfile() + file.create(stdout_file, stderr_file) + cat("***Downloading optional dependencies to ", deps_dir) + return_status <- system2(download_dependencies_sh, args = deps_dir, + stdout = stdout_file, stderr = stderr_file + ) + if (return_status == 0) { + # File contents are something like: + # # Environment variables for offline Arrow build + # export ARROW_ABSL_URL=/path/to/file/absl-12345.tar.gz + # export ... + env_var_file <- file.path(deps_dir, "DEFINE_ENV_VARS.sh") + # Also save a copy in the directory for ease of use. + file.copy(stdout_file, env_var_file) + msg <- c( + "*** Offline build environment variables", + paste(" (These are also saved in ", env_var_file, ")\n"), + readLines(stdout_file) + ) + cat(paste(msg, collapse = "\n")) + } else { + msg <- c("Failed to download some optional dependencies", readLines(stderr_file), "") + warning(paste(msg, collapse = "\n")) + } + # Return sucess status + return_status == 0 +} diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd new file mode 100644 index 00000000000..cf81f6af8ff --- /dev/null +++ b/r/man/download_optional_dependencies.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{download_optional_dependencies} +\alias{download_optional_dependencies} +\title{Download all optional Arrow dependencies} +\usage{ +download_optional_dependencies(deps_dir) +} +\arguments{ +\item{deps_dir}{Directory to save files into. Will be created if necessary.} +} +\value{ +TRUE/FALSE for whether the downloads were successful + +This function is used for setting up an offline build. If it's possible to +download at build time, don't use this function. Instead, let \code{cmake} +download them for you. +These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} +is \code{BUNDLED} or \code{AUTO}. +https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +} +\description{ +Download all optional Arrow dependencies +} +\examples{ +\dontrun{ +download_optional_dependencies("arrow-thirdparty") +} +# Now define the environment variables (see arrow-thirdparty/DEFINE_ENV_VARS.sh) +# and run your offline build. +This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... +} diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index cd0838395bf..acc6a9c774e 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -29,17 +29,8 @@ if (getRversion() < 3.4 && is.null(getOption("download.file.method"))) { options(.arrow.cleanup = character()) # To collect dirs to rm on exit on.exit(unlink(getOption(".arrow.cleanup"))) + env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) -# * no download, build_ok: Only build with local git checkout -# * download_ok, no build: Only use prebuilt binary, if found -# * neither: Get the arrow-without-arrow package -# Download and build are OK unless you say not to -download_ok <- !env_is("LIBARROW_DOWNLOAD", "false") -build_ok <- !env_is("LIBARROW_BUILD", "false") -# But binary defaults to not OK -binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") -# For local debugging, set ARROW_R_DEV=TRUE to make this script print more -quietly <- !env_is("ARROW_R_DEV", "true") try_download <- function(from_url, to_file) { status <- try( @@ -52,6 +43,21 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } +quietly <- !env_is("ARROW_R_DEV", "true") # try_download uses quietly global +# * download_ok, build_ok: Use prebuilt binary, if found, otherwise try to build +# * no download, build_ok: Build with local git checkout, if available, or +# sources included in r/tools/cpp/. Optional dependencies are not included, +# and will not be automatically downloaded. +# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +# * download_ok, no build: Only use prebuilt binary, if found +# * neither: Get the arrow-without-arrow package +# Download and build are OK unless you say not to (or can't access github) +download_ok <- (!env_is("LIBARROW_DOWNLOAD", "false")) && try_download("https://github.com", tempfile()) +build_ok <- !env_is("LIBARROW_BUILD", "false") +# But binary defaults to not OK +binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") +# For local debugging, set ARROW_R_DEV=TRUE to make this script print more + download_binary <- function(os = identify_os()) { libfile <- tempfile() if (!is.null(os)) { @@ -271,13 +277,18 @@ apache_download <- function(version, destfile, n_mirrors = 3) { } find_local_source <- function(arrow_home = Sys.getenv("ARROW_SOURCE_HOME", "..")) { + cpp_dir <- NULL if (file.exists(paste0(arrow_home, "/cpp/src/arrow/api.h"))) { # We're in a git checkout of arrow, so we can build it - cat("*** Found local C++ source\n") - return(paste0(arrow_home, "/cpp")) - } else { - return(NULL) + cpp_dir <- paste0(arrow_home, "/cpp") + } else if (file.exists("tools/cpp/src/arrow/api.h")) { + # Use the version bundled in tools/cpp/ + cpp_dir <- "tools/cpp" + } + if (!is.null(cpp_dir)) { + cat("*** Found local C++ source:\n '", cpp_dir, "'\n") } + cpp_dir } build_libarrow <- function(src_dir, dst_dir) { @@ -373,6 +384,14 @@ ensure_cmake <- function() { "https://github.com/Kitware/CMake/releases/download/v", CMAKE_VERSION, "/cmake-", CMAKE_VERSION, postfix ) + if (!download_ok) { + stop( + "cmake was not found and downloads are not permitted.\n", + "Make sure cmake is installed and available on your PATH\n", + "(or download '", cmake_binary_url, + "' and define the CMAKE environment variable)." + ) + } cmake_tar <- tempfile() cmake_dir <- tempfile() try_download(cmake_binary_url, cmake_tar) @@ -503,12 +522,10 @@ if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) { unlink(bin_file) } else if (build_ok) { # (2) Find source and build it - if (download_ok) { + src_dir <- find_local_source() + if (is.null(src_dir) && download_ok) { src_dir <- download_source() } - if (is.null(src_dir)) { - src_dir <- find_local_source() - } if (!is.null(src_dir)) { cat("*** Building C++ libraries\n") build_libarrow(src_dir, dst_dir) From 0cac87405500dc4b88bff4431adb09a3355ab076 Mon Sep 17 00:00:00 2001 From: karldw Date: Tue, 24 Aug 2021 16:10:00 -0700 Subject: [PATCH 02/27] Add checks for features --- r/Makefile | 1 + r/R/util.R | 6 +- r/inst/build_arrow_static.sh | 2 +- r/tools/nixlibs.R | 307 ++++++++++++++++++++++++++++++++--- 4 files changed, 288 insertions(+), 28 deletions(-) diff --git a/r/Makefile b/r/Makefile index 882bbf06015..9e4a51733e3 100644 --- a/r/Makefile +++ b/r/Makefile @@ -57,4 +57,5 @@ clean: -rm src/Makevars.win -rm -rf arrow.Rcheck/ -rm -rf libarrow/ + -rm -rf tools/cpp/ -find . -name "*.orig" -delete diff --git a/r/R/util.R b/r/R/util.R index b6264479368..163c348ec32 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -205,13 +205,12 @@ repeat_value_as_array <- function(object, n) { #' # Now define the environment variables (see arrow-thirdparty/DEFINE_ENV_VARS.sh) #' # and run your offline build. download_optional_dependencies <- function(deps_dir) { - #' This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... + # This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... download_dependencies_sh <- system.file( "tools/cpp/thirdparty/download_dependencies.sh", package = "arrow", mustWork = TRUE ) - # Make sure the directory is sort of reasonable before creating it deps_dir <- trimws(deps_dir) stopifnot(nchar(deps_dir) >= 1) @@ -222,7 +221,8 @@ download_optional_dependencies <- function(deps_dir) { stderr_file <- tempfile() file.create(stdout_file, stderr_file) cat("***Downloading optional dependencies to ", deps_dir) - return_status <- system2(download_dependencies_sh, args = deps_dir, + return_status <- system2(download_dependencies_sh, + args = deps_dir, stdout = stdout_file, stderr = stderr_file ) if (return_status == 0) { diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 84a9f971246..578d8b6e5b2 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -59,7 +59,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_FILESYSTEM=ON \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ - -DARROW_JSON=ON \ + -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_PARQUET=${ARROW_PARQUET:-ON} \ -DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \ diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index acc6a9c774e..1412d748ef1 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -43,7 +43,7 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } -quietly <- !env_is("ARROW_R_DEV", "true") # try_download uses quietly global +quietly <- !env_is("ARROW_R_DEV", "true") # try_download uses quietly global # * download_ok, build_ok: Use prebuilt binary, if found, otherwise try to build # * no download, build_ok: Build with local git checkout, if available, or # sources included in r/tools/cpp/. Optional dependencies are not included, @@ -199,6 +199,10 @@ system_release <- function() { read_system_release <- function() utils::head(readLines("/etc/system-release"), 1) +is_solaris <- function() { + tolower(Sys.info()[["sysname"]]) %in% "sunos" +} + #### end distro #### find_available_binary <- function(os) { @@ -340,18 +344,22 @@ build_libarrow <- function(src_dir, dst_dir) { LDFLAGS = R_CMD_config("LDFLAGS") ) env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") + # Add env variables like ARROW_S3=ON. Order doesn't matter. Depends on `download_ok` env_vars <- with_s3_support(env_vars) env_vars <- with_mimalloc(env_vars) - if (tolower(Sys.info()[["sysname"]]) %in% "sunos") { - # jemalloc doesn't seem to build on Solaris - # nor does thrift, so turn off parquet, - # and arrowExports.cpp requires parquet for dataset (ARROW-11994), so turn that off - # xsimd doesn't compile, so set SIMD level to NONE to skip it - # re2 and utf8proc do compile, - # but `ar` fails to build libarrow_bundled_dependencies, so turn them off - # so that there are no bundled deps - env_vars <- paste(env_vars, "ARROW_JEMALLOC=OFF ARROW_PARQUET=OFF ARROW_DATASET=OFF ARROW_WITH_RE2=OFF ARROW_WITH_UTF8PROC=OFF EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE") - } + env_vars <- with_jemalloc(env_vars) + env_vars <- with_parquet(env_vars) + env_vars <- with_dataset(env_vars) + env_vars <- with_brotli(env_vars) + env_vars <- with_bz2(env_vars) + env_vars <- with_lz4(env_vars) + env_vars <- with_re2(env_vars) + env_vars <- with_snappy(env_vars) + env_vars <- with_utf8proc(env_vars) + env_vars <- with_zlib(env_vars) + env_vars <- with_zstd(env_vars) + env_vars <- with_xsimd(env_vars) + cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n") status <- suppressWarnings(system( paste(env_vars, "inst/build_arrow_static.sh"), @@ -434,12 +442,108 @@ cmake_version <- function(cmd = "cmake") { ) } -with_s3_support <- function(env_vars) { - arrow_s3 <- toupper(Sys.getenv("ARROW_S3")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - # but if ARROW_S3=OFF explicitly, we are definitely off, so override - if (toupper(Sys.getenv("ARROW_S3")) == "OFF") { - arrow_s3 <- FALSE +is_feature_requested <- function(arrow_feature) { + # Cases: + # * nothing set: OFF + # * explicitly enabled: ON + # * LIBARROW_MINIMAL=false: ON + # Note that if LIBARROW_MINIMAL is unset, `configure` sets it to "false" when + # NOT_CRAN or LIBARROW_DOWNLOAD are "true". + explicitly_set_val <- toupper(Sys.getenv(arrow_feature)) + if (explicitly_set_val == "OFF") { + feature_on <- FALSE + } else { + feature_on <- explicitly_set_val == "ON" || env_is("LIBARROW_MINIMAL", "false") + } + feature_on +} + +remote_download_unavailable <- function(url_env_vars) { + # Check the env vars + # e.g. ARROW_MIMALLOC_URL should point to an existing file if !download_ok + # Some dependencies require multiple downloads - check that all are available. + # https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + missing_local <- FALSE + for (v in url_env_vars) { + local_url <- Sys.getenv(v) + missing_local <- missing_local || (local_url == "") || (!file.exists(local_url)) + } + # This check is only relevant when Cmake would try to download things + # (This check would change if we were using individual dependency resolution.) + # https://arrow.apache.org/docs/developers/cpp/building.html#individual-dependency-resolution) + download_required <- missing_local && + (toupper(Sys.getenv("ARROW_DEPENDENCY_SOURCE")) %in% c("", "BUNDLED", "AUTO")) + download_unavailable <- download_required && (!download_ok) + download_unavailable +} + +# Memory alloc features: mimalloc, jemalloc +with_mimalloc <- function(env_vars) { + # Note that the logic here is different than in build_arrow_static.sh, which + # default includes mimalloc even when LIBARROW_MINIMAL=true + arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") + + if (arrow_mimalloc) { + # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 + if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { + cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") + arrow_mimalloc <- FALSE + } + download_unavailable <- remote_download_unavailable("ARROW_MIMALLOC_URL") + if (download_unavailable) { + cat(paste( + "**** mimalloc needs to be downloaded, but can't be.", + "See ?arrow::download_optional_dependencies.", + "Building with ARROW_MIMALLOC=OFF\n" + )) + arrow_mimalloc <- FALSE + } + } + paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) +} + +with_jemalloc <- function(env_vars) { + arrow_jemalloc <- is_feature_requested("ARROW_JEMALLOC") && !is_solaris() + # jemalloc doesn't seem to build on Solaris + if (arrow_jemalloc) { + download_unavailable <- remote_download_unavailable("ARROW_JEMALLOC_URL") + if (download_unavailable) { + cat("**** jemalloc requested but cannot be downloaded. Setting ARROW_JEMALLOC=OFF\n") + arrow_jemalloc <- FALSE + } } + paste(env_vars, ifelse(arrow_jemalloc, "ARROW_JEMALLOC=ON", "ARROW_JEMALLOC=OFF")) +} + +# File access features: parquet, dataset, S3 +with_parquet <- function(env_vars) { + # We try to build parquet unless it's explicitly turned off, even if + # LIBARROW_MINIMAL=true. + # Parquet is built-in, but depends on Thrift, which is thirdparty + arrow_parquet <- !env_is("ARROW_PARQUET", "off") && !is_solaris() + # Thrift doesn't compile on solaris, so turn off parquet there. + if (arrow_parquet) { + download_unavailable <- remote_download_unavailable("ARROW_THRIFT_URL") + if (download_unavailable) { + cat("**** parquet requested but dependencies cannot be downloaded. Setting ARROW_PARQUET=OFF\n") + arrow_parquet <- FALSE + } + } + paste(env_vars, ifelse(arrow_parquet, "ARROW_PARQUET=ON", "ARROW_PARQUET=OFF")) +} + +with_dataset <- function(env_vars) { + # Note: we try to build dataset unless it's explicitly turned off, even if + # LIBARROW_MINIMAL=true. + arrow_dataset <- (!env_is("ARROW_DATASET", "off")) && + grepl("ARROW_PARQUET=ON", with_parquet("")) + # arrowExports.cpp requires parquet for dataset (ARROW-11994), so turn dataset + # off if parquet is off. + paste(env_vars, ifelse(arrow_dataset, "ARROW_DATASET=ON", "ARROW_DATASET=OFF")) +} + +with_s3_support <- function(env_vars) { + arrow_s3 <- is_feature_requested("ARROW_S3") if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs @@ -454,22 +558,177 @@ with_s3_support <- function(env_vars) { cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE } + download_unavailable <- remote_download_unavailable(c( + "ARROW_AWSSDK_URL", + "ARROW_AWS_C_COMMON_URL", + "ARROW_AWS_CHECKSUMS_URL", + "ARROW_AWS_C_EVENT_STREAM_URL" + )) + if (download_unavailable) { + cat(paste( + "**** S3 dependencies need to be downloaded, but can't be.", + "See ?arrow::download_optional_dependencies.", + "Building with ARROW_S3=OFF\n" + )) + arrow_s3 <- FALSE + } } paste(env_vars, ifelse(arrow_s3, "ARROW_S3=ON", "ARROW_S3=OFF")) } -with_mimalloc <- function(env_vars) { - arrow_mimalloc <- toupper(Sys.getenv("ARROW_MIMALLOC")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - if (arrow_mimalloc) { - # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { - cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") - arrow_mimalloc <- FALSE +# Compression features: brotli, bz2, lz4, snappy, zlib, zstd +with_brotli <- function(env_vars) { + arrow_brotli <- is_feature_requested("ARROW_WITH_BROTLI") + if (arrow_brotli) { + download_unavailable <- remote_download_unavailable("ARROW_BROTLI_URL") + if (download_unavailable) { + cat("**** brotli requested but cannot be downloaded. Setting ARROW_WITH_BROTLI=OFF\n") + arrow_brotli <- FALSE } } - paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) + paste(env_vars, ifelse(arrow_brotli, "ARROW_WITH_BROTLI=ON", "ARROW_WITH_BROTLI=OFF")) +} + +with_bz2 <- function(env_vars) { + arrow_brotli <- is_feature_requested("ARROW_WITH_BZ2") + if (arrow_brotli) { + download_unavailable <- remote_download_unavailable("ARROW_BZIP2_URL") + if (download_unavailable) { + cat("**** bz2 requested but cannot be downloaded. Setting ARROW_WITH_BZ2=OFF\n") + arrow_brotli <- FALSE + } + } + paste(env_vars, ifelse(arrow_brotli, "ARROW_WITH_BZ2=ON", "ARROW_WITH_BZ2=OFF")) +} + +with_lz4 <- function(env_vars) { + arrow_lz4 <- is_feature_requested("ARROW_WITH_LZ4") + if (arrow_lz4) { + download_unavailable <- remote_download_unavailable("ARROW_LZ4_URL") + if (download_unavailable) { + cat("**** lz4 requested but cannot be downloaded. Setting ARROW_WITH_LZ4=OFF\n") + arrow_lz4 <- FALSE + } + } + paste(env_vars, ifelse(arrow_lz4, "ARROW_WITH_LZ4=ON", "ARROW_WITH_LZ4=OFF")) +} + +with_snappy <- function(env_vars) { + arrow_snappy <- is_feature_requested("ARROW_WITH_SNAPPY") + if (arrow_snappy) { + download_unavailable <- remote_download_unavailable("ARROW_SNAPPY_URL") + if (download_unavailable) { + cat("**** snappy requested but cannot be downloaded. Setting ARROW_WITH_SNAPPY=OFF\n") + arrow_snappy <- FALSE + } + } + paste(env_vars, ifelse(arrow_snappy, "ARROW_WITH_SNAPPY=ON", "ARROW_WITH_SNAPPY=OFF")) +} + +with_zlib <- function(env_vars) { + arrow_zlib <- is_feature_requested("ARROW_WITH_ZLIB") + if (arrow_zlib) { + download_unavailable <- remote_download_unavailable("ARROW_ZLIB_URL") + if (download_unavailable) { + cat("**** zlib requested but cannot be downloaded. Setting ARROW_WITH_ZLIB=OFF\n") + arrow_zlib <- FALSE + } + } + paste(env_vars, ifelse(arrow_zlib, "ARROW_WITH_ZLIB=ON", "ARROW_WITH_ZLIB=OFF")) +} + +with_zstd <- function(env_vars) { + arrow_zstd <- is_feature_requested("ARROW_WITH_ZSTD") + if (arrow_zstd) { + download_unavailable <- remote_download_unavailable("ARROW_ZSTD_URL") + if (download_unavailable) { + cat("**** zstd requested but cannot be downloaded. Setting ARROW_WITH_ZSTD=OFF\n") + arrow_zstd <- FALSE + } + } + paste(env_vars, ifelse(arrow_zstd, "ARROW_WITH_ZSTD=ON", "ARROW_WITH_ZSTD=OFF")) +} + +# Specific computations: json, re2, utf8proc, xsimd +with_json <- function(env_vars) { + # Note: we try to build json unless it's explicitly turned off, even if + # LIBARROW_MINIMAL=true. + arrow_json <- (!env_is("ARROW_JSON", "off")) || (!env_is("ARROW_WITH_RAPIDJSON", "off")) + if (arrow_json) { + download_unavailable <- remote_download_unavailable("ARROW_RAPIDJSON_URL") + if (download_unavailable) { + cat("**** json requested but cannot be downloaded. Setting ARROW_JSON=OFF\n") + arrow_json <- FALSE + } + } + paste(env_vars, ifelse(arrow_json, "ARROW_WITH_JSON=ON", "ARROW_WITH_JSON=OFF")) } +with_re2 <- function(env_vars) { + # Note: we try to build re2 unless it's explicitly turned off, even if + # LIBARROW_MINIMAL=true. + arrow_re2 <- !env_is("ARROW_WITH_RE2", "off") && !is_solaris() + # re2 and utf8proc do compile on Solaris + # but `ar` fails to build libarrow_bundled_dependencies, so turn them off + # so that there are no bundled deps + if (arrow_re2) { + download_unavailable <- remote_download_unavailable("ARROW_RE2_URL") + if (download_unavailable) { + cat("**** re2 requested but cannot be downloaded. Setting ARROW_WITH_RE2=OFF\n") + arrow_re2 <- FALSE + } + } + paste(env_vars, ifelse(arrow_re2, "ARROW_WITH_RE2=ON", "ARROW_WITH_RE2=OFF")) +} + +with_utf8proc <- function(env_vars) { + # Note: we try to build utf8proc unless it's explicitly turned off, even if + # LIBARROW_MINIMAL=true. + arrow_utf8proc <- !env_is("ARROW_WITH_UTF8PROC", "off") && !is_solaris() + # re2 and utf8proc do compile on Solaris + # but `ar` fails to build libarrow_bundled_dependencies, so turn them off + # so that there are no bundled deps + if (arrow_utf8proc) { + download_unavailable <- remote_download_unavailable("ARROW_UTF8PROC_URL") + if (download_unavailable) { + cat("**** utf8proc requested but cannot be downloaded. Setting ARROW_WITH_UTF8PROC=OFF\n") + arrow_utf8proc <- FALSE + } + } + paste(env_vars, ifelse(arrow_utf8proc, "ARROW_WITH_UTF8PROC=ON", "ARROW_WITH_UTF8PROC=OFF")) +} + +with_xsimd <- function(env_vars) { + # xsimd doesn't compile on solaris, so set SIMD level to NONE to skip it. + # Use it everywhere else (as long as xsimd is available) + use_simd <- !is_solaris() + if (use_simd) { + download_unavailable <- remote_download_unavailable("ARROW_XSIMD_URL") + if (download_unavailable) { + cat("**** xsimd requested but cannot be downloaded. Setting EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE\n") + use_simd <- FALSE + } + } + paste(env_vars, ifelse(use_simd, "", "EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE")) +} + +# Notes on other downloaded dependencies: +# Boost is required in some cases (Flight, Gandiva, S3, and tests, at least), +# but there's no such thing as ARROW_BOOST=OFF. +# It may be necessary to set BOOST_ROOT or ARROW_BOOST_URL for offline installs. +# +# Other URLs get downloaded, but afaik, are not used in the build. +# - ARROW_ABSL_URL - seems to be a dependency of gRPC +# - ARROW_CARES_URL - "a dependency of gRPC" +# - ARROW_GBENCHMARK_URL - "Google benchmark, for testing" +# - ARROW_GFLAGS_URL - "for command line utilities (formerly Googleflags)" +# - ARROW_GLOG_URL - "for logging" +# - ARROW_GRPC_URL - "for remote procedure calls" +# - ARROW_GTEST_URL - "Googletest, for testing" +# - ARROW_ORC_URL - "for Apache ORC format support" +# - ARROW_PROTOBUF_URL - "Google Protocol Buffers, for data serialization" + + cmake_gcc_version <- function(env_vars) { # This function returns NA if using a non-gcc compiler # Always enclose calls to it in isTRUE() or isFALSE() From 318b166463574ee8cc12202ad6c9c6645e714685 Mon Sep 17 00:00:00 2001 From: karldw Date: Wed, 25 Aug 2021 11:05:23 -0700 Subject: [PATCH 03/27] Fixes to offline build --- r/.gitignore | 8 ++++++++ r/Makefile | 11 ++++++++++- r/NAMESPACE | 1 + r/R/util.R | 26 ++++++++++++++++++++----- r/man/download_optional_dependencies.Rd | 24 ++++++++++++++++++++--- r/tests/testthat/test-install-arrow.R | 17 ++++++++++++++++ r/tools/nixlibs.R | 2 +- 7 files changed, 79 insertions(+), 10 deletions(-) diff --git a/r/.gitignore b/r/.gitignore index 937f055fcf3..4837920768a 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -18,4 +18,12 @@ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz extra-tests/files + +# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here. /tools/cpp/ +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up +# from cpp/, but again, they're just copies +/tools/.env +/tools/LICENSE.txt +/tools/NOTICE.txt +/inst/thirdparty/ diff --git a/r/Makefile b/r/Makefile index 9e4a51733e3..525858c0fdb 100644 --- a/r/Makefile +++ b/r/Makefile @@ -36,9 +36,18 @@ test: deps: R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)' +# Note: files in tools are available at build time, but not at run time. The thirdparty +# scripts need to be in inst/ so they're available to download_optional_dependencies() +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up from cpp/ build: doc cp ../NOTICE.txt inst/NOTICE.txt rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ + mkdir -p inst/thirdparty + cp -p ../cpp/thirdparty/download_dependencies.sh inst/thirdparty/ + cp -p ../cpp/thirdparty/versions.txt inst/thirdparty/ R CMD build . check: build @@ -57,5 +66,5 @@ clean: -rm src/Makevars.win -rm -rf arrow.Rcheck/ -rm -rf libarrow/ - -rm -rf tools/cpp/ + -rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt -find . -name "*.orig" -delete diff --git a/r/NAMESPACE b/r/NAMESPACE index 8bcc58653fb..217990bae1a 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -206,6 +206,7 @@ export(date64) export(decimal) export(default_memory_pool) export(dictionary) +export(download_optional_dependencies) export(ends_with) export(everything) export(field) diff --git a/r/R/util.R b/r/R/util.R index 163c348ec32..01579c77f5d 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -198,16 +198,32 @@ repeat_value_as_array <- function(object, n) { #' is `BUNDLED` or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds #' +#' Steps for an offline install with optional dependencies: +#' - Install the `arrow` package on a computer with internet access +#' - Run this function +#' - Copy the saved dependency files to a computer without internet access +#' - Export the environment variables printed by this function on the computer +#' without internet access. For example, this function will print +#' `export ARROW_THRIFT_URL=/path/to/deps_dir/file.tar.gz` +#' - These export commands are also saved in `DEFINE_ENV_VARS.sh`, in the same +#' directory +#' - You may have to edit the paths if the copied folder is not accessible at +#' the same location as it was when `download_optional_dependencies()` was +#' run on the internet-connected computer +#' - Install the `arrow` package on the computer without internet access +#' - Run [arrow_info()] to check installed capabilities +#' #' @examples #' \dontrun{ #' download_optional_dependencies("arrow-thirdparty") +#' file.exists("arrow-thirdparty/DEFINE_ENV_VARS.sh") # TRUE +#' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar #' } -#' # Now define the environment variables (see arrow-thirdparty/DEFINE_ENV_VARS.sh) -#' # and run your offline build. +#' @export download_optional_dependencies <- function(deps_dir) { # This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... download_dependencies_sh <- system.file( - "tools/cpp/thirdparty/download_dependencies.sh", + "thirdparty/download_dependencies.sh", package = "arrow", mustWork = TRUE ) @@ -220,7 +236,7 @@ download_optional_dependencies <- function(deps_dir) { stdout_file <- tempfile() stderr_file <- tempfile() file.create(stdout_file, stderr_file) - cat("***Downloading optional dependencies to ", deps_dir) + cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) return_status <- system2(download_dependencies_sh, args = deps_dir, stdout = stdout_file, stderr = stderr_file @@ -244,5 +260,5 @@ download_optional_dependencies <- function(deps_dir) { warning(paste(msg, collapse = "\n")) } # Return sucess status - return_status == 0 + invisible(return_status == 0) } diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index cf81f6af8ff..c713b6956b9 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -18,6 +18,25 @@ download them for you. These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} is \code{BUNDLED} or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + +Steps for an offline install with optional dependencies: +\itemize{ +\item Install the \code{arrow} package on a computer with internet access +\item Run this function +\item Copy the saved dependency files to a computer without internet access +\item Export the environment variables printed by this function on the computer +without internet access. For example, this function will print +\verb{export ARROW_THRIFT_URL=/path/to/deps_dir/file.tar.gz} +\itemize{ +\item These export commands are also saved in \code{DEFINE_ENV_VARS.sh}, in the same +directory +\item You may have to edit the paths if the copied folder is not accessible at +the same location as it was when \code{download_optional_dependencies()} was +run on the internet-connected computer +} +\item Install the \code{arrow} package on the computer without internet access +\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} } \description{ Download all optional Arrow dependencies @@ -25,8 +44,7 @@ Download all optional Arrow dependencies \examples{ \dontrun{ download_optional_dependencies("arrow-thirdparty") +file.exists("arrow-thirdparty/DEFINE_ENV_VARS.sh") # TRUE +list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar } -# Now define the environment variables (see arrow-thirdparty/DEFINE_ENV_VARS.sh) -# and run your offline build. -This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... } diff --git a/r/tests/testthat/test-install-arrow.R b/r/tests/testthat/test-install-arrow.R index c53ee829829..9681d41d108 100644 --- a/r/tests/testthat/test-install-arrow.R +++ b/r/tests/testthat/test-install-arrow.R @@ -37,3 +37,20 @@ r_only({ }) }) }) + + +r_only({ + test_that("download_optional_dependencies", { + skip_if_offline() + deps_dir <- tempfile() + download_successful <- expect_output( + download_optional_dependencies(deps_dir), + "export ARROW_THRIFT_URL" + ) + expect_true(download_successful) + env_var_file <- file.path(deps_dir, "DEFINE_ENV_VARS.sh") + expect_true(file.exists(env_var_file)) + env_var_lines <- readLines(env_var_file) + expect_true(any(grepl("export ARROW_THRIFT_URL", env_var_lines))) + }) +}) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 1412d748ef1..374d2fa8ad5 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -290,7 +290,7 @@ find_local_source <- function(arrow_home = Sys.getenv("ARROW_SOURCE_HOME", "..") cpp_dir <- "tools/cpp" } if (!is.null(cpp_dir)) { - cat("*** Found local C++ source:\n '", cpp_dir, "'\n") + cat(paste0("*** Found local C++ source:\n '", cpp_dir, "'\n")) } cpp_dir } From adef21df5ffd52749bee1621ea0e94d1cb2b3cb2 Mon Sep 17 00:00:00 2001 From: karldw Date: Wed, 25 Aug 2021 12:20:14 -0700 Subject: [PATCH 04/27] Re-enable cmake download, add to pkgdown --- r/_pkgdown.yml | 1 + r/tools/nixlibs.R | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 90d900ddf28..0bbbc827779 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -175,6 +175,7 @@ reference: - arrow_available - install_arrow - install_pyarrow + - download_optional_dependencies repo: jira_projects: [ARROW] diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 374d2fa8ad5..4ad59e4a228 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -392,17 +392,17 @@ ensure_cmake <- function() { "https://github.com/Kitware/CMake/releases/download/v", CMAKE_VERSION, "/cmake-", CMAKE_VERSION, postfix ) - if (!download_ok) { + cmake_tar <- tempfile() + cmake_dir <- tempfile() + download_successful <- try_download(cmake_binary_url, cmake_tar) + if (!download_successful) { stop( - "cmake was not found and downloads are not permitted.\n", + "cmake was not found locally and download failed.\n", "Make sure cmake is installed and available on your PATH\n", "(or download '", cmake_binary_url, "' and define the CMAKE environment variable)." ) } - cmake_tar <- tempfile() - cmake_dir <- tempfile() - try_download(cmake_binary_url, cmake_tar) untar(cmake_tar, exdir = cmake_dir) unlink(cmake_tar) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir)) From 5626cb7c48231e6e1351964b930375cfee4b949b Mon Sep 17 00:00:00 2001 From: karldw Date: Wed, 25 Aug 2021 14:25:40 -0700 Subject: [PATCH 05/27] Remove LIBARROW_DOWNLOAD, add TEST_OFFLINE_BUILD + minor fixes --- r/R/install-arrow.R | 1 - r/configure | 12 ++--- r/tools/nixlibs.R | 112 ++++++++++------------------------------ r/vignettes/install.Rmd | 20 ++++--- 4 files changed, 44 insertions(+), 101 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 63db8ede910..b07ac60f50a 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -70,7 +70,6 @@ install_arrow <- function(nightly = FALSE, } } else { Sys.setenv( - LIBARROW_DOWNLOAD = "true", LIBARROW_BINARY = binary, LIBARROW_MINIMAL = minimal, ARROW_R_DEV = verbose, diff --git a/r/configure b/r/configure index 696c38a00ad..0bf82285a7a 100755 --- a/r/configure +++ b/r/configure @@ -39,7 +39,7 @@ FORCE_AUTOBREW=`echo $FORCE_AUTOBREW | tr '[:upper:]' '[:lower:]'` FORCE_BUNDLED_BUILD=`echo $FORCE_BUNDLED_BUILD | tr '[:upper:]' '[:lower:]'` ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'` -LIBARROW_DOWNLOAD=`echo $LIBARROW_DOWNLOAD | tr '[:upper:]' '[:lower:]'` +TEST_OFFLINE_BUILD=`echo $TEST_OFFLINE_BUILD | tr '[:upper:]' '[:lower:]'` NOT_CRAN=`echo $NOT_CRAN | tr '[:upper:]' '[:lower:]'` VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` @@ -130,14 +130,8 @@ else fi else # Set some default values/backwards compatibility - if [ "${LIBARROW_DOWNLOAD}" = "" ] && [ "${NOT_CRAN}" != "" ]; then - LIBARROW_DOWNLOAD=$NOT_CRAN; export LIBARROW_DOWNLOAD - fi - if [ "${LIBARROW_BINARY}" = "" ] && [ "${LIBARROW_DOWNLOAD}" != "" ]; then - LIBARROW_BINARY=$LIBARROW_DOWNLOAD; export LIBARROW_BINARY - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${LIBARROW_DOWNLOAD}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + if [ "${LIBARROW_BINARY}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then + LIBARROW_BINARY=true; export LIBARROW_BINARY fi if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 4ad59e4a228..6ac78755583 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -43,20 +43,23 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } +build_ok <- !env_is("LIBARROW_BUILD", "false") +# But binary defaults to not OK +binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") +# For local debugging, set ARROW_R_DEV=TRUE to make this script print more + quietly <- !env_is("ARROW_R_DEV", "true") # try_download uses quietly global # * download_ok, build_ok: Use prebuilt binary, if found, otherwise try to build -# * no download, build_ok: Build with local git checkout, if available, or +# * !download_ok, build_ok: Build with local git checkout, if available, or # sources included in r/tools/cpp/. Optional dependencies are not included, # and will not be automatically downloaded. +# cmake will still be downloaded if necessary # https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds -# * download_ok, no build: Only use prebuilt binary, if found +# * download_ok, !build_ok: Only use prebuilt binary, if found # * neither: Get the arrow-without-arrow package # Download and build are OK unless you say not to (or can't access github) -download_ok <- (!env_is("LIBARROW_DOWNLOAD", "false")) && try_download("https://github.com", tempfile()) -build_ok <- !env_is("LIBARROW_BUILD", "false") -# But binary defaults to not OK -binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") -# For local debugging, set ARROW_R_DEV=TRUE to make this script print more +download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) + download_binary <- function(os = identify_os()) { libfile <- tempfile() @@ -88,7 +91,7 @@ download_binary <- function(os = identify_os()) { # * `TRUE` (not case-sensitive), to try to discover your current OS, or # * some other string, presumably a related "distro-version" that has binaries # built that work for your OS -identify_os <- function(os = Sys.getenv("LIBARROW_BINARY", Sys.getenv("LIBARROW_DOWNLOAD"))) { +identify_os <- function(os = Sys.getenv("LIBARROW_BINARY", Sys.getenv("TEST_OFFLINE_BUILD"))) { if (tolower(os) %in% c("", "false")) { # Env var says not to download a binary return(NULL) @@ -219,79 +222,20 @@ find_available_binary <- function(os) { os } -download_source <- function() { - tf1 <- tempfile() - src_dir <- tempfile() - - # Given VERSION as x.y.z.p - p <- package_version(VERSION)[1, 4] - if (is.na(p) || p < 1000) { - # This is either just x.y.z or it has a small (R-only) patch version - # Download from the official Apache release, dropping the p - VERSION <- as.character(package_version(VERSION)[1, -4]) - if (apache_download(VERSION, tf1)) { - untar(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/apache-arrow-", VERSION, "/cpp") - } - } else if (p != 9000) { - # This is a custom dev version (x.y.z.9999) or a nightly (x.y.z.20210505) - # (Don't try to download on the default dev .9000 version) - if (nightly_download(VERSION, tf1)) { - unzip(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/cpp") - } - } - - if (dir.exists(src_dir)) { - cat("*** Successfully retrieved C++ source\n") - options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir)) - # These scripts need to be executable - system( - sprintf("chmod 755 %s/build-support/*.sh", src_dir), - ignore.stdout = quietly, ignore.stderr = quietly - ) - return(src_dir) - } else { - return(NULL) - } -} - -nightly_download <- function(version, destfile) { - source_url <- paste0(arrow_repo, "src/arrow-", version, ".zip") - try_download(source_url, destfile) -} - -apache_download <- function(version, destfile, n_mirrors = 3) { - apache_path <- paste0("arrow/arrow-", version, "/apache-arrow-", version, ".tar.gz") - apache_urls <- c( - # This returns a different mirror each time - rep("https://www.apache.org/dyn/closer.lua?action=download&filename=", n_mirrors), - "https://downloads.apache.org/" # The backup +find_local_source <- function() { + # We'll take the first of these that exists + # The first case probably occurs if we're in the arrow git repo + # The second probably occurs if we're installing the arrow R package + cpp_dir_options <- c( + Sys.getenv("ARROW_SOURCE_HOME", ".."), + "tools/cpp" ) - downloaded <- FALSE - for (u in apache_urls) { - downloaded <- try_download(paste0(u, apache_path), destfile) - if (downloaded) { - break - } - } - downloaded -} - -find_local_source <- function(arrow_home = Sys.getenv("ARROW_SOURCE_HOME", "..")) { - cpp_dir <- NULL - if (file.exists(paste0(arrow_home, "/cpp/src/arrow/api.h"))) { - # We're in a git checkout of arrow, so we can build it - cpp_dir <- paste0(arrow_home, "/cpp") - } else if (file.exists("tools/cpp/src/arrow/api.h")) { - # Use the version bundled in tools/cpp/ - cpp_dir <- "tools/cpp" - } - if (!is.null(cpp_dir)) { - cat(paste0("*** Found local C++ source:\n '", cpp_dir, "'\n")) + valid_cpp_dir <- file.exists(file.path(cpp_dir_options, "src/arrow/api.h")) + if (!any(valid_cpp_dir)) { + return(NULL) } + cpp_dir <- cpp_dir_options[valid_cpp_dir][1] + cat(paste0("*** Found local C++ source:\n '", cpp_dir, "'\n")) cpp_dir } @@ -448,7 +392,7 @@ is_feature_requested <- function(arrow_feature) { # * explicitly enabled: ON # * LIBARROW_MINIMAL=false: ON # Note that if LIBARROW_MINIMAL is unset, `configure` sets it to "false" when - # NOT_CRAN or LIBARROW_DOWNLOAD are "true". + # NOT_CRAN or TEST_OFFLINE_BUILD are "true". explicitly_set_val <- toupper(Sys.getenv(arrow_feature)) if (explicitly_set_val == "OFF") { feature_on <- FALSE @@ -590,15 +534,15 @@ with_brotli <- function(env_vars) { } with_bz2 <- function(env_vars) { - arrow_brotli <- is_feature_requested("ARROW_WITH_BZ2") - if (arrow_brotli) { + arrow_bz2 <- is_feature_requested("ARROW_WITH_BZ2") + if (arrow_bz2) { download_unavailable <- remote_download_unavailable("ARROW_BZIP2_URL") if (download_unavailable) { cat("**** bz2 requested but cannot be downloaded. Setting ARROW_WITH_BZ2=OFF\n") - arrow_brotli <- FALSE + arrow_bz2 <- FALSE } } - paste(env_vars, ifelse(arrow_brotli, "ARROW_WITH_BZ2=ON", "ARROW_WITH_BZ2=OFF")) + paste(env_vars, ifelse(arrow_bz2, "ARROW_WITH_BZ2=ON", "ARROW_WITH_BZ2=OFF")) } with_lz4 <- function(env_vars) { diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 47ae8944b71..c758e6f53a2 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -288,13 +288,17 @@ See discussion [here](https://issues.apache.org/jira/browse/ARROW-8556). Some features are optional when you build Arrow from source. With the exception of `ARROW_S3`, these are all `ON` by default in the bundled C++ build, but you can set them to `OFF` to disable them. -* `ARROW_S3`: If set to `ON` S3 support will be built as long as the - dependencies are met; if they are not met, the build script will turn this `OFF` +* `ARROW_S3`: If set to `ON` S3 support will be built as long as the + dependencies are met; if they are not met, the build script will turn this `OFF` * `ARROW_JEMALLOC` for the `jemalloc` memory allocator +* `ARROW_MIMALLOC` for the `mimalloc` memmory allocator * `ARROW_PARQUET` * `ARROW_DATASET` * `ARROW_WITH_RE2` for the RE2 regular expression library, used in some string compute functions * `ARROW_WITH_UTF8PROC` for the UTF8Proc string library, used in many other string compute functions +* `ARROW_JSON` for JSON parsing +* `ARROW_WITH_BROTLI`, `ARROW_WITH_BZ2`, `ARROW_WITH_LZ4`, `ARROW_WITH_SNAPPY`, `ARROW_WITH_ZLIB`, and `ARROW_WITH_ZSTD` for various compression algorithms + There are a number of other variables that affect the `configure` script and the bundled build script. By default, these are all unset. All boolean variables are case-insensitive. @@ -303,10 +307,12 @@ By default, these are all unset. All boolean variables are case-insensitive. won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. -* `LIBARROW_DOWNLOAD`: Unless set to `false`, the build script - will attempt to download C++ binary or source bundles. +* `TEST_OFFLINE_BUILD`: Unless set to `true`, the build script + will download prebuilt C++ binary or third-party source bundles as necessary. If you're in a checkout of the `apache/arrow` git repository - and want to build the C++ library from the local source, make this `false`. + and want to build the C++ library from the local source, make this `false` or + not set. If building the C++ library from source with cmake unavailable, cmake + will still be downloaded, regardless of this flag's value. * `LIBARROW_BINARY`: If set to `true`, the script will try to download a binary C++ library built for your operating system. You may also set it to some other string, @@ -341,8 +347,8 @@ By default, these are all unset. All boolean variables are case-insensitive. The directory will be created if it does not exist. * `CMAKE`: When building the C++ library from source, you can specify a `/path/to/cmake` to use a different version than whatever is found on the `$PATH` - - + + # Contributing As mentioned above, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) From 562176f54de70dfa394bfac3a3d4aa5af17b6e00 Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 26 Aug 2021 15:41:56 -0700 Subject: [PATCH 06/27] Simplify turning features off, downloading thirdparty --- r/R/util.R | 45 +-- r/configure | 13 +- r/inst/build_arrow_static.sh | 2 +- r/man/download_optional_dependencies.Rd | 15 +- r/tools/nixlibs.R | 380 ++++++++---------------- 5 files changed, 148 insertions(+), 307 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index 01579c77f5d..39912a5dfa8 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -194,6 +194,8 @@ repeat_value_as_array <- function(object, n) { #' This function is used for setting up an offline build. If it's possible to #' download at build time, don't use this function. Instead, let `cmake` #' download them for you. +#' If the files already exist in `deps_dir`, they will be re-downloaded and +#' overwritten. Other files are not changed. #' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` #' is `BUNDLED` or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds @@ -202,21 +204,14 @@ repeat_value_as_array <- function(object, n) { #' - Install the `arrow` package on a computer with internet access #' - Run this function #' - Copy the saved dependency files to a computer without internet access -#' - Export the environment variables printed by this function on the computer -#' without internet access. For example, this function will print -#' `export ARROW_THRIFT_URL=/path/to/deps_dir/file.tar.gz` -#' - These export commands are also saved in `DEFINE_ENV_VARS.sh`, in the same -#' directory -#' - You may have to edit the paths if the copied folder is not accessible at -#' the same location as it was when `download_optional_dependencies()` was -#' run on the internet-connected computer +#' - Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that +#' points to the folder. #' - Install the `arrow` package on the computer without internet access #' - Run [arrow_info()] to check installed capabilities #' #' @examples #' \dontrun{ #' download_optional_dependencies("arrow-thirdparty") -#' file.exists("arrow-thirdparty/DEFINE_ENV_VARS.sh") # TRUE #' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar #' } #' @export @@ -233,32 +228,18 @@ download_optional_dependencies <- function(deps_dir) { dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) # Run download_dependencies.sh - stdout_file <- tempfile() - stderr_file <- tempfile() - file.create(stdout_file, stderr_file) cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) return_status <- system2(download_dependencies_sh, - args = deps_dir, - stdout = stdout_file, stderr = stderr_file + args = deps_dir, stdout = FALSE, stderr = FALSE ) - if (return_status == 0) { - # File contents are something like: - # # Environment variables for offline Arrow build - # export ARROW_ABSL_URL=/path/to/file/absl-12345.tar.gz - # export ... - env_var_file <- file.path(deps_dir, "DEFINE_ENV_VARS.sh") - # Also save a copy in the directory for ease of use. - file.copy(stdout_file, env_var_file) - msg <- c( - "*** Offline build environment variables", - paste(" (These are also saved in ", env_var_file, ")\n"), - readLines(stdout_file) - ) - cat(paste(msg, collapse = "\n")) + download_successful <- isTRUE(return_status == 0) + if (download_successful) { + cat(paste0( + "**** Set environment variable on offline machine and re-build arrow:\n", + "export ARROW_THIRDPARTY_DEPENDENCY_DIR=\n" + )) } else { - msg <- c("Failed to download some optional dependencies", readLines(stderr_file), "") - warning(paste(msg, collapse = "\n")) + warning("Failed to download optional dependencies") } - # Return sucess status - invisible(return_status == 0) + invisible(download_successful) } diff --git a/r/configure b/r/configure index 0bf82285a7a..1d51a5a9f17 100755 --- a/r/configure +++ b/r/configure @@ -129,12 +129,15 @@ else # autobrew sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` fi else + # Set some default values/backwards compatibility - if [ "${LIBARROW_BINARY}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then - LIBARROW_BINARY=true; export LIBARROW_BINARY - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + if [ "${NOT_CRAN}" = "true" ]; then + if [ "${LIBARROW_BINARY}" = "" ]; then + LIBARROW_BINARY=true; export LIBARROW_BINARY + fi + if [ "${LIBARROW_MINIMAL}" = "" ]; then + LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + fi fi # find openssl on macos. macOS ships with libressl. openssl is installable diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 578d8b6e5b2..84a9f971246 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -59,7 +59,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_FILESYSTEM=ON \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ - -DARROW_JSON=${ARROW_JSON:-ON} \ + -DARROW_JSON=ON \ -DARROW_PARQUET=${ARROW_PARQUET:-ON} \ -DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \ diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index c713b6956b9..99862c06abc 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -15,6 +15,8 @@ TRUE/FALSE for whether the downloads were successful This function is used for setting up an offline build. If it's possible to download at build time, don't use this function. Instead, let \code{cmake} download them for you. +If the files already exist in \code{deps_dir}, they will be re-downloaded and +overwritten. Other files are not changed. These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} is \code{BUNDLED} or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds @@ -24,16 +26,8 @@ Steps for an offline install with optional dependencies: \item Install the \code{arrow} package on a computer with internet access \item Run this function \item Copy the saved dependency files to a computer without internet access -\item Export the environment variables printed by this function on the computer -without internet access. For example, this function will print -\verb{export ARROW_THRIFT_URL=/path/to/deps_dir/file.tar.gz} -\itemize{ -\item These export commands are also saved in \code{DEFINE_ENV_VARS.sh}, in the same -directory -\item You may have to edit the paths if the copied folder is not accessible at -the same location as it was when \code{download_optional_dependencies()} was -run on the internet-connected computer -} +\item Create a environment variable called \code{ARROW_THIRDPARTY_DEPENDENCY_DIR} that +points to the folder. \item Install the \code{arrow} package on the computer without internet access \item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities } @@ -44,7 +38,6 @@ Download all optional Arrow dependencies \examples{ \dontrun{ download_optional_dependencies("arrow-thirdparty") -file.exists("arrow-thirdparty/DEFINE_ENV_VARS.sh") # TRUE list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar } } diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 6ac78755583..d32ecaf5b30 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -227,16 +227,16 @@ find_local_source <- function() { # The first case probably occurs if we're in the arrow git repo # The second probably occurs if we're installing the arrow R package cpp_dir_options <- c( - Sys.getenv("ARROW_SOURCE_HOME", ".."), + file.path(Sys.getenv("ARROW_SOURCE_HOME", ".."), "cpp"), "tools/cpp" ) - valid_cpp_dir <- file.exists(file.path(cpp_dir_options, "src/arrow/api.h")) - if (!any(valid_cpp_dir)) { - return(NULL) + for (cpp_dir in cpp_dir_options) { + if (file.exists(file.path(cpp_dir, "src/arrow/api.h"))) { + cat(paste0("*** Found local C++ source: '", cpp_dir, "'\n")) + return(cpp_dir) + } } - cpp_dir <- cpp_dir_options[valid_cpp_dir][1] - cat(paste0("*** Found local C++ source:\n '", cpp_dir, "'\n")) - cpp_dir + NULL } build_libarrow <- function(src_dir, dst_dir) { @@ -288,21 +288,20 @@ build_libarrow <- function(src_dir, dst_dir) { LDFLAGS = R_CMD_config("LDFLAGS") ) env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") - # Add env variables like ARROW_S3=ON. Order doesn't matter. Depends on `download_ok` - env_vars <- with_s3_support(env_vars) - env_vars <- with_mimalloc(env_vars) - env_vars <- with_jemalloc(env_vars) - env_vars <- with_parquet(env_vars) - env_vars <- with_dataset(env_vars) - env_vars <- with_brotli(env_vars) - env_vars <- with_bz2(env_vars) - env_vars <- with_lz4(env_vars) - env_vars <- with_re2(env_vars) - env_vars <- with_snappy(env_vars) - env_vars <- with_utf8proc(env_vars) - env_vars <- with_zlib(env_vars) - env_vars <- with_zstd(env_vars) - env_vars <- with_xsimd(env_vars) + thirdparty_deps_unavailable <- !download_ok && + !dir.exists(Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) && + !env_is("ARROW_DEPENDENCY_SOURCE", "system") + + if (thirdparty_deps_unavailable || is_solaris()) { + # Note that JSON support does work on Solaris, but will be turned off with + # the rest of the thirdparty dependencies (when ARROW-13768 is resolved and + # JSON can be turned off at all). All other dependencies don't compile + # (e.g thrift, jemalloc, and xsimd) or do compile but `ar` fails to build + # libarrow_bundled_dependencies (e.g. re2 and utf8proc). + env_vars <- turn_off_thirdparty_features(env_vars) + } + # If $ARROW_THIRDPARTY_DEPENDENCY_DIR has files, add their *_SOURCE_URL env vars + env_vars <- set_thirdparty_urls(env_vars) cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n") status <- suppressWarnings(system( @@ -311,7 +310,11 @@ build_libarrow <- function(src_dir, dst_dir) { )) if (status != 0) { # It failed :( - cat("**** Error building Arrow C++. Re-run with ARROW_R_DEV=true for debug information.\n") + cat( + "**** Error building Arrow C++.", + ifelse(env_is("ARROW_R_DEV", "true"), "", "Re-run with ARROW_R_DEV=true for debug information."), + "\n" + ) } invisible(status) } @@ -386,20 +389,97 @@ cmake_version <- function(cmd = "cmake") { ) } -is_feature_requested <- function(arrow_feature) { - # Cases: - # * nothing set: OFF - # * explicitly enabled: ON - # * LIBARROW_MINIMAL=false: ON - # Note that if LIBARROW_MINIMAL is unset, `configure` sets it to "false" when - # NOT_CRAN or TEST_OFFLINE_BUILD are "true". - explicitly_set_val <- toupper(Sys.getenv(arrow_feature)) - if (explicitly_set_val == "OFF") { - feature_on <- FALSE - } else { - feature_on <- explicitly_set_val == "ON" || env_is("LIBARROW_MINIMAL", "false") +turn_off_thirdparty_features <- function(env_vars) { + + # Because these are done as environment variables (as opposed to build flags), + # setting these to "OFF" overrides any previous setting. We don't need to + # check the existing value. + turn_off <- c( + "ARROW_MIMALLOC=OFF", + "ARROW_JEMALLOC=OFF", + "ARROW_PARQUET=OFF", # depends on thrift + "ARROW_DATASET=OFF", # depends on parquet + "ARROW_S3=OFF", + "ARROW_WITH_BROTLI=OFF", + "ARROW_WITH_BZ2=OFF", + "ARROW_WITH_LZ4=OFF", + "ARROW_WITH_SNAPPY=OFF", + "ARROW_WITH_ZLIB=OFF", + "ARROW_WITH_ZSTD=OFF", + "ARROW_WITH_RE2=OFF", + "ARROW_WITH_UTF8PROC=OFF", + # NOTE: this code sets the environment variable ARROW_JSON to "OFF", but + # that setting is will *not* be honored by build_arrow_static.sh until + # ARROW-13768 is resolved. + "ARROW_JSON=OFF", + # The syntax to turn off XSIMD is different. + 'EXTRA_CMAKE_FLAGS="-DARROW_SIMD_LEVEL=NONE"' + ) + if (Sys.getenv("EXTRA_CMAKE_FLAGS") != "") { + # Error rather than overwriting EXTRA_CMAKE_FLAGS + # (Correctly inserting the flag into an existing quoted string is tricky) + stop("Sorry, setting EXTRA_CMAKE_FLAGS is not supported at this time.") + } + paste(env_vars, paste(turn_off, collapse = " ")) +} + +set_thirdparty_urls <- function(env_vars) { + deps_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") + files <- list.files(deps_dir, full.names = FALSE) + if (length(files) == 0) { + # This will be true if the variable is unset, if it's set but the directory + # doesn't exist, or if it exists but is empty. + return(env_vars) + } + dep_names <- c( + "absl", # not used; seems to be a dependency of gRPC + "aws-sdk-cpp", + "aws-checksums", + "aws-c-common", + "aws-c-event-stream", + "boost", + "brotli", + "bzip2", + "cares", # not used; "a dependency of gRPC" + "gbenchmark", # not used; "Google benchmark, for testing" + "gflags", # not used; "for command line utilities (formerly Googleflags)" + "glog", # not used; "for logging" + "grpc", # not used; "for remote procedure calls" + "gtest", # not used; "Googletest, for testing" + "jemalloc", + "lz4", + "mimalloc", + "orc", # not used; "for Apache ORC format support" + "protobuf", # not used; "Google Protocol Buffers, for data serialization" + "rapidjson", + "re2", + "snappy", + "thrift", + "utf8proc", + "xsimd", + "zlib", + "zstd" + ) + dep_regex <- paste0("^(", paste(dep_names, collapse = "|"), ").*") + # If there were extra files in the folder (not matching our regex) drop them. + files <- files[grepl(dep_regex, files, perl = TRUE)] + # Convert e.g. "thrift-0.13.0.tar.gz" to ARROW_THRIFT_URL + # Note that if there's no file called thrift*, we won't add + # ARROW_THRIFT_URL to env_vars. + url_env_varname <- sub(dep_regex, "ARROW_\\1_URL", files, perl = TRUE) + url_env_varname <- toupper(gsub("-", "_", url_env_varname, fixed = TRUE)) + # Special case: ARROW_AWSSDK_URL for aws-sdk-cpp-.tar.gz + url_env_varname <- sub("ARROW_AWS_SDK_CPP_URL", "ARROW_AWSSDK_URL", url_env_varname, fixed = TRUE) + if (anyDuplicated(url_env_varname)) { + warning("Unexpected files in ", deps_dir, + "\nDo you have multiple copies of a dependency?", + .call = FALSE + ) + return(env_vars) } - feature_on + full_filenames <- file.path(normalizePath(deps_dir), files) + url_env_vars <- paste(url_env_varname, full_filenames, sep = "=", collapse = " ") + paste(env_vars, url_env_vars) } remote_download_unavailable <- function(url_env_vars) { @@ -421,73 +501,24 @@ remote_download_unavailable <- function(url_env_vars) { download_unavailable } -# Memory alloc features: mimalloc, jemalloc with_mimalloc <- function(env_vars) { - # Note that the logic here is different than in build_arrow_static.sh, which - # default includes mimalloc even when LIBARROW_MINIMAL=true - arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") - + arrow_mimalloc <- env_is("ARROW_MIMALLOC", "on") || env_is("LIBARROW_MINIMAL", "false") if (arrow_mimalloc) { # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") arrow_mimalloc <- FALSE } - download_unavailable <- remote_download_unavailable("ARROW_MIMALLOC_URL") - if (download_unavailable) { - cat(paste( - "**** mimalloc needs to be downloaded, but can't be.", - "See ?arrow::download_optional_dependencies.", - "Building with ARROW_MIMALLOC=OFF\n" - )) - arrow_mimalloc <- FALSE - } } paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) } -with_jemalloc <- function(env_vars) { - arrow_jemalloc <- is_feature_requested("ARROW_JEMALLOC") && !is_solaris() - # jemalloc doesn't seem to build on Solaris - if (arrow_jemalloc) { - download_unavailable <- remote_download_unavailable("ARROW_JEMALLOC_URL") - if (download_unavailable) { - cat("**** jemalloc requested but cannot be downloaded. Setting ARROW_JEMALLOC=OFF\n") - arrow_jemalloc <- FALSE - } - } - paste(env_vars, ifelse(arrow_jemalloc, "ARROW_JEMALLOC=ON", "ARROW_JEMALLOC=OFF")) -} - -# File access features: parquet, dataset, S3 -with_parquet <- function(env_vars) { - # We try to build parquet unless it's explicitly turned off, even if - # LIBARROW_MINIMAL=true. - # Parquet is built-in, but depends on Thrift, which is thirdparty - arrow_parquet <- !env_is("ARROW_PARQUET", "off") && !is_solaris() - # Thrift doesn't compile on solaris, so turn off parquet there. - if (arrow_parquet) { - download_unavailable <- remote_download_unavailable("ARROW_THRIFT_URL") - if (download_unavailable) { - cat("**** parquet requested but dependencies cannot be downloaded. Setting ARROW_PARQUET=OFF\n") - arrow_parquet <- FALSE - } - } - paste(env_vars, ifelse(arrow_parquet, "ARROW_PARQUET=ON", "ARROW_PARQUET=OFF")) -} - -with_dataset <- function(env_vars) { - # Note: we try to build dataset unless it's explicitly turned off, even if - # LIBARROW_MINIMAL=true. - arrow_dataset <- (!env_is("ARROW_DATASET", "off")) && - grepl("ARROW_PARQUET=ON", with_parquet("")) - # arrowExports.cpp requires parquet for dataset (ARROW-11994), so turn dataset - # off if parquet is off. - paste(env_vars, ifelse(arrow_dataset, "ARROW_DATASET=ON", "ARROW_DATASET=OFF")) -} - with_s3_support <- function(env_vars) { - arrow_s3 <- is_feature_requested("ARROW_S3") + arrow_s3 <- env_is("ARROW_S3", "on") || env_is("LIBARROW_MINIMAL", "false") + # but if ARROW_S3=OFF explicitly, we are definitely off, so override + if (env_is("ARROW_S3", "off")) { + arrow_s3 <- FALSE + } if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs @@ -502,177 +533,10 @@ with_s3_support <- function(env_vars) { cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE } - download_unavailable <- remote_download_unavailable(c( - "ARROW_AWSSDK_URL", - "ARROW_AWS_C_COMMON_URL", - "ARROW_AWS_CHECKSUMS_URL", - "ARROW_AWS_C_EVENT_STREAM_URL" - )) - if (download_unavailable) { - cat(paste( - "**** S3 dependencies need to be downloaded, but can't be.", - "See ?arrow::download_optional_dependencies.", - "Building with ARROW_S3=OFF\n" - )) - arrow_s3 <- FALSE - } } paste(env_vars, ifelse(arrow_s3, "ARROW_S3=ON", "ARROW_S3=OFF")) } -# Compression features: brotli, bz2, lz4, snappy, zlib, zstd -with_brotli <- function(env_vars) { - arrow_brotli <- is_feature_requested("ARROW_WITH_BROTLI") - if (arrow_brotli) { - download_unavailable <- remote_download_unavailable("ARROW_BROTLI_URL") - if (download_unavailable) { - cat("**** brotli requested but cannot be downloaded. Setting ARROW_WITH_BROTLI=OFF\n") - arrow_brotli <- FALSE - } - } - paste(env_vars, ifelse(arrow_brotli, "ARROW_WITH_BROTLI=ON", "ARROW_WITH_BROTLI=OFF")) -} - -with_bz2 <- function(env_vars) { - arrow_bz2 <- is_feature_requested("ARROW_WITH_BZ2") - if (arrow_bz2) { - download_unavailable <- remote_download_unavailable("ARROW_BZIP2_URL") - if (download_unavailable) { - cat("**** bz2 requested but cannot be downloaded. Setting ARROW_WITH_BZ2=OFF\n") - arrow_bz2 <- FALSE - } - } - paste(env_vars, ifelse(arrow_bz2, "ARROW_WITH_BZ2=ON", "ARROW_WITH_BZ2=OFF")) -} - -with_lz4 <- function(env_vars) { - arrow_lz4 <- is_feature_requested("ARROW_WITH_LZ4") - if (arrow_lz4) { - download_unavailable <- remote_download_unavailable("ARROW_LZ4_URL") - if (download_unavailable) { - cat("**** lz4 requested but cannot be downloaded. Setting ARROW_WITH_LZ4=OFF\n") - arrow_lz4 <- FALSE - } - } - paste(env_vars, ifelse(arrow_lz4, "ARROW_WITH_LZ4=ON", "ARROW_WITH_LZ4=OFF")) -} - -with_snappy <- function(env_vars) { - arrow_snappy <- is_feature_requested("ARROW_WITH_SNAPPY") - if (arrow_snappy) { - download_unavailable <- remote_download_unavailable("ARROW_SNAPPY_URL") - if (download_unavailable) { - cat("**** snappy requested but cannot be downloaded. Setting ARROW_WITH_SNAPPY=OFF\n") - arrow_snappy <- FALSE - } - } - paste(env_vars, ifelse(arrow_snappy, "ARROW_WITH_SNAPPY=ON", "ARROW_WITH_SNAPPY=OFF")) -} - -with_zlib <- function(env_vars) { - arrow_zlib <- is_feature_requested("ARROW_WITH_ZLIB") - if (arrow_zlib) { - download_unavailable <- remote_download_unavailable("ARROW_ZLIB_URL") - if (download_unavailable) { - cat("**** zlib requested but cannot be downloaded. Setting ARROW_WITH_ZLIB=OFF\n") - arrow_zlib <- FALSE - } - } - paste(env_vars, ifelse(arrow_zlib, "ARROW_WITH_ZLIB=ON", "ARROW_WITH_ZLIB=OFF")) -} - -with_zstd <- function(env_vars) { - arrow_zstd <- is_feature_requested("ARROW_WITH_ZSTD") - if (arrow_zstd) { - download_unavailable <- remote_download_unavailable("ARROW_ZSTD_URL") - if (download_unavailable) { - cat("**** zstd requested but cannot be downloaded. Setting ARROW_WITH_ZSTD=OFF\n") - arrow_zstd <- FALSE - } - } - paste(env_vars, ifelse(arrow_zstd, "ARROW_WITH_ZSTD=ON", "ARROW_WITH_ZSTD=OFF")) -} - -# Specific computations: json, re2, utf8proc, xsimd -with_json <- function(env_vars) { - # Note: we try to build json unless it's explicitly turned off, even if - # LIBARROW_MINIMAL=true. - arrow_json <- (!env_is("ARROW_JSON", "off")) || (!env_is("ARROW_WITH_RAPIDJSON", "off")) - if (arrow_json) { - download_unavailable <- remote_download_unavailable("ARROW_RAPIDJSON_URL") - if (download_unavailable) { - cat("**** json requested but cannot be downloaded. Setting ARROW_JSON=OFF\n") - arrow_json <- FALSE - } - } - paste(env_vars, ifelse(arrow_json, "ARROW_WITH_JSON=ON", "ARROW_WITH_JSON=OFF")) -} - -with_re2 <- function(env_vars) { - # Note: we try to build re2 unless it's explicitly turned off, even if - # LIBARROW_MINIMAL=true. - arrow_re2 <- !env_is("ARROW_WITH_RE2", "off") && !is_solaris() - # re2 and utf8proc do compile on Solaris - # but `ar` fails to build libarrow_bundled_dependencies, so turn them off - # so that there are no bundled deps - if (arrow_re2) { - download_unavailable <- remote_download_unavailable("ARROW_RE2_URL") - if (download_unavailable) { - cat("**** re2 requested but cannot be downloaded. Setting ARROW_WITH_RE2=OFF\n") - arrow_re2 <- FALSE - } - } - paste(env_vars, ifelse(arrow_re2, "ARROW_WITH_RE2=ON", "ARROW_WITH_RE2=OFF")) -} - -with_utf8proc <- function(env_vars) { - # Note: we try to build utf8proc unless it's explicitly turned off, even if - # LIBARROW_MINIMAL=true. - arrow_utf8proc <- !env_is("ARROW_WITH_UTF8PROC", "off") && !is_solaris() - # re2 and utf8proc do compile on Solaris - # but `ar` fails to build libarrow_bundled_dependencies, so turn them off - # so that there are no bundled deps - if (arrow_utf8proc) { - download_unavailable <- remote_download_unavailable("ARROW_UTF8PROC_URL") - if (download_unavailable) { - cat("**** utf8proc requested but cannot be downloaded. Setting ARROW_WITH_UTF8PROC=OFF\n") - arrow_utf8proc <- FALSE - } - } - paste(env_vars, ifelse(arrow_utf8proc, "ARROW_WITH_UTF8PROC=ON", "ARROW_WITH_UTF8PROC=OFF")) -} - -with_xsimd <- function(env_vars) { - # xsimd doesn't compile on solaris, so set SIMD level to NONE to skip it. - # Use it everywhere else (as long as xsimd is available) - use_simd <- !is_solaris() - if (use_simd) { - download_unavailable <- remote_download_unavailable("ARROW_XSIMD_URL") - if (download_unavailable) { - cat("**** xsimd requested but cannot be downloaded. Setting EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE\n") - use_simd <- FALSE - } - } - paste(env_vars, ifelse(use_simd, "", "EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE")) -} - -# Notes on other downloaded dependencies: -# Boost is required in some cases (Flight, Gandiva, S3, and tests, at least), -# but there's no such thing as ARROW_BOOST=OFF. -# It may be necessary to set BOOST_ROOT or ARROW_BOOST_URL for offline installs. -# -# Other URLs get downloaded, but afaik, are not used in the build. -# - ARROW_ABSL_URL - seems to be a dependency of gRPC -# - ARROW_CARES_URL - "a dependency of gRPC" -# - ARROW_GBENCHMARK_URL - "Google benchmark, for testing" -# - ARROW_GFLAGS_URL - "for command line utilities (formerly Googleflags)" -# - ARROW_GLOG_URL - "for logging" -# - ARROW_GRPC_URL - "for remote procedure calls" -# - ARROW_GTEST_URL - "Googletest, for testing" -# - ARROW_ORC_URL - "for Apache ORC format support" -# - ARROW_PROTOBUF_URL - "Google Protocol Buffers, for data serialization" - - cmake_gcc_version <- function(env_vars) { # This function returns NA if using a non-gcc compiler # Always enclose calls to it in isTRUE() or isFALSE() From c03d7dc039756d3c3965c3da6551dc7a46d190d9 Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 26 Aug 2021 21:45:23 -0700 Subject: [PATCH 07/27] Re-add system requirements in nixlibs - That is, use the `with_mimalloc()` and `with_s3_support()` functions - Remove an unused function - Add a task `test-r-offline-minimal` that sets `TEST_OFFLINE_BUILD` --- dev/tasks/tasks.yml | 8 ++++++++ r/tools/nixlibs.R | 25 ++++--------------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 8b87c982983..98a535ecfbc 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1033,6 +1033,14 @@ tasks: flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE -e ARROW_DEPENDENCY_SOURCE=SYSTEM' image: ubuntu-r-only-r + test-r-offline-minimal: + ci: azure + template: r/azure.linux.yml + params: + r_org: rocker + r_image: r-base + r_tag: latest + flags: '-e TEST_OFFLINE_BUILD=true' {% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"), ("rocker", "r-base", "latest"), diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index d32ecaf5b30..5f525ffe69b 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -288,10 +288,13 @@ build_libarrow <- function(src_dir, dst_dir) { LDFLAGS = R_CMD_config("LDFLAGS") ) env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") + env_vars <- with_s3_support(env_vars) + env_vars <- with_mimalloc(env_vars) + # turn_off_thirdparty_features() needs to happen after with_mimalloc() and + # with_s3_support(), since those might turn features ON. thirdparty_deps_unavailable <- !download_ok && !dir.exists(Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") - if (thirdparty_deps_unavailable || is_solaris()) { # Note that JSON support does work on Solaris, but will be turned off with # the rest of the thirdparty dependencies (when ARROW-13768 is resolved and @@ -390,7 +393,6 @@ cmake_version <- function(cmd = "cmake") { } turn_off_thirdparty_features <- function(env_vars) { - # Because these are done as environment variables (as opposed to build flags), # setting these to "OFF" overrides any previous setting. We don't need to # check the existing value. @@ -482,25 +484,6 @@ set_thirdparty_urls <- function(env_vars) { paste(env_vars, url_env_vars) } -remote_download_unavailable <- function(url_env_vars) { - # Check the env vars - # e.g. ARROW_MIMALLOC_URL should point to an existing file if !download_ok - # Some dependencies require multiple downloads - check that all are available. - # https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds - missing_local <- FALSE - for (v in url_env_vars) { - local_url <- Sys.getenv(v) - missing_local <- missing_local || (local_url == "") || (!file.exists(local_url)) - } - # This check is only relevant when Cmake would try to download things - # (This check would change if we were using individual dependency resolution.) - # https://arrow.apache.org/docs/developers/cpp/building.html#individual-dependency-resolution) - download_required <- missing_local && - (toupper(Sys.getenv("ARROW_DEPENDENCY_SOURCE")) %in% c("", "BUNDLED", "AUTO")) - download_unavailable <- download_required && (!download_ok) - download_unavailable -} - with_mimalloc <- function(env_vars) { arrow_mimalloc <- env_is("ARROW_MIMALLOC", "on") || env_is("LIBARROW_MINIMAL", "false") if (arrow_mimalloc) { From 4e2ef525f1c9706cca5b49e9031f21389733b598 Mon Sep 17 00:00:00 2001 From: karldw Date: Fri, 27 Aug 2021 11:38:30 -0700 Subject: [PATCH 08/27] Tweaks to offline build - Move `download_optional_dependencies()` function - Change output of `download_optional_dependencies` to directory used, and input to `ARROW_THIRDPARTY_DEPENDENCY_DIR` if it's set. - Enable `ARROW_VERBOSE_THIRDPARTY_BUILD` if `ARROW_R_DEV` is true and we're setting `*_SOURCE_URL` flags so the printed log shows the file used. - Change env var management in `nixlibx.R` to work with a vector, rather than adding to one long string. - Add checks to env vars: names must be standard, values can't contain `'` --- r/R/install-arrow.R | 61 +++++++ r/R/util.R | 60 ------- r/inst/build_arrow_static.sh | 1 + r/man/download_optional_dependencies.Rd | 12 +- r/tests/testthat/test-install-arrow.R | 17 -- r/tools/nixlibs.R | 212 +++++++++++++----------- r/vignettes/install.Rmd | 8 + 7 files changed, 188 insertions(+), 183 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index b07ac60f50a..de8210738c6 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -136,3 +136,64 @@ reload_arrow <- function() { message("Please restart R to use the 'arrow' package.") } } + + +#' Download all optional Arrow dependencies +#' +#' @param deps_dir Directory to save files into. Will be created if necessary. +#' Defaults to the value of `ARROW_THIRDPARTY_DEPENDENCY_DIR`, if that +#' environment variable is set. +#' +#' @return `deps_dir`, invisibly +#' +#' This function is used for setting up an offline build. If it's possible to +#' download at build time, don't use this function. Instead, let `cmake` +#' download them for you. +#' If the files already exist in `deps_dir`, they will be re-downloaded and +#' overwritten. Do not put other files in this directory. +#' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` +#' is `BUNDLED` or `AUTO`. +#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +#' +#' Steps for an offline install with optional dependencies: +#' - Install the `arrow` package on a computer with internet access +#' - Run this function +#' - Copy the saved dependency files to a computer without internet access +#' - Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that +#' points to the folder. +#' - Install the `arrow` package on the computer without internet access +#' - Run [arrow_info()] to check installed capabilities +#' +#' @examples +#' \dontrun{ +#' download_optional_dependencies("arrow-thirdparty") +#' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar +#' } +#' @export +download_optional_dependencies <- function(deps_dir = NULL) { + # This script is copied over from arrow/cpp/... to arrow/r/inst/... + download_dependencies_sh <- system.file( + "thirdparty/download_dependencies.sh", + package = "arrow", + mustWork = TRUE + ) + if (is.null(deps_dir) && Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") != "") { + deps_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") + } + + dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) + # Run download_dependencies.sh + cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) + return_status <- system2(download_dependencies_sh, + args = deps_dir, stdout = FALSE, stderr = FALSE + ) + if (isTRUE(return_status == 0)) { + cat(paste0( + "**** Set environment variable on offline machine and re-build arrow:\n", + "export ARROW_THIRDPARTY_DEPENDENCY_DIR=\n" + )) + } else { + stop("Failed to download optional dependencies", .call = FALSE) + } + invisible(deps_dir) +} diff --git a/r/R/util.R b/r/R/util.R index 39912a5dfa8..5958b0b3111 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -183,63 +183,3 @@ repeat_value_as_array <- function(object, n) { } return(Scalar$create(object)$as_array(n)) } - - -#' Download all optional Arrow dependencies -#' -#' @param deps_dir Directory to save files into. Will be created if necessary. -#' -#' @return TRUE/FALSE for whether the downloads were successful -#' -#' This function is used for setting up an offline build. If it's possible to -#' download at build time, don't use this function. Instead, let `cmake` -#' download them for you. -#' If the files already exist in `deps_dir`, they will be re-downloaded and -#' overwritten. Other files are not changed. -#' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` -#' is `BUNDLED` or `AUTO`. -#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds -#' -#' Steps for an offline install with optional dependencies: -#' - Install the `arrow` package on a computer with internet access -#' - Run this function -#' - Copy the saved dependency files to a computer without internet access -#' - Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that -#' points to the folder. -#' - Install the `arrow` package on the computer without internet access -#' - Run [arrow_info()] to check installed capabilities -#' -#' @examples -#' \dontrun{ -#' download_optional_dependencies("arrow-thirdparty") -#' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar -#' } -#' @export -download_optional_dependencies <- function(deps_dir) { - # This script is copied over from arrow/cpp/... to arrow/r/tools/cpp/... - download_dependencies_sh <- system.file( - "thirdparty/download_dependencies.sh", - package = "arrow", - mustWork = TRUE - ) - # Make sure the directory is sort of reasonable before creating it - deps_dir <- trimws(deps_dir) - stopifnot(nchar(deps_dir) >= 1) - dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) - - # Run download_dependencies.sh - cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) - return_status <- system2(download_dependencies_sh, - args = deps_dir, stdout = FALSE, stderr = FALSE - ) - download_successful <- isTRUE(return_status == 0) - if (download_successful) { - cat(paste0( - "**** Set environment variable on offline machine and re-build arrow:\n", - "export ARROW_THIRDPARTY_DEPENDENCY_DIR=\n" - )) - } else { - warning("Failed to download optional dependencies") - } - invisible(download_successful) -} diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 84a9f971246..8a4a8fa5667 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -70,6 +70,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \ diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index 99862c06abc..2d203269c93 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -1,22 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/util.R +% Please edit documentation in R/install-arrow.R \name{download_optional_dependencies} \alias{download_optional_dependencies} \title{Download all optional Arrow dependencies} \usage{ -download_optional_dependencies(deps_dir) +download_optional_dependencies(deps_dir = NULL) } \arguments{ -\item{deps_dir}{Directory to save files into. Will be created if necessary.} +\item{deps_dir}{Directory to save files into. Will be created if necessary. +Defaults to the value of \code{ARROW_THIRDPARTY_DEPENDENCY_DIR}, if that +environment variable is set.} } \value{ -TRUE/FALSE for whether the downloads were successful +\code{deps_dir}, invisibly This function is used for setting up an offline build. If it's possible to download at build time, don't use this function. Instead, let \code{cmake} download them for you. If the files already exist in \code{deps_dir}, they will be re-downloaded and -overwritten. Other files are not changed. +overwritten. Do not put other files in this directory. These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} is \code{BUNDLED} or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds diff --git a/r/tests/testthat/test-install-arrow.R b/r/tests/testthat/test-install-arrow.R index 9681d41d108..c53ee829829 100644 --- a/r/tests/testthat/test-install-arrow.R +++ b/r/tests/testthat/test-install-arrow.R @@ -37,20 +37,3 @@ r_only({ }) }) }) - - -r_only({ - test_that("download_optional_dependencies", { - skip_if_offline() - deps_dir <- tempfile() - download_successful <- expect_output( - download_optional_dependencies(deps_dir), - "export ARROW_THRIFT_URL" - ) - expect_true(download_successful) - env_var_file <- file.path(deps_dir, "DEFINE_ENV_VARS.sh") - expect_true(file.exists(env_var_file)) - env_var_lines <- readLines(env_var_file) - expect_true(any(grepl("export ARROW_THRIFT_URL", env_var_lines))) - }) -}) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 5f525ffe69b..781662aa4e8 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -239,6 +239,25 @@ find_local_source <- function() { NULL } +env_vars_as_string <- function(env_var_list) { + # Do some basic checks on env_var_list: + # Check that env_var_list has names, that those names are valid POSIX + # environment variables, and that none of the values contain `'`. + stopifnot( + length(env_var_list) == length(names(env_var_list)), + all(grepl("^[^0-9]", names(env_var_list))), + all(grepl("^[A-Z0-9_]+$", names(env_var_list))), + !any(grepl("'", env_var_list, fixed = TRUE)) + ) + env_var_string <- paste0(names(env_var_list), "='", env_var_list, "'", collapse = " ") + if (nchar(env_var_string) > 30000) { + # This could happen if the full paths in *_SOURCE_URL were *very* long. + # A more formal check would look at getconf ARG_MAX, but this shouldn't matter + cat("*** Warning: Environment variables are very long. This could cause issues on some shells.\n") + } + env_var_string +} + build_libarrow <- function(src_dir, dst_dir) { # We'll need to compile R bindings with these libs, so delete any .o files system("rm src/*.o", ignore.stdout = TRUE, ignore.stderr = TRUE) @@ -281,30 +300,41 @@ build_libarrow <- function(src_dir, dst_dir) { BUILD_DIR = build_dir, DEST_DIR = dst_dir, CMAKE = cmake, + # EXTRA_CMAKE_FLAGS will often be "", but it's convenient later to have it defined + EXTRA_CMAKE_FLAGS = Sys.getenv("EXTRA_CMAKE_FLAGS"), # Make sure we build with the same compiler settings that R is using CC = R_CMD_config("CC"), CXX = paste(R_CMD_config("CXX11"), R_CMD_config("CXX11STD")), # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS") ) - env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") - env_vars <- with_s3_support(env_vars) - env_vars <- with_mimalloc(env_vars) + env_var_list <- with_s3_support(env_var_list) + env_var_list <- with_mimalloc(env_var_list) # turn_off_thirdparty_features() needs to happen after with_mimalloc() and # with_s3_support(), since those might turn features ON. thirdparty_deps_unavailable <- !download_ok && !dir.exists(Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") - if (thirdparty_deps_unavailable || is_solaris()) { + if (is_solaris()) { # Note that JSON support does work on Solaris, but will be turned off with # the rest of the thirdparty dependencies (when ARROW-13768 is resolved and # JSON can be turned off at all). All other dependencies don't compile # (e.g thrift, jemalloc, and xsimd) or do compile but `ar` fails to build # libarrow_bundled_dependencies (e.g. re2 and utf8proc). - env_vars <- turn_off_thirdparty_features(env_vars) + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else if (thirdparty_deps_unavailable) { + cat(paste0( + "*** Building C++ library from source, but downloading thirdparty dependencies\n", + " is not possible, so this build will turn off all thirdparty features.\n", + " See install vignette for details:\n", + " https://cran.r-project.org/web/packages/arrow/vignettes/install.html\n" + )) + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else { + # If $ARROW_THIRDPARTY_DEPENDENCY_DIR has files, add their *_SOURCE_URL env vars + env_var_list <- set_thirdparty_urls(env_var_list) } - # If $ARROW_THIRDPARTY_DEPENDENCY_DIR has files, add their *_SOURCE_URL env vars - env_vars <- set_thirdparty_urls(env_vars) + env_vars <- env_vars_as_string(env_var_list) cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n") status <- suppressWarnings(system( @@ -346,12 +376,12 @@ ensure_cmake <- function() { cmake_dir <- tempfile() download_successful <- try_download(cmake_binary_url, cmake_tar) if (!download_successful) { - stop( - "cmake was not found locally and download failed.\n", - "Make sure cmake is installed and available on your PATH\n", - "(or download '", cmake_binary_url, - "' and define the CMAKE environment variable)." - ) + cat(paste0( + "*** cmake was not found locally and download failed.\n", + " Make sure cmake is installed and available on your PATH\n", + " (or download '", cmake_binary_url, + "' and define the CMAKE environment variable).\n" + )) } untar(cmake_tar, exdir = cmake_dir) unlink(cmake_tar) @@ -392,111 +422,92 @@ cmake_version <- function(cmd = "cmake") { ) } -turn_off_thirdparty_features <- function(env_vars) { +turn_off_thirdparty_features <- function(env_var_list) { # Because these are done as environment variables (as opposed to build flags), # setting these to "OFF" overrides any previous setting. We don't need to # check the existing value. turn_off <- c( - "ARROW_MIMALLOC=OFF", - "ARROW_JEMALLOC=OFF", - "ARROW_PARQUET=OFF", # depends on thrift - "ARROW_DATASET=OFF", # depends on parquet - "ARROW_S3=OFF", - "ARROW_WITH_BROTLI=OFF", - "ARROW_WITH_BZ2=OFF", - "ARROW_WITH_LZ4=OFF", - "ARROW_WITH_SNAPPY=OFF", - "ARROW_WITH_ZLIB=OFF", - "ARROW_WITH_ZSTD=OFF", - "ARROW_WITH_RE2=OFF", - "ARROW_WITH_UTF8PROC=OFF", + "ARROW_MIMALLOC" = "OFF", + "ARROW_JEMALLOC" = "OFF", + "ARROW_PARQUET" = "OFF", # depends on thrift + "ARROW_DATASET" = "OFF", # depends on parquet + "ARROW_S3" = "OFF", + "ARROW_WITH_BROTLI" = "OFF", + "ARROW_WITH_BZ2" = "OFF", + "ARROW_WITH_LZ4" = "OFF", + "ARROW_WITH_SNAPPY" = "OFF", + "ARROW_WITH_ZLIB" = "OFF", + "ARROW_WITH_ZSTD" = "OFF", + "ARROW_WITH_RE2" = "OFF", + "ARROW_WITH_UTF8PROC" = "OFF", # NOTE: this code sets the environment variable ARROW_JSON to "OFF", but # that setting is will *not* be honored by build_arrow_static.sh until # ARROW-13768 is resolved. - "ARROW_JSON=OFF", + "ARROW_JSON" = "OFF", # The syntax to turn off XSIMD is different. - 'EXTRA_CMAKE_FLAGS="-DARROW_SIMD_LEVEL=NONE"' + # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined) + "EXTRA_CMAKE_FLAGS" = paste(env_var_list[["EXTRA_CMAKE_FLAGS"]], "-DARROW_SIMD_LEVEL=NONE") ) - if (Sys.getenv("EXTRA_CMAKE_FLAGS") != "") { - # Error rather than overwriting EXTRA_CMAKE_FLAGS - # (Correctly inserting the flag into an existing quoted string is tricky) - stop("Sorry, setting EXTRA_CMAKE_FLAGS is not supported at this time.") - } - paste(env_vars, paste(turn_off, collapse = " ")) + # Create a new env_var_list, with the values of turn_off set. + # replace() also adds new values if they didn't exist before + replace(env_var_list, names(turn_off), turn_off) } -set_thirdparty_urls <- function(env_vars) { +set_thirdparty_urls <- function(env_var_list) { + # This function is run in most typical cases -- when download_ok is TRUE *or* + # ARROW_THIRDPARTY_DEPENDENCY_DIR is set. It does *not* check if existing + # *_SOURCE_URL variables are set. (It is also run whenever ARROW_DEPENDENCY_SOURCE + # is "SYSTEM", but doesn't affect the build in that case.) deps_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") + if (deps_dir == "") { + return(env_var_list) + } files <- list.files(deps_dir, full.names = FALSE) if (length(files) == 0) { - # This will be true if the variable is unset, if it's set but the directory - # doesn't exist, or if it exists but is empty. - return(env_vars) + # This will be true if the directory doesn't exist, or if it exists but is empty. + # Here the build will continue, but will likely fail when the downloads are + # unavailable. The user will end up with the arrow-without-arrow package. + cat(paste0( + "*** Error: ARROW_THIRDPARTY_DEPENDENCY_DIR was set but has no files.\n", + " Have you run download_optional_dependencies()?" + )) + return(env_var_list) } - dep_names <- c( - "absl", # not used; seems to be a dependency of gRPC - "aws-sdk-cpp", - "aws-checksums", - "aws-c-common", - "aws-c-event-stream", - "boost", - "brotli", - "bzip2", - "cares", # not used; "a dependency of gRPC" - "gbenchmark", # not used; "Google benchmark, for testing" - "gflags", # not used; "for command line utilities (formerly Googleflags)" - "glog", # not used; "for logging" - "grpc", # not used; "for remote procedure calls" - "gtest", # not used; "Googletest, for testing" - "jemalloc", - "lz4", - "mimalloc", - "orc", # not used; "for Apache ORC format support" - "protobuf", # not used; "Google Protocol Buffers, for data serialization" - "rapidjson", - "re2", - "snappy", - "thrift", - "utf8proc", - "xsimd", - "zlib", - "zstd" - ) - dep_regex <- paste0("^(", paste(dep_names, collapse = "|"), ").*") - # If there were extra files in the folder (not matching our regex) drop them. - files <- files[grepl(dep_regex, files, perl = TRUE)] - # Convert e.g. "thrift-0.13.0.tar.gz" to ARROW_THRIFT_URL - # Note that if there's no file called thrift*, we won't add - # ARROW_THRIFT_URL to env_vars. - url_env_varname <- sub(dep_regex, "ARROW_\\1_URL", files, perl = TRUE) - url_env_varname <- toupper(gsub("-", "_", url_env_varname, fixed = TRUE)) - # Special case: ARROW_AWSSDK_URL for aws-sdk-cpp-.tar.gz - url_env_varname <- sub("ARROW_AWS_SDK_CPP_URL", "ARROW_AWSSDK_URL", url_env_varname, fixed = TRUE) - if (anyDuplicated(url_env_varname)) { - warning("Unexpected files in ", deps_dir, - "\nDo you have multiple copies of a dependency?", - .call = FALSE + url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files)) + # Special handling for the aws dependencies, which have extra `-` + aws <- grepl("^aws", files) + url_env_varname[aws] <- sub( + "AWS_SDK_CPP", "AWSSDK", + gsub( + "-", "_", + sub( + "(AWS.*)-.*", "ARROW_\\1_URL", + toupper(files[aws]) + ) ) - return(env_vars) - } + ) full_filenames <- file.path(normalizePath(deps_dir), files) - url_env_vars <- paste(url_env_varname, full_filenames, sep = "=", collapse = " ") - paste(env_vars, url_env_vars) + + env_var_list <- replace(env_var_list, url_env_varname, full_filenames) + if (env_is("ARROW_R_DEV", "true")) { + env_var_list <- replace(env_var_list, "ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + } + env_var_list } -with_mimalloc <- function(env_vars) { +with_mimalloc <- function(env_var_list) { arrow_mimalloc <- env_is("ARROW_MIMALLOC", "on") || env_is("LIBARROW_MINIMAL", "false") if (arrow_mimalloc) { # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") arrow_mimalloc <- FALSE } } - paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) + replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) } -with_s3_support <- function(env_vars) { +with_s3_support <- function(env_var_list) { arrow_s3 <- env_is("ARROW_S3", "on") || env_is("LIBARROW_MINIMAL", "false") # but if ARROW_S3=OFF explicitly, we are definitely off, so override if (env_is("ARROW_S3", "off")) { @@ -505,32 +516,33 @@ with_s3_support <- function(env_vars) { if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("CURL", NULL, env_vars)) { + } else if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("OpenSSL", "1.0.2", env_vars)) { + } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE } } - paste(env_vars, ifelse(arrow_s3, "ARROW_S3=ON", "ARROW_S3=OFF")) + replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) } -cmake_gcc_version <- function(env_vars) { +cmake_gcc_version <- function(env_var_list) { # This function returns NA if using a non-gcc compiler # Always enclose calls to it in isTRUE() or isFALSE() - vals <- cmake_cxx_compiler_vars(env_vars) + vals <- cmake_cxx_compiler_vars(env_var_list) if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) { return(NA) } package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]]) } -cmake_cxx_compiler_vars <- function(env_vars) { +cmake_cxx_compiler_vars <- function(env_var_list) { + env_vars <- env_vars_as_string(env_var_list) info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE) info <- grep("^[A-Z_]* .*$", info, value = TRUE) vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info)) @@ -538,12 +550,13 @@ cmake_cxx_compiler_vars <- function(env_vars) { vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))] } -cmake_find_package <- function(pkg, version = NULL, env_vars) { +cmake_find_package <- function(pkg, version = NULL, env_var_list) { td <- tempfile() dir.create(td) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), td)) find_package <- paste0("find_package(", pkg, " ", version, " REQUIRED)") writeLines(find_package, file.path(td, "CMakeLists.txt")) + env_vars <- env_vars_as_string(env_var_list) cmake_cmd <- paste0( "export ", env_vars, " && cd ", td, @@ -573,9 +586,6 @@ if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) { } else if (build_ok) { # (2) Find source and build it src_dir <- find_local_source() - if (is.null(src_dir) && download_ok) { - src_dir <- download_source() - } if (!is.null(src_dir)) { cat("*** Building C++ libraries\n") build_libarrow(src_dir, dst_dir) diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 9dc85d0a32b..83ad37c7c7b 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -102,6 +102,14 @@ satisfy C++ dependencies. > Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation. +The `install-arrow.R` file also includes the `download_optional_dependencies()` +function. Normally, when installing on a computer with internet access, the +build process will download third-party dependencies as needed. This function +provides a way to download them in advance. Relevant environment variables are +`ARROW_THIRDPARTY_DEPENDENCY_DIR` for the directory of downloaded dependencies +and `TEST_OFFLINE_BUILD` to force the build process not to download. +(Note: `cmake` will still be downloaded if not available locally.) + ## S3 support The `arrow` package allows you to work with data in AWS S3 or in other cloud From 5a13cbf81ee66172b63341d20acf51efc03d0c97 Mon Sep 17 00:00:00 2001 From: karldw Date: Sat, 28 Aug 2021 11:00:17 -0700 Subject: [PATCH 09/27] Set ARROW_RUNTIME_SIMD_LEVEL=NONE as well --- r/tools/nixlibs.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 781662aa4e8..0cc7f11abcb 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -446,7 +446,10 @@ turn_off_thirdparty_features <- function(env_var_list) { "ARROW_JSON" = "OFF", # The syntax to turn off XSIMD is different. # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined) - "EXTRA_CMAKE_FLAGS" = paste(env_var_list[["EXTRA_CMAKE_FLAGS"]], "-DARROW_SIMD_LEVEL=NONE") + "EXTRA_CMAKE_FLAGS" = paste( + env_var_list[["EXTRA_CMAKE_FLAGS"]], + "-DARROW_SIMD_LEVEL=NONE -DARROW_RUNTIME_SIMD_LEVEL=NONE" + ) ) # Create a new env_var_list, with the values of turn_off set. # replace() also adds new values if they didn't exist before From 98b5601f94ff0f0caf240c6e1b914d4e8e49f98e Mon Sep 17 00:00:00 2001 From: karldw Date: Mon, 30 Aug 2021 08:43:22 -0700 Subject: [PATCH 10/27] Fix identify_os() logic, clarify offline text --- r/R/install-arrow.R | 14 +++++++++----- r/man/download_optional_dependencies.Rd | 20 ++++++++++++++------ r/tools/nixlibs.R | 2 +- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index de8210738c6..63939dd2433 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -155,13 +155,17 @@ reload_arrow <- function() { #' is `BUNDLED` or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds #' -#' Steps for an offline install with optional dependencies: -#' - Install the `arrow` package on a computer with internet access +#' ## Steps for an offline install with optional dependencies: +#' +#' ### On a computer with internet access: +#' - Install the `arrow` package #' - Run this function -#' - Copy the saved dependency files to a computer without internet access +#' - Copy the saved dependency files to the computer with internet access +#' +#' ### On the computer without internet access: #' - Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that -#' points to the folder. -#' - Install the `arrow` package on the computer without internet access +#' points to the newly copied folder of dependency files. +#' - Install the `arrow` package #' - Run [arrow_info()] to check installed capabilities #' #' @examples diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index 2d203269c93..73d4f034038 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -22,16 +22,24 @@ overwritten. Do not put other files in this directory. These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} is \code{BUNDLED} or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds - -Steps for an offline install with optional dependencies: +\subsection{Steps for an offline install with optional dependencies:}{ +\subsection{On a computer with internet access:}{ \itemize{ -\item Install the \code{arrow} package on a computer with internet access +\item Install the \code{arrow} package \item Run this function -\item Copy the saved dependency files to a computer without internet access +\item Copy the saved dependency files to the computer with internet access +} +} + +\subsection{On the computer without internet access:}{ +\itemize{ \item Create a environment variable called \code{ARROW_THIRDPARTY_DEPENDENCY_DIR} that -points to the folder. -\item Install the \code{arrow} package on the computer without internet access +points to the newly copied folder of dependency files. +\item Install the \code{arrow} package \item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} +} + } } \description{ diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 0cc7f11abcb..c4af6c6edf4 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -91,7 +91,7 @@ download_binary <- function(os = identify_os()) { # * `TRUE` (not case-sensitive), to try to discover your current OS, or # * some other string, presumably a related "distro-version" that has binaries # built that work for your OS -identify_os <- function(os = Sys.getenv("LIBARROW_BINARY", Sys.getenv("TEST_OFFLINE_BUILD"))) { +identify_os <- function(os = Sys.getenv("LIBARROW_BINARY")) { if (tolower(os) %in% c("", "false")) { # Env var says not to download a binary return(NULL) From 5b9bb771830d9734b60ac17c9b6f74984ab7cd43 Mon Sep 17 00:00:00 2001 From: karldw Date: Mon, 30 Aug 2021 13:58:58 -0700 Subject: [PATCH 11/27] Clarify build/download explanations - Also simplify `download_optional_dependencies` --- r/R/install-arrow.R | 32 ++++++-------- r/man/download_optional_dependencies.Rd | 19 +++++---- r/tools/nixlibs.R | 30 ++++++-------- r/vignettes/install.Rmd | 55 ++++++++++++++++++------- 4 files changed, 77 insertions(+), 59 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 63939dd2433..d7c62367ef2 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -152,21 +152,22 @@ reload_arrow <- function() { #' If the files already exist in `deps_dir`, they will be re-downloaded and #' overwritten. Do not put other files in this directory. #' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` -#' is `BUNDLED` or `AUTO`. +#' is unset, `BUNDLED`, or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds #' #' ## Steps for an offline install with optional dependencies: #' -#' ### On a computer with internet access: -#' - Install the `arrow` package -#' - Run this function -#' - Copy the saved dependency files to the computer with internet access +#' ### Using a computer with internet access, pre-download the dependencies: +#' * Install the `arrow` package +#' * Run `download_optional_dependencies(my_dependencies)` +#' * Copy the directory `my-arrow-dependencies` to the computer without internet access #' -#' ### On the computer without internet access: -#' - Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that -#' points to the newly copied folder of dependency files. -#' - Install the `arrow` package -#' - Run [arrow_info()] to check installed capabilities +#' ### On the computer without internet access, use the pre-downloaded dependencies: +#' * Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that +#' points to the newly copied `my_dependencies`. +#' * Install the `arrow` package +#' * This installation will build from source, so `cmake` must be available +#' * Run [arrow_info()] to check installed capabilities #' #' @examples #' \dontrun{ @@ -174,17 +175,13 @@ reload_arrow <- function() { #' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar #' } #' @export -download_optional_dependencies <- function(deps_dir = NULL) { +download_optional_dependencies <- function(deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) { # This script is copied over from arrow/cpp/... to arrow/r/inst/... download_dependencies_sh <- system.file( "thirdparty/download_dependencies.sh", package = "arrow", mustWork = TRUE ) - if (is.null(deps_dir) && Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") != "") { - deps_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") - } - dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) # Run download_dependencies.sh cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) @@ -192,10 +189,7 @@ download_optional_dependencies <- function(deps_dir = NULL) { args = deps_dir, stdout = FALSE, stderr = FALSE ) if (isTRUE(return_status == 0)) { - cat(paste0( - "**** Set environment variable on offline machine and re-build arrow:\n", - "export ARROW_THIRDPARTY_DEPENDENCY_DIR=\n" - )) + } else { stop("Failed to download optional dependencies", .call = FALSE) } diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index 73d4f034038..fc49abf7532 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -4,7 +4,9 @@ \alias{download_optional_dependencies} \title{Download all optional Arrow dependencies} \usage{ -download_optional_dependencies(deps_dir = NULL) +download_optional_dependencies( + deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") +) } \arguments{ \item{deps_dir}{Directory to save files into. Will be created if necessary. @@ -20,22 +22,25 @@ download them for you. If the files already exist in \code{deps_dir}, they will be re-downloaded and overwritten. Do not put other files in this directory. These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} -is \code{BUNDLED} or \code{AUTO}. +is unset, \code{BUNDLED}, or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds \subsection{Steps for an offline install with optional dependencies:}{ -\subsection{On a computer with internet access:}{ +\subsection{Using a computer with internet access, pre-download the dependencies:}{ \itemize{ \item Install the \code{arrow} package -\item Run this function -\item Copy the saved dependency files to the computer with internet access +\item Run \code{download_optional_dependencies(my_dependencies)} +\item Copy the directory \code{my-arrow-dependencies} to the computer without internet access } } -\subsection{On the computer without internet access:}{ +\subsection{On the computer without internet access, use the pre-downloaded dependencies:}{ \itemize{ \item Create a environment variable called \code{ARROW_THIRDPARTY_DEPENDENCY_DIR} that -points to the newly copied folder of dependency files. +points to the newly copied \code{my_dependencies}. \item Install the \code{arrow} package +\itemize{ +\item This installation will build from source, so \code{cmake} must be available +} \item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities } } diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index c4af6c6edf4..b231edd0f37 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -29,7 +29,6 @@ if (getRversion() < 3.4 && is.null(getOption("download.file.method"))) { options(.arrow.cleanup = character()) # To collect dirs to rm on exit on.exit(unlink(getOption(".arrow.cleanup"))) - env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) try_download <- function(from_url, to_file) { @@ -43,21 +42,16 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } -build_ok <- !env_is("LIBARROW_BUILD", "false") -# But binary defaults to not OK -binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") # For local debugging, set ARROW_R_DEV=TRUE to make this script print more +quietly <- !env_is("ARROW_R_DEV", "true") + +# Default is build from source, not download a binary +build_ok <- !env_is("LIBARROW_BUILD", "false") +binary_ok <- !(env_is("LIBARROW_BINARY", "false") || env_is("LIBARROW_BINARY", "")) -quietly <- !env_is("ARROW_R_DEV", "true") # try_download uses quietly global -# * download_ok, build_ok: Use prebuilt binary, if found, otherwise try to build -# * !download_ok, build_ok: Build with local git checkout, if available, or -# sources included in r/tools/cpp/. Optional dependencies are not included, -# and will not be automatically downloaded. -# cmake will still be downloaded if necessary -# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds -# * download_ok, !build_ok: Only use prebuilt binary, if found -# * neither: Get the arrow-without-arrow package -# Download and build are OK unless you say not to (or can't access github) +# Check if we're doing an offline build. +# (Note that cmake will still be downloaded if necessary +# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds) download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) @@ -378,9 +372,9 @@ ensure_cmake <- function() { if (!download_successful) { cat(paste0( "*** cmake was not found locally and download failed.\n", - " Make sure cmake is installed and available on your PATH\n", - " (or download '", cmake_binary_url, - "' and define the CMAKE environment variable).\n" + " Make sure cmake >= 3.10 is installed and available on your PATH,\n", + " or download ", cmake_binary_url, "\n", + " and define the CMAKE environment variable.\n" )) } untar(cmake_tar, exdir = cmake_dir) @@ -471,7 +465,7 @@ set_thirdparty_urls <- function(env_var_list) { # Here the build will continue, but will likely fail when the downloads are # unavailable. The user will end up with the arrow-without-arrow package. cat(paste0( - "*** Error: ARROW_THIRDPARTY_DEPENDENCY_DIR was set but has no files.\n", + "*** Warning: ARROW_THIRDPARTY_DEPENDENCY_DIR was set but has no files.\n", " Have you run download_optional_dependencies()?" )) return(env_var_list) diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 83ad37c7c7b..8aae54aeb47 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -104,11 +104,27 @@ satisfy C++ dependencies. The `install-arrow.R` file also includes the `download_optional_dependencies()` function. Normally, when installing on a computer with internet access, the -build process will download third-party dependencies as needed. This function -provides a way to download them in advance. Relevant environment variables are -`ARROW_THIRDPARTY_DEPENDENCY_DIR` for the directory of downloaded dependencies -and `TEST_OFFLINE_BUILD` to force the build process not to download. -(Note: `cmake` will still be downloaded if not available locally.) +build process will download third-party dependencies as needed. +This function provides a way to download them in advance. +Doing so may be useful when installing Arrow on a computer without internet access. +Note that Arrow _can_ be installed on a computer without internet access, but +many useful features will be disabled, as they depend on third-party components. +More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every +capability. +One approach to add more capabilities in an offline install is to pre-download +the required files. + +### Using a computer with internet access, pre-download the dependencies: +* Install the `arrow` package +* Run `download_optional_dependencies(my_dependencies)` +* Copy the directory `my-arrow-dependencies` to the computer without internet access + +### On the computer without internet access, use the pre-downloaded dependencies: +* Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that + points to the newly copied `my_dependencies`. +* Install the `arrow` package + * This installation will build from source, so `cmake` must be available +* Run `arrow_info()` to check installed capabilities ## S3 support @@ -164,10 +180,10 @@ If found, they will be downloaded and bundled when your R package compiles. For a list of supported distributions and versions, see the [arrow-r-nightly](https://github.com/ursa-labs/arrow-r-nightly/blob/master/README.md) project. -If no binary is found, it will download the Arrow C++ source that matches the R package version -(CRAN release or nightly build) and attempt to build it locally. -If no matching source bundle is found, it will also look to see if you are in +If no C++ library binary is found, it will attempt to build it locally. +First, it will also look to see if you are in a checkout of the `apache/arrow` git repository and thus have the C++ source there. +Otherwise, it builds from the C++ files included in the package. Depending on your system, building Arrow C++ from source may be slow. For the specific mechanics of how all this works, see the R package `configure` script, @@ -293,6 +309,13 @@ setting `ARROW_WITH_ZSTD=OFF` to build without `zstd`; or (3) uninstalling the conflicting `zstd`. See discussion [here](https://issues.apache.org/jira/browse/ARROW-8556). +* Offline installation fails when dependencies haven't been downloaded to +`ARROW_THIRDPARTY_DEPENDENCY_DIR`. The package currently depends on the +third-party project RapidJSON. See `?download_optional_dependencies`. +See discussion [here](https://issues.apache.org/jira/browse/ARROW-13768) on +allowing the project to build without JSON support. + + ## Summary of build environment variables Some features are optional when you build Arrow from source. With the exception of `ARROW_S3`, these are all `ON` by default in the bundled C++ build, but you can set them to `OFF` to disable them. @@ -316,12 +339,6 @@ By default, these are all unset. All boolean variables are case-insensitive. won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. -* `TEST_OFFLINE_BUILD`: Unless set to `true`, the build script - will download prebuilt C++ binary or third-party source bundles as necessary. - If you're in a checkout of the `apache/arrow` git repository - and want to build the C++ library from the local source, make this `false` or - not set. If building the C++ library from source with cmake unavailable, cmake - will still be downloaded, regardless of this flag's value. * `LIBARROW_BINARY`: If set to `true`, the script will try to download a binary C++ library built for your operating system. You may also set it to some other string, @@ -356,7 +373,15 @@ By default, these are all unset. All boolean variables are case-insensitive. The directory will be created if it does not exist. * `CMAKE`: When building the C++ library from source, you can specify a `/path/to/cmake` to use a different version than whatever is found on the `$PATH` - +* `ARROW_THIRDPARTY_DEPENDENCY_DIR`: Directory with downloaded third-party + dependency files. Run `download_optional_dependencies(my-dir)` to download. +* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download + prebuilt the C++ library binary. + It will turn off any features that require a download, unless they're available + in `ARROW_THIRDPARTY_DEPENDENCY_DIR`. + Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. + (Currently `RapidJSON` will also be downloaded. + See discussion [here](https://issues.apache.org/jira/browse/ARROW-13768).) # Contributing From e08b3066e27ab848a767732807ad0bc3918063c9 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 30 Aug 2021 17:02:26 -0500 Subject: [PATCH 12/27] Allow overriding download_dependencies_sh, first stab at CI job for maximal offline build --- dev/tasks/r/github.linux.offline.build.yml | 111 +++++++++++++++++++++ dev/tasks/tasks.yml | 5 + r/R/install-arrow.R | 6 +- 3 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 dev/tasks/r/github.linux.offline.build.yml diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml new file mode 100644 index 00000000000..d1fa054b0c1 --- /dev/null +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: must set "Crossbow" as name to have the badge links working in the +# github comment reports! +name: Crossbow + +on: + push + +jobs: + grab-dependencies: + name: "Download thirdparty dependencies" + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: "TRUE" + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - name: Free Up Disk Space + shell: bash + run: arrow/ci/scripts/util_cleanup.sh + - name: Fetch Submodules and Tags + shell: bash + run: cd arrow && ci/scripts/util_checkout.sh + - uses: r-lib/actions/setup-r@v1 + - name: Pull Arrow dependencies + run: | + cd arrow/r + # copy the two files we will need + # TODO: allow manually specifying `download_dependencies.sh` in `download_optional_dependencies()` then we won't need to install + mkdir -p inst/thirdparty/ + cp -p ../cpp/thirdparty/download_dependencies.sh inst/thirdparty/ + cp -p ../cpp/thirdparty/versions.txt inst/thirdparty/ + mkdir thirdparty_deps + R -e "download_optional_dependencies("thirdparty_deps", download_dependencies_sh = "./inst/thirdparty/download_dependencies.sh")" + shell: bash + - name: Upload the third party dependency artifacts + uses: actions/upload-artifact@v2 + with: + name: thirdparty_deps + path: arrow/r/thirdparty_deps + + intall-offline: + name: "Install offline" + needs: [grab-dependencies] + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: "TRUE" + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - uses: r-lib/actions/setup-r@v1 + - name: Download artifacts + uses: actions/download-artifact@v2 + with: + name: thirdparty_deps + path: thirdparty_deps + - name: Install dependencies + run: | + install.packages(c("remotes", "glue", "sys")) + remotes::install_deps("arrow/r", dependencies = TRUE) + shell: Rscript {0} + - name: Install + env: + TEST_OFFLINE_BUILD: true + LIBARROW_MINIMAL: false + ARROW_R_DEV: TRUE + ARROW_THIRDPARTY_DEPENDENCY_DIR: ~/thirdparty_deps + run: | + cd arrow/r + R CMD INSTALL . --install-tests + - name: Run the tests + run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' + - name: Dump test logs + run: cat arrow-tests/testthat.Rout* + if: failure() + - name: Save the test output + uses: actions/upload-artifact@v2 + with: + name: test-output + path: arrow-tests/testthat.Rout* + if: always() diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 98a535ecfbc..6b11286188f 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1042,6 +1042,11 @@ tasks: r_tag: latest flags: '-e TEST_OFFLINE_BUILD=true' + test-r-offline-maximal: + ci: github + template: r/github.linux.offline.build.yml + + {% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"), ("rocker", "r-base", "latest"), ("rstudio", "r-base", "3.6-bionic"), diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index d7c62367ef2..c8651275c8e 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -175,13 +175,15 @@ reload_arrow <- function() { #' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar #' } #' @export -download_optional_dependencies <- function(deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) { +download_optional_dependencies <- function( + deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR"), # This script is copied over from arrow/cpp/... to arrow/r/inst/... - download_dependencies_sh <- system.file( + download_dependencies_sh = system.file( "thirdparty/download_dependencies.sh", package = "arrow", mustWork = TRUE ) +) { dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) # Run download_dependencies.sh cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) From 479b054f7549d1265dfec308e46aabc79844fee0 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 30 Aug 2021 19:37:15 -0500 Subject: [PATCH 13/27] a few more tweaks --- dev/tasks/r/github.linux.offline.build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index d1fa054b0c1..2e73760cf4d 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -54,7 +54,7 @@ jobs: cp -p ../cpp/thirdparty/download_dependencies.sh inst/thirdparty/ cp -p ../cpp/thirdparty/versions.txt inst/thirdparty/ mkdir thirdparty_deps - R -e "download_optional_dependencies("thirdparty_deps", download_dependencies_sh = "./inst/thirdparty/download_dependencies.sh")" + R -e 'source("R/install-arrow.R"); download_optional_dependencies("thirdparty_deps", download_dependencies_sh = "./inst/thirdparty/download_dependencies.sh")' shell: bash - name: Upload the third party dependency artifacts uses: actions/upload-artifact@v2 @@ -94,8 +94,8 @@ jobs: TEST_OFFLINE_BUILD: true LIBARROW_MINIMAL: false ARROW_R_DEV: TRUE - ARROW_THIRDPARTY_DEPENDENCY_DIR: ~/thirdparty_deps run: | + export ARROW_THIRDPARTY_DEPENDENCY_DIR=$(pwd)/thirdparty_deps cd arrow/r R CMD INSTALL . --install-tests - name: Run the tests From 2410d5574db9c03d41c7faf70480c8189bcf69c9 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 30 Aug 2021 20:17:17 -0500 Subject: [PATCH 14/27] docs --- r/R/install-arrow.R | 2 ++ r/man/download_optional_dependencies.Rd | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index c8651275c8e..123e7621623 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -143,6 +143,8 @@ reload_arrow <- function() { #' @param deps_dir Directory to save files into. Will be created if necessary. #' Defaults to the value of `ARROW_THIRDPARTY_DEPENDENCY_DIR`, if that #' environment variable is set. +#' @param download_dependencies_sh location of the dependency download script, +#' defaults to the one included with the arrow package. #' #' @return `deps_dir`, invisibly #' diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd index fc49abf7532..1c62841988b 100644 --- a/r/man/download_optional_dependencies.Rd +++ b/r/man/download_optional_dependencies.Rd @@ -5,13 +5,18 @@ \title{Download all optional Arrow dependencies} \usage{ download_optional_dependencies( - deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") + deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR"), + download_dependencies_sh = system.file("thirdparty/download_dependencies.sh", package + = "arrow", mustWork = TRUE) ) } \arguments{ \item{deps_dir}{Directory to save files into. Will be created if necessary. Defaults to the value of \code{ARROW_THIRDPARTY_DEPENDENCY_DIR}, if that environment variable is set.} + +\item{download_dependencies_sh}{location of the dependency download script, +defaults to the one included with the arrow package.} } \value{ \code{deps_dir}, invisibly From adb089eb42b793987819f7be88f42e90e2575ea2 Mon Sep 17 00:00:00 2001 From: karldw Date: Tue, 31 Aug 2021 12:07:45 -0700 Subject: [PATCH 15/27] Change to packing all dependencies into one file --- r/.gitignore | 1 - r/Makefile | 3 - r/NAMESPACE | 2 +- r/R/install-arrow.R | 100 +++++++++++------- r/_pkgdown.yml | 2 +- r/man/create_package_with_all_dependencies.Rd | 56 ++++++++++ r/man/download_optional_dependencies.Rd | 63 ----------- r/tools/nixlibs.R | 34 +++--- r/vignettes/developing.Rmd | 15 +++ r/vignettes/install.Rmd | 27 ++--- 10 files changed, 163 insertions(+), 140 deletions(-) create mode 100644 r/man/create_package_with_all_dependencies.Rd delete mode 100644 r/man/download_optional_dependencies.Rd diff --git a/r/.gitignore b/r/.gitignore index 4837920768a..fbc5c8c3bfd 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -26,4 +26,3 @@ extra-tests/files /tools/.env /tools/LICENSE.txt /tools/NOTICE.txt -/inst/thirdparty/ diff --git a/r/Makefile b/r/Makefile index 525858c0fdb..9bb37730728 100644 --- a/r/Makefile +++ b/r/Makefile @@ -45,9 +45,6 @@ build: doc cp -p ../.env tools/ cp -p ../NOTICE.txt tools/ cp -p ../LICENSE.txt tools/ - mkdir -p inst/thirdparty - cp -p ../cpp/thirdparty/download_dependencies.sh inst/thirdparty/ - cp -p ../cpp/thirdparty/versions.txt inst/thirdparty/ R CMD build . check: build diff --git a/r/NAMESPACE b/r/NAMESPACE index 217990bae1a..c61a95cf8d4 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -200,13 +200,13 @@ export(codec_is_available) export(contains) export(copy_files) export(cpu_count) +export(create_package_with_all_dependencies) export(dataset_factory) export(date32) export(date64) export(decimal) export(default_memory_pool) export(dictionary) -export(download_optional_dependencies) export(ends_with) export(everything) export(field) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 123e7621623..40a9704d37e 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -138,64 +138,82 @@ reload_arrow <- function() { } -#' Download all optional Arrow dependencies +#' Create an install package with all thirdparty dependencies #' -#' @param deps_dir Directory to save files into. Will be created if necessary. -#' Defaults to the value of `ARROW_THIRDPARTY_DEPENDENCY_DIR`, if that -#' environment variable is set. -#' @param download_dependencies_sh location of the dependency download script, -#' defaults to the one included with the arrow package. -#' -#' @return `deps_dir`, invisibly +#' @param outfile File path for the new tar.gz package. Defaults to +#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) +#' @param package_source File path for the input tar.gz package. Defaults to +#' downloading from CRAN. +#' @return The full path to `outfile`, invisibly #' #' This function is used for setting up an offline build. If it's possible to #' download at build time, don't use this function. Instead, let `cmake` -#' download them for you. -#' If the files already exist in `deps_dir`, they will be re-downloaded and -#' overwritten. Do not put other files in this directory. -#' These saved files are only used in the build if `ARROW_DEPENDENCY_SOURCE` -#' is unset, `BUNDLED`, or `AUTO`. +#' download the required dependencies for you. +#' These downloaded dependencies are only used in the build if +#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds #' #' ## Steps for an offline install with optional dependencies: #' #' ### Using a computer with internet access, pre-download the dependencies: #' * Install the `arrow` package -#' * Run `download_optional_dependencies(my_dependencies)` -#' * Copy the directory `my-arrow-dependencies` to the computer without internet access +#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access #' -#' ### On the computer without internet access, use the pre-downloaded dependencies: -#' * Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that -#' points to the newly copied `my_dependencies`. -#' * Install the `arrow` package +#' ### On the computer without internet access, install the prepared package: +#' * Install the `arrow` package from the copied file (`install.packages("my_arrow_pkg.tar.gz")`) #' * This installation will build from source, so `cmake` must be available #' * Run [arrow_info()] to check installed capabilities #' +#' #' @examples #' \dontrun{ -#' download_optional_dependencies("arrow-thirdparty") -#' list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar +#' new_pkg <- create_package_with_all_dependencies() +#' # Note: this works when run in the same R session, but it's meant to be +#' # copied to a different computer. +#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) #' } #' @export -download_optional_dependencies <- function( - deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR"), - # This script is copied over from arrow/cpp/... to arrow/r/inst/... - download_dependencies_sh = system.file( - "thirdparty/download_dependencies.sh", - package = "arrow", - mustWork = TRUE - ) -) { - dir.create(deps_dir, showWarnings = FALSE, recursive = TRUE) - # Run download_dependencies.sh - cat(paste0("*** Downloading optional dependencies to ", deps_dir, "\n")) - return_status <- system2(download_dependencies_sh, - args = deps_dir, stdout = FALSE, stderr = FALSE - ) - if (isTRUE(return_status == 0)) { - - } else { - stop("Failed to download optional dependencies", .call = FALSE) +create_package_with_all_dependencies <- function(outfile = NULL, package_source = NULL) { + if (is.null(package_source)) { + pkg_download_dir <- tempfile() + dir.create(pkg_download_dir) + on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) + downloaded <- download.packages("arrow", destdir = pkg_download_dir, type = "source") + package_source <- downloaded[1, 2, drop = TRUE] + } + if (!file.exists(package_source) || !endsWith(package_source, "tar.gz")) { + stop("Arrow package .tar.gz file not found") + } + if (is.null(outfile)) { + # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz + # (add 'with_deps' for clarity if the file was downloaded locally) + outfile <- paste0(gsub(".tar.gz$", "", basename(package_source)), "_with_deps.tar.gz") + } + untar_dir <- tempfile() + on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) + untar(package_source, exdir = untar_dir) + thirdparty_dir <- file.path(untar_dir, "arrow/tools/cpp/thirdparty") + download_dependencies_sh <- file.path(thirdparty_dir, "download_dependencies.sh") + download_dir <- file.path(thirdparty_dir, "download") + dir.create(download_dir) + download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 + if (!download_successful) { + stop("Failed to download thirdparty dependencies") + } + # Need to change directory to untar_dir so tar() will use relative paths. That + # means we'll need a full, non-relative path for outfile. (extra_flags="-C" + # doesn't work with R's internal tar) + orig_wd <- getwd() + on.exit(setwd(orig_wd), add = TRUE) + # normalizePath() may return the input unchanged if outfile doesn't exist, so + # create it first. + file.create(outfile) + outfile <- normalizePath(outfile, mustWork = TRUE) + setwd(untar_dir) + tar_successful <- tar(outfile, compression = "gz") == 0 + if (!tar_successful) { + stop("Failed to create new tar.gz file") } - invisible(deps_dir) + invisible(outfile) } diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 0bbbc827779..c0127a8b53a 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -175,7 +175,7 @@ reference: - arrow_available - install_arrow - install_pyarrow - - download_optional_dependencies + - create_package_with_all_dependencies repo: jira_projects: [ARROW] diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd new file mode 100644 index 00000000000..6a03c4cf9f6 --- /dev/null +++ b/r/man/create_package_with_all_dependencies.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/install-arrow.R +\name{create_package_with_all_dependencies} +\alias{create_package_with_all_dependencies} +\title{Create an install package with all thirdparty dependencies} +\usage{ +create_package_with_all_dependencies(outfile = NULL, package_source = NULL) +} +\arguments{ +\item{outfile}{File path for the new tar.gz package. Defaults to +\code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} + +\item{package_source}{File path for the input tar.gz package. Defaults to +downloading from CRAN.} +} +\value{ +The full path to \code{outfile}, invisibly + +This function is used for setting up an offline build. If it's possible to +download at build time, don't use this function. Instead, let \code{cmake} +download the required dependencies for you. +These downloaded dependencies are only used in the build if +\code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}. +https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +\subsection{Steps for an offline install with optional dependencies:}{ +\subsection{Using a computer with internet access, pre-download the dependencies:}{ +\itemize{ +\item Install the \code{arrow} package +\item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")} +\item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access +} +} + +\subsection{On the computer without internet access, install the prepared package:}{ +\itemize{ +\item Install the \code{arrow} package from the copied file (\code{install.packages("my_arrow_pkg.tar.gz")}) +\itemize{ +\item This installation will build from source, so \code{cmake} must be available +} +\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} +} + +} +} +\description{ +Create an install package with all thirdparty dependencies +} +\examples{ +\dontrun{ +new_pkg <- create_package_with_all_dependencies() +# Note: this works when run in the same R session, but it's meant to be +# copied to a different computer. +install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +} +} diff --git a/r/man/download_optional_dependencies.Rd b/r/man/download_optional_dependencies.Rd deleted file mode 100644 index 1c62841988b..00000000000 --- a/r/man/download_optional_dependencies.Rd +++ /dev/null @@ -1,63 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/install-arrow.R -\name{download_optional_dependencies} -\alias{download_optional_dependencies} -\title{Download all optional Arrow dependencies} -\usage{ -download_optional_dependencies( - deps_dir = Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR"), - download_dependencies_sh = system.file("thirdparty/download_dependencies.sh", package - = "arrow", mustWork = TRUE) -) -} -\arguments{ -\item{deps_dir}{Directory to save files into. Will be created if necessary. -Defaults to the value of \code{ARROW_THIRDPARTY_DEPENDENCY_DIR}, if that -environment variable is set.} - -\item{download_dependencies_sh}{location of the dependency download script, -defaults to the one included with the arrow package.} -} -\value{ -\code{deps_dir}, invisibly - -This function is used for setting up an offline build. If it's possible to -download at build time, don't use this function. Instead, let \code{cmake} -download them for you. -If the files already exist in \code{deps_dir}, they will be re-downloaded and -overwritten. Do not put other files in this directory. -These saved files are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} -is unset, \code{BUNDLED}, or \code{AUTO}. -https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds -\subsection{Steps for an offline install with optional dependencies:}{ -\subsection{Using a computer with internet access, pre-download the dependencies:}{ -\itemize{ -\item Install the \code{arrow} package -\item Run \code{download_optional_dependencies(my_dependencies)} -\item Copy the directory \code{my-arrow-dependencies} to the computer without internet access -} -} - -\subsection{On the computer without internet access, use the pre-downloaded dependencies:}{ -\itemize{ -\item Create a environment variable called \code{ARROW_THIRDPARTY_DEPENDENCY_DIR} that -points to the newly copied \code{my_dependencies}. -\item Install the \code{arrow} package -\itemize{ -\item This installation will build from source, so \code{cmake} must be available -} -\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities -} -} - -} -} -\description{ -Download all optional Arrow dependencies -} -\examples{ -\dontrun{ -download_optional_dependencies("arrow-thirdparty") -list.files("arrow-thirdparty", "thrift-*") # "thrift-0.13.0.tar.gz" or similar -} -} diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index b231edd0f37..f61baf3f237 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -54,6 +54,11 @@ binary_ok <- !(env_is("LIBARROW_BINARY", "false") || env_is("LIBARROW_BINARY", " # https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds) download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) +# This path, within the tar file, might exist if +# create_package_with_all_dependencies() was run. Otherwise, it won't, but +# tools/cpp/thirdparty/ still will. +thirdparty_dependency_dir <- "tools/cpp/thirdparty/download" + download_binary <- function(os = identify_os()) { libfile <- tempfile() @@ -307,7 +312,7 @@ build_libarrow <- function(src_dir, dst_dir) { # turn_off_thirdparty_features() needs to happen after with_mimalloc() and # with_s3_support(), since those might turn features ON. thirdparty_deps_unavailable <- !download_ok && - !dir.exists(Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR")) && + !dir.exists(thirdparty_dependency_dir) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") if (is_solaris()) { # Note that JSON support does work on Solaris, but will be turned off with @@ -325,7 +330,7 @@ build_libarrow <- function(src_dir, dst_dir) { )) env_var_list <- turn_off_thirdparty_features(env_var_list) } else { - # If $ARROW_THIRDPARTY_DEPENDENCY_DIR has files, add their *_SOURCE_URL env vars + # If thirdparty_dependency_dir exists, the *_SOURCE_URL env vars env_var_list <- set_thirdparty_urls(env_var_list) } env_vars <- env_vars_as_string(env_var_list) @@ -451,25 +456,18 @@ turn_off_thirdparty_features <- function(env_var_list) { } set_thirdparty_urls <- function(env_var_list) { - # This function is run in most typical cases -- when download_ok is TRUE *or* - # ARROW_THIRDPARTY_DEPENDENCY_DIR is set. It does *not* check if existing - # *_SOURCE_URL variables are set. (It is also run whenever ARROW_DEPENDENCY_SOURCE - # is "SYSTEM", but doesn't affect the build in that case.) - deps_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR") - if (deps_dir == "") { + # This function does *not* check if existing *_SOURCE_URL variables are set. + # The directory tools/cpp/thirdparty/download is created by + # create_package_with_all_dependencies() and saved in the tar file. + # In all other cases, where we're not installing from that offline tar file, + # that directory won't exist, but tools/cpp/thirdparty/ still should. + # Test tools/cpp/thirdparty to avoid false negatives. + deps_dir <- thirdparty_dependency_dir # defined at the top + stopifnot(dir.exists(dirname(thirdparty_dependency_dir))) + if (!dir.exists(deps_dir)) { return(env_var_list) } files <- list.files(deps_dir, full.names = FALSE) - if (length(files) == 0) { - # This will be true if the directory doesn't exist, or if it exists but is empty. - # Here the build will continue, but will likely fail when the downloads are - # unavailable. The user will end up with the arrow-without-arrow package. - cat(paste0( - "*** Warning: ARROW_THIRDPARTY_DEPENDENCY_DIR was set but has no files.\n", - " Have you run download_optional_dependencies()?" - )) - return(env_var_list) - } url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files)) # Special handling for the aws dependencies, which have extra `-` aws <- grepl("^aws", files) diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index 3d7f82e3619..53bc5e232e2 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -107,6 +107,7 @@ You can choose to build and then install the Arrow library into a user-defined d It is recommended that you install the arrow library to a user-level directory to be used in development. This is so that the development version you are using doesn't overwrite a released version of Arrow you may have installed. You are also able to have more than one version of the Arrow library to link to with this approach (by using different `ARROW_HOME` directories for the different versions). This approach also matches the recommendations for other Arrow bindings like [Python](http://arrow.apache.org/docs/developers/python.html). + #### Configure for installing to a user directory In this example we will install it to a directory called `dist` that has the same parent as our `arrow` checkout, but it could be named or located anywhere you would like. However, note that your installation of the Arrow R package will point to this directory and need it to remain intact for the package to continue to work. This is one reason we recommend *not* placing it inside of the arrow git checkout. @@ -537,3 +538,17 @@ from the command line (`make test`, `make doc`, `make clean`, etc.) R CMD build . R CMD check arrow_*.tar.gz --as-cran ``` + +## Summary of environment variables + +* See the user-facing [Install vignette](install.html) for a large number of + environment variables that determine how the build works and what features + get built. +* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download + prebuilt the C++ library binary. + It will turn off any features that require a download, unless they're available + in the `tools/cpp/thirdparty/download/` subfolder of the tar.gz file. + `create_package_with_all_dependencies()` creates that subfolder. + Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. +* `TEST_R_WITHOUT_LIBARROW`: When set to `true`, skip tests that would require + the C++ Arrow library (that is, almost everything). diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 8aae54aeb47..30b53a77eb1 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -102,7 +102,7 @@ satisfy C++ dependencies. > Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation. -The `install-arrow.R` file also includes the `download_optional_dependencies()` +The `install-arrow.R` file also includes the `create_package_with_all_dependencies()` function. Normally, when installing on a computer with internet access, the build process will download third-party dependencies as needed. This function provides a way to download them in advance. @@ -111,8 +111,20 @@ Note that Arrow _can_ be installed on a computer without internet access, but many useful features will be disabled, as they depend on third-party components. More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every capability. -One approach to add more capabilities in an offline install is to pre-download -the required files. +One approach to add more capabilities in an offline install is to prepare a +package with pre-downloaded dependencies. The +`create_package_with_all_dependencies()` function does this preparation. + +### Using a computer with internet access, pre-download the dependencies: +* Install the `arrow` package +* Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +* Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access + +### On the computer without internet access, install the prepared package: +* Install the `arrow` package from the copied file (`install.packages("my_arrow_pkg.tar.gz")`) + * This installation will build from source, so `cmake` must be available +* Run `arrow_info()` to check installed capabilities + ### Using a computer with internet access, pre-download the dependencies: * Install the `arrow` package @@ -373,15 +385,6 @@ By default, these are all unset. All boolean variables are case-insensitive. The directory will be created if it does not exist. * `CMAKE`: When building the C++ library from source, you can specify a `/path/to/cmake` to use a different version than whatever is found on the `$PATH` -* `ARROW_THIRDPARTY_DEPENDENCY_DIR`: Directory with downloaded third-party - dependency files. Run `download_optional_dependencies(my-dir)` to download. -* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download - prebuilt the C++ library binary. - It will turn off any features that require a download, unless they're available - in `ARROW_THIRDPARTY_DEPENDENCY_DIR`. - Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. - (Currently `RapidJSON` will also be downloaded. - See discussion [here](https://issues.apache.org/jira/browse/ARROW-13768).) # Contributing From 6daff455ad1e4c5ac4c84bda5711bdb5c30b6156 Mon Sep 17 00:00:00 2001 From: karldw Date: Tue, 31 Aug 2021 12:57:36 -0700 Subject: [PATCH 16/27] Add narration to create_package_with_all_dependencies --- r/R/install-arrow.R | 8 +++++++- r/man/create_package_with_all_dependencies.Rd | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 40a9704d37e..696e7cda61a 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -174,7 +174,7 @@ reload_arrow <- function() { #' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) #' } #' @export -create_package_with_all_dependencies <- function(outfile = NULL, package_source = NULL) { +create_package_with_all_dependencies <- function(outfile = NULL, package_source = NULL, quietly = TRUE) { if (is.null(package_source)) { pkg_download_dir <- tempfile() dir.create(pkg_download_dir) @@ -197,6 +197,9 @@ create_package_with_all_dependencies <- function(outfile = NULL, package_source download_dependencies_sh <- file.path(thirdparty_dir, "download_dependencies.sh") download_dir <- file.path(thirdparty_dir, "download") dir.create(download_dir) + if (!quietly) { + message("Downloading files to ", download_dir) + } download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 if (!download_successful) { stop("Failed to download thirdparty dependencies") @@ -211,6 +214,9 @@ create_package_with_all_dependencies <- function(outfile = NULL, package_source file.create(outfile) outfile <- normalizePath(outfile, mustWork = TRUE) setwd(untar_dir) + if (!quietly) { + message("Repacking tar.gz file to ", outfile) + } tar_successful <- tar(outfile, compression = "gz") == 0 if (!tar_successful) { stop("Failed to create new tar.gz file") diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd index 6a03c4cf9f6..9124dee25c6 100644 --- a/r/man/create_package_with_all_dependencies.Rd +++ b/r/man/create_package_with_all_dependencies.Rd @@ -4,7 +4,11 @@ \alias{create_package_with_all_dependencies} \title{Create an install package with all thirdparty dependencies} \usage{ -create_package_with_all_dependencies(outfile = NULL, package_source = NULL) +create_package_with_all_dependencies( + outfile = NULL, + package_source = NULL, + quietly = TRUE +) } \arguments{ \item{outfile}{File path for the new tar.gz package. Defaults to From 74093dec3cc829f362cad5668f94fb3570205dc4 Mon Sep 17 00:00:00 2001 From: karldw Date: Tue, 31 Aug 2021 16:25:11 -0700 Subject: [PATCH 17/27] Fix check warnings --- r/R/install-arrow.R | 7 ++++--- r/man/create_package_with_all_dependencies.Rd | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 696e7cda61a..90d94eb4683 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -144,6 +144,7 @@ reload_arrow <- function() { #' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) #' @param package_source File path for the input tar.gz package. Defaults to #' downloading from CRAN. +#' @param quietly boolean, default `TRUE`. If `FALSE`, narrate progress. #' @return The full path to `outfile`, invisibly #' #' This function is used for setting up an offline build. If it's possible to @@ -179,7 +180,7 @@ create_package_with_all_dependencies <- function(outfile = NULL, package_source pkg_download_dir <- tempfile() dir.create(pkg_download_dir) on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) - downloaded <- download.packages("arrow", destdir = pkg_download_dir, type = "source") + downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") package_source <- downloaded[1, 2, drop = TRUE] } if (!file.exists(package_source) || !endsWith(package_source, "tar.gz")) { @@ -192,7 +193,7 @@ create_package_with_all_dependencies <- function(outfile = NULL, package_source } untar_dir <- tempfile() on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) - untar(package_source, exdir = untar_dir) + utils::untar(package_source, exdir = untar_dir) thirdparty_dir <- file.path(untar_dir, "arrow/tools/cpp/thirdparty") download_dependencies_sh <- file.path(thirdparty_dir, "download_dependencies.sh") download_dir <- file.path(thirdparty_dir, "download") @@ -217,7 +218,7 @@ create_package_with_all_dependencies <- function(outfile = NULL, package_source if (!quietly) { message("Repacking tar.gz file to ", outfile) } - tar_successful <- tar(outfile, compression = "gz") == 0 + tar_successful <- utils::tar(outfile, compression = "gz") == 0 if (!tar_successful) { stop("Failed to create new tar.gz file") } diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd index 9124dee25c6..d7978d16e0c 100644 --- a/r/man/create_package_with_all_dependencies.Rd +++ b/r/man/create_package_with_all_dependencies.Rd @@ -16,6 +16,8 @@ create_package_with_all_dependencies( \item{package_source}{File path for the input tar.gz package. Defaults to downloading from CRAN.} + +\item{quietly}{boolean, default \code{TRUE}. If \code{FALSE}, narrate progress.} } \value{ The full path to \code{outfile}, invisibly From 2d73ac19dcfaec0127c7590ce5809e9f15e874a1 Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 2 Sep 2021 14:59:49 -0700 Subject: [PATCH 18/27] Cleanup create_package_with_all_dependencies --- dev/tasks/r/github.linux.offline.build.yml | 23 +++---- r/Makefile | 1 - r/R/install-arrow.R | 60 +++++++++---------- r/man/create_package_with_all_dependencies.Rd | 22 +++---- r/tools/nixlibs.R | 30 ++++------ r/vignettes/developing.Rmd | 3 +- r/vignettes/install.Rmd | 30 +++------- 7 files changed, 73 insertions(+), 96 deletions(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index 2e73760cf4d..5b091168b07 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -48,19 +48,21 @@ jobs: - name: Pull Arrow dependencies run: | cd arrow/r - # copy the two files we will need - # TODO: allow manually specifying `download_dependencies.sh` in `download_optional_dependencies()` then we won't need to install - mkdir -p inst/thirdparty/ - cp -p ../cpp/thirdparty/download_dependencies.sh inst/thirdparty/ - cp -p ../cpp/thirdparty/versions.txt inst/thirdparty/ - mkdir thirdparty_deps - R -e 'source("R/install-arrow.R"); download_optional_dependencies("thirdparty_deps", download_dependencies_sh = "./inst/thirdparty/download_dependencies.sh")' + # This is `make build`, but with no vignettes and not running `make doc` + cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ + R CMD build --no-build-vignettes --no-manual . + built_tar=$(ls -1 arrow*.tar.gz | head -n 1) + R -e "source('R/install-arrow.R'); create_package_with_all_dependencies(dest_file = 'arrow_with_deps.tar.gz', source_file = \"${built_tar}\")" shell: bash - name: Upload the third party dependency artifacts uses: actions/upload-artifact@v2 with: name: thirdparty_deps - path: arrow/r/thirdparty_deps + path: arrow/r/arrow_with_deps.tar.gz intall-offline: name: "Install offline" @@ -83,7 +85,7 @@ jobs: uses: actions/download-artifact@v2 with: name: thirdparty_deps - path: thirdparty_deps + path: arrow/r/arrow_with_deps.tar.gz - name: Install dependencies run: | install.packages(c("remotes", "glue", "sys")) @@ -95,9 +97,8 @@ jobs: LIBARROW_MINIMAL: false ARROW_R_DEV: TRUE run: | - export ARROW_THIRDPARTY_DEPENDENCY_DIR=$(pwd)/thirdparty_deps cd arrow/r - R CMD INSTALL . --install-tests + R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz - name: Run the tests run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - name: Dump test logs diff --git a/r/Makefile b/r/Makefile index 9bb37730728..f493cc49ffe 100644 --- a/r/Makefile +++ b/r/Makefile @@ -37,7 +37,6 @@ deps: R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)' # Note: files in tools are available at build time, but not at run time. The thirdparty -# scripts need to be in inst/ so they're available to download_optional_dependencies() # cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up from cpp/ build: doc cp ../NOTICE.txt inst/NOTICE.txt diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 90d94eb4683..458806d0a9a 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -140,12 +140,11 @@ reload_arrow <- function() { #' Create an install package with all thirdparty dependencies #' -#' @param outfile File path for the new tar.gz package. Defaults to +#' @param dest_file File path for the new tar.gz package. Defaults to #' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) -#' @param package_source File path for the input tar.gz package. Defaults to -#' downloading from CRAN. -#' @param quietly boolean, default `TRUE`. If `FALSE`, narrate progress. -#' @return The full path to `outfile`, invisibly +#' @param source_file File path for the input tar.gz package. Defaults to +#' downloading the package. +#' @return The full path to `dest_file`, invisibly #' #' This function is used for setting up an offline build. If it's possible to #' download at build time, don't use this function. Instead, let `cmake` @@ -157,12 +156,14 @@ reload_arrow <- function() { #' ## Steps for an offline install with optional dependencies: #' #' ### Using a computer with internet access, pre-download the dependencies: -#' * Install the `arrow` package +#' * Install the `arrow` package _or_ run +#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` #' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` #' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access #' #' ### On the computer without internet access, install the prepared package: -#' * Install the `arrow` package from the copied file (`install.packages("my_arrow_pkg.tar.gz")`) +#' * Install the `arrow` package from the copied file +#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` #' * This installation will build from source, so `cmake` must be available #' * Run [arrow_info()] to check installed capabilities #' @@ -175,52 +176,51 @@ reload_arrow <- function() { #' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) #' } #' @export -create_package_with_all_dependencies <- function(outfile = NULL, package_source = NULL, quietly = TRUE) { - if (is.null(package_source)) { +create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) { + if (is.null(source_file)) { pkg_download_dir <- tempfile() dir.create(pkg_download_dir) on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") - package_source <- downloaded[1, 2, drop = TRUE] + source_file <- downloaded[1, 2, drop = TRUE] } - if (!file.exists(package_source) || !endsWith(package_source, "tar.gz")) { + if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) { stop("Arrow package .tar.gz file not found") } - if (is.null(outfile)) { + if (is.null(dest_file)) { # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz # (add 'with_deps' for clarity if the file was downloaded locally) - outfile <- paste0(gsub(".tar.gz$", "", basename(package_source)), "_with_deps.tar.gz") + dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz") } untar_dir <- tempfile() on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) - utils::untar(package_source, exdir = untar_dir) - thirdparty_dir <- file.path(untar_dir, "arrow/tools/cpp/thirdparty") - download_dependencies_sh <- file.path(thirdparty_dir, "download_dependencies.sh") - download_dir <- file.path(thirdparty_dir, "download") + utils::untar(source_file, exdir = untar_dir) + tools_dir <- file.path(untar_dir, "arrow/tools") + download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh") + # If you change this path, also need to edit nixlibs.R + download_dir <- file.path(tools_dir, "thirdparty_dependencies") dir.create(download_dir) - if (!quietly) { - message("Downloading files to ", download_dir) - } + + message("Downloading files to ", download_dir) download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 if (!download_successful) { stop("Failed to download thirdparty dependencies") } # Need to change directory to untar_dir so tar() will use relative paths. That - # means we'll need a full, non-relative path for outfile. (extra_flags="-C" + # means we'll need a full, non-relative path for dest_file. (extra_flags="-C" # doesn't work with R's internal tar) orig_wd <- getwd() on.exit(setwd(orig_wd), add = TRUE) - # normalizePath() may return the input unchanged if outfile doesn't exist, so - # create it first. - file.create(outfile) - outfile <- normalizePath(outfile, mustWork = TRUE) + # normalizePath() may return the input unchanged if dest_file doesn't exist, + # so create it first. + file.create(dest_file) + dest_file <- normalizePath(dest_file, mustWork = TRUE) setwd(untar_dir) - if (!quietly) { - message("Repacking tar.gz file to ", outfile) - } - tar_successful <- utils::tar(outfile, compression = "gz") == 0 + + message("Repacking tar.gz file to ", dest_file) + tar_successful <- utils::tar(dest_file, compression = "gz") == 0 if (!tar_successful) { stop("Failed to create new tar.gz file") } - invisible(outfile) + invisible(dest_file) } diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd index d7978d16e0c..a6773712e5b 100644 --- a/r/man/create_package_with_all_dependencies.Rd +++ b/r/man/create_package_with_all_dependencies.Rd @@ -4,23 +4,17 @@ \alias{create_package_with_all_dependencies} \title{Create an install package with all thirdparty dependencies} \usage{ -create_package_with_all_dependencies( - outfile = NULL, - package_source = NULL, - quietly = TRUE -) +create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) } \arguments{ -\item{outfile}{File path for the new tar.gz package. Defaults to +\item{dest_file}{File path for the new tar.gz package. Defaults to \code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} -\item{package_source}{File path for the input tar.gz package. Defaults to -downloading from CRAN.} - -\item{quietly}{boolean, default \code{TRUE}. If \code{FALSE}, narrate progress.} +\item{source_file}{File path for the input tar.gz package. Defaults to +downloading the package.} } \value{ -The full path to \code{outfile}, invisibly +The full path to \code{dest_file}, invisibly This function is used for setting up an offline build. If it's possible to download at build time, don't use this function. Instead, let \code{cmake} @@ -31,7 +25,8 @@ https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds \subsection{Steps for an offline install with optional dependencies:}{ \subsection{Using a computer with internet access, pre-download the dependencies:}{ \itemize{ -\item Install the \code{arrow} package +\item Install the \code{arrow} package \emph{or} run +\code{source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")} \item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")} \item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access } @@ -39,8 +34,9 @@ https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds \subsection{On the computer without internet access, install the prepared package:}{ \itemize{ -\item Install the \code{arrow} package from the copied file (\code{install.packages("my_arrow_pkg.tar.gz")}) +\item Install the \code{arrow} package from the copied file \itemize{ +\item \code{install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))} \item This installation will build from source, so \code{cmake} must be available } \item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index f61baf3f237..a3f78ef0197 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -54,10 +54,12 @@ binary_ok <- !(env_is("LIBARROW_BINARY", "false") || env_is("LIBARROW_BINARY", " # https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds) download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) -# This path, within the tar file, might exist if -# create_package_with_all_dependencies() was run. Otherwise, it won't, but -# tools/cpp/thirdparty/ still will. -thirdparty_dependency_dir <- "tools/cpp/thirdparty/download" +# This "tools/thirdparty_dependencies" path, within the tar file, might exist if +# create_package_with_all_dependencies() was run, or if someone has created it +# manually before running make build. +# If you change this path, you also need to edit +# `create_package_with_all_dependencies()` in install-arrow.R +thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies") download_binary <- function(os = identify_os()) { @@ -329,8 +331,8 @@ build_libarrow <- function(src_dir, dst_dir) { " https://cran.r-project.org/web/packages/arrow/vignettes/install.html\n" )) env_var_list <- turn_off_thirdparty_features(env_var_list) - } else { - # If thirdparty_dependency_dir exists, the *_SOURCE_URL env vars + } else if (dir.exists(thirdparty_dependency_dir)){ + # Add the *_SOURCE_URL env vars env_var_list <- set_thirdparty_urls(env_var_list) } env_vars <- env_vars_as_string(env_var_list) @@ -457,17 +459,9 @@ turn_off_thirdparty_features <- function(env_var_list) { set_thirdparty_urls <- function(env_var_list) { # This function does *not* check if existing *_SOURCE_URL variables are set. - # The directory tools/cpp/thirdparty/download is created by + # The directory tools/thirdparty_dependencies is created by # create_package_with_all_dependencies() and saved in the tar file. - # In all other cases, where we're not installing from that offline tar file, - # that directory won't exist, but tools/cpp/thirdparty/ still should. - # Test tools/cpp/thirdparty to avoid false negatives. - deps_dir <- thirdparty_dependency_dir # defined at the top - stopifnot(dir.exists(dirname(thirdparty_dependency_dir))) - if (!dir.exists(deps_dir)) { - return(env_var_list) - } - files <- list.files(deps_dir, full.names = FALSE) + files <- list.files(thirdparty_dependency_dir, full.names = FALSE) url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files)) # Special handling for the aws dependencies, which have extra `-` aws <- grepl("^aws", files) @@ -481,10 +475,10 @@ set_thirdparty_urls <- function(env_var_list) { ) ) ) - full_filenames <- file.path(normalizePath(deps_dir), files) + full_filenames <- file.path(normalizePath(thirdparty_dependency_dir), files) env_var_list <- replace(env_var_list, url_env_varname, full_filenames) - if (env_is("ARROW_R_DEV", "true")) { + if (!quietly) { env_var_list <- replace(env_var_list, "ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") } env_var_list diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index 53bc5e232e2..5b628f8d9a5 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -107,7 +107,6 @@ You can choose to build and then install the Arrow library into a user-defined d It is recommended that you install the arrow library to a user-level directory to be used in development. This is so that the development version you are using doesn't overwrite a released version of Arrow you may have installed. You are also able to have more than one version of the Arrow library to link to with this approach (by using different `ARROW_HOME` directories for the different versions). This approach also matches the recommendations for other Arrow bindings like [Python](http://arrow.apache.org/docs/developers/python.html). - #### Configure for installing to a user directory In this example we will install it to a directory called `dist` that has the same parent as our `arrow` checkout, but it could be named or located anywhere you would like. However, note that your installation of the Arrow R package will point to this directory and need it to remain intact for the package to continue to work. This is one reason we recommend *not* placing it inside of the arrow git checkout. @@ -551,4 +550,4 @@ R CMD check arrow_*.tar.gz --as-cran `create_package_with_all_dependencies()` creates that subfolder. Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. * `TEST_R_WITHOUT_LIBARROW`: When set to `true`, skip tests that would require - the C++ Arrow library (that is, almost everything). + the C++ Arrow library (that is, almost everything). diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 30b53a77eb1..2718af5e31b 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -116,27 +116,22 @@ package with pre-downloaded dependencies. The `create_package_with_all_dependencies()` function does this preparation. ### Using a computer with internet access, pre-download the dependencies: -* Install the `arrow` package +* Install the `arrow` package _or_ run + `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access ### On the computer without internet access, install the prepared package: -* Install the `arrow` package from the copied file (`install.packages("my_arrow_pkg.tar.gz")`) +* Install the `arrow` package from the copied file + * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` * This installation will build from source, so `cmake` must be available * Run `arrow_info()` to check installed capabilities - -### Using a computer with internet access, pre-download the dependencies: -* Install the `arrow` package -* Run `download_optional_dependencies(my_dependencies)` -* Copy the directory `my-arrow-dependencies` to the computer without internet access - -### On the computer without internet access, use the pre-downloaded dependencies: -* Create a environment variable called `ARROW_THIRDPARTY_DEPENDENCY_DIR` that - points to the newly copied `my_dependencies`. -* Install the `arrow` package - * This installation will build from source, so `cmake` must be available -* Run `arrow_info()` to check installed capabilities +#### Alternative, hands-on approach +* Download the dependency files (`cpp/thirdparty/download_dependencies.sh` may be helpful) +* Copy the directory of dependencies to the offline computer +* Create the environment variable `ARROW_THIRDPARTY_DEPENDENCY_DIR` on the offline computer, pointing to the copied directory. +* Install the `arrow` package as usual. ## S3 support @@ -321,13 +316,6 @@ setting `ARROW_WITH_ZSTD=OFF` to build without `zstd`; or (3) uninstalling the conflicting `zstd`. See discussion [here](https://issues.apache.org/jira/browse/ARROW-8556). -* Offline installation fails when dependencies haven't been downloaded to -`ARROW_THIRDPARTY_DEPENDENCY_DIR`. The package currently depends on the -third-party project RapidJSON. See `?download_optional_dependencies`. -See discussion [here](https://issues.apache.org/jira/browse/ARROW-13768) on -allowing the project to build without JSON support. - - ## Summary of build environment variables Some features are optional when you build Arrow from source. With the exception of `ARROW_S3`, these are all `ON` by default in the bundled C++ build, but you can set them to `OFF` to disable them. From 130683aee3e41060d0ba54d746d28ea30337ef7d Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 2 Sep 2021 19:24:10 -0500 Subject: [PATCH 19/27] Update dev/tasks/r/github.linux.offline.build.yml --- dev/tasks/r/github.linux.offline.build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index 5b091168b07..fd346ad402c 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -85,7 +85,7 @@ jobs: uses: actions/download-artifact@v2 with: name: thirdparty_deps - path: arrow/r/arrow_with_deps.tar.gz + path: arrow/r/ - name: Install dependencies run: | install.packages(c("remotes", "glue", "sys")) From 13d8c4e0c8f5819ef73095603ac8d005d0d86db9 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 3 Sep 2021 08:26:37 -0500 Subject: [PATCH 20/27] fix testthat output display/uploading --- dev/tasks/r/github.linux.offline.build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index fd346ad402c..e879862e360 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -98,15 +98,15 @@ jobs: ARROW_R_DEV: TRUE run: | cd arrow/r - R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz + R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz |& tee testthat.out - name: Run the tests run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - name: Dump test logs - run: cat arrow-tests/testthat.Rout* - if: failure() + run: cat arrow/r/testthat.Rout* + if: always() - name: Save the test output uses: actions/upload-artifact@v2 with: name: test-output - path: arrow-tests/testthat.Rout* + path: arrow/r/testthat.Rout* if: always() From ad9adb4c1f1711a7061354eab1f5a8ab3c583cd1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 3 Sep 2021 09:44:23 -0500 Subject: [PATCH 21/27] oops, revert back to original setup --- dev/tasks/r/github.linux.offline.build.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index e879862e360..ef5d257663b 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -71,7 +71,7 @@ jobs: strategy: fail-fast: false env: - ARROW_R_DEV: "TRUE" + ARROW_R_DEV: TRUE RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" steps: - name: Checkout Arrow @@ -95,18 +95,17 @@ jobs: env: TEST_OFFLINE_BUILD: true LIBARROW_MINIMAL: false - ARROW_R_DEV: TRUE run: | cd arrow/r - R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz |& tee testthat.out + R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz - name: Run the tests run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - name: Dump test logs - run: cat arrow/r/testthat.Rout* - if: always() + run: cat arrow-tests/testthat.Rout* + if: failure() - name: Save the test output uses: actions/upload-artifact@v2 with: name: test-output - path: arrow/r/testthat.Rout* + path: arrow-tests/testthat.Rout* if: always() From 939cb87a1cd7774054e6aca7a8d87a184cc663ae Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 3 Sep 2021 09:45:30 -0500 Subject: [PATCH 22/27] always print the testthat output --- dev/tasks/r/github.linux.offline.build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index ef5d257663b..d2a8d1fe1f9 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -102,7 +102,7 @@ jobs: run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - name: Dump test logs run: cat arrow-tests/testthat.Rout* - if: failure() + if: always() - name: Save the test output uses: actions/upload-artifact@v2 with: From 0fc5600ba056158e7ebbc1948f8f42177d9e4a0e Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 3 Sep 2021 10:09:07 -0500 Subject: [PATCH 23/27] disable building on git tags --- dev/tasks/r/github.linux.offline.build.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index d2a8d1fe1f9..71cdac60058 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -20,7 +20,9 @@ name: Crossbow on: - push + push: + branches: + - "*-github-*" jobs: grab-dependencies: From 6bf7b8515a69c35d6e3ca76d120711c8307117fd Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 3 Sep 2021 10:17:42 -0500 Subject: [PATCH 24/27] add openssl + libssl dependencies --- dev/tasks/r/github.linux.offline.build.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index 71cdac60058..60685b18c5c 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -88,6 +88,10 @@ jobs: with: name: thirdparty_deps path: arrow/r/ + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt install libcurl4-openssl-dev libssl-dev - name: Install dependencies run: | install.packages(c("remotes", "glue", "sys")) From ec726d6b0bc1ad4562faec3196e954ca75b1b7b0 Mon Sep 17 00:00:00 2001 From: karldw Date: Fri, 3 Sep 2021 10:23:03 -0700 Subject: [PATCH 25/27] Fix JSON comment --- r/tools/nixlibs.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index a3f78ef0197..336efe87197 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -318,9 +318,9 @@ build_libarrow <- function(src_dir, dst_dir) { !env_is("ARROW_DEPENDENCY_SOURCE", "system") if (is_solaris()) { # Note that JSON support does work on Solaris, but will be turned off with - # the rest of the thirdparty dependencies (when ARROW-13768 is resolved and - # JSON can be turned off at all). All other dependencies don't compile - # (e.g thrift, jemalloc, and xsimd) or do compile but `ar` fails to build + # the rest of the thirdparty dependencies. + # All other dependencies don't compile (e.g thrift, jemalloc, and xsimd) + # or do compile but `ar` fails to build # libarrow_bundled_dependencies (e.g. re2 and utf8proc). env_var_list <- turn_off_thirdparty_features(env_var_list) } else if (thirdparty_deps_unavailable) { From c0b7f9636e87f22b653cc7f9a7b5264bc71e97e6 Mon Sep 17 00:00:00 2001 From: karldw Date: Fri, 3 Sep 2021 17:10:48 -0700 Subject: [PATCH 26/27] Docs tweaks --- r/R/install-arrow.R | 6 +++-- r/man/create_package_with_all_dependencies.Rd | 7 +++--- r/tools/nixlibs.R | 24 ++++++++++++------- r/vignettes/install.Rmd | 2 +- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 458806d0a9a..0185ff54a2a 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -138,12 +138,13 @@ reload_arrow <- function() { } -#' Create an install package with all thirdparty dependencies +#' Create a source bundle that includes all thirdparty dependencies #' #' @param dest_file File path for the new tar.gz package. Defaults to #' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) #' @param source_file File path for the input tar.gz package. Defaults to -#' downloading the package. +#' downloading the package from CRAN (or whatever you have set as the first in +#' `getOption("repos")`) #' @return The full path to `dest_file`, invisibly #' #' This function is used for setting up an offline build. If it's possible to @@ -181,6 +182,7 @@ create_package_with_all_dependencies <- function(dest_file = NULL, source_file = pkg_download_dir <- tempfile() dir.create(pkg_download_dir) on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) + message("Downloading Arrow source file") downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") source_file <- downloaded[1, 2, drop = TRUE] } diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd index a6773712e5b..ee04fe28e1a 100644 --- a/r/man/create_package_with_all_dependencies.Rd +++ b/r/man/create_package_with_all_dependencies.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/install-arrow.R \name{create_package_with_all_dependencies} \alias{create_package_with_all_dependencies} -\title{Create an install package with all thirdparty dependencies} +\title{Create a source bundle that includes all thirdparty dependencies} \usage{ create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) } @@ -11,7 +11,8 @@ create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) \code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} \item{source_file}{File path for the input tar.gz package. Defaults to -downloading the package.} +downloading the package from CRAN (or whatever you have set as the first in +\code{getOption("repos")})} } \value{ The full path to \code{dest_file}, invisibly @@ -46,7 +47,7 @@ https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds } } \description{ -Create an install package with all thirdparty dependencies +Create a source bundle that includes all thirdparty dependencies } \examples{ \dontrun{ diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 336efe87197..d3bf9879500 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -441,9 +441,6 @@ turn_off_thirdparty_features <- function(env_var_list) { "ARROW_WITH_ZSTD" = "OFF", "ARROW_WITH_RE2" = "OFF", "ARROW_WITH_UTF8PROC" = "OFF", - # NOTE: this code sets the environment variable ARROW_JSON to "OFF", but - # that setting is will *not* be honored by build_arrow_static.sh until - # ARROW-13768 is resolved. "ARROW_JSON" = "OFF", # The syntax to turn off XSIMD is different. # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined) @@ -484,8 +481,21 @@ set_thirdparty_urls <- function(env_var_list) { env_var_list } +is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL", "false")) { + env_value <- tolower(Sys.getenv(env_varname)) + if (identical(env_value, "off")) { + # If e.g. ARROW_MIMALLOC=OFF explicitly, override default + requested <- FALSE + } else if (identical(env_value, "on")) { + requested <- TRUE + } else { + requested <- default + } + requested +} + with_mimalloc <- function(env_var_list) { - arrow_mimalloc <- env_is("ARROW_MIMALLOC", "on") || env_is("LIBARROW_MINIMAL", "false") + arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") if (arrow_mimalloc) { # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { @@ -497,11 +507,7 @@ with_mimalloc <- function(env_var_list) { } with_s3_support <- function(env_var_list) { - arrow_s3 <- env_is("ARROW_S3", "on") || env_is("LIBARROW_MINIMAL", "false") - # but if ARROW_S3=OFF explicitly, we are definitely off, so override - if (env_is("ARROW_S3", "off")) { - arrow_s3 <- FALSE - } + arrow_s3 <- is_feature_requested("ARROW_S3") if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 9030e38c971..c6761c1e328 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -107,7 +107,7 @@ function. Normally, when installing on a computer with internet access, the build process will download third-party dependencies as needed. This function provides a way to download them in advance. Doing so may be useful when installing Arrow on a computer without internet access. -Note that Arrow _can_ be installed on a computer without internet access, but +Note that Arrow _can_ be installed on a computer without internet access without doing this, but many useful features will be disabled, as they depend on third-party components. More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every capability. From 7a0bdd413570680d6ce0316e2a86990e587fb620 Mon Sep 17 00:00:00 2001 From: karldw Date: Mon, 6 Sep 2021 07:53:45 -0700 Subject: [PATCH 27/27] Clarify binary packages --- r/R/install-arrow.R | 11 +++++++++++ r/man/create_package_with_all_dependencies.Rd | 11 +++++++++++ r/vignettes/install.Rmd | 13 +++++++++++++ 3 files changed, 35 insertions(+) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 0185ff54a2a..3e295c543cf 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -154,6 +154,17 @@ reload_arrow <- function() { #' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`. #' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds #' +#' If you're using binary packages you shouldn't need to use this function. You +#' should download the appropriate binary from your package repository, transfer +#' that to the offline computer, and install that. Any OS can create the source +#' bundle, but it cannot be installed on Windows. (Instead, use a standard +#' Windows binary package.) +#' +#' Note if you're using RStudio Package Manager on Linux: If you still want to +#' make a source bundle with this function, make sure to set the first repo in +#' `options("repos")` to be a mirror that contains source packages (that is: +#' something other than the RSPM binary mirror URLs). +#' #' ## Steps for an offline install with optional dependencies: #' #' ### Using a computer with internet access, pre-download the dependencies: diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd index ee04fe28e1a..b2da8c2491a 100644 --- a/r/man/create_package_with_all_dependencies.Rd +++ b/r/man/create_package_with_all_dependencies.Rd @@ -23,6 +23,17 @@ download the required dependencies for you. These downloaded dependencies are only used in the build if \code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}. https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + +If you're using binary packages you shouldn't need to use this function. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +\code{options("repos")} to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). \subsection{Steps for an offline install with optional dependencies:}{ \subsection{Using a computer with internet access, pre-download the dependencies:}{ \itemize{ diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index c6761c1e328..3be7c2cb475 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -102,6 +102,8 @@ satisfy C++ dependencies. > Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation. +## Offline installation + The `install-arrow.R` file also includes the `create_package_with_all_dependencies()` function. Normally, when installing on a computer with internet access, the build process will download third-party dependencies as needed. @@ -115,6 +117,17 @@ One approach to add more capabilities in an offline install is to prepare a package with pre-downloaded dependencies. The `create_package_with_all_dependencies()` function does this preparation. +If you're using binary packages you shouldn't need to follow these steps. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +`options("repos")` to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). + ### Using a computer with internet access, pre-download the dependencies: * Install the `arrow` package _or_ run `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")`