diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml new file mode 100644 index 00000000000..60685b18c5c --- /dev/null +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: must set "Crossbow" as name to have the badge links working in the +# github comment reports! +name: Crossbow + +on: + push: + branches: + - "*-github-*" + +jobs: + grab-dependencies: + name: "Download thirdparty dependencies" + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: "TRUE" + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - name: Free Up Disk Space + shell: bash + run: arrow/ci/scripts/util_cleanup.sh + - name: Fetch Submodules and Tags + shell: bash + run: cd arrow && ci/scripts/util_checkout.sh + - uses: r-lib/actions/setup-r@v1 + - name: Pull Arrow dependencies + run: | + cd arrow/r + # This is `make build`, but with no vignettes and not running `make doc` + cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ + R CMD build --no-build-vignettes --no-manual . + built_tar=$(ls -1 arrow*.tar.gz | head -n 1) + R -e "source('R/install-arrow.R'); create_package_with_all_dependencies(dest_file = 'arrow_with_deps.tar.gz', source_file = \"${built_tar}\")" + shell: bash + - name: Upload the third party dependency artifacts + uses: actions/upload-artifact@v2 + with: + name: thirdparty_deps + path: arrow/r/arrow_with_deps.tar.gz + + intall-offline: + name: "Install offline" + needs: [grab-dependencies] + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: TRUE + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - uses: r-lib/actions/setup-r@v1 + - name: Download artifacts + uses: actions/download-artifact@v2 + with: + name: thirdparty_deps + path: arrow/r/ + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt install libcurl4-openssl-dev libssl-dev + - name: Install dependencies + run: | + install.packages(c("remotes", "glue", "sys")) + remotes::install_deps("arrow/r", dependencies = TRUE) + shell: Rscript {0} + - name: Install + env: + TEST_OFFLINE_BUILD: true + LIBARROW_MINIMAL: false + run: | + cd arrow/r + R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz + - name: Run the tests + run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' + - name: Dump test logs + run: cat arrow-tests/testthat.Rout* + if: always() + - name: Save the test output + uses: actions/upload-artifact@v2 + with: + name: test-output + path: arrow-tests/testthat.Rout* + if: always() diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 176d44ec35f..b2f0a1dcae8 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1033,6 +1033,19 @@ tasks: flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE -e ARROW_DEPENDENCY_SOURCE=SYSTEM' image: ubuntu-r-only-r + test-r-offline-minimal: + ci: azure + template: r/azure.linux.yml + params: + r_org: rocker + r_image: r-base + r_tag: latest + flags: '-e TEST_OFFLINE_BUILD=true' + + test-r-offline-maximal: + ci: github + template: r/github.linux.offline.build.yml + {% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"), ("rocker", "r-base", "latest"), diff --git a/r/.gitignore b/r/.gitignore index 76e8a8dd0bd..fbc5c8c3bfd 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -18,3 +18,11 @@ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz extra-tests/files + +# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here. +/tools/cpp/ +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up +# from cpp/, but again, they're just copies +/tools/.env +/tools/LICENSE.txt +/tools/NOTICE.txt diff --git a/r/Makefile b/r/Makefile index 7a51cbd5188..f493cc49ffe 100644 --- a/r/Makefile +++ b/r/Makefile @@ -36,8 +36,14 @@ test: deps: R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)' +# Note: files in tools are available at build time, but not at run time. The thirdparty +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up from cpp/ build: doc cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ R CMD build . check: build @@ -56,4 +62,5 @@ clean: -rm src/Makevars.win -rm -rf arrow.Rcheck/ -rm -rf libarrow/ + -rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt -find . -name "*.orig" -delete diff --git a/r/NAMESPACE b/r/NAMESPACE index 5e78d04de52..5164e7c9f20 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -201,6 +201,7 @@ export(codec_is_available) export(contains) export(copy_files) export(cpu_count) +export(create_package_with_all_dependencies) export(dataset_factory) export(date32) export(date64) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 63db8ede910..3e295c543cf 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -70,7 +70,6 @@ install_arrow <- function(nightly = FALSE, } } else { Sys.setenv( - LIBARROW_DOWNLOAD = "true", LIBARROW_BINARY = binary, LIBARROW_MINIMAL = minimal, ARROW_R_DEV = verbose, @@ -137,3 +136,104 @@ reload_arrow <- function() { message("Please restart R to use the 'arrow' package.") } } + + +#' Create a source bundle that includes all thirdparty dependencies +#' +#' @param dest_file File path for the new tar.gz package. Defaults to +#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) +#' @param source_file File path for the input tar.gz package. Defaults to +#' downloading the package from CRAN (or whatever you have set as the first in +#' `getOption("repos")`) +#' @return The full path to `dest_file`, invisibly +#' +#' This function is used for setting up an offline build. If it's possible to +#' download at build time, don't use this function. Instead, let `cmake` +#' download the required dependencies for you. +#' These downloaded dependencies are only used in the build if +#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`. +#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +#' +#' If you're using binary packages you shouldn't need to use this function. You +#' should download the appropriate binary from your package repository, transfer +#' that to the offline computer, and install that. Any OS can create the source +#' bundle, but it cannot be installed on Windows. (Instead, use a standard +#' Windows binary package.) +#' +#' Note if you're using RStudio Package Manager on Linux: If you still want to +#' make a source bundle with this function, make sure to set the first repo in +#' `options("repos")` to be a mirror that contains source packages (that is: +#' something other than the RSPM binary mirror URLs). +#' +#' ## Steps for an offline install with optional dependencies: +#' +#' ### Using a computer with internet access, pre-download the dependencies: +#' * Install the `arrow` package _or_ run +#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` +#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access +#' +#' ### On the computer without internet access, install the prepared package: +#' * Install the `arrow` package from the copied file +#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` +#' * This installation will build from source, so `cmake` must be available +#' * Run [arrow_info()] to check installed capabilities +#' +#' +#' @examples +#' \dontrun{ +#' new_pkg <- create_package_with_all_dependencies() +#' # Note: this works when run in the same R session, but it's meant to be +#' # copied to a different computer. +#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +#' } +#' @export +create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) { + if (is.null(source_file)) { + pkg_download_dir <- tempfile() + dir.create(pkg_download_dir) + on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) + message("Downloading Arrow source file") + downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") + source_file <- downloaded[1, 2, drop = TRUE] + } + if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) { + stop("Arrow package .tar.gz file not found") + } + if (is.null(dest_file)) { + # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz + # (add 'with_deps' for clarity if the file was downloaded locally) + dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz") + } + untar_dir <- tempfile() + on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) + utils::untar(source_file, exdir = untar_dir) + tools_dir <- file.path(untar_dir, "arrow/tools") + download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh") + # If you change this path, also need to edit nixlibs.R + download_dir <- file.path(tools_dir, "thirdparty_dependencies") + dir.create(download_dir) + + message("Downloading files to ", download_dir) + download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 + if (!download_successful) { + stop("Failed to download thirdparty dependencies") + } + # Need to change directory to untar_dir so tar() will use relative paths. That + # means we'll need a full, non-relative path for dest_file. (extra_flags="-C" + # doesn't work with R's internal tar) + orig_wd <- getwd() + on.exit(setwd(orig_wd), add = TRUE) + # normalizePath() may return the input unchanged if dest_file doesn't exist, + # so create it first. + file.create(dest_file) + dest_file <- normalizePath(dest_file, mustWork = TRUE) + setwd(untar_dir) + + message("Repacking tar.gz file to ", dest_file) + tar_successful <- utils::tar(dest_file, compression = "gz") == 0 + if (!tar_successful) { + stop("Failed to create new tar.gz file") + } + invisible(dest_file) +} diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 90d900ddf28..c0127a8b53a 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -175,6 +175,7 @@ reference: - arrow_available - install_arrow - install_pyarrow + - create_package_with_all_dependencies repo: jira_projects: [ARROW] diff --git a/r/configure b/r/configure index 88aef7e1d35..c36e13388c2 100755 --- a/r/configure +++ b/r/configure @@ -39,7 +39,7 @@ FORCE_AUTOBREW=`echo $FORCE_AUTOBREW | tr '[:upper:]' '[:lower:]'` FORCE_BUNDLED_BUILD=`echo $FORCE_BUNDLED_BUILD | tr '[:upper:]' '[:lower:]'` ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'` -LIBARROW_DOWNLOAD=`echo $LIBARROW_DOWNLOAD | tr '[:upper:]' '[:lower:]'` +TEST_OFFLINE_BUILD=`echo $TEST_OFFLINE_BUILD | tr '[:upper:]' '[:lower:]'` NOT_CRAN=`echo $NOT_CRAN | tr '[:upper:]' '[:lower:]'` VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` @@ -129,18 +129,15 @@ else # autobrew sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` fi else + # Set some default values/backwards compatibility - if [ "${LIBARROW_DOWNLOAD}" = "" ] && [ "${NOT_CRAN}" != "" ]; then - LIBARROW_DOWNLOAD=$NOT_CRAN; export LIBARROW_DOWNLOAD - fi - if [ "${LIBARROW_BINARY}" = "" ] && [ "${LIBARROW_DOWNLOAD}" != "" ]; then - LIBARROW_BINARY=$LIBARROW_DOWNLOAD; export LIBARROW_BINARY - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${LIBARROW_DOWNLOAD}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + if [ "${NOT_CRAN}" = "true" ]; then + if [ "${LIBARROW_BINARY}" = "" ]; then + LIBARROW_BINARY=true; export LIBARROW_BINARY + fi + if [ "${LIBARROW_MINIMAL}" = "" ]; then + LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + fi fi # find openssl on macos. macOS ships with libressl. openssl is installable diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 578d8b6e5b2..5f01ae0a75f 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -70,6 +70,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \ diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd new file mode 100644 index 00000000000..b2da8c2491a --- /dev/null +++ b/r/man/create_package_with_all_dependencies.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/install-arrow.R +\name{create_package_with_all_dependencies} +\alias{create_package_with_all_dependencies} +\title{Create a source bundle that includes all thirdparty dependencies} +\usage{ +create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) +} +\arguments{ +\item{dest_file}{File path for the new tar.gz package. Defaults to +\code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} + +\item{source_file}{File path for the input tar.gz package. Defaults to +downloading the package from CRAN (or whatever you have set as the first in +\code{getOption("repos")})} +} +\value{ +The full path to \code{dest_file}, invisibly + +This function is used for setting up an offline build. If it's possible to +download at build time, don't use this function. Instead, let \code{cmake} +download the required dependencies for you. +These downloaded dependencies are only used in the build if +\code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}. +https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + +If you're using binary packages you shouldn't need to use this function. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +\code{options("repos")} to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). +\subsection{Steps for an offline install with optional dependencies:}{ +\subsection{Using a computer with internet access, pre-download the dependencies:}{ +\itemize{ +\item Install the \code{arrow} package \emph{or} run +\code{source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")} +\item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")} +\item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access +} +} + +\subsection{On the computer without internet access, install the prepared package:}{ +\itemize{ +\item Install the \code{arrow} package from the copied file +\itemize{ +\item \code{install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))} +\item This installation will build from source, so \code{cmake} must be available +} +\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} +} + +} +} +\description{ +Create a source bundle that includes all thirdparty dependencies +} +\examples{ +\dontrun{ +new_pkg <- create_package_with_all_dependencies() +# Note: this works when run in the same R session, but it's meant to be +# copied to a different computer. +install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +} +} diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index e28dae79f5d..d3bf9879500 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -30,16 +30,6 @@ options(.arrow.cleanup = character()) # To collect dirs to rm on exit on.exit(unlink(getOption(".arrow.cleanup"))) env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) -# * no download, build_ok: Only build with local git checkout -# * download_ok, no build: Only use prebuilt binary, if found -# * neither: Get the arrow-without-arrow package -# Download and build are OK unless you say not to -download_ok <- !env_is("LIBARROW_DOWNLOAD", "false") -build_ok <- !env_is("LIBARROW_BUILD", "false") -# But binary defaults to not OK -binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") -# For local debugging, set ARROW_R_DEV=TRUE to make this script print more -quietly <- !env_is("ARROW_R_DEV", "true") try_download <- function(from_url, to_file) { status <- try( @@ -52,6 +42,26 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } +# For local debugging, set ARROW_R_DEV=TRUE to make this script print more +quietly <- !env_is("ARROW_R_DEV", "true") + +# Default is build from source, not download a binary +build_ok <- !env_is("LIBARROW_BUILD", "false") +binary_ok <- !(env_is("LIBARROW_BINARY", "false") || env_is("LIBARROW_BINARY", "")) + +# Check if we're doing an offline build. +# (Note that cmake will still be downloaded if necessary +# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds) +download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) + +# This "tools/thirdparty_dependencies" path, within the tar file, might exist if +# create_package_with_all_dependencies() was run, or if someone has created it +# manually before running make build. +# If you change this path, you also need to edit +# `create_package_with_all_dependencies()` in install-arrow.R +thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies") + + download_binary <- function(os = identify_os()) { libfile <- tempfile() if (!is.null(os)) { @@ -82,7 +92,7 @@ download_binary <- function(os = identify_os()) { # * `TRUE` (not case-sensitive), to try to discover your current OS, or # * some other string, presumably a related "distro-version" that has binaries # built that work for your OS -identify_os <- function(os = Sys.getenv("LIBARROW_BINARY", Sys.getenv("LIBARROW_DOWNLOAD"))) { +identify_os <- function(os = Sys.getenv("LIBARROW_BINARY")) { if (tolower(os) %in% c("", "false")) { # Env var says not to download a binary return(NULL) @@ -193,6 +203,10 @@ system_release <- function() { read_system_release <- function() utils::head(readLines("/etc/system-release"), 1) +is_solaris <- function() { + tolower(Sys.info()[["sysname"]]) %in% "sunos" +} + #### end distro #### find_available_binary <- function(os) { @@ -209,73 +223,40 @@ find_available_binary <- function(os) { os } -download_source <- function() { - tf1 <- tempfile() - src_dir <- tempfile() - - # Given VERSION as x.y.z.p - p <- package_version(VERSION)[1, 4] - if (is.na(p)) { - # This is just x.y.z so download the official Apache release - if (apache_download(VERSION, tf1)) { - untar(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/apache-arrow-", VERSION, "/cpp") - } - } else if (p != 9000) { - # This is a custom dev version (x.y.z.9999) or a nightly (x.y.z.20210505) - # (Don't try to download on the default dev .9000 version) - if (nightly_download(VERSION, tf1)) { - unzip(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/cpp") - } - } - - if (dir.exists(src_dir)) { - cat("*** Successfully retrieved C++ source\n") - options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir)) - # These scripts need to be executable - system( - sprintf("chmod 755 %s/build-support/*.sh", src_dir), - ignore.stdout = quietly, ignore.stderr = quietly - ) - return(src_dir) - } else { - return(NULL) - } -} - -nightly_download <- function(version, destfile) { - source_url <- paste0(arrow_repo, "src/arrow-", version, ".zip") - try_download(source_url, destfile) -} - -apache_download <- function(version, destfile, n_mirrors = 3) { - apache_path <- paste0("arrow/arrow-", version, "/apache-arrow-", version, ".tar.gz") - apache_urls <- c( - # This returns a different mirror each time - rep("https://www.apache.org/dyn/closer.lua?action=download&filename=", n_mirrors), - "https://downloads.apache.org/" # The backup +find_local_source <- function() { + # We'll take the first of these that exists + # The first case probably occurs if we're in the arrow git repo + # The second probably occurs if we're installing the arrow R package + cpp_dir_options <- c( + file.path(Sys.getenv("ARROW_SOURCE_HOME", ".."), "cpp"), + "tools/cpp" ) - downloaded <- FALSE - for (u in apache_urls) { - downloaded <- try_download(paste0(u, apache_path), destfile) - if (downloaded) { - break + for (cpp_dir in cpp_dir_options) { + if (file.exists(file.path(cpp_dir, "src/arrow/api.h"))) { + cat(paste0("*** Found local C++ source: '", cpp_dir, "'\n")) + return(cpp_dir) } } - downloaded + NULL } -find_local_source <- function(arrow_home = Sys.getenv("ARROW_SOURCE_HOME", "..")) { - if (file.exists(paste0(arrow_home, "/cpp/src/arrow/api.h"))) { - # We're in a git checkout of arrow, so we can build it - cat("*** Found local C++ source\n") - return(paste0(arrow_home, "/cpp")) - } else { - return(NULL) +env_vars_as_string <- function(env_var_list) { + # Do some basic checks on env_var_list: + # Check that env_var_list has names, that those names are valid POSIX + # environment variables, and that none of the values contain `'`. + stopifnot( + length(env_var_list) == length(names(env_var_list)), + all(grepl("^[^0-9]", names(env_var_list))), + all(grepl("^[A-Z0-9_]+$", names(env_var_list))), + !any(grepl("'", env_var_list, fixed = TRUE)) + ) + env_var_string <- paste0(names(env_var_list), "='", env_var_list, "'", collapse = " ") + if (nchar(env_var_string) > 30000) { + # This could happen if the full paths in *_SOURCE_URL were *very* long. + # A more formal check would look at getconf ARG_MAX, but this shouldn't matter + cat("*** Warning: Environment variables are very long. This could cause issues on some shells.\n") } + env_var_string } build_libarrow <- function(src_dir, dst_dir) { @@ -320,25 +301,42 @@ build_libarrow <- function(src_dir, dst_dir) { BUILD_DIR = build_dir, DEST_DIR = dst_dir, CMAKE = cmake, + # EXTRA_CMAKE_FLAGS will often be "", but it's convenient later to have it defined + EXTRA_CMAKE_FLAGS = Sys.getenv("EXTRA_CMAKE_FLAGS"), # Make sure we build with the same compiler settings that R is using CC = R_CMD_config("CC"), CXX = paste(R_CMD_config("CXX11"), R_CMD_config("CXX11STD")), # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS") ) - env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") - env_vars <- with_s3_support(env_vars) - env_vars <- with_mimalloc(env_vars) - if (tolower(Sys.info()[["sysname"]]) %in% "sunos") { - # jemalloc doesn't seem to build on Solaris - # nor does thrift, so turn off parquet, - # and arrowExports.cpp requires parquet for dataset (ARROW-11994), so turn that off - # xsimd doesn't compile, so set SIMD level to NONE to skip it - # re2 and utf8proc do compile, - # but `ar` fails to build libarrow_bundled_dependencies, so turn them off - # so that there are no bundled deps - env_vars <- paste(env_vars, "ARROW_JEMALLOC=OFF ARROW_PARQUET=OFF ARROW_DATASET=OFF ARROW_WITH_RE2=OFF ARROW_WITH_UTF8PROC=OFF EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE") + env_var_list <- with_s3_support(env_var_list) + env_var_list <- with_mimalloc(env_var_list) + # turn_off_thirdparty_features() needs to happen after with_mimalloc() and + # with_s3_support(), since those might turn features ON. + thirdparty_deps_unavailable <- !download_ok && + !dir.exists(thirdparty_dependency_dir) && + !env_is("ARROW_DEPENDENCY_SOURCE", "system") + if (is_solaris()) { + # Note that JSON support does work on Solaris, but will be turned off with + # the rest of the thirdparty dependencies. + # All other dependencies don't compile (e.g thrift, jemalloc, and xsimd) + # or do compile but `ar` fails to build + # libarrow_bundled_dependencies (e.g. re2 and utf8proc). + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else if (thirdparty_deps_unavailable) { + cat(paste0( + "*** Building C++ library from source, but downloading thirdparty dependencies\n", + " is not possible, so this build will turn off all thirdparty features.\n", + " See install vignette for details:\n", + " https://cran.r-project.org/web/packages/arrow/vignettes/install.html\n" + )) + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else if (dir.exists(thirdparty_dependency_dir)){ + # Add the *_SOURCE_URL env vars + env_var_list <- set_thirdparty_urls(env_var_list) } + env_vars <- env_vars_as_string(env_var_list) + cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n") status <- suppressWarnings(system( paste(env_vars, "inst/build_arrow_static.sh"), @@ -346,7 +344,11 @@ build_libarrow <- function(src_dir, dst_dir) { )) if (status != 0) { # It failed :( - cat("**** Error building Arrow C++. Re-run with ARROW_R_DEV=true for debug information.\n") + cat( + "**** Error building Arrow C++.", + ifelse(env_is("ARROW_R_DEV", "true"), "", "Re-run with ARROW_R_DEV=true for debug information."), + "\n" + ) } invisible(status) } @@ -373,7 +375,15 @@ ensure_cmake <- function() { ) cmake_tar <- tempfile() cmake_dir <- tempfile() - try_download(cmake_binary_url, cmake_tar) + download_successful <- try_download(cmake_binary_url, cmake_tar) + if (!download_successful) { + cat(paste0( + "*** cmake was not found locally and download failed.\n", + " Make sure cmake >= 3.10 is installed and available on your PATH,\n", + " or download ", cmake_binary_url, "\n", + " and define the CMAKE environment variable.\n" + )) + } untar(cmake_tar, exdir = cmake_dir) unlink(cmake_tar) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir)) @@ -413,53 +423,121 @@ cmake_version <- function(cmd = "cmake") { ) } -with_s3_support <- function(env_vars) { - arrow_s3 <- toupper(Sys.getenv("ARROW_S3")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - # but if ARROW_S3=OFF explicitly, we are definitely off, so override - if (toupper(Sys.getenv("ARROW_S3")) == "OFF") { - arrow_s3 <- FALSE +turn_off_thirdparty_features <- function(env_var_list) { + # Because these are done as environment variables (as opposed to build flags), + # setting these to "OFF" overrides any previous setting. We don't need to + # check the existing value. + turn_off <- c( + "ARROW_MIMALLOC" = "OFF", + "ARROW_JEMALLOC" = "OFF", + "ARROW_PARQUET" = "OFF", # depends on thrift + "ARROW_DATASET" = "OFF", # depends on parquet + "ARROW_S3" = "OFF", + "ARROW_WITH_BROTLI" = "OFF", + "ARROW_WITH_BZ2" = "OFF", + "ARROW_WITH_LZ4" = "OFF", + "ARROW_WITH_SNAPPY" = "OFF", + "ARROW_WITH_ZLIB" = "OFF", + "ARROW_WITH_ZSTD" = "OFF", + "ARROW_WITH_RE2" = "OFF", + "ARROW_WITH_UTF8PROC" = "OFF", + "ARROW_JSON" = "OFF", + # The syntax to turn off XSIMD is different. + # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined) + "EXTRA_CMAKE_FLAGS" = paste( + env_var_list[["EXTRA_CMAKE_FLAGS"]], + "-DARROW_SIMD_LEVEL=NONE -DARROW_RUNTIME_SIMD_LEVEL=NONE" + ) + ) + # Create a new env_var_list, with the values of turn_off set. + # replace() also adds new values if they didn't exist before + replace(env_var_list, names(turn_off), turn_off) +} + +set_thirdparty_urls <- function(env_var_list) { + # This function does *not* check if existing *_SOURCE_URL variables are set. + # The directory tools/thirdparty_dependencies is created by + # create_package_with_all_dependencies() and saved in the tar file. + files <- list.files(thirdparty_dependency_dir, full.names = FALSE) + url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files)) + # Special handling for the aws dependencies, which have extra `-` + aws <- grepl("^aws", files) + url_env_varname[aws] <- sub( + "AWS_SDK_CPP", "AWSSDK", + gsub( + "-", "_", + sub( + "(AWS.*)-.*", "ARROW_\\1_URL", + toupper(files[aws]) + ) + ) + ) + full_filenames <- file.path(normalizePath(thirdparty_dependency_dir), files) + + env_var_list <- replace(env_var_list, url_env_varname, full_filenames) + if (!quietly) { + env_var_list <- replace(env_var_list, "ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + } + env_var_list +} + +is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL", "false")) { + env_value <- tolower(Sys.getenv(env_varname)) + if (identical(env_value, "off")) { + # If e.g. ARROW_MIMALLOC=OFF explicitly, override default + requested <- FALSE + } else if (identical(env_value, "on")) { + requested <- TRUE + } else { + requested <- default + } + requested +} + +with_mimalloc <- function(env_var_list) { + arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") + if (arrow_mimalloc) { + # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { + cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") + arrow_mimalloc <- FALSE + } } + replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) +} + +with_s3_support <- function(env_var_list) { + arrow_s3 <- is_feature_requested("ARROW_S3") if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("CURL", NULL, env_vars)) { + } else if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("OpenSSL", "1.0.2", env_vars)) { + } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE } } - paste(env_vars, ifelse(arrow_s3, "ARROW_S3=ON", "ARROW_S3=OFF")) -} - -with_mimalloc <- function(env_vars) { - arrow_mimalloc <- toupper(Sys.getenv("ARROW_MIMALLOC")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - if (arrow_mimalloc) { - # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { - cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") - arrow_mimalloc <- FALSE - } - } - paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) + replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) } -cmake_gcc_version <- function(env_vars) { +cmake_gcc_version <- function(env_var_list) { # This function returns NA if using a non-gcc compiler # Always enclose calls to it in isTRUE() or isFALSE() - vals <- cmake_cxx_compiler_vars(env_vars) + vals <- cmake_cxx_compiler_vars(env_var_list) if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) { return(NA) } package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]]) } -cmake_cxx_compiler_vars <- function(env_vars) { +cmake_cxx_compiler_vars <- function(env_var_list) { + env_vars <- env_vars_as_string(env_var_list) info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE) info <- grep("^[A-Z_]* .*$", info, value = TRUE) vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info)) @@ -467,12 +545,13 @@ cmake_cxx_compiler_vars <- function(env_vars) { vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))] } -cmake_find_package <- function(pkg, version = NULL, env_vars) { +cmake_find_package <- function(pkg, version = NULL, env_var_list) { td <- tempfile() dir.create(td) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), td)) find_package <- paste0("find_package(", pkg, " ", version, " REQUIRED)") writeLines(find_package, file.path(td, "CMakeLists.txt")) + env_vars <- env_vars_as_string(env_var_list) cmake_cmd <- paste0( "export ", env_vars, " && cd ", td, @@ -501,12 +580,7 @@ if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) { unlink(bin_file) } else if (build_ok) { # (2) Find source and build it - if (download_ok) { - src_dir <- download_source() - } - if (is.null(src_dir)) { - src_dir <- find_local_source() - } + src_dir <- find_local_source() if (!is.null(src_dir)) { cat("*** Building C++ libraries\n") build_libarrow(src_dir, dst_dir) diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index 59c231724aa..d1d7998de32 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -50,13 +50,13 @@ This document is a work in progress and will grow and change as the Apache Arrow We welcome any feedback you have about things that are confusing or additions you would like to see here - please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) if you have any suggestions or requests. -# Developer environment setup +# Developer environment setup ## R-only {.tabset} Windows and macOS users who wish to contribute to the R package and don't need to alter libarrow (Arrow's C++ library) may be able to obtain a -recent version of the library without building from source. +recent version of the library without building from source. ### Linux @@ -71,7 +71,7 @@ nightly$ls("libarrow/bin") ``` Version numbers in that repository correspond to dates. -You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it. +You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it. ### macOS On macOS, you can install libarrow using [Homebrew](https://brew.sh/): @@ -95,7 +95,7 @@ nightly$ls("libarrow/bin") ``` Version numbers in that repository correspond to dates. -You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package. +You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package. ## R and C++ @@ -258,7 +258,7 @@ cmake \ .. ```

- + ## Installing a version of the R package with a specific git reference @@ -283,13 +283,13 @@ remotes::install_github("apache/arrow/r@bugfix", build = FALSE) Developers may wish to use this method of installing a specific commit separate from another Arrow development environment or system installation (e.g. we use this in [arrowbench](https://github.com/ursacomputing/arrowbench) -to install development versions of libarrow isolated from the system install). If -you already have libarrow installed system-wide, you may need to set +to install development versions of libarrow isolated from the system install). If +you already have libarrow installed system-wide, you may need to set some additional variables in order to isolate this build from your system libraries: * Setting the environment variable `FORCE_BUNDLED_BUILD` to `true` will skip the `pkg-config` search for libarrow and attempt to build from the same source at the repository+ref given. -* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: +* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: ```{r} withr::with_makevars(list(CPPFLAGS = "", LDFLAGS = ""), remotes::install_github(...)) ``` @@ -304,7 +304,7 @@ You can load the R package via `devtools::load_all()`. ## Rebuilding the documentation -The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. +The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. ```{r} remotes::install_github("r-lib/roxygen2") @@ -326,7 +326,7 @@ pkgdown::build_site(preview=TRUE) The R code in the package follows [the tidyverse style](https://style.tidyverse.org/). On PR submission (and on pushes) our CI will run linting and will flag possible errors on the pull request with annotations. -To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run +To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run ```{r} lintr::lint_package("arrow/r") @@ -360,12 +360,12 @@ C++ code in `src/`. This is because there are some features that are only enable and built conditionally during build time. If you change C++ code in the R package, you will need to set the `ARROW_R_DEV` environment variable to `true` (optionally, add it to your `~/.Renviron` file to persist across sessions) so -that the `data-raw/codegen.R` file is used for code generation. The `Makefile` +that the `data-raw/codegen.R` file is used for code generation. The `Makefile` commands also handles this automatically. We use Google C++ style in our C++ code. The easiest way to accomplish this is -use an editors/IDE that formats your code for you. Many popular editors/IDEs -have support for running `clang-format` on C++ files when you save them. +use an editors/IDE that formats your code for you. Many popular editors/IDEs +have support for running `clang-format` on C++ files when you save them. Installing/enabling the appropriate plugin may save you much frustration. Check for style errors with @@ -392,7 +392,7 @@ On macOS, you can get this by installing LLVM via Homebrew and running the scrip CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh ``` -_Note_ that the lint script requires Python 3 and the Python dependencies +_Note_ that the lint script requires Python 3 and the Python dependencies (note that `cmake_format is pinned to a specific version): * autopep8 @@ -419,16 +419,16 @@ variables or other settings: * All tests are skipped on Linux if the package builds without the C++ libarrow. To make the build fail if libarrow is not available (as in, to test that the C++ build was successful), set `TEST_R_WITH_ARROW=true` - + * Some tests are disabled unless `ARROW_R_DEV=true` * Tests that require allocating >2GB of memory to test Large types are disabled unless `ARROW_LARGE_MEMORY_TESTS=true` - + * Integration tests against a real S3 bucket are disabled unless credentials are set in `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`; these are available on request - + * S3 tests using [MinIO](https://min.io/) locally are enabled if the `minio server` process is found running. If you're running MinIO with custom settings, you can set `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and @@ -436,7 +436,7 @@ variables or other settings: ## Running checks -You can run package checks by using `devtools::check()` and check test coverage +You can run package checks by using `devtools::check()` and check test coverage with `covr::package_coverage()`. ```r @@ -458,10 +458,10 @@ R CMD check arrow_*.tar.gz --as-cran ## Running additional CI checks -On a pull request, there are some actions you can trigger by commenting on the +On a pull request, there are some actions you can trigger by commenting on the PR. We have additional CI checks that run nightly and can be requested on demand -using an internal tool called -[crossbow](https://arrow.apache.org/docs/developers/crossbow.html). +using an internal tool called +[crossbow](https://arrow.apache.org/docs/developers/crossbow.html). A few important GitHub comment commands are shown below. #### Run all extended R CI tasks @@ -476,7 +476,7 @@ This runs each of the R-related CI tasks. @github-actions crossbow submit {task-name} ``` -See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) +See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) for a list of glob expression patterns that match names of items in the `tasks:` list below it. @@ -486,10 +486,24 @@ list below it. @github-actions autotune ``` -This will run and fix lint C++ linting errors, run R documentation (among other -cleanup tasks), run styler on any changed R code, and commit the resulting +This will run and fix lint C++ linting errors, run R documentation (among other +cleanup tasks), run styler on any changed R code, and commit the resulting updates to the branch. +# Summary of environment variables + +* See the user-facing [Install vignette](install.html) for a large number of + environment variables that determine how the build works and what features + get built. +* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download + prebuilt the C++ library binary. + It will turn off any features that require a download, unless they're available + in the `tools/cpp/thirdparty/download/` subfolder of the tar.gz file. + `create_package_with_all_dependencies()` creates that subfolder. + Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. +* `TEST_R_WITHOUT_LIBARROW`: When set to `true`, skip tests that would require + the C++ Arrow library (that is, almost everything). + # Troubleshooting Note that after any change to libarrow, you must reinstall it and @@ -519,8 +533,8 @@ To resolve this, try [rebuilding the Arrow library](#step-3-building-arrow). ## Multiple versions of libarrow -If you are installing from a user-level directory, and you already have a -previous installation of libarrow in a system directory, you get you may get +If you are installing from a user-level directory, and you already have a +previous installation of libarrow in a system directory, you get you may get errors like the following when you install the R package: ``` @@ -531,7 +545,7 @@ Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath Reason: image not found ``` -If this happens, you need to make sure that you don't let R link to your system +If this happens, you need to make sure that you don't let R link to your system library when building arrow. You can do this a number of different ways: * Setting the `MAKEFLAGS` environment variable to `"LDFLAGS="` (see below for an example) this is the recommended way to accomplish this @@ -588,4 +602,5 @@ guide](https://arrow.apache.org/docs/developers/cpp/building.html). ## Other installation issues -There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information. \ No newline at end of file +There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information. +>>>>>>> master diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 013e63a113e..66f3e8e2e6e 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -102,6 +102,50 @@ satisfy C++ dependencies. > Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation. +## Offline installation + +The `install-arrow.R` file also includes the `create_package_with_all_dependencies()` +function. Normally, when installing on a computer with internet access, the +build process will download third-party dependencies as needed. +This function provides a way to download them in advance. +Doing so may be useful when installing Arrow on a computer without internet access. +Note that Arrow _can_ be installed on a computer without internet access without doing this, but +many useful features will be disabled, as they depend on third-party components. +More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every +capability. +One approach to add more capabilities in an offline install is to prepare a +package with pre-downloaded dependencies. The +`create_package_with_all_dependencies()` function does this preparation. + +If you're using binary packages you shouldn't need to follow these steps. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +`options("repos")` to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). + +### Using a computer with internet access, pre-download the dependencies: +* Install the `arrow` package _or_ run + `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` +* Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +* Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access + +### On the computer without internet access, install the prepared package: +* Install the `arrow` package from the copied file + * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` + * This installation will build from source, so `cmake` must be available +* Run `arrow_info()` to check installed capabilities + +#### Alternative, hands-on approach +* Download the dependency files (`cpp/thirdparty/download_dependencies.sh` may be helpful) +* Copy the directory of dependencies to the offline computer +* Create the environment variable `ARROW_THIRDPARTY_DEPENDENCY_DIR` on the offline computer, pointing to the copied directory. +* Install the `arrow` package as usual. + ## S3 support The `arrow` package allows you to work with data in AWS S3 or in other cloud @@ -156,10 +200,10 @@ If found, they will be downloaded and bundled when your R package compiles. For a list of supported distributions and versions, see the [arrow-r-nightly](https://github.com/ursa-labs/arrow-r-nightly/blob/master/README.md) project. -If no binary is found, it will download the Arrow C++ source that matches the R package version -(CRAN release or nightly build) and attempt to build it locally. -If no matching source bundle is found, it will also look to see if you are in +If no C++ library binary is found, it will attempt to build it locally. +First, it will also look to see if you are in a checkout of the `apache/arrow` git repository and thus have the C++ source there. +Otherwise, it builds from the C++ files included in the package. Depending on your system, building Arrow C++ from source may be slow. For the specific mechanics of how all this works, see the R package `configure` script, @@ -329,11 +373,15 @@ Some features are optional when you build Arrow from source. With the exception * `ARROW_S3`: If set to `ON` S3 support will be built as long as the dependencies are met; if they are not met, the build script will turn this `OFF` * `ARROW_JEMALLOC` for the `jemalloc` memory allocator +* `ARROW_MIMALLOC` for the `mimalloc` memmory allocator * `ARROW_PARQUET` * `ARROW_DATASET` * `ARROW_JSON` for the JSON parsing library * `ARROW_WITH_RE2` for the RE2 regular expression library, used in some string compute functions * `ARROW_WITH_UTF8PROC` for the UTF8Proc string library, used in many other string compute functions +* `ARROW_JSON` for JSON parsing +* `ARROW_WITH_BROTLI`, `ARROW_WITH_BZ2`, `ARROW_WITH_LZ4`, `ARROW_WITH_SNAPPY`, `ARROW_WITH_ZLIB`, and `ARROW_WITH_ZSTD` for various compression algorithms + There are a number of other variables that affect the `configure` script and the bundled build script. By default, these are all unset. All boolean variables are case-insensitive. @@ -342,10 +390,6 @@ By default, these are all unset. All boolean variables are case-insensitive. won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. -* `LIBARROW_DOWNLOAD`: Unless set to `false`, the build script - will attempt to download C++ binary or source bundles. - If you're in a checkout of the `apache/arrow` git repository - and want to build the C++ library from the local source, make this `false`. * `LIBARROW_BINARY`: If set to `true`, the script will try to download a binary C++ library built for your operating system. You may also set it to some other string,