diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 7275baca2bf..9173f0e530b 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -211,7 +211,6 @@ jobs: - name: Build Arrow C++ shell: bash env: - RTOOLS_VERSION: ${{ matrix.config.rtools }} MINGW_ARCH: ${{ matrix.config.arch }} run: ci/scripts/r_windows_build.sh - name: Rename libarrow.zip diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 853f03267bd..c801f90d414 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -72,11 +72,18 @@ fi if [[ -n "$DEVTOOLSET_VERSION" ]]; then $PACKAGE_MANAGER install -y centos-release-scl $PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION" - - # Only add make var if not set - if ! grep -Fq "CXX17=" ~/.R/Makevars &> /dev/null; then + + # Enable devtoolset here so that `which gcc` finds the right compiler below + source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable + + # Build images which require the devtoolset don't have CXX17 variables + # set as the system compiler doesn't support C++17 + if [ ! "`{R_BIN} CMD config CXX17`" ]; then mkdir -p ~/.R - echo "CXX17=g++ -std=gnu++17 -g -O2 -fpic" >> ~/.R/Makevars + echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars + echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars + echo "CXX17STD = -std=c++17" >> ~/.R/Makevars + echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars fi fi diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index f532bc7cf0a..d7df44e2e43 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -26,19 +26,6 @@ pushd ${source_dir} printenv -if [[ -n "$DEVTOOLSET_VERSION" ]]; then - # enable the devtoolset version to use it - source /opt/rh/devtoolset-$DEVTOOLSET_VERSION/enable - - # Build images which require the devtoolset don't have CXX17 variables - # set as the system compiler doesn't support C++17 - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars -fi - # Run the nixlibs.R test suite, which is not included in the installed package ${R_BIN} -e 'setwd("tools"); testthat::test_dir(".")' diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index c361af1d267..6b6a5dd0c99 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -23,26 +23,15 @@ set -ex # Make sure it is absolute and exported export ARROW_HOME="$(cd "${ARROW_HOME}" && pwd)" -if [ "$RTOOLS_VERSION" = "35" ]; then - # Use rtools-backports if building with rtools35 - curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf - pacman --noconfirm -Syy - # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) - RWINLIB_LIB_DIR="lib-4.9.3" - # This is the default (will build for each arch) but we can set up CI to - # do these in parallel - : ${MINGW_ARCH:="mingw32 mingw64"} -else - # Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN - # curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf - # curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" - # pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz - # pacman --noconfirm -Scc - - pacman --noconfirm -Syy - RWINLIB_LIB_DIR="lib" - : ${MINGW_ARCH:="mingw32 mingw64 ucrt64"} -fi +# Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN +# curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf +# curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" +# pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz +# pacman --noconfirm -Scc + +pacman --noconfirm -Syy +RWINLIB_LIB_DIR="lib" +: ${MINGW_ARCH:="mingw32 mingw64 ucrt64"} export MINGW_ARCH @@ -78,26 +67,19 @@ fi if [ -d mingw64/lib/ ]; then ls $MSYS_LIB_DIR/mingw64/lib/ # Make the rest of the directory structure - # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) - mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/x64 - # lib is for the new gcc 8 toolchain (Rtools 4.0) mkdir -p $DST_DIR/lib/x64 # Move the 64-bit versions of libarrow into the expected location - mv mingw64/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 - # These may be from https://dl.bintray.com/rtools/backports/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 + mv mingw64/lib/*.a $DST_DIR/lib/x64 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 + cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 fi # Same for the 32-bit versions if [ -d mingw32/lib/ ]; then ls $MSYS_LIB_DIR/mingw32/lib/ - mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/i386 mkdir -p $DST_DIR/lib/i386 - mv mingw32/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 + mv mingw32/lib/*.a $DST_DIR/lib/i386 + cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 fi # Do the same also for ucrt64 diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 222dbab3a08..efe071a766a 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -163,7 +163,7 @@ jobs: rig default {{ '${{ matrix.r_version.r }}' }}$rig_arch rig system setup-user-lib - rig system add-pak + rig system add-pak {{ macros.github_setup_local_r_repo(false, true)|indent }} - name: Prepare Dependency Installation @@ -275,18 +275,13 @@ jobs: ARROW_R_DEV: "TRUE" LIBARROW_BUILD: "FALSE" LIBARROW_BINARY: {{ '${{ matrix.config.libarrow_binary }}' }} - DEVTOOLSET_VERSION: {{ '${{ matrix.config.devtoolset }}' }} shell: bash run: | - if [[ "$DEVTOOLSET_VERSION" -gt 0 ]]; then - # enable the devtoolset version to use it - source /opt/rh/devtoolset-$DEVTOOLSET_VERSION/enable - fi Rscript -e ' {{ macros.github_test_r_src_pkg()|indent(8) }} ' - name: Upload binary artifact - if: matrix.config.devtoolset + if: matrix.config.devtoolset uses: actions/upload-artifact@v3 with: name: r-pkg_centos7 @@ -307,11 +302,11 @@ jobs: pkg <- pkg[[1]] warning("Multiple packages found! Using first one.") } - + # Install dependencies from RSPM install.packages("arrow", repos = "https://packagemanager.rstudio.com/all/__linux__/centos7/latest") remove.packages("arrow") - + install.packages(pkg) library(arrow) read_parquet(system.file("v0.7.1.parquet", package = "arrow")) diff --git a/r/README.md b/r/README.md index 2a85a82aeb3..edfa4678f3a 100644 --- a/r/README.md +++ b/r/README.md @@ -29,8 +29,8 @@ access to the Arrow C++ library API and higher-level access through a efficiency** (`read_csv_arrow()`, `read_json_arrow()`) - Write CSV files (`write_csv_arrow()`) - Manipulate and analyze Arrow data with **`dplyr` verbs** -- Read and write files in **Amazon S3** buckets with no additional - function calls +- Read and write files in **Amazon S3** and **Google Cloud Storage** + buckets with no additional function calls - Exercise **fine control over column types** for seamless interoperability with databases and data warehouse systems - Use **compression codecs** including Snappy, gzip, Brotli, @@ -64,9 +64,18 @@ additional system dependencies. For macOS and Windows, CRAN hosts binary packages that contain the Arrow C++ library. On Linux, source package installation will also build necessary C++ dependencies. For a faster, more complete installation, set the environment variable -`NOT_CRAN=true`. See `vignette("install", package = "arrow")` for -details. Note that version 9.0.0 was the last version to support -R 3.6 and lower on Windows. +`NOT_CRAN=true`. See `vignette("install", package = "arrow")` for details. + +As of version 10.0.0, `arrow` requires C++17 to build. This means that: + +* On Windows, you need `R >= 4.0`. Version 9.0.0 was the last version to support +R 3.6. +* On CentOS 7, you can build the latest version of `arrow`, +but you first need to install a newer compiler than the default system compiler, +gcc 4.8. See `vignette("install", package = "arrow")` for guidance. +Note that you only need the newer compiler to build `arrow`: +installing a binary package, as from RStudio Package Manager, +or loading a package you've already installed works fine with the system defaults. ### Installing a development version @@ -134,7 +143,7 @@ returns an R `data.frame`. To return an Arrow `Table`, set argument - `read_json_arrow()`: read a JSON data file For writing data to single files, the `arrow` package provides the -functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`. +functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`. These can be used with R `data.frame` and Arrow `Table` objects. For example, let’s write the Star Wars characters data that’s included @@ -266,7 +275,7 @@ sw %>% ``` Additionally, equality joins (e.g. `left_join()`, `inner_join()`) are supported -for joining multiple tables. +for joining multiple tables. ```r jedi <- data.frame( diff --git a/r/configure b/r/configure index 95328fd080f..eae33be57a4 100755 --- a/r/configure +++ b/r/configure @@ -51,6 +51,14 @@ if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then ${R_HOME}/bin/Rscript data-raw/codegen.R fi +if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then + echo "------------------------- NOTE ---------------------------" + echo "Cannot install arrow: a C++17 compiler is required." + echo "See https://arrow.apache.org/docs/r/articles/install.html" + echo "---------------------------------------------------------" + exit 1 +fi + if [ -f "tools/apache-arrow.rb" ]; then # If you want to use a local apache-arrow.rb formula, do # $ cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb tools/apache-arrow.rb @@ -177,7 +185,7 @@ else # Assume nixlibs.R has handled and messaged about its failure already # # TODO: what about non-bundled deps? - # Set CDPATH locally to prevent interference from global CDPATH (if set) + # Set CDPATH locally to prevent interference from global CDPATH (if set) BUNDLED_LIBS=`CDPATH=''; cd $LIB_DIR && ls *.a` BUNDLED_LIBS=`echo "$BUNDLED_LIBS" | sed -e "s/\\.a lib/ -l/g" | sed -e "s/\\.a$//" | sed -e "s/^lib/-l/" | tr '\n' ' ' | sed -e "s/ $//"` PKG_DIRS="-L`pwd`/$LIB_DIR" diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 025cf059f8a..f066e4494d3 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -98,8 +98,8 @@ download_binary <- function(lib) { # * Some other string: a "distro-version" that corresponds to a binary that is # available, to override what this function may discover by default. # Possible values are: -# * "centos-7" (gcc 4.8, no AWS/GCS support) -# * "ubuntu-18.04" (gcc 8, openssl 1) +# * "centos-7" (gcc 8 (devtoolset), openssl 1, glib 2.17) +# * "ubuntu-18.04" (gcc 8, openssl 1, glib 2.27) # * "ubuntu-22.04" (openssl 3) # These string values, along with `NULL`, are the potential return values of # this function. @@ -137,28 +137,21 @@ check_allowlist <- function(os, allowed = "https://raw.githubusercontent.com/apa select_binary <- function(os = tolower(Sys.info()[["sysname"]]), arch = tolower(Sys.info()[["machine"]]), - compiler_version = compiler_version_string(), test_program = test_for_curl_and_openssl) { if (identical(os, "linux") && identical(arch, "x86_64")) { # We only host x86 linux binaries today - is_gcc4 <- any(grepl("^g\\+\\+.*[^\\d.]4(\\.\\d){2}", compiler_version)) - if (is_gcc4) { - cat("*** Some features are not available with gcc 4\n") - return("centos-7") - } else { - tryCatch( - # Somehow the test program system2 call errors on the sanitizer builds - # so globally handle the possibility that this could fail - { - errs <- compile_test_program(test_program) - determine_binary_from_stderr(errs) - }, - error = function(e) { - cat("*** Unable to find libcurl and openssl\n") - NULL - } - ) - } + tryCatch( + # Somehow the test program system2 call errors on the sanitizer builds + # so globally handle the possibility that this could fail + { + errs <- compile_test_program(test_program) + determine_binary_from_stderr(errs) + }, + error = function(e) { + cat("*** Unable to find libcurl and openssl\n") + NULL + } + ) } else { # No binary available for arch cat(sprintf("*** Building on %s %s\n", os, arch)) @@ -196,30 +189,20 @@ compile_test_program <- function(code) { suppressWarnings(system2("echo", sprintf('"%s" | %s -', code, runner), stdout = FALSE, stderr = TRUE)) } -# TODO(ARROW-16976): build "ubuntu-18.04" on centos7 with newer devtoolset (but glibc is 2.17) for broader compatibility (like manylinux2014)? +# TODO(ARROW-16976): drop "ubuntu-18.04" and just use "centos-7" +# (built with newer devtoolset but older glibc (2.17) for broader compatibility,# like manylinux2014) determine_binary_from_stderr <- function(errs) { if (is.null(attr(errs, "status"))) { # There was no error in compiling: so we found libcurl and openssl > 1.0.2, # openssl is < 3.0, glibc is >= 2.27, and we're not using a strict libc++ cat("*** Found libcurl and openssl >= 1.0.2\n") return("ubuntu-18.04") + # Else, check for dealbreakers: } else if (any(grepl("Using libc++", errs, fixed = TRUE))) { # Our binaries are all built with GNU stdlib so they fail with libc++ cat("*** Found libc++\n") return(NULL) - } else if (any(grepl("glibc version too old", errs))) { - # ubuntu-18.04 has glibc 2.27, so even if you install newer compilers - # (e.g. devtoolset on centos) and have curl/openssl, you run into problems - # TODO(ARROW-16976): build binaries with older glibc - cat("*** Checking glibc version\n") - # If we're here, we're on an older OS but with a newer compiler than gcc 4.8 - # (we already checked), so it is possible to build with more features on. - # We just can't use our binaries because they were built with newer glibc. - return("centos-7") } else if (header_not_found("curl/curl", errs)) { - # TODO(ARROW-16985): should these next 3 NULL cases return centos-7? A source build - # won't be able to include more features. - # Could check if build_ok (also for glibc?) cat("*** libcurl not found\n") return(NULL) } else if (header_not_found("openssl/opensslv", errs)) { @@ -228,6 +211,15 @@ determine_binary_from_stderr <- function(errs) { } else if (any(grepl("OpenSSL version too old", errs))) { cat("*** openssl found but version >= 1.0.2 is required for some features\n") return(NULL) + # Else, determine which other binary will work + } else if (any(grepl("glibc version too old", errs))) { + # ubuntu-18.04 has glibc 2.27, so even if you install newer compilers + # (e.g. devtoolset on centos) and have curl/openssl, you run into problems + # TODO(ARROW-16976): build binaries with older glibc + cat("*** Checking glibc version\n") + # If we're here, we're on an older OS but with a new enough compiler + # (e.g. CentOS 7 with devtoolset-8) + return("centos-7") } else if (any(grepl("Using OpenSSL version 3", errs))) { cat("*** Found libcurl and openssl >= 3.0.0\n") return("ubuntu-22.04") @@ -240,10 +232,6 @@ header_not_found <- function(header, errs) { any(grepl(regex, errs)) } -compiler_version_string <- function(compiler = R_CMD_config("CXX17")) { - system(paste(compiler, "--version"), intern = TRUE) -} - #### start distro #### distro <- function() { @@ -436,10 +424,9 @@ build_libarrow <- function(src_dir, dst_dir) { LDFLAGS = R_CMD_config("LDFLAGS") ) env_var_list <- with_cloud_support(env_var_list) - env_var_list <- with_mimalloc(env_var_list) - # turn_off_all_optional_features() needs to happen after with_mimalloc() and - # with_cloud_support(), since those might turn features ON. + # turn_off_all_optional_features() needs to happen after + # with_cloud_support(), since it might turn features ON. thirdparty_deps_unavailable <- !download_ok && !dir.exists(thirdparty_dependency_dir) && !env_is("ARROW_DEPENDENCY_SOURCE", "system") @@ -654,26 +641,12 @@ is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL requested } -with_mimalloc <- function(env_var_list) { - arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") - if (arrow_mimalloc) { - # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 - if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { - cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") - arrow_mimalloc <- FALSE - } - } - replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) -} - with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3") arrow_gcs <- is_feature_requested("ARROW_GCS") if (arrow_s3 || arrow_gcs) { # User wants S3 or GCS support. - # If they're using gcc, let's make sure the version is >= 4.9 - # (aws-sdk-cpp requires that; google-cloud-cpp only tests with >= 6.3) - # and make sure that we have curl and openssl system libs + # Make sure that we have curl and openssl system libs feats <- c( if (arrow_s3) "S3", if (arrow_gcs) "GCS" @@ -690,11 +663,7 @@ with_cloud_support <- function(env_var_list) { # capabilities for using binaries. We could consider consolidating this # logic, though these use cmake in order to match exactly what we do in the # libarrow build, and maybe that increases the fidelity. - if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { - print_warning("not available for gcc < 4.9") - arrow_s3 <- FALSE - arrow_gcs <- FALSE - } else if (!cmake_find_package("CURL", NULL, env_var_list)) { + if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos # TODO: check for apt/yum/etc. and message the right thing? print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb)") @@ -712,25 +681,6 @@ with_cloud_support <- function(env_var_list) { replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) } -cmake_gcc_version <- function(env_var_list) { - # This function returns NA if using a non-gcc compiler - # Always enclose calls to it in isTRUE() or isFALSE() - vals <- cmake_cxx_compiler_vars(env_var_list) - if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) { - return(NA) - } - package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]]) -} - -cmake_cxx_compiler_vars <- function(env_var_list) { - env_vars <- env_vars_as_string(env_var_list) - info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE) - info <- grep("^[A-Z_]* .*$", info, value = TRUE) - vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info)) - names(vals) <- sub("^(.*?) .*$", "\\1", info) - vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))] -} - cmake_find_package <- function(pkg, version = NULL, env_var_list) { td <- tempfile() dir.create(td) diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index d5e83b13058..631ff7a3e35 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -39,19 +39,6 @@ test_that("select_binary() based on system", { expect_null(select_binary("linux", arch = "aarch64")), # Not built today "Building on linux aarch64" ) - gcc48 <- c( - "g++-4.8 (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4", - "Copyright (C) 2013 Free Software Foundation, Inc.", - "This is free software; see the source for copying conditions. There is NO", - "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - ) - expect_output( - expect_identical( - select_binary("linux", "x86_64", compiler_version = gcc48), - "centos-7" - ), - "Some features are not available with gcc 4" - ) }) test_that("compile_test_program()", { @@ -87,14 +74,14 @@ test_that("determine_binary_from_stderr", { test_that("select_binary() with test program", { expect_output( expect_identical( - select_binary("linux", "x86_64", "clang", "int a;"), + select_binary("linux", "x86_64", "int a;"), "ubuntu-18.04" ), "Found libcurl and openssl >= 1.0.2" ) expect_output( expect_identical( - select_binary("linux", "x86_64", "clang", "#error Using OpenSSL version 3"), + select_binary("linux", "x86_64", "#error Using OpenSSL version 3"), "ubuntu-22.04" ), "Found libcurl and openssl >= 3.0.0" diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index 54575d14cf8..9a4bf43b4a1 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -47,9 +47,9 @@ recent version of the library without building from source. On Linux, you can download a .zip file containing libarrow from the [nightly repository](https://nightlies.apache.org/arrow/r/libarrow/bin/). -The directory names correspond to the OS the binaries where built on: -- "centos-7" (gcc 4.8, no AWS/GCS support) -- "ubuntu-18.04" (gcc 8, openssl 1) +The directory names correspond to the OS the binaries where built on: +- "centos-7" (gcc 8 via devtoolset, openssl 1, glib 2.17) +- "ubuntu-18.04" (gcc 8, openssl 1, glib 2.27) - "ubuntu-22.04" (openssl 3) Version numbers in that repository correspond to dates. @@ -68,7 +68,7 @@ brew install apache-arrow --HEAD ### Windows -On Windows, you can download a .zip file containing libarrow from the +On Windows, you can download a .zip file containing libarrow from the [nightly repository](https://nightlies.apache.org/arrow/r/libarrow/bin/windows/). Version numbers in that repository correspond to dates. diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 36c973289b2..953f3c41bfc 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -25,6 +25,57 @@ a more detailed discussion of the code run during the installation process in th > Having trouble installing arrow? See the "Troubleshooting" section below. +# System dependencies + +The arrow package is designed to work with very minimal system requirements, +but there are a few things to note. + +## Compilers + +As of version 10.0.0, arrow requires a C++17 compiler to build. +For `gcc`, this generally means version 7 or newer. Most contemporary Linux +distributions have a new enough compiler; however, CentOS 7 is a notable +exception, as it ships with gcc 4.8. + +If you are on CentOS 7, to build arrow you will need to install a newer `devtoolset`, and you'll need to update R's Makevars to define the `CXX17` variables. This script installs `devtoolset-8` and configures R to be able to use C++17: + +``` +#!/usr/bin/env bash + +yum install -y centos-release-scl +yum install -y devtoolset-8 +# Optional: also install cloud storage dependencies, as described below +yum install -y libcurl-devel openssl-devel + +source /opt/rh/devtoolset-8/enable + +if [ ! `R CMD config CXX17` ]; then + mkdir -p ~/.R + echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars + echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars + echo "CXX17STD = -std=c++17" >> ~/.R/Makevars + echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars +fi +``` + +Note that the C++17 compiler is only required at *build* time. You don't need +to enable the devtoolset every time you load the package. What's more, if you +install a binary package from RStudio Package Manager (see method 1a below), you +do not need to set up any of this. Likewise, if you `R CMD INSTALL --build` +arrow on a CentOS machine with the newer compilers, you can take the binary +package it produces and install it on any other CentOS machine without those +compilers. + +## Libraries + +Optional support for reading from cloud storage--AWS S3 and +Google Cloud Storage (GCS)--requires additional system dependencies: + +* CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) +* OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb) + +The prebuilt binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. + # Installing a release version (the easy way) ## Method 1 - Installation with a precompiled libarrow binary @@ -85,7 +136,12 @@ install.packages("arrow") This installs the source version of the R package, but during the installation process will check for compatible libarrow binaries that we host and use those if available. If no binary is available or can't be found, then this option falls back onto method 2 below (full source build), but setting the environment variable results in a more fully-featured build than default. -Except for the those built for gcc 4.8 (default on CentOS 7), the binaries include support for AWS S3 and Google Cloud Storage (GCS). These features require libcurl and openssl libraries installed separately; see below on how to install them. If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build. +The libarrow binaries include support for AWS S3 and GCS, so they require the +libcurl and openssl libraries installed separately, as noted above. +If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3 and GCS support disabled). + +Users on CentOS 7 will also need to install and configure a C++17 compiler. +See "System dependencies" above. # Installing a release version (the less easy way) @@ -172,20 +228,17 @@ If downloading dependencies at build time is not an option, as when building on #### Dependencies for S3 and GCS support -The arrow package allows you to work with data in AWS S3 or in other cloud -storage system that emulate S3, as well as Google Cloud Storage. -However, support for working with S3 and GCS is not -enabled in the default source build, and it has additional system requirements. To +Support for working with data in S3 and GCS is not enabled in the default +source build, and it has additional system requirements as described above. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set `ARROW_S3=ON` and/or `ARROW_GCS=ON`. -You also need the following system dependencies: - -* `gcc` >= 4.9 or `clang` >= 3.3; note that the default compiler on CentOS 7 is gcc 4.8.5, which is not sufficient -* CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) -* OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb) -The prebuilt libarrow binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. +When either feature is enabled, the install script will check for the presence +of the required dependencies, and if the prerequisites are met, it will turn +off S3 and GCS support--installation will succeed but without S3 or GCS +functionality. If afterwards you install the missing system requirements, +you'll need to reinstall the package in order to enable S3 and GCS support. ### Advanced configuration for building from source @@ -239,8 +292,8 @@ See below for more in-depth explanations of these environment variables. will work with your system. You can set it to `false` to skip this option altogether, or you can specify a string "distro-version" that corresponds to a binary that is available, to override what this function may discover by - default. Possible values are: "centos-7" (gcc 4.8, no AWS/GCS support); - "ubuntu-18.04" (gcc 8, openssl 1); "ubuntu-22.04" (openssl 3). + default. Possible values are: "centos-7", + "ubuntu-18.04" (both with gcc 8, and openssl 1), "ubuntu-22.04" (openssl 3). * `LIBARROW_BUILD` : If set to `false`, the build script will not attempt to build the C++ from source. This means you will only get a working arrow R package if a prebuilt binary is found. @@ -477,19 +530,7 @@ so that we can improve the script. ## Known installation issues -* On CentOS, if you are using a more modern `devtoolset`, you may need to set -the environment variables `CC` and `CXX` either in the shell or in R's `Makeconf`. -For CentOS 7 and above, both the Arrow system packages and the C++ binaries -for R are built with the default system compilers. If you want to use either of these -and you have a `devtoolset` installed, set `CC=/usr/bin/gcc CXX=/usr/bin/g++` -to use the system compilers instead of the `devtoolset`. -Alternatively, if you want to build arrow with the newer `devtoolset` compilers, -set both `ARROW_USE_PKG_CONFIG` and `LIBARROW_BINARY` to `false` so that -you build the Arrow C++ from source using those compilers. -Compiler mismatch between the arrow system libraries and the R -package may cause R to segfault when arrow package functions are used. -See discussions [here](https://issues.apache.org/jira/browse/ARROW-8586) -and [here](https://issues.apache.org/jira/browse/ARROW-10780). +* On CentOS, building the package requires a more modern `devtoolset` than the default system compilers. See "System dependencies" above. * If you have multiple versions of `zstd` installed on your system, installation by building libarrow from source may fail with an "undefined symbols"