diff --git a/.env b/.env index ccaecc9ea84..7deac551dc1 100644 --- a/.env +++ b/.env @@ -30,7 +30,6 @@ GO=1.12 NODE=11 MAVEN=3.5.4 JDK=8 -R=3.6.1 PANDAS=latest DASK=latest TURBODBC=latest @@ -38,3 +37,7 @@ HDFS=2.9.2 SPARK=master DOTNET=2.1 R=3.6 +# These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-gcc-release:latest +R_ORG=rhub +R_IMAGE=ubuntu-gcc-release +R_TAG=latest diff --git a/.github/workflows/dev_cron.yml b/.github/workflows/dev_cron.yml index 8c74ceb86c0..afbb96ac7db 100644 --- a/.github/workflows/dev_cron.yml +++ b/.github/workflows/dev_cron.yml @@ -18,6 +18,12 @@ name: Dev Cron on: + push: + paths: + - '.github/workflows/dev_cron.yml' + pull_request: + paths: + - '.github/workflows/dev_cron.yml' schedule: - cron: | */15 * * * * diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 18d2ba8f264..fda436759da 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -32,7 +32,6 @@ on: - 'r/**' jobs: - ubuntu: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} runs-on: ubuntu-latest @@ -71,34 +70,32 @@ jobs: -p ${{ secrets.DOCKERHUB_TOKEN }} docker-compose push ubuntu-r - conda: - name: AMD64 Conda R ${{ matrix.r }} + rstudio: + name: "rstudio/r-base:${{ matrix.r_version }}-${{ matrix.r_image }}" runs-on: ubuntu-latest - if: github.event_name == 'push' strategy: fail-fast: false matrix: - r: [3.6] + # See https://hub.docker.com/r/rstudio/r-base + r_version: ["3.6"] + r_image: + - centos7 env: - R: ${{ matrix.r }} + R_ORG: "rstudio" + R_IMAGE: "r-base" + R_TAG: ${{ matrix.r_version }}-${{ matrix.r_image }} steps: - name: Checkout Arrow uses: actions/checkout@v1 - with: - submodules: true - name: Docker Pull shell: bash - run: | - docker-compose pull --ignore-pull-failures conda-cpp - docker-compose pull --ignore-pull-failures conda-r + run: docker-compose pull --ignore-pull-failures r - name: Docker Build shell: bash - run: | - docker-compose build conda-cpp - docker-compose build conda-r + run: docker-compose build r - name: Docker Run shell: bash - run: docker-compose run conda-r + run: docker-compose run r - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true @@ -106,4 +103,4 @@ jobs: run: | docker login -u ${{ secrets.DOCKERHUB_USER }} \ -p ${{ secrets.DOCKERHUB_TOKEN }} - docker-compose push conda-r + docker-compose push r diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 15320a63e70..100a1f4c018 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -54,16 +54,10 @@ RUN apt-get update -y && \ # Ensure parallel R package installation, set CRAN repo mirror, # and use pre-built binaries where possible -RUN printf "\ - options(Ncpus = parallel::detectCores(), \ - repos = 'https://demo.rstudiopm.com/all/__linux__/"$(lsb_release -cs)"/latest', \ - HTTPUserAgent = sprintf(\ - 'R/%%s R (%%s)', getRversion(), \ - paste(getRversion(), R.version\$platform, R.version\$arch, R.version\$os)))\n" \ - >> /etc/R/Rprofile.site - +COPY ci/etc/rprofile /arrow/ci/etc/ +RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site # Also ensure parallel compilation of C/C++ code -RUN echo "MAKEFLAGS=-j$(R --slave -e 'cat(parallel::detectCores())')" >> /usr/lib/R/etc/Makeconf +RUN echo "MAKEFLAGS=-j$(R --slave -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Makeconf COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile new file mode 100644 index 00000000000..d23b802c55f --- /dev/null +++ b/ci/docker/linux-r.dockerfile @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# General purpose Dockerfile to take a Docker image containing R +# and install Arrow R package dependencies + +ARG base +FROM ${base} + +# Make sure R is on the path for the R-hub devel versions (where RPREFIX is set in its dockerfile) +ENV PATH "${RPREFIX}/bin:${PATH}" +# Ensure parallel R package installation, set CRAN repo mirror, +# and use pre-built binaries where possible +COPY ci/etc/rprofile /arrow/ci/etc/ +RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site +# Also ensure parallel compilation of C/C++ code +RUN echo "MAKEFLAGS=-j$(R --slave -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Makeconf +# Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786 +RUN Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}' + +COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ +COPY r/DESCRIPTION /arrow/r/ +RUN /arrow/ci/scripts/r_deps.sh /arrow diff --git a/ci/etc/rprofile b/ci/etc/rprofile new file mode 100644 index 00000000000..8b7e401388c --- /dev/null +++ b/ci/etc/rprofile @@ -0,0 +1,51 @@ +.pick_cran <- function() { + # Return a CRAN repo URL, preferring RSPM binaries if available for this OS + rspm_template <- "https://demo.rstudiopm.com/all/__linux__/%s/latest" + supported_os <- c("xenial", "bionic", "centos7", "opensuse42", "opensuse15") + + if (nzchar(Sys.which("lsb_release"))) { + os <- tolower(system("lsb_release -cs", intern = TRUE)) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } + } + if (file.exists("/etc/os-release")) { + os_release <- readLines("/etc/os-release") + vals <- sub("^.*=(.*)$", "\\1", os_release) + os <- intersect(vals, supported_os) + if (length(os)) { + # e.g. "bionic" + return(sprintf(rspm_template, os)) + } else { + names(vals) <- sub("^(.*)=.*$", "\\1", os_release) + if (vals["ID"] == "opensuse") { + version <- sub('^"?([0-9]+).*"?.*$', "\\1", vals["VERSION_ID"]) + os <- paste0("opensuse", version) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } + } + } + } + if (file.exists("/etc/system-release")) { + # Something like "CentOS Linux release 7.7.1908 (Core)" + system_release <- tolower(utils::head(readLines("/etc/system-release"), 1)) + # Extract from that the distro and the major version number + os <- sub("^([a-z]+) .* ([0-9]+).*$", "\\1\\2", system_release) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } + } + + return("https://cloud.r-project.org") +} + +options( + Ncpus = parallel::detectCores(), + repos = tryCatch(.pick_cran(), error = function(e) "https://cloud.r-project.org"), + HTTPUserAgent = sprintf( + 'R/%s R (%s)', + getRversion(), + paste(getRversion(), R.version$platform, R.version$arch, R.version$os) + ) +) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 05a2abac6b7..567fd0e90ca 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1038,20 +1038,25 @@ macro(build_thrift) # Thrift also uses boost. Forward important boost settings if there were ones passed. if(DEFINED BOOST_ROOT) - set(THRIFT_CMAKE_ARGS ${THRIFT_CMAKE_ARGS} "-DBOOST_ROOT=${BOOST_ROOT}") + list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() if(DEFINED Boost_NAMESPACE) - set(THRIFT_CMAKE_ARGS ${THRIFT_CMAKE_ARGS} "-DBoost_NAMESPACE=${Boost_NAMESPACE}") + list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") + endif() + + if(DEFINED FLEX_ROOT) + # thrift hasn't set the cmake policy that lets us use _ROOT, so work around + list(APPEND THRIFT_CMAKE_ARGS "-DFLEX_EXECUTABLE=${FLEX_ROOT}/flex") endif() set(THRIFT_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}thrift") if(MSVC) if(ARROW_USE_STATIC_CRT) set(THRIFT_STATIC_LIB_NAME "${THRIFT_STATIC_LIB_NAME}mt") - set(THRIFT_CMAKE_ARGS ${THRIFT_CMAKE_ARGS} "-DWITH_MT=ON") + list(APPEND THRIFT_CMAKE_ARGS "-DWITH_MT=ON") else() set(THRIFT_STATIC_LIB_NAME "${THRIFT_STATIC_LIB_NAME}md") - set(THRIFT_CMAKE_ARGS ${THRIFT_CMAKE_ARGS} "-DWITH_MT=OFF") + list(APPEND THRIFT_CMAKE_ARGS "-DWITH_MT=OFF") endif() endif() if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index deadc82abc5..ea54278ede7 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -8,6 +8,7 @@ *.json *.snap .github/ISSUE_TEMPLATE.md +ci/etc/rprofile cpp/CHANGELOG_PARQUET.md cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml new file mode 100644 index 00000000000..94ddc08d265 --- /dev/null +++ b/dev/tasks/r/azure.linux.yml @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +jobs: +- job: linux + pool: + vmImage: ubuntu-latest + timeoutInMinutes: 360 + steps: + - script: | + set -ex + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + displayName: Clone arrow + + - script: | + set -ex + docker -v + docker-compose -v + cd arrow + R_ORG={{ r_org }} + R_IMAGE={{ r_image }} + R_TAG={{ r_tag }} + docker-compose pull --ignore-pull-failures r + docker-compose build r + displayName: Docker build + + - script: | + set -ex + cd arrow + R_ORG={{ r_org }} + R_IMAGE={{ r_image }} + R_TAG={{ r_tag }} + docker-compose run r + displayName: Docker run diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index af59d8cb467..884072e8f54 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -111,6 +111,12 @@ groups: - test-debian-10-python-3 - test-ubuntu-18.04-python-3 - test-fedora-29-python-3 + - test-r-rhub-ubuntu-gcc-release + - test-r-rhub-debian-gcc-devel + - test-r-rstudio-r-base-3.6-bionic + - test-r-rstudio-r-base-3.6-centos6 + - test-r-rstudio-r-base-3.6-opensuse15 + - test-r-rstudio-r-base-3.6-opensuse42 - test-ubuntu-18.04-r-3.6 - test-conda-r-3.6 - test-ubuntu-18.04-r-sanitizer @@ -151,6 +157,12 @@ groups: - test-debian-10-python-3 - test-ubuntu-18.04-python-3 - test-fedora-29-python-3 + - test-r-rhub-ubuntu-gcc-release + - test-r-rhub-debian-gcc-devel + - test-r-rstudio-r-base-3.6-bionic + - test-r-rstudio-r-base-3.6-centos6 + - test-r-rstudio-r-base-3.6-opensuse15 + - test-r-rstudio-r-base-3.6-opensuse42 - test-ubuntu-18.04-r-3.6 - test-conda-r-3.6 - test-ubuntu-18.04-r-sanitizer @@ -181,9 +193,15 @@ groups: - test-fedora-29-python-3 r: - - test-ubuntu-18.04-r-3.6 - test-conda-r-3.6 - test-ubuntu-18.04-r-sanitizer + - test-r-rhub-ubuntu-gcc-release + - test-r-rhub-debian-gcc-devel + - test-r-rstudio-r-base-3.6-bionic + - test-r-rstudio-r-base-3.6-centos6 + - test-r-rstudio-r-base-3.6-opensuse15 + - test-r-rstudio-r-base-3.6-opensuse42 + - macos-r-autobrew ruby: - test-debian-ruby @@ -271,7 +289,12 @@ groups: - test-debian-10-python-3 - test-ubuntu-18.04-python-3 - test-fedora-29-python-3 - - test-ubuntu-18.04-r-3.6 + - test-r-rhub-ubuntu-gcc-release + - test-r-rhub-debian-gcc-devel + - test-r-rstudio-r-base-3.6-bionic + - test-r-rstudio-r-base-3.6-centos6 + - test-r-rstudio-r-base-3.6-opensuse15 + - test-r-rstudio-r-base-3.6-opensuse42 - test-conda-r-3.6 - test-ubuntu-18.04-r-sanitizer - test-debian-10-go-1.12 @@ -1907,6 +1930,60 @@ tasks: - docker-compose build fedora-python - docker-compose run fedora-python + test-r-rhub-debian-gcc-devel: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rhub + r_image: debian-gcc-devel + r_tag: latest + + test-r-rhub-ubuntu-gcc-release: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rhub + r_image: ubuntu-gcc-release + r_tag: latest + + test-r-rstudio-r-base-3.6-bionic: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rstudio + r_image: r-base + r_tag: 3.6-bionic + + test-r-rstudio-r-base-3.6-centos6: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rstudio + r_image: r-base + r_tag: 3.6-centos6 + + test-r-rstudio-r-base-3.6-opensuse15: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rstudio + r_image: r-base + r_tag: 3.6-opensuse15 + + test-r-rstudio-r-base-3.6-opensuse42: + ci: azure + platform: linux + template: r/azure.linux.yml + params: + r_org: rstudio + r_image: r-base + r_tag: 3.6-opensuse24 + test-ubuntu-18.04-r-3.6: ci: circle platform: linux diff --git a/docker-compose.yml b/docker-compose.yml index 6bf5f67e73a..6c2904e3bca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -662,6 +662,35 @@ services: /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/r_test.sh /arrow" + r: + # This lets you test building/installing the arrow R package + # (including building the C++ library) on any Docker image that contains R + # + # Usage: + # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r + # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r + image: ${REPO}:r-${R_ORG}-${R_IMAGE}-${R_TAG} + build: + context: . + dockerfile: ci/docker/linux-r.dockerfile + cache_from: + - ${REPO}:r-${R_ORG}-${R_IMAGE}-${R_TAG} + args: + base: ${R_ORG}/${R_IMAGE}:${R_TAG} + shm_size: *shm-size + environment: + TEST_R_WITH_ARROW: "true" + NOT_CRAN: "true" + LIBARROW_DOWNLOAD: "false" + volumes: + - .:/arrow:delegated + command: > + /bin/bash -c " + cd /arrow/r && + R CMD INSTALL . && + cd tests && + R --no-save --no-restore -e 'library(testthat); library(arrow); library(tibble); test_check(\"arrow\", reporter = \"summary\", stop_on_warning = TRUE)'" + ubuntu-r-sanitizer: # Only 18.04 and amd64 supported # Usage: diff --git a/r/.Rbuildignore b/r/.Rbuildignore index 8f0680510be..6830c9019cd 100644 --- a/r/.Rbuildignore +++ b/r/.Rbuildignore @@ -8,6 +8,7 @@ lint.sh Dockerfile .*\.tar\.gz ^windows +^libarrow clang_format.sh ^cran-comments\.md$ ^arrow_.*.tar.gz$ diff --git a/r/.gitignore b/r/.gitignore index a137f77756c..63a9dc70959 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -12,5 +12,6 @@ inst/doc src/Makevars src/Makevars.win windows/ +libarrow/ arrow_*.tar.gz arrow_*.tgz diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 465dba72681..c4cd26a1704 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -98,6 +98,7 @@ THEN_REINSTALL <- paste( SEE_README <- paste( "Refer to the R package README", "", + "and `vignette('install', package = 'arrow')`", "for further details." ) diff --git a/r/README.Rmd b/r/README.Rmd index 5ba6689eee4..ab0ddf8c0bc 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -36,9 +36,16 @@ Conda users on Linux and macOS can install `arrow` from conda-forge with conda install -c conda-forge r-arrow ``` -On macOS and Windows, installing a binary package from CRAN will handle Arrow's C++ dependencies for you. On Linux, unless you use `conda` you'll need to first install the C++ library. See the [Arrow project installation page](https://arrow.apache.org/install/) to find pre-compiled binary packages for some common Linux distributions, including Debian, Ubuntu, and CentOS. You'll need to install `libparquet-dev` on Debian and Ubuntu, or `parquet-devel` on CentOS. This will also automatically install the Arrow C++ library as a dependency. Other Linux distributions must install the C++ library from source. +On macOS and Windows, installing a binary package from CRAN will handle +Arrow's C++ dependencies for you. On Linux, unless you use `conda`, +the R package will have to compile its bindings from source +and it will need to find or download the C++ dependencies. +As of the 0.16.0 release, this dependency resolution is automatic on most +common Linux distributions. See `vignette("install", package = "arrow")` +for details. -If you install the `arrow` package from source and the C++ library is not found, the R package functions will notify you that Arrow is not available. Call +If you install the `arrow` package from source and the C++ library is not found, +the R package functions will notify you that Arrow is not available. Call ```r arrow::install_arrow() @@ -46,7 +53,9 @@ arrow::install_arrow() for version- and platform-specific guidance on installing the Arrow C++ library. -When installing from source, if the R and C++ library versions do not match, installation may fail. If you've previously installed the libraries and want to upgrade the R package, you'll need to update the Arrow C++ library first. +When installing from source, if the R and C++ library versions do not match, +installation may fail. If you've previously installed the libraries +and want to upgrade the R package, you'll need to update the Arrow C++ library first. ## Example @@ -69,13 +78,11 @@ as.data.frame(tab) Binary R packages for macOS and Windows are built daily and hosted at https://dl.bintray.com/ursalabs/arrow-r/. To install from there: ```r -install.packages("arrow", repos="https://dl.bintray.com/ursalabs/arrow-r") +install.packages("arrow", repos = "https://dl.bintray.com/ursalabs/arrow-r") ``` These daily package builds are not official Apache releases and are not recommended for production use. They may be useful for testing bug fixes and new features under active development. -Linux users will need to build the Arrow C++ library from source, then install the R package from source, as described in the next section. - ## Developing Windows and macOS users who wish to contribute to the R package and don't need to alter the Arrow C++ library may be able to obtain a recent version of the library without building from source. On macOS, you may install the C++ library using [Homebrew](https://brew.sh/): @@ -131,7 +138,8 @@ was put in `make install`, e.g. `export R_LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package. For any other build/configuration challenges, see the [C++ developer -guide](https://arrow.apache.org/docs/developers/cpp.html#building). +guide](https://arrow.apache.org/docs/developers/cpp.html#building) +and `vignette("install", package = "arrow")`. ### Editing Rcpp code @@ -187,8 +195,8 @@ command line (`make test`, `make doc`, `make clean`, etc.) ### Full package validation ```shell -R CMD build --keep-empty-dirs . -R CMD check arrow_*.tar.gz --as-cran --no-manual +R CMD build . +R CMD check arrow_*.tar.gz --as-cran ``` [1]: https://github.com/apache/arrow/blob/master/docs/source/developers/cpp.rst diff --git a/r/configure b/r/configure index 003e1fb43ac..b281528992f 100755 --- a/r/configure +++ b/r/configure @@ -31,7 +31,8 @@ PKG_DEB_NAME="(unsuppored)" PKG_RPM_NAME="(unsuppored)" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="" -PKG_LIBS="-larrow -lparquet -larrow_dataset" +# These must be the same order as $(pkg-config --libs arrow-dataset) +PKG_LIBS="-larrow_dataset -lparquet -larrow" # generate code if [ "$ARROW_R_DEV" = "TRUE" ]; then @@ -50,6 +51,7 @@ if [ "$LOCAL_AUTOBREW" = "TRUE" ]; then cp tools/autobrew . cp tools/apache-arrow.rb . FORCE_AUTOBREW="TRUE" + ARROW_USE_PKG_CONFIG="FALSE" fi # Note that cflags may be empty in case of success @@ -58,9 +60,9 @@ if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS" PKG_LIBS="-L$LIB_DIR $PKG_LIBS" else - # Use pkg-config if available + # Use pkg-config if available and allowed pkg-config --version >/dev/null 2>&1 - if [ "$FORCE_AUTOBREW" != "TRUE" ] && [ $? -eq 0 ]; then + if [ "$ARROW_USE_PKG_CONFIG" != "FALSE" ] && [ $? -eq 0 ]; then PKGCONFIG_CFLAGS=`pkg-config --cflags --silence-errors ${PKG_CONFIG_NAME}` PKGCONFIG_LIBS=`pkg-config --libs --silence-errors ${PKG_CONFIG_NAME}` fi @@ -91,6 +93,14 @@ else fi PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include" PKG_LIBS="-L$BREWDIR/opt/$PKG_BREW_NAME/lib $PKG_LIBS" + elif [ "$UNAME" = "Linux" ]; then + VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) + ${R_HOME}/bin/Rscript tools/linuxlibs.R $VERSION + PKG_CFLAGS="-I$(pwd)/libarrow/arrow-${VERSION}/include $PKG_CFLAGS" + PKG_LIBS="-L$(pwd)/libarrow/arrow-${VERSION}/lib $PKG_LIBS" + # Also enumerate the static libs included in there + # TODO: this should be generated based on what's in the lib dir + PKG_LIBS="$PKG_LIBS -lthrift -lsnappy -lz -lzstd -llz4 -lbrotlidec-static -lbrotlienc-static -lbrotlicommon-static -lboost_filesystem -lboost_regex -lboost_system -ljemalloc_pic" fi fi fi diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh new file mode 100755 index 00000000000..5ea7a9bb7a4 --- /dev/null +++ b/r/inst/build_arrow_static.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Quit on failure +# set -e + +# Print commands for debugging +set -x + +# By default, this script assumes it's in the top-level dir of the apache/arrow +# git repository. Set any of the following env vars to customize where to read +# and write from +: ${ARROW_HOME:="$(pwd)"} # Only used in default SOURCE/BUILD dirs +: ${SOURCE_DIR:="${ARROW_HOME}/cpp"} # Where the C++ source is +: ${BUILD_DIR:="${ARROW_HOME}/r/libarrow/dist"} # Where cmake should build +: ${DEST_DIR:="$BUILD_DIR"} # Where the resulting /lib and /include should be +: ${CMAKE:="$(which cmake)"} + +# Make sure SOURCE and DEST dirs are absolute and exist +SOURCE_DIR="$(cd "${SOURCE_DIR}" && pwd)" +DEST_DIR="$(mkdir -p "${DEST_DIR}" && cd "${DEST_DIR}" && pwd)" + +if [ "$CMAKE_GENERATOR" = "" ]; then + # Look for ninja, prefer it + if which ninja >/dev/null 2>&1; then + CMAKE_GENERATOR="Ninja" + fi +fi + +if [ "$FLEX_ROOT" != "" ]; then + EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DFLEX_ROOT=${FLEX_ROOT}" +fi +if [ "$BISON_ROOT" != "" ]; then + # Thrift can't find this as a cmake flag, so put it on the PATH + export PATH="${BISON_ROOT}:${PATH}" +fi +if [ "$ARROW_R_DEV" = "TRUE" ]; then + # Print more verbosity + EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DARROW_VERBOSE_THIRDPARTY_BUILD=ON" +fi + +mkdir -p "${BUILD_DIR}" +pushd "${BUILD_DIR}" +${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_STATIC=ON \ + -DARROW_COMPUTE=ON \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + -DARROW_FILESYSTEM=ON \ + -DARROW_JEMALLOC=ON \ + -DARROW_JSON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_WITH_BROTLI=ON \ + -DARROW_WITH_BZ2=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \ + -DOPENSSL_USE_STATIC_LIBS=ON \ + ${EXTRA_CMAKE_FLAGS} \ + -G ${CMAKE_GENERATOR:-"Unix Makefiles"} \ + ${SOURCE_DIR} +${CMAKE} --build . --target install + +if [ $? -ne 0 ] && [ "${DEBUG_DIR}" != "" ]; then + # For debugging installation problems, copy the build contents somewhere not tmp + mkdir -p ${DEBUG_DIR} + cp -r ./* ${DEBUG_DIR} +fi + +# Copy the bundled static libs from the build to the install dir +# See https://issues.apache.org/jira/browse/ARROW-7499 for moving this to CMake +find . -regex .*/.*/lib/.*\\.a\$ | xargs -I{} cp -u {} ${DEST_DIR}/lib +popd diff --git a/r/tests/testthat/test-arrow.R b/r/tests/testthat/test-arrow.R index 41a4df932b9..7ef25ac672e 100644 --- a/r/tests/testthat/test-arrow.R +++ b/r/tests/testthat/test-arrow.R @@ -17,7 +17,7 @@ context("General checks") -if (identical(Sys.getenv("TEST_R_WITH_ARROW"), "TRUE")) { +if (identical(tolower(Sys.getenv("TEST_R_WITH_ARROW")), "true")) { testthat::test_that("Arrow C++ is available", { expect_true(arrow_available()) }) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 140ce554eb4..fe9c1725991 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -127,7 +127,7 @@ test_that("Partition scheme inference", { }) test_that("filter() on a dataset won't auto-collect", { - ds <- open_dataset(dataset_dir) + ds <- open_dataset(dataset_dir, partition = "part") expect_error( ds %>% filter(int > 6, dbl > max(dbl)), "Filter expression not supported for Arrow Datasets: dbl > max(dbl)", diff --git a/r/tests/testthat/test-install-arrow.R b/r/tests/testthat/test-install-arrow.R index b529b01274f..fe983f862ef 100644 --- a/r/tests/testthat/test-install-arrow.R +++ b/r/tests/testthat/test-install-arrow.R @@ -19,7 +19,7 @@ context("install_arrow()") i_have_arrow_msg <- "It appears you already have Arrow installed successfully: are you trying to install a different version of the library? -Refer to the R package README for further details. +Refer to the R package README and `vignette('install', package = 'arrow')` for further details. If you have other trouble, or if you think this message could be improved, please report an issue here: " diff --git a/r/tools/linuxlibs.R b/r/tools/linuxlibs.R new file mode 100644 index 00000000000..8c65d514af0 --- /dev/null +++ b/r/tools/linuxlibs.R @@ -0,0 +1,341 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +args <- commandArgs(TRUE) +VERSION <- args[1] +dst_dir <- paste0("libarrow/arrow-", VERSION) + +arrow_repo <- "https://dl.bintray.com/ursalabs/arrow-r/libarrow/" +apache_src_url <- paste0( + "https://www.apache.org/dyn/closer.cgi/arrow/arrow-", VERSION, + "/apache-arrow-", VERSION, ".tar.gz" +) + +options(.arrow.cleanup = character()) # To collect dirs to rm on exit +on.exit(unlink(getOption(".arrow.cleanup"))) + +env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) + +# This condition identifies when you're installing locally, either by presence +# of NOT_CRAN or by absence of a `check` env var. So even if you don't have +# NOT_CRAN set in your dev environment, this will still pass outside R CMD check +locally_installing <- env_is("NOT_CRAN", "true") || env_is("_R_CHECK_SIZE_OF_TARBALL_", "") +# Combine with explicit vars to turn off downloading and building. I.e. only +# downloads/builds when local, but can turn off either with these env vars. +# * no download, build_ok: Only build with local git checkout +# * download_ok, no build: Only use prebuilt binary, if found +# * neither: Get the arrow-without-arrow package +download_ok <- locally_installing && !env_is("LIBARROW_DOWNLOAD", "false") +build_ok <- locally_installing && !env_is("LIBARROW_BUILD", "false") +# TODO: allow LIBARROW_DOWNLOAD=true to override locally_installing? or just set NOT_CRAN? + +# For local debugging, set ARROW_R_DEV=TRUE to make this script print more +quietly <- !env_is("ARROW_R_DEV", "true") + +download_binary <- function(os = identify_os()) { + libfile <- tempfile() + if (!is.null(os)) { + binary_url <- paste0(arrow_repo, "bin/", os, "/arrow-", VERSION, ".zip") + try( + download.file(binary_url, libfile, quiet = quietly), + silent = quietly + ) + if (!file.exists(libfile)) { + cat(sprintf("*** No C++ binaries found for %s\n", os)) + libfile <- NULL + } + } else { + libfile <- NULL + } + libfile +} + +# Function to figure out which flavor of binary we should download. +# By default, it will try to discover from the OS what distro-version we're on +# but you can override this by setting the env var LIBARROW_BINARY_DISTRO to: +# * `FALSE` (not case-sensitive), which tells the script not to download a binary, +# * some other string, presumably a related "distro-version" that has binaries +# built that work for your OS +identify_os <- function(os = Sys.getenv("LIBARROW_BINARY_DISTRO")) { + if (nzchar(os)) { + if (identical(tolower(os), "false")) { + # Env var says not to download a binary + return(NULL) + } else { + # Env var provided an os-version to use--maybe you're on Ubuntu 18.10 but + # we only build for 18.04 and that's fine--so use what the user set + return(os) + } + } + + if (nzchar(Sys.which("lsb_release"))) { + distro <- tolower(system("lsb_release -is", intern = TRUE)) + os_version <- system("lsb_release -rs", intern = TRUE) + # In the future, we may be able to do some mapping of distro-versions to + # versions we built for, since there's no way we'll build for everything. + os <- paste0(distro, "-", os_version) + } else if (file.exists("/etc/system-release")) { + # Something like "CentOS Linux release 7.7.1908 (Core)" + system_release <- tolower(utils::head(readLines("/etc/system-release"), 1)) + # Extract from that the distro and the major version number + os <- sub("^([a-z]+) .* ([0-9]+).*$", "\\1-\\2", system_release) + } else { + cat("*** Unable to identify current OS/version\n") + os <- NULL + } + os +} + +download_source <- function() { + tf1 <- tempfile() + src_dir <- NULL + source_url <- paste0(arrow_repo, "src/arrow-", VERSION, ".zip") + try( + download.file(source_url, tf1, quiet = quietly), + silent = quietly + ) + if (!file.exists(tf1)) { + # Try for an official release + try( + download.file(apache_src_url, tf1, quiet = quietly), + silent = quietly + ) + } + if (file.exists(tf1)) { + cat("*** Successfully retrieved C++ source\n") + src_dir <- tempfile() + unzip(tf1, exdir = src_dir) + unlink(tf1) + # These scripts need to be executable + system(sprintf("chmod 755 %s/cpp/build-support/*.sh", src_dir)) + options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir)) + # The actual src is in cpp + src_dir <- paste0(src_dir, "/cpp") + } + src_dir +} + +find_local_source <- function() { + if (file.exists("../cpp/src/arrow/api.h")) { + # We're in a git checkout of arrow, so we can build it + cat("*** Found local C++ source\n") + return("../cpp") + } else { + return(NULL) + } +} + +build_libarrow <- function(src_dir, dst_dir) { + # We'll need to compile R bindings with these libs, so delete any .o files + system("rm src/*.o", ignore.stdout = quietly, ignore.stderr = quietly) + # Check for libarrow build dependencies: + # * cmake + # * flex and bison (for building thrift) + # * m4 (for building flex and bison) + cmake <- ensure_cmake() + m4 <- ensure_m4() + flex <- ensure_flex(m4) + bison <- ensure_bison(m4) + env_vars <- sprintf( + "SOURCE_DIR=%s BUILD_DIR=libarrow/build DEST_DIR=%s CMAKE=%s", + src_dir, dst_dir, cmake + ) + if (!is.null(flex)) { + system(paste0(flex, "/flex --version")) + env_vars <- paste0(env_vars, " FLEX_ROOT=", flex) + } + if (!is.null(bison)) { + system(paste0(bison, "/bin/bison --version")) + env_vars <- sprintf( + "PATH=%s/bin:$PATH %s BISON_PKGDATADIR=%s/share/bison", + bison, env_vars, bison + ) + } + if (!quietly) { + cat("*** Building with ", env_vars, "\n") + } + system(paste(env_vars, "inst/build_arrow_static.sh")) +} + +ensure_cmake <- function() { + cmake <- Sys.which("cmake") + if (!nzchar(cmake)) { + # If not found, download it + cat("*** Downloading cmake\n") + CMAKE_VERSION <- Sys.getenv("CMAKE_VERSION", "3.16.2") + cmake_binary_url <- paste0( + "https://github.com/Kitware/CMake/releases/download/v", CMAKE_VERSION, + "/cmake-", CMAKE_VERSION, "-Linux-x86_64.tar.gz" + ) + cmake_tar <- tempfile() + cmake_dir <- tempfile() + try( + download.file(cmake_binary_url, cmake_tar, quiet = quietly), + silent = quietly + ) + untar(cmake_tar, exdir = cmake_dir) + unlink(cmake_tar) + options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir)) + cmake <- paste0( + cmake_dir, + "/cmake-", CMAKE_VERSION, "-Linux-x86_64", + "/bin/cmake" + ) + } + cmake +} + +# TODO: move ensure_flex/bison/m4 to cmake: https://issues.apache.org/jira/browse/ARROW-7501 +ensure_flex <- function(m4 = ensure_m4()) { + if (nzchar(Sys.which("flex"))) { + # We already have flex. + # NULL will tell the caller not to append FLEX_ROOT to env vars bc it's not needed + return(NULL) + } + # If not found, download it + cat("*** Downloading and building flex\n") + # Flex 2.6.4 (latest release) causes segfaults on some platforms (ubuntu bionic, debian e.g.) + # See https://github.com/westes/flex/issues/219 + # Allegedly it has been fixed in master but there hasn't been a release since May 2017 + FLEX_VERSION <- Sys.getenv("FLEX_VERSION", "2.6.3") + flex_source_url <- paste0( + "https://github.com/westes/flex/releases/download/v", FLEX_VERSION, + "/flex-", FLEX_VERSION, ".tar.gz" + ) + flex_tar <- tempfile() + flex_dir <- tempfile() + try( + download.file(flex_source_url, flex_tar, quiet = quietly), + silent = quietly + ) + untar(flex_tar, exdir = flex_dir) + unlink(flex_tar) + options(.arrow.cleanup = c(getOption(".arrow.cleanup"), flex_dir)) + # flex also needs m4 + if (!is.null(m4)) { + # If we just built it, put it on PATH for building bison + path <- sprintf('export PATH="%s:$PATH" && ', m4) + } else { + path <- "" + } + # Now, build flex + flex_dir <- paste0(flex_dir, "/flex-", FLEX_VERSION) + cmd <- sprintf("cd %s && ./configure && make", shQuote(flex_dir)) + system(paste0(path, cmd)) + # The built flex should be in ./src. Return that so we can set as FLEX_ROOT + paste0(flex_dir, "/src") +} + +ensure_bison <- function(m4 = ensure_m4()) { + if (nzchar(Sys.which("bison"))) { + # We already have bison. + # NULL will tell the caller not to append BISON_ROOT to env vars bc it's not needed + return(NULL) + } + # If not found, download it + cat("*** Downloading and building bison\n") + BISON_VERSION <- Sys.getenv("BISON_VERSION", "3.5") + source_url <- paste0("https://ftp.gnu.org/gnu/bison/bison-", BISON_VERSION, ".tar.gz") + tar_file <- tempfile() + build_dir <- tempfile() + install_dir <- tempfile() + try( + download.file(source_url, tar_file, quiet = quietly), + silent = quietly + ) + untar(tar_file, exdir = build_dir) + unlink(tar_file) + on.exit(unlink(build_dir)) + options(.arrow.cleanup = c(getOption(".arrow.cleanup"), install_dir)) + # bison also needs m4, so let's make sure we have that too + # (we probably don't if we're here) + if (!is.null(m4)) { + # If we just built it, put it on PATH for building bison + path <- sprintf('export PATH="%s:$PATH" && ', m4) + } else { + path <- "" + } + # Now, build bison + build_dir <- paste0(build_dir, "/bison-", BISON_VERSION) + cmd <- sprintf( + "cd %s && ./configure --prefix=%s && make && make install", + shQuote(build_dir), install_dir + ) + system(paste0(path, cmd)) + install_dir +} + +ensure_m4 <- function() { + if (nzchar(Sys.which("m4"))) { + # We already have m4. + return(NULL) + } + # If not found, download it + cat("*** Downloading and building m4\n") + M4_VERSION <- Sys.getenv("M4_VERSION", "1.4.18") + source_url <- paste0("https://ftp.gnu.org/gnu/m4/m4-", M4_VERSION, ".tar.gz") + tar_file <- tempfile() + dst_dir <- tempfile() + try( + download.file(source_url, tar_file, quiet = quietly), + silent = quietly + ) + untar(tar_file, exdir = dst_dir) + unlink(tar_file) + options(.arrow.cleanup = c(getOption(".arrow.cleanup"), dst_dir)) + # bison also needs m4, so let's make sure we have that too + # (we probably don't if we're here) + + # Now, build it + dst_dir <- paste0(dst_dir, "/m4-", M4_VERSION) + system(sprintf("cd %s && ./configure && make", shQuote(dst_dir))) + # The built m4 should be in ./src. Return that so we can put that on the PATH + paste0(dst_dir, "/src") +} + +##### + +if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) { + # If we're working in a local checkout and have already built the libs, we + # don't need to do anything. Otherwise, + # (1) Look for a prebuilt binary for this version + bin_file <- src_dir <- NULL + if (download_ok) { + bin_file <- download_binary() + } + if (!is.null(bin_file)) { + cat(sprintf("*** Successfully retrieved C++ binaries for %s\n", os)) + # Extract them + dir.create(dst_dir, showWarnings = !quietly, recursive = TRUE) + unzip(bin_file, exdir = dst_dir) + unlink(bin_file) + } else if (build_ok) { + # (2) Find source and build it + if (download_ok) { + src_dir <- download_source() + } + if (is.null(src_dir)) { + src_dir <- find_local_source() + } + if (!is.null(src_dir)) { + cat("*** Building C++ libraries\n") + build_libarrow(src_dir, dst_dir) + } + } else { + cat("*** Proceeding without C++ dependencies\n") + } +} diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd new file mode 100644 index 00000000000..766076e0016 --- /dev/null +++ b/r/vignettes/install.Rmd @@ -0,0 +1,269 @@ +--- +title: "Installing the Arrow Package on Linux" +description: "" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Installing the Arrow Package on Linux} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +On macOS and Windows, when you `install.packages("arrow")`, +you get a binary package that contains Arrow’s C++ dependencies along with it. +On Linux, `install.packages()` retrieves a source package that has to be compiled locally, +and C++ dependencies need to be resolved as well. +Generally for R packages with C++ dependencies, +this requires either installing system packages, which you may not have privileges to do, +or building the C++ dependencies separately, +which introduces all sorts of additional ways for things to go wrong. + +Our goal is to make `install.packages("arrow")` "just work" for as many Linux distributions, +versions, and configurations as possible. +This document describes how it works and the options for fine-tuning Linux installation. +The intended audience for this document is `arrow` R package users on Linux, not developers. +If you're contributing to the Arrow project, +you'll probably want to manage your C++ installation more directly. +Note also that if you use `conda` to manage your R environment, this document does not apply. +You can `conda install -c conda-forge r-arrow` and you'll get the latest official +release of the R package along with any C++ dependencies. + +# Installation basics + +Install the latest release of `arrow` from CRAN with + +```r +install.packages("arrow") +``` + +Daily development builds, which are not official releases, +can be installed from the Ursa Labs repository: + +```r +install.packages("arrow", repos = "https://dl.bintray.com/ursalabs/arrow-r") +``` + +There currently are no daily `conda` builds. + +You can also install the R package from a git checkout: + +```shell +git clone https://github.com/apache/arrow +cd arrow/r +R CMD INSTALL . +``` + + + +# How dependencies are resolved + +In order for the `arrow` R package to work, it needs the Arrow C++ library. +There are a number of ways you can get it: a system package; a library you've +built yourself outside of the context of installing the R package; +or, if you don't already have it, the R package will attempt to resolve it +automatically when it installs. + +If you are authorized to install system packages, and you're installing a CRAN release, +you may want to use the official Apache Arrow release packages corresponding to the R package version. +See the [Arrow project installation page](https://arrow.apache.org/install/) +to find pre-compiled binary packages for some common Linux distributions, +including Debian, Ubuntu, and CentOS. +You'll need to install `libparquet-dev` on Debian and Ubuntu, or `parquet-devel` on CentOS. +This will also automatically install the Arrow C++ library as a dependency. + +When you install the `arrow` R package on Linux, +it will first attempt to find the Arrow C++ libraries on your system using +the `pkg-config` command. +This will find either installed system packages or libraries you've built yourself. +In order for `install.packages("arrow")` to work with these system packages, +you'll need to install them before installing the R package. + +If no Arrow C++ libraries are found on the system, +the R package installation script will next attempt to download +prebuilt static Arrow C++ libraries, hosted by Ursa Labs, +that match your both your local operating system and `arrow` R package version. +If found, they will be downloaded and bundled when your R package compiles. + + +If no binary is found, it will download the Arrow C++ source that matches the R package version +(CRAN release or nightly build) and attempt to build it locally. +If no matching source bundle is found, it will also look to see if you are in +a checkout of the `apache/arrow` git repository and thus have the C++ source there. +Depending on your system, building Arrow C++ from source likely will be slow; +consequently, it is designed to happen only when you +run `install.packages("arrow")` or `R CMD INSTALL` but not when running `R CMD check`, +unless you've set the `NOT_CRAN=true` environment variable. + +For the mechanics of how all this works, see the R package `configure` script, +which calls `tools/linuxlibs.R`. +If the C++ library is built from source, `inst/build_arrow_static.sh` is executed. +This build script is also what is used to generate the prebuilt binaries. + +# Troubleshooting and additional options + +The intent is that `install.packages("arrow")` will just work and handle all C++ +dependencies, but depending on your system, you may have better results if you +tune one of several parameters. Here are some known complications and ways to address them. + +## Using system libraries + +If a system library or other installed Arrow is found but it doesn't match the R package version +(for example, you have libarrow 0.14 on your system and are installing R package 0.15.1), +it is likely that the R bindings will fail to compile. +Because the Apache Arrow project is under active development, +is it essential that versions of the C++ and R libraries match. +When `install.packages("arrow")` has to download the C++ libraries, +the install script ensures that you fetch the C++ libraries that correspond to your R package version. +However, if you are using Arrow libraries already on your system, version match isn't guaranteed. + +To fix version mismatch, you can either update your system packages to match the R package version, +or set the environment variable `ARROW_USE_PKG_CONFIG=FALSE` +to tell the configure script not to look for system Arrow packages. +System packages are available corresponding to all CRAN releases +but not for nightly or dev versions, so depending on the R package version you're installing, +system packages may not be an option. + +Note also that once you have a working R package installation based on system (shared) libraries, +if you update your system Arrow, you'll need to reinstall the R package to match its version. + +## Using a local Arrow C++ build + +If you've built the Arrow C++ libraries locally from source +but haven't installed them where `pkg-config` will find them, +there are a few options for telling the R package how to locate them. +You can set `PKG_CONFIG_PATH` to `/path/to/your/installation/pkgconfig` +(that is, `PKG_CONFIG_PATH=${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig`, +if you've set those variables). +Alternatively, you can set the `INCLUDE_DIR` and `LIB_DIR` environment variables +to point to their location. + +If the package fails to install/load with an error like this: + +``` +** testing if installed package can be loaded from temporary location +Error: package or namespace load failed for 'arrow' in dyn.load(file, DLLpath = DLLpath, ...): +unable to load shared object '/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so': +dlopen(/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: @rpath/libarrow.14.dylib +``` + +try setting the environment variable `R_LD_LIBRARY_PATH` to wherever Arrow C++ +was put in `make install`, e.g. `export R_LD_LIBRARY_PATH=/usr/local/lib`, and +retry installing the R package. + +## Using prebuilt binaries + +If the R package finds and downloads a prebuilt binary of the C++ library, +but then the `arrow` package can't be loaded, perhaps with "undefined symbols" errors, +please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues). +This is likely a compiler mismatch and may be resolvable by setting some +environment variables to instruct R to compile the packages to match the C++ library. + +A workaround would be to set the environment variable `LIBARROW_BINARY_DISTRO=FALSE` +and retry installation: this value instructs the package to build the C++ library from source +instead of downloading the prebuilt binary. +That should guarantee that the compiler settings match. + +If a prebuilt binary wasn't found for your operating system but you think it should have been, +check the logs for a message that says `*** Unable to identify current OS/version`, +or a message that says `*** No C++ binaries found for` an invalid OS. +If you see either, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues). +You may also set the environment variable `ARROW_R_DEV=TRUE` for additional +debug messages. + +A workaround would be to set the environment variable `LIBARROW_BINARY_DISTRO` +to a `distribution-version` that exists in the Ursa Labs repository. +Setting `LIBARROW_BINARY_DISTRO` is also an option when there's not an exact match +for your OS but a similar version would work, +such as if you're on `ubuntu-18.10` and there's only a binary for `ubuntu-18.04`. + + + + + +## Building C++ from source + +If building the C++ library from source fails, check the error message. +The install script attempts to install any necessary build dependencies, +but it's possible that some operating systems may require additional ones. +You may be able to install them and retry. +Regardless, if the C++ library fails to compile, +please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) +so that we can attempt to improve the script. + +### Known C++ build issues + +(1) `m4` (build dependency for `flex` and `bison`, which are build dependencies for `thrift`) +fails to build with a message like: + +``` +freadahead.c: In function 'freadahead': +freadahead.c:92:3: error: #error "Please port gnulib freadahead.c to your platform! Look at the definition of fflush, fread, ungetc on your system, then report this to bug-gnulib." +``` + +This has been observed on CentOS 8 (Docker image `rstudio/r-base:3.6-centos8`) +and Fedora (`rhub/fedora-clang-devel`). + +A solution is to install `m4` using your system package manager; +if that's an option for you, you may as well just install `flex` and `bison` +and avoid this build step entirely. + + + +## Summary of build environment variables + +By default, these are all unset. + +* `ARROW_USE_PKG_CONFIG`: If set to `FALSE`, the configure script + won't look for Arrow libraries on your system and instead will look to download/build them. + Use this if you have a version mismatch between installed system libraries + and the version of the R package you're installing. +* `LIBARROW_DOWNLOAD`: If set to `FALSE` (case insensitive), the build script + will not attempt to download C++ binary or source bundles. + Use this if you're in a checkout of the `apache/arrow` git repository + and want to build the C++ library from the local source. +* `LIBARROW_BUILD`: If set to `FALSE` (case insensitive), the build script + will not attempt to build the C++ from source. This means you will only get + a working `arrow` R package if a prebuilt binary is found. + Use this if you want to avoid compiling the C++ library, which may be slow + and resource-intensive, and ensure that you only use a prebuilt binary. +* `LIBARROW_BINARY_DISTRO`: If set to `FALSE` (case insensitive), + the script will not download a binary, but it may still download a source bundle. + You may also set it to some other string, + a related "distro-version" that has binaries built that work for your OS. +* `NOT_CRAN`: If this variable is set to `true`, as the `devtools` package does, + the build script will attempt to download and/or build the Arrow C++ library, if necessary, + even when running `R CMD check`. Otherwise, it will only download/build C++ + if you're not running `R CMD check`. + The purpose of this protection is to avoid expensive compilation in + automated testing environments (unless you opt-in). +* `ARROW_R_DEV`: If set to `TRUE`, more verbose messaging will be printed + in the build script. This variable also is needed if you're modifying `Rcpp` + code in the package: see "Editing Rcpp code" in the README. +* `DEBUG_DIR`: If the C++ library building from source fails (`cmake`), + there may be messages telling you to check some log file in the build directory. + However, when the library is built during R package installation, + that location is in a temp directory that is already deleted. + To capture those logs, set this variable to an absolute (not relative) path + and the log files will be copied there. + The directory will be created if it does not exist. + +# Contributing + +As mentioned above, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) +if you encounter ways to improve this. If you find that your Linux distribution +or version is not supported, we welcome the contribution of Docker images +(hosted on Docker Hub) that we can use in our continuous integration. These +Docker images should be minimal, containing only R and the dependencies it +requires. (For reference, see the images that +[R-hub](https://github.com/r-hub/rhub-linux-builders) uses.) + +You can test the `arrow` R package installation using the `docker-compose` +setup included in the `apache/arrow` git repository. For example, + +``` +R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r-hub +R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r-hub +``` + +installs the `arrow` R package, including the C++ source build, on the +[rhub/ubuntu-gcc-release](https://hub.docker.com/r/rhub/ubuntu-gcc-release) +image.