From 865c24cc079cfca90c79229aff4180739676550a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 17 Jan 2023 05:51:31 +0100 Subject: [PATCH 01/11] Add numpy version pin to docker-compose.yml --- docker-compose.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 12071a57bd3..31d193d655f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1783,6 +1783,10 @@ services: repo: ${REPO} arch: ${ARCH} python: ${PYTHON} + # https://github.com/apache/arrow/issues/33697 + # numpy version pin should be removed with new apache spark release + # that includes https://github.com/apache/spark/pull/37817 + numpy: 1.23 jdk: ${JDK} # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should # be set to ${MAVEN} From 4d5915a48ae7fb0cc9ad43264408396a1f7f4463 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 17 Jan 2023 08:16:30 +0100 Subject: [PATCH 02/11] Add numpy to mamba install in conda-python-spark.dockerfile --- ci/docker/conda-python-spark.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 861d83fe607..05d93807d43 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -26,6 +26,7 @@ ARG maven=3.5 RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ + numpy=${numpy} \ pandas && \ mamba clean --all From daec93619c424bdaf10e7a1060ddb4da5700238f Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 17 Jan 2023 10:01:19 +0100 Subject: [PATCH 03/11] Remove change in docker-compose.yml and add version pin in conda-python-spark.dockerfile --- ci/docker/conda-python-spark.dockerfile | 4 ++++ docker-compose.yml | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 05d93807d43..a7244560249 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -22,6 +22,10 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 +# https://github.com/apache/arrow/issues/33697 +# numpy version pin should be removed with new apache spark release +# that includes https://github.com/apache/spark/pull/37817 +ARG numpy=1.23 RUN mamba install -q -y \ openjdk=${jdk} \ diff --git a/docker-compose.yml b/docker-compose.yml index 31d193d655f..12071a57bd3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1783,10 +1783,6 @@ services: repo: ${REPO} arch: ${ARCH} python: ${PYTHON} - # https://github.com/apache/arrow/issues/33697 - # numpy version pin should be removed with new apache spark release - # that includes https://github.com/apache/spark/pull/37817 - numpy: 1.23 jdk: ${JDK} # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should # be set to ${MAVEN} From d86c6a90387852e7dd9ea4f819bbda980106bf2d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 23 Feb 2023 12:08:41 +0100 Subject: [PATCH 04/11] Remove numpy pin from ci/docker/conda-python-spark.dockerfile and add it to tasks.yml --- ci/docker/conda-python-spark.dockerfile | 5 ----- dev/tasks/tasks.yml | 7 ++++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index a7244560249..861d83fe607 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -22,15 +22,10 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 -# https://github.com/apache/arrow/issues/33697 -# numpy version pin should be removed with new apache spark release -# that includes https://github.com/apache/spark/pull/37817 -ARG numpy=1.23 RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ - numpy=${numpy} \ pandas && \ mamba clean --all diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5fe33dc28b0..d2471af8e83 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1587,9 +1587,9 @@ tasks: image: conda-python-hdfs {% endfor %} -{% for python_version, spark_version, test_pyarrow_only in [("3.7", "v3.1.2", "false"), - ("3.8", "v3.2.0", "false"), - ("3.9", "master", "false")] %} +{% for python_version, spark_version, test_pyarrow_only, numpy_version in [("3.7", "v3.1.2", "false", "latest"), + ("3.8", "v3.2.0", "false", "1.23"), + ("3.9", "master", "false", "latest")] %} test-conda-python-{{ python_version }}-spark-{{ spark_version }}: ci: github template: docker-tests/github.linux.yml @@ -1598,6 +1598,7 @@ tasks: PYTHON: "{{ python_version }}" SPARK: "{{ spark_version }}" TEST_PYARROW_ONLY: "{{ test_pyarrow_only }}" + NUMPY: "{{ numpy_version }}" # use the branch-3.0 of spark, so prevent reusing any layers flags: --no-leaf-cache image: conda-python-spark From 7f768993ecca24cd01e76c8f552282f163491f3e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 23 Feb 2023 14:56:54 +0100 Subject: [PATCH 05/11] Pass numpy to docker-compose.yml and conda-python-spark.dockerfile --- ci/docker/conda-python-spark.dockerfile | 3 +++ docker-compose.yml | 1 + 2 files changed, 4 insertions(+) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 861d83fe607..3e14fc68422 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -22,10 +22,13 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 +ARG numpy=latest +RUN mamba uninstall -q -y numpy RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ + numpy=${numpy} \ pandas && \ mamba clean --all diff --git a/docker-compose.yml b/docker-compose.yml index 12071a57bd3..c9b02c45d1c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1788,6 +1788,7 @@ services: # be set to ${MAVEN} maven: 3.5 spark: ${SPARK} + numpy: ${NUMPY} shm_size: *shm-size environment: <<: *ccache From b1b776d66fb6326984eb9b87558d347ab1ab3b9a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 27 Feb 2023 13:27:55 +0100 Subject: [PATCH 06/11] Try installing numpy with /ci/scripts/install_numpy.sh --- ci/docker/conda-python-spark.dockerfile | 5 +++-- ci/scripts/install_numpy.sh | 28 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 ci/scripts/install_numpy.sh diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 3e14fc68422..6994a1110dc 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -22,13 +22,14 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 + ARG numpy=latest +RUN mamba uninstall -q -y numpy && \ + /arrow/ci/scripts/install_numpy.sh ${numpy} -RUN mamba uninstall -q -y numpy RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ - numpy=${numpy} \ pandas && \ mamba clean --all diff --git a/ci/scripts/install_numpy.sh b/ci/scripts/install_numpy.sh new file mode 100644 index 00000000000..741605a3320 --- /dev/null +++ b/ci/scripts/install_numpy.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +numpy=${1:-"latest"} + +if [ "${numpy}" = "latest" ]; then + pip install numpy +else + pip install numpy==${numpy} +fi From 9743842dd5c6a56275ad2750fa9556efa8803069 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 27 Feb 2023 13:41:28 +0100 Subject: [PATCH 07/11] Maybe this will help installing numpy correctly --- ci/scripts/install_numpy.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/scripts/install_numpy.sh b/ci/scripts/install_numpy.sh index 741605a3320..34f07eed1bd 100644 --- a/ci/scripts/install_numpy.sh +++ b/ci/scripts/install_numpy.sh @@ -19,6 +19,11 @@ set -e +if [ "$#" -lt 1 ]; then + echo "Usage: $0 " + exit 1 +fi + numpy=${1:-"latest"} if [ "${numpy}" = "latest" ]; then From efdf9fce559aa9ed8f386868ecca1b56e96179f8 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 27 Feb 2023 14:04:31 +0100 Subject: [PATCH 08/11] One more try --- ci/docker/conda-python-spark.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 6994a1110dc..356419958d4 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -24,6 +24,7 @@ ARG jdk=8 ARG maven=3.5 ARG numpy=latest +COPY ci/scripts/install_numpy.sh /arrow/ci/scripts/ RUN mamba uninstall -q -y numpy && \ /arrow/ci/scripts/install_numpy.sh ${numpy} From b1b3b996e4544c99687693b03b22e61709b081e4 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 28 Feb 2023 12:20:12 +0100 Subject: [PATCH 09/11] Change install_numpy.sh file permission --- ci/scripts/install_numpy.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 ci/scripts/install_numpy.sh diff --git a/ci/scripts/install_numpy.sh b/ci/scripts/install_numpy.sh old mode 100644 new mode 100755 From d56f4b722aa95b71ab941d637e3f8e030980537e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 28 Feb 2023 17:17:46 +0100 Subject: [PATCH 10/11] Try running mamba before uninstalling&installing numpy --- ci/docker/conda-python-spark.dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 356419958d4..0d89c960621 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -25,14 +25,14 @@ ARG maven=3.5 ARG numpy=latest COPY ci/scripts/install_numpy.sh /arrow/ci/scripts/ -RUN mamba uninstall -q -y numpy && \ - /arrow/ci/scripts/install_numpy.sh ${numpy} RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ pandas && \ mamba clean --all +RUN mamba uninstall -q -y numpy && \ + /arrow/ci/scripts/install_numpy.sh ${numpy} # installing specific version of spark ARG spark=master From 1ebe2768a20d774329aa68dff7867b531c9bc3e0 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 1 Mar 2023 08:45:54 +0100 Subject: [PATCH 11/11] Apply suggestions from code review - kou Co-authored-by: Sutou Kouhei --- ci/docker/conda-python-spark.dockerfile | 4 ++-- ci/scripts/install_numpy.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 0d89c960621..58e3d5e5d56 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -30,8 +30,8 @@ RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ pandas && \ - mamba clean --all -RUN mamba uninstall -q -y numpy && \ + mamba clean --all && \ + mamba uninstall -q -y numpy && \ /arrow/ci/scripts/install_numpy.sh ${numpy} # installing specific version of spark diff --git a/ci/scripts/install_numpy.sh b/ci/scripts/install_numpy.sh index 34f07eed1bd..f04fe81b669 100755 --- a/ci/scripts/install_numpy.sh +++ b/ci/scripts/install_numpy.sh @@ -19,7 +19,7 @@ set -e -if [ "$#" -lt 1 ]; then +if [ $# -gt 1 ]; then echo "Usage: $0 " exit 1 fi