From 496e89bc95d8c85c63278a5a742fd2f8b87c386f Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 10 Sep 2021 17:28:42 -1000 Subject: [PATCH 01/12] ARROW-13685: Changed CreateDir to test for bucket existence explicitly instead of relying on CreateBucket to do so as CreateBucket can fail for permission denied reasons --- ci/etc/minio-no-create-bucket-policy.json | 20 ++++++++++++++++++ cpp/src/arrow/filesystem/s3fs.cc | 25 ++++++++++++++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 ci/etc/minio-no-create-bucket-policy.json diff --git a/ci/etc/minio-no-create-bucket-policy.json b/ci/etc/minio-no-create-bucket-policy.json new file mode 100644 index 00000000000..53db194cfa2 --- /dev/null +++ b/ci/etc/minio-no-create-bucket-policy.json @@ -0,0 +1,20 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListAllMyBuckets", + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObjectTagging", + "s3:DeleteObject", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:aws:s3:::*" + ] + } + ] +} diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 3c443ed5518..12a48c5c545 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -66,8 +66,6 @@ #include #include -#include "arrow/util/windows_fixup.h" - #include "arrow/buffer.h" #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/path_util.h" @@ -87,6 +85,7 @@ #include "arrow/util/optional.h" #include "arrow/util/task_group.h" #include "arrow/util/thread_pool.h" +#include "arrow/util/windows_fixup.h" namespace arrow { @@ -1530,6 +1529,23 @@ class S3FileSystem::Impl : public std::enable_shared_from_this BucketExists(const std::string& bucket) { + S3Model::HeadBucketRequest req; + req.SetBucket(ToAwsString(bucket)); + + auto outcome = client_->HeadBucket(req); + if (!outcome.IsSuccess()) { + if (!IsNotFound(outcome.GetError())) { + return ErrorToStatus( + std::forward_as_tuple("When testing for bucket existence '", bucket, "': "), + outcome.GetError()); + } + return false; + } + return true; + } + // Create a bucket. Successful if bucket already exists. Status CreateBucket(const std::string& bucket) { S3Model::CreateBucketConfiguration config; @@ -2159,7 +2175,10 @@ Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { // Create object if (recursive) { // Ensure bucket exists - RETURN_NOT_OK(impl_->CreateBucket(path.bucket)); + ARROW_ASSIGN_OR_RAISE(bool bucket_exists, impl_->BucketExists(path.bucket)); + if (!bucket_exists) { + RETURN_NOT_OK(impl_->CreateBucket(path.bucket)); + } // Ensure that all parents exist, then the directory itself std::string parent_key; for (const auto& part : path.key_parts) { From 8911f3fcbed48708396c51e1686906a586ef43ed Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 16 Sep 2021 00:49:59 -1000 Subject: [PATCH 02/12] ARROW-13685: Created an integration test for minio with limited permissions --- ci/docker/conda-python-minio.dockerfile | 22 ++++++++ ci/etc/minio-no-create-bucket-policy.json | 20 ------- ci/scripts/install_minio.sh | 2 + ci/scripts/integration_minio.py | 24 +++++++++ ci/scripts/integration_minio.sh | 63 +++++++++++++++++++++++ docker-compose.yml | 29 +++++++++++ 6 files changed, 140 insertions(+), 20 deletions(-) create mode 100644 ci/docker/conda-python-minio.dockerfile delete mode 100644 ci/etc/minio-no-create-bucket-policy.json create mode 100644 ci/scripts/integration_minio.py create mode 100755 ci/scripts/integration_minio.sh diff --git a/ci/docker/conda-python-minio.dockerfile b/ci/docker/conda-python-minio.dockerfile new file mode 100644 index 00000000000..fa26f70c039 --- /dev/null +++ b/ci/docker/conda-python-minio.dockerfile @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG repo +ARG arch=amd64 +ARG python=3.6 +FROM ${repo}:${arch}-conda-python-${python} + diff --git a/ci/etc/minio-no-create-bucket-policy.json b/ci/etc/minio-no-create-bucket-policy.json deleted file mode 100644 index 53db194cfa2..00000000000 --- a/ci/etc/minio-no-create-bucket-policy.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:ListAllMyBuckets", - "s3:PutObject", - "s3:GetObject", - "s3:ListBucket", - "s3:PutObjectTagging", - "s3:DeleteObject", - "s3:GetObjectVersion" - ], - "Resource": [ - "arn:aws:s3:::*" - ] - } - ] -} diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 42f7ce040e0..4b61a35b208 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -49,4 +49,6 @@ elif [[ ${version} != "latest" ]]; then fi wget -nv -P ${prefix}/bin https://dl.min.io/server/minio/release/${platform}-${arch}/minio +wget -nv -P ${prefix}/bin https://dl.min.io/client/mc/release/${platform}-${arch}/mc chmod +x ${prefix}/bin/minio +chmod +x ${prefix}/bin/mc diff --git a/ci/scripts/integration_minio.py b/ci/scripts/integration_minio.py new file mode 100644 index 00000000000..9bd01f3a27e --- /dev/null +++ b/ci/scripts/integration_minio.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file is called from the integration_minio crossbow job + +import pyarrow.fs as fs + +def test_can_create_bucket(): + filesystem = fs.S3FileSystem(access_key='limited', secret_key='limited123', endpoint_override='http://127.0.0.1:9000') + filesystem.create_dir('existing-bucket/foo') # This line fails without the change diff --git a/ci/scripts/integration_minio.sh b/ci/scripts/integration_minio.sh new file mode 100755 index 00000000000..0e488b8d94d --- /dev/null +++ b/ci/scripts/integration_minio.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +source_dir=${1} +build_dir=${2} + +mkdir ./minio_integration + +# Not ideal but need to run minio server in the background +# and ensure it is up and running. +minio server ./minio_integration --address 127.0.0.1:9000 & +sleep 30 + +echo '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListAllMyBuckets", + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObjectTagging", + "s3:DeleteObject", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:aws:s3:::*" + ] + } + ] +}' > ./limited_policy.json + +# Create a user with limited permissions +mc alias set myminio http://127.0.0.1:9000 minioadmin minioadmin +mc admin policy add myminio/ no-create-buckets ./limited_policy.json +mc admin user add myminio/ limited limited123 +mc admin policy set myminio no-create-buckets user=limited +# Create a bucket +mc mb myminio/existing-bucket + +# Run Arrow tests relying on limited permissions user + +python -mpytest ${source_dir}/ci/scripts/integration_minio.py diff --git a/docker-compose.yml b/docker-compose.yml index 482a9e0af31..ed66e0c2dbc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -104,6 +104,7 @@ x-hierarchy: - conda-python-dask - conda-python-hdfs - conda-python-jpype + - conda-python-minio - conda-python-turbodbc - conda-python-kartothek - conda-python-spark @@ -967,6 +968,34 @@ services: /arrow/ci/scripts/java_build.sh /arrow /build && /arrow/ci/scripts/python_test.sh /arrow"] + conda-python-minio: + # Usage: + # docker-compose build conda + # docker-compose build conda-cpp + # docker-compose build conda-python + # docker-compose build conda-python-minio + # docker-compose run --rm conda-python-minio + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-minio + build: + context: . + dockerfile: ci/docker/conda-python-minio.dockerfile + cache_from: + - ${REPO}:${ARCH}-conda-python-${PYTHON}-minio + args: + repo: ${REPO} + arch: ${ARCH} + python: ${PYTHON} + shm_size: *shm-size + environment: + <<: *ccache + ARROW_FLIGHT: "OFF" + ARROW_GANDIVA: "OFF" + volumes: *conda-volumes + command: + ["/arrow/ci/scripts/cpp_build.sh /arrow /build && + /arrow/ci/scripts/python_build.sh /arrow /build && + /arrow/ci/scripts/integration_minio.sh /arrow /build"] + conda-python-turbodbc: # Possible $TURBODBC parameters: # - `latest`: latest release From 2eea041855f724df594a2fe183c01f8d0654e2b1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 16 Sep 2021 00:51:31 -1000 Subject: [PATCH 03/12] ARROW-13685: Reverted header ordering change --- cpp/src/arrow/filesystem/s3fs.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 12a48c5c545..84dbcf8c7e4 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -66,6 +66,8 @@ #include #include +#include "arrow/util/windows_fixup.h" + #include "arrow/buffer.h" #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/path_util.h" @@ -85,7 +87,6 @@ #include "arrow/util/optional.h" #include "arrow/util/task_group.h" #include "arrow/util/thread_pool.h" -#include "arrow/util/windows_fixup.h" namespace arrow { From 06deba448969ef48b06d6ce0ef404e9f6114c235 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 16 Sep 2021 00:53:05 -1000 Subject: [PATCH 04/12] ARROW-13685: Touched up error per PR review --- cpp/src/arrow/filesystem/s3fs.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 84dbcf8c7e4..314abdf3393 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1538,9 +1538,9 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisHeadBucket(req); if (!outcome.IsSuccess()) { if (!IsNotFound(outcome.GetError())) { - return ErrorToStatus( - std::forward_as_tuple("When testing for bucket existence '", bucket, "': "), - outcome.GetError()); + return ErrorToStatus(std::forward_as_tuple( + "When testing for existence of bucket '", bucket, "': "), + outcome.GetError()); } return false; } From bc43ad1b0890cc5a4596f2020c767c6895adc97d Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 17 Sep 2021 13:14:55 -1000 Subject: [PATCH 05/12] ARROW-13685: Removed needless minio dockerfile. Added minio to nightly tasks --- dev/tasks/tasks.yml | 6 ++++++ docker-compose.yml | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 70f7c6c6d15..39a69147afe 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -942,6 +942,12 @@ tasks: image: conda-python {% endfor %} + test-conda-python-minio: + ci: github + template: docker-tests/github.linux.yml + params: + image: conda-python-minio + test-conda-python-3.8-hypothesis: ci: github template: docker-tests/github.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index ed66e0c2dbc..efef5be2fff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -973,12 +973,11 @@ services: # docker-compose build conda # docker-compose build conda-cpp # docker-compose build conda-python - # docker-compose build conda-python-minio # docker-compose run --rm conda-python-minio image: ${REPO}:${ARCH}-conda-python-${PYTHON}-minio build: context: . - dockerfile: ci/docker/conda-python-minio.dockerfile + dockerfile: ci/docker/conda-python.dockerfile cache_from: - ${REPO}:${ARCH}-conda-python-${PYTHON}-minio args: From 54c6aa5bf0c61c18288f268484cfd9daaeb4029a Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 17 Sep 2021 15:16:51 -1000 Subject: [PATCH 06/12] ARROW-13685: Updated comments and cleaned up test --- ci/scripts/integration_minio.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ci/scripts/integration_minio.py b/ci/scripts/integration_minio.py index 9bd01f3a27e..1b54801c2d9 100644 --- a/ci/scripts/integration_minio.py +++ b/ci/scripts/integration_minio.py @@ -16,9 +16,16 @@ # under the License. # This file is called from the integration_minio crossbow job +# It is intended to test S3 integration features that cannot +# be tested in unit tests against a simple minio setup. + +# For example, issues which require a specific configuration +# of API permissions to reproduce. import pyarrow.fs as fs def test_can_create_bucket(): - filesystem = fs.S3FileSystem(access_key='limited', secret_key='limited123', endpoint_override='http://127.0.0.1:9000') - filesystem.create_dir('existing-bucket/foo') # This line fails without the change + filesystem = fs.S3FileSystem(access_key='limited', + secret_key='limited123', + endpoint_override='http://127.0.0.1:9000') + filesystem.create_dir('existing-bucket/foo') From 23e56a23d657cff67a94d18a2752c85329a24031 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 17 Sep 2021 15:20:05 -1000 Subject: [PATCH 07/12] ARROW-13685: Forgot to remove unneeded dockerfile from the repo --- ci/docker/conda-python-minio.dockerfile | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 ci/docker/conda-python-minio.dockerfile diff --git a/ci/docker/conda-python-minio.dockerfile b/ci/docker/conda-python-minio.dockerfile deleted file mode 100644 index fa26f70c039..00000000000 --- a/ci/docker/conda-python-minio.dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG repo -ARG arch=amd64 -ARG python=3.6 -FROM ${repo}:${arch}-conda-python-${python} - From 765a2c036bd0757586937ef1e55ba20e9adc743c Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 21 Sep 2021 13:26:09 -1000 Subject: [PATCH 08/12] ARROW-13685: Moved mc-based integration tests into the main python test suite instead of a dedicated integration test. --- ci/scripts/integration_minio.py | 31 ------ ci/scripts/integration_minio.sh | 63 ------------ docker-compose.yml | 28 ------ python/pyarrow/tests/conftest.py | 6 +- python/pyarrow/tests/parquet/conftest.py | 12 +-- python/pyarrow/tests/test_dataset.py | 8 +- python/pyarrow/tests/test_fs.py | 122 +++++++++++++++++++++-- 7 files changed, 131 insertions(+), 139 deletions(-) delete mode 100644 ci/scripts/integration_minio.py delete mode 100755 ci/scripts/integration_minio.sh diff --git a/ci/scripts/integration_minio.py b/ci/scripts/integration_minio.py deleted file mode 100644 index 1b54801c2d9..00000000000 --- a/ci/scripts/integration_minio.py +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file is called from the integration_minio crossbow job -# It is intended to test S3 integration features that cannot -# be tested in unit tests against a simple minio setup. - -# For example, issues which require a specific configuration -# of API permissions to reproduce. - -import pyarrow.fs as fs - -def test_can_create_bucket(): - filesystem = fs.S3FileSystem(access_key='limited', - secret_key='limited123', - endpoint_override='http://127.0.0.1:9000') - filesystem.create_dir('existing-bucket/foo') diff --git a/ci/scripts/integration_minio.sh b/ci/scripts/integration_minio.sh deleted file mode 100755 index 0e488b8d94d..00000000000 --- a/ci/scripts/integration_minio.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -source_dir=${1} -build_dir=${2} - -mkdir ./minio_integration - -# Not ideal but need to run minio server in the background -# and ensure it is up and running. -minio server ./minio_integration --address 127.0.0.1:9000 & -sleep 30 - -echo '{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:ListAllMyBuckets", - "s3:PutObject", - "s3:GetObject", - "s3:ListBucket", - "s3:PutObjectTagging", - "s3:DeleteObject", - "s3:GetObjectVersion" - ], - "Resource": [ - "arn:aws:s3:::*" - ] - } - ] -}' > ./limited_policy.json - -# Create a user with limited permissions -mc alias set myminio http://127.0.0.1:9000 minioadmin minioadmin -mc admin policy add myminio/ no-create-buckets ./limited_policy.json -mc admin user add myminio/ limited limited123 -mc admin policy set myminio no-create-buckets user=limited -# Create a bucket -mc mb myminio/existing-bucket - -# Run Arrow tests relying on limited permissions user - -python -mpytest ${source_dir}/ci/scripts/integration_minio.py diff --git a/docker-compose.yml b/docker-compose.yml index efef5be2fff..482a9e0af31 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -104,7 +104,6 @@ x-hierarchy: - conda-python-dask - conda-python-hdfs - conda-python-jpype - - conda-python-minio - conda-python-turbodbc - conda-python-kartothek - conda-python-spark @@ -968,33 +967,6 @@ services: /arrow/ci/scripts/java_build.sh /arrow /build && /arrow/ci/scripts/python_test.sh /arrow"] - conda-python-minio: - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose run --rm conda-python-minio - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-minio - build: - context: . - dockerfile: ci/docker/conda-python.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-minio - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - shm_size: *shm-size - environment: - <<: *ccache - ARROW_FLIGHT: "OFF" - ARROW_GANDIVA: "OFF" - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/integration_minio.sh /arrow /build"] - conda-python-turbodbc: # Possible $TURBODBC parameters: # - `latest`: latest release diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 40836867f5f..8fa520b9398 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -292,7 +292,11 @@ def s3_server(s3_connection): except OSError: pytest.skip('`minio` command cannot be located') else: - yield proc + yield { + 'connection': s3_connection, + 'process': proc, + 'tempdir': tempdir + } finally: if proc is not None: proc.kill() diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py index ead9affc3cc..1e75493cdae 100644 --- a/python/pyarrow/tests/parquet/conftest.py +++ b/python/pyarrow/tests/parquet/conftest.py @@ -26,11 +26,11 @@ def datadir(base_datadir): @pytest.fixture -def s3_bucket(request, s3_connection, s3_server): +def s3_bucket(s3_server): boto3 = pytest.importorskip('boto3') botocore = pytest.importorskip('botocore') - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] s3 = boto3.resource( 's3', endpoint_url='http://{}:{}'.format(host, port), @@ -49,10 +49,10 @@ def s3_bucket(request, s3_connection, s3_server): @pytest.fixture -def s3_example_s3fs(s3_connection, s3_server, s3_bucket): +def s3_example_s3fs(s3_server, s3_bucket): s3fs = pytest.importorskip('s3fs') - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, @@ -72,10 +72,10 @@ def s3_example_s3fs(s3_connection, s3_server, s3_bucket): @pytest.fixture -def s3_example_fs(s3_connection, s3_server): +def s3_example_fs(s3_server): from pyarrow.fs import FileSystem - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" .format(access_key, secret_key, host, port) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 5cbe0d9ab10..900cd650db2 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2242,11 +2242,11 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): @pytest.fixture -def s3_example_simple(s3_connection, s3_server): +def s3_example_simple(s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" .format(access_key, secret_key, host, port) @@ -2305,11 +2305,11 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): @pytest.mark.parquet @pytest.mark.s3 -def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): +def test_open_dataset_from_s3_with_filesystem_uri(s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] bucket = 'theirbucket' path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 16e73a1e286..e61cdec382e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -20,7 +20,9 @@ import os import pathlib import pickle +import subprocess import sys +import time import pytest import weakref @@ -263,11 +265,11 @@ def subtree_localfs(request, tempdir, localfs): @pytest.fixture -def s3fs(request, s3_connection, s3_server): +def s3fs(request, s3_server): request.config.pyarrow.requires('s3') from pyarrow.fs import S3FileSystem - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] bucket = 'pyarrow-filesystem/' fs = S3FileSystem( @@ -298,6 +300,99 @@ def subtree_s3fs(request, s3fs): ) +__minio_limited_policy = """{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:ListAllMyBuckets", + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObjectTagging", + "s3:DeleteObject", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:aws:s3:::*" + ] + } + ] +}""" + + +def __run_mc_command(mcdir, *args): + full_args = ['mc', '-C', mcdir] + list(args) + proc = subprocess.Popen(full_args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding='utf-8') + retval = proc.wait(10) + cmd_str = ' '.join(full_args) + print(f'Cmd: {cmd_str}') + print(f' Return: {retval}') + print(f' Stdout: {proc.stdout.read()}') + print(f' Stderr: {proc.stderr.read()}') + if retval != 0: + raise ChildProcessError("Could not run mc") + + +def __wait_for_minio_startup(mcdir, address, access_key, secret_key): + start = time.time() + while time.time() - start < 10: + try: + __run_mc_command(mcdir, 'alias', 'set', 'myminio', + f'http://{address}', access_key, secret_key) + return + except ChildProcessError: + time.sleep(1) + raise Exception("mc command could not connect to local minio") + + +def __configure_limited_user(tmpdir, address, access_key, secret_key): + """ + Attempts to use the mc command to configure the minio server + with a special user limited:limited123 which does not have + permission to create buckets. This mirrors some real life S3 + configurations where users are given strict permissions. + + Arrow S3 operations should still work in such a configuration + (e.g. see ARROW-13685) + """ + try: + mcdir = os.path.join(tmpdir, 'mc') + os.mkdir(mcdir) + policy_path = os.path.join(tmpdir, 'limited-buckets-policy.json') + with open(policy_path, mode='w') as policy_file: + policy_file.write(__minio_limited_policy) + # The s3_server fixture starts the minio process but + # it takes a few moments for the process to become available + __wait_for_minio_startup(mcdir, address, access_key, secret_key) + # These commands create a limited user with a specific + # policy and creates a sample bucket for that user to + # write to + __run_mc_command(mcdir, 'admin', 'policy', 'add', + 'myminio/', 'no-create-buckets', policy_path) + __run_mc_command(mcdir, 'admin', 'user', 'add', + 'myminio/', 'limited', 'limited123') + __run_mc_command(mcdir, 'admin', 'policy', 'set', + 'myminio', 'no-create-buckets', 'user=limited') + __run_mc_command(mcdir, 'mb', 'myminio/existing-bucket') + return True + except FileNotFoundError: + # If mc is not found, skip these tests + return False + + +@pytest.fixture(scope='session') +def limited_s3_user(request, s3_server): + request.config.pyarrow.requires('s3') + tempdir = s3_server['tempdir'] + host, port, access_key, secret_key = s3_server['connection'] + address = '{}:{}'.format(host, port) + if not __configure_limited_user(tempdir, address, access_key, secret_key): + pytest.skip('Could not locate mc command to configure limited user') + + @pytest.fixture def hdfs(request, hdfs_connection): request.config.pyarrow.requires('hdfs') @@ -345,13 +440,13 @@ def py_fsspec_memoryfs(request, tempdir): @pytest.fixture -def py_fsspec_s3fs(request, s3_connection, s3_server): +def py_fsspec_s3fs(request, s3_server): s3fs = pytest.importorskip("s3fs") if (sys.version_info < (3, 7) and Version(s3fs.__version__) >= Version("0.5")): pytest.skip("s3fs>=0.5 version is async and requires Python >= 3.7") - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] bucket = 'pyarrow-filesystem/' fs = s3fs.S3FileSystem( @@ -473,6 +568,21 @@ def skip_fsspec_s3fs(fs): pytest.xfail(reason="Not working with fsspec's s3fs") +@pytest.mark.s3 +def test_s3fs_limited_permissions_create_bucket(s3_server, limited_s3_user): + from pyarrow.fs import S3FileSystem + + host, port, _, _ = s3_server['connection'] + + fs = S3FileSystem( + access_key='limited', + secret_key='limited123', + endpoint_override='{}:{}'.format(host, port), + scheme='http' + ) + fs.create_dir('existing-bucket/test') + + def test_file_info_constructor(): dt = datetime.fromtimestamp(1568799826, timezone.utc) @@ -1319,10 +1429,10 @@ def test_filesystem_from_path_object(path): @pytest.mark.s3 -def test_filesystem_from_uri_s3(s3_connection, s3_server): +def test_filesystem_from_uri_s3(s3_server): from pyarrow.fs import S3FileSystem - host, port, access_key, secret_key = s3_connection + host, port, access_key, secret_key = s3_server['connection'] uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \ .format(access_key, secret_key, host, port) From c83c1661aeaafee51e05fa58243db3cd10c8b05e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 21 Sep 2021 13:28:22 -1000 Subject: [PATCH 09/12] ARROW-13685: Forgot to revert change to tasks.yml --- dev/tasks/tasks.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 39a69147afe..70f7c6c6d15 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -942,12 +942,6 @@ tasks: image: conda-python {% endfor %} - test-conda-python-minio: - ci: github - template: docker-tests/github.linux.yml - params: - image: conda-python-minio - test-conda-python-3.8-hypothesis: ci: github template: docker-tests/github.linux.yml From b28f4157ec750e7e7637d22ca2eaadad2d1d528b Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 22 Sep 2021 09:10:30 -1000 Subject: [PATCH 10/12] ARROW-13685: Convert leading __ to leading _ --- python/pyarrow/tests/test_fs.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index e61cdec382e..622f64c251c 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -300,7 +300,7 @@ def subtree_s3fs(request, s3fs): ) -__minio_limited_policy = """{ +_minio_limited_policy = """{ "Version": "2012-10-17", "Statement": [ { @@ -322,7 +322,7 @@ def subtree_s3fs(request, s3fs): }""" -def __run_mc_command(mcdir, *args): +def _run_mc_command(mcdir, *args): full_args = ['mc', '-C', mcdir] + list(args) proc = subprocess.Popen(full_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') @@ -336,11 +336,11 @@ def __run_mc_command(mcdir, *args): raise ChildProcessError("Could not run mc") -def __wait_for_minio_startup(mcdir, address, access_key, secret_key): +def _wait_for_minio_startup(mcdir, address, access_key, secret_key): start = time.time() while time.time() - start < 10: try: - __run_mc_command(mcdir, 'alias', 'set', 'myminio', + _run_mc_command(mcdir, 'alias', 'set', 'myminio', f'http://{address}', access_key, secret_key) return except ChildProcessError: @@ -348,7 +348,7 @@ def __wait_for_minio_startup(mcdir, address, access_key, secret_key): raise Exception("mc command could not connect to local minio") -def __configure_limited_user(tmpdir, address, access_key, secret_key): +def _configure_limited_user(tmpdir, address, access_key, secret_key): """ Attempts to use the mc command to configure the minio server with a special user limited:limited123 which does not have @@ -363,20 +363,20 @@ def __configure_limited_user(tmpdir, address, access_key, secret_key): os.mkdir(mcdir) policy_path = os.path.join(tmpdir, 'limited-buckets-policy.json') with open(policy_path, mode='w') as policy_file: - policy_file.write(__minio_limited_policy) + policy_file.write(_minio_limited_policy) # The s3_server fixture starts the minio process but # it takes a few moments for the process to become available - __wait_for_minio_startup(mcdir, address, access_key, secret_key) + _wait_for_minio_startup(mcdir, address, access_key, secret_key) # These commands create a limited user with a specific # policy and creates a sample bucket for that user to # write to - __run_mc_command(mcdir, 'admin', 'policy', 'add', + _run_mc_command(mcdir, 'admin', 'policy', 'add', 'myminio/', 'no-create-buckets', policy_path) - __run_mc_command(mcdir, 'admin', 'user', 'add', + _run_mc_command(mcdir, 'admin', 'user', 'add', 'myminio/', 'limited', 'limited123') - __run_mc_command(mcdir, 'admin', 'policy', 'set', + _run_mc_command(mcdir, 'admin', 'policy', 'set', 'myminio', 'no-create-buckets', 'user=limited') - __run_mc_command(mcdir, 'mb', 'myminio/existing-bucket') + _run_mc_command(mcdir, 'mb', 'myminio/existing-bucket') return True except FileNotFoundError: # If mc is not found, skip these tests @@ -389,7 +389,7 @@ def limited_s3_user(request, s3_server): tempdir = s3_server['tempdir'] host, port, access_key, secret_key = s3_server['connection'] address = '{}:{}'.format(host, port) - if not __configure_limited_user(tempdir, address, access_key, secret_key): + if not _configure_limited_user(tempdir, address, access_key, secret_key): pytest.skip('Could not locate mc command to configure limited user') From 249a499145419e2710d90e6348e67e94612b9cb5 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 30 Sep 2021 01:42:49 -1000 Subject: [PATCH 11/12] ARROW-13685: Lint errors --- python/pyarrow/tests/test_fs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 622f64c251c..035a8a62949 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -341,7 +341,7 @@ def _wait_for_minio_startup(mcdir, address, access_key, secret_key): while time.time() - start < 10: try: _run_mc_command(mcdir, 'alias', 'set', 'myminio', - f'http://{address}', access_key, secret_key) + f'http://{address}', access_key, secret_key) return except ChildProcessError: time.sleep(1) @@ -371,11 +371,11 @@ def _configure_limited_user(tmpdir, address, access_key, secret_key): # policy and creates a sample bucket for that user to # write to _run_mc_command(mcdir, 'admin', 'policy', 'add', - 'myminio/', 'no-create-buckets', policy_path) + 'myminio/', 'no-create-buckets', policy_path) _run_mc_command(mcdir, 'admin', 'user', 'add', - 'myminio/', 'limited', 'limited123') + 'myminio/', 'limited', 'limited123') _run_mc_command(mcdir, 'admin', 'policy', 'set', - 'myminio', 'no-create-buckets', 'user=limited') + 'myminio', 'no-create-buckets', 'user=limited') _run_mc_command(mcdir, 'mb', 'myminio/existing-bucket') return True except FileNotFoundError: From 6b8a57c651cbbbbf5735287becaa203b27166308 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 30 Sep 2021 12:04:05 -1000 Subject: [PATCH 12/12] ARROW-13685: Proactively disabling tests requiring mc on Windows so as not to be confused with mc.exe which is the Windows message compiler --- python/pyarrow/tests/test_fs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 035a8a62949..684d89f5b0d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -385,6 +385,11 @@ def _configure_limited_user(tmpdir, address, access_key, secret_key): @pytest.fixture(scope='session') def limited_s3_user(request, s3_server): + if sys.platform == 'win32': + # Can't rely on FileNotFound check because + # there is sometimes an mc command on Windows + # which is unrelated to the minio mc + pytest.skip('The mc command is not installed on Windows') request.config.pyarrow.requires('s3') tempdir = s3_server['tempdir'] host, port, access_key, secret_key = s3_server['connection']