-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-9718: [Python] ParquetWriter to work with new FileSystem API #7991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6a329f4
775c5e9
842aae9
5d344cd
e309ce1
fd27aad
5bafd1d
c619a5b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| from pyarrow.tests import util | ||
| from pyarrow.util import guid | ||
| from pyarrow.filesystem import LocalFileSystem, FileSystem | ||
| from pyarrow import fs | ||
|
|
||
|
|
||
| try: | ||
|
|
@@ -2152,7 +2153,7 @@ def s3_bucket(request, s3_connection, s3_server): | |
|
|
||
|
|
||
| @pytest.fixture | ||
| def s3_example(s3_connection, s3_server, s3_bucket): | ||
| def s3_example_s3fs(s3_connection, s3_server, s3_bucket): | ||
| s3fs = pytest.importorskip('s3fs') | ||
|
|
||
| host, port, access_key, secret_key = s3_connection | ||
|
|
@@ -2175,10 +2176,10 @@ def s3_example(s3_connection, s3_server, s3_bucket): | |
| @pytest.mark.pandas | ||
| @pytest.mark.s3 | ||
| @parametrize_legacy_dataset | ||
| def test_read_partitioned_directory_s3fs(s3_example, use_legacy_dataset): | ||
| def test_read_partitioned_directory_s3fs(s3_example_s3fs, use_legacy_dataset): | ||
| from pyarrow.filesystem import S3FSWrapper | ||
|
|
||
| fs, bucket_uri = s3_example | ||
| fs, bucket_uri = s3_example_s3fs | ||
| wrapper = S3FSWrapper(fs) | ||
| _partition_test_for_filesystem(wrapper, bucket_uri) | ||
|
|
||
|
|
@@ -3510,6 +3511,88 @@ def test_parquet_file_pass_directory_instead_of_file(tempdir): | |
| pq.ParquetFile(path) | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
| @pytest.mark.parametrize("filesystem", [ | ||
| None, | ||
| LocalFileSystem.get_instance(), | ||
| fs.LocalFileSystem(), | ||
| ]) | ||
| def test_parquet_writer_filesystem_local(tempdir, filesystem): | ||
| df = _test_dataframe(100) | ||
| table = pa.Table.from_pandas(df, preserve_index=False) | ||
| path = str(tempdir / 'data.parquet') | ||
|
|
||
| with pq.ParquetWriter( | ||
| path, table.schema, filesystem=filesystem, version='2.0' | ||
| ) as writer: | ||
| writer.write_table(table) | ||
|
|
||
| result = _read_table(path).to_pandas() | ||
| tm.assert_frame_equal(result, df) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def s3_example_fs(s3_connection, s3_server): | ||
| from pyarrow.fs import FileSystem | ||
|
|
||
| host, port, access_key, secret_key = s3_connection | ||
| uri = ( | ||
| "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" | ||
| .format(access_key, secret_key, host, port) | ||
| ) | ||
| fs, path = FileSystem.from_uri(uri) | ||
|
|
||
| fs.create_dir("mybucket") | ||
|
|
||
| yield fs, uri, path | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
| @pytest.mark.s3 | ||
| def test_parquet_writer_filesystem_s3(s3_example_fs): | ||
| df = _test_dataframe(100) | ||
| table = pa.Table.from_pandas(df, preserve_index=False) | ||
|
|
||
| fs, uri, path = s3_example_fs | ||
|
|
||
| with pq.ParquetWriter( | ||
| path, table.schema, filesystem=fs, version='2.0' | ||
| ) as writer: | ||
| writer.write_table(table) | ||
|
||
|
|
||
| result = _read_table(uri).to_pandas() | ||
| tm.assert_frame_equal(result, df) | ||
|
|
||
|
|
||
| # TODO segfaulting (ARROW-9814?) | ||
| # @pytest.mark.pandas | ||
| # @pytest.mark.s3 | ||
| # def test_parquet_writer_filesystem_s3_uri(s3_example_fs): | ||
| # df = _test_dataframe(100) | ||
| # table = pa.Table.from_pandas(df, preserve_index=False) | ||
|
|
||
| # fs, uri, path = s3_example_fs | ||
|
|
||
| # with pq.ParquetWriter(uri, table.schema, version='2.0') as writer: | ||
| # writer.write_table(table) | ||
|
|
||
| # result = _read_table(path, filesystem=fs).to_pandas() | ||
| # tm.assert_frame_equal(result, df) | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
| def test_parquet_writer_filesystem_buffer_raises(): | ||
| df = _test_dataframe(100) | ||
| table = pa.Table.from_pandas(df, preserve_index=False) | ||
| filesystem = fs.LocalFileSystem() | ||
|
|
||
| # Should raise ValueError when filesystem is passed with file-like object | ||
| with pytest.raises(ValueError, match="specified path is file-like"): | ||
| pq.ParquetWriter( | ||
| pa.BufferOutputStream(), table.schema, filesystem=filesystem | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
| @parametrize_legacy_dataset | ||
| def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also
from pyarrow import filesystem as legacyfs?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here I am going to leave it as is for now, because the old ones are still used a lot (would make the diff much larger, will keep that for a next PR, eg when actually deprecating)