diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 34856beb78d..16ef9d09a0a 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1175,6 +1175,10 @@ def open(self): reader = self.open_file_func(self.path) if not isinstance(reader, ParquetFile): reader = ParquetFile(reader, **self.file_options) + + # ensure reader knows it's responsible for closing source + # since we opened the source here internally. + reader._close_source = True return reader def read(self, columns=None, use_threads=True, partitions=None, diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a06ac92095b..154ef79dcd5 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -174,6 +174,7 @@ def s3_server(s3_connection): finally: if proc is not None: proc.kill() + proc.wait() @pytest.fixture(scope='session') @@ -194,3 +195,4 @@ def gcs_server(): finally: if proc is not None: proc.kill() + proc.wait() diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 62ea19d422d..05321a937b5 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -17,6 +17,7 @@ from collections import OrderedDict import io +import warnings import numpy as np import pytest @@ -617,15 +618,16 @@ def test_read_non_existent_file(tempdir, use_legacy_dataset): @parametrize_legacy_dataset def test_read_table_doesnt_warn(datadir, use_legacy_dataset): - with pytest.warns(None) as record: - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - # FutureWarning: 'use_legacy_dataset=True' - assert len(record) == 1 + msg = "Passing 'use_legacy_dataset=True'" + with pytest.warns(FutureWarning, match=msg): + pq.read_table(datadir / 'v0.7.1.parquet', + use_legacy_dataset=use_legacy_dataset) else: - assert len(record) == 0 + with warnings.catch_warnings(): + warnings.simplefilter(action="error") + pq.read_table(datadir / 'v0.7.1.parquet', + use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 654fd4ddc11..0fabbb5c977 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -81,6 +81,7 @@ def test_parquet_piece_open_and_get_metadata(tempdir): with pytest.warns(FutureWarning): piece = pq.ParquetDatasetPiece(path) + table1 = piece.read() assert isinstance(table1, pa.Table) meta1 = piece.get_metadata() diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index e10d4fd776d..52f3f5ce4a0 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -17,6 +17,7 @@ import datetime import io +import warnings import numpy as np import pytest @@ -321,7 +322,11 @@ def get_table(pq_reader_method, filename, **kwargs): # with the default resolution of ns, we get wrong values for INT96 # that are out of bounds for nanosecond range tab_error = get_table(pq_reader_method, filename) - assert tab_error["a"].to_pylist() != oob_dts + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", + "Discarding nonzero nanoseconds in conversion", + UserWarning) + assert tab_error["a"].to_pylist() != oob_dts # avoid this overflow by specifying the resolution to use for INT96 values tab_correct = get_table( diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 2c6f250452d..fef1cc564b4 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -400,6 +400,7 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 +@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning") def test_write_metadata(tempdir): path = str(tempdir / "metadata") schema = pa.schema([("a", "int64"), ("b", "float64")]) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 3bc204c978a..c153db4aa00 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -314,6 +314,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): @pytest.mark.pandas +@pytest.mark.filterwarnings("ignore:Parquet format '2.0':FutureWarning") def test_spark_flavor_preserves_pandas_metadata(): df = _test_dataframe(size=100) df.index = np.arange(0, 10 * len(df), 10) diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py index 33ac4b8517f..9862c5990d7 100644 --- a/python/pyarrow/tests/test_filesystem.py +++ b/python/pyarrow/tests/test_filesystem.py @@ -63,6 +63,7 @@ def test_resolve_local_path(): assert path == uri +@pytest.mark.filterwarnings("ignore:pyarrow.filesystem.LocalFileSystem") def test_resolve_home_directory(): uri = '~/myfile.parquet' fs, path = filesystem.resolve_filesystem_and_path(uri) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 5237e3b4f03..d9abe987ae6 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -846,6 +846,7 @@ def run(self): self._batches.append(batch) finally: connection.close() + self._sock.close() def get_result(self): return (self._schema, self._table if self._do_read_all diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index a109ab3472f..1b467d52330 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -52,8 +52,10 @@ def fix_example_values(actual_cols, expected_cols): if (name == "map" and [d.keys() == {'key', 'value'} for m in expected for d in m]): # convert [{'key': k, 'value': v}, ...] to [(k, v), ...] + col = expected_cols[name].copy() for i, m in enumerate(expected): - expected_cols[name][i] = [(d['key'], d['value']) for d in m] + col[i] = [(d['key'], d['value']) for d in m] + expected_cols[name] = col continue typ = actual[0].__class__ diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index a1ab4d43881..c7b0bf783dc 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -20,6 +20,7 @@ import json import multiprocessing as mp import sys +import warnings from collections import OrderedDict from datetime import date, datetime, time, timedelta, timezone @@ -95,9 +96,12 @@ def _check_pandas_roundtrip(df, expected=None, use_threads=False, if expected is None: expected = df - tm.assert_frame_equal(result, expected, check_dtype=check_dtype, - check_index_type=('equiv' if preserve_index - else False)) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "elementwise comparison failed", DeprecationWarning) + tm.assert_frame_equal(result, expected, check_dtype=check_dtype, + check_index_type=('equiv' if preserve_index + else False)) def _check_series_roundtrip(s, type_=None, expected_pa_type=None): @@ -239,11 +243,10 @@ def test_rangeindex_doesnt_warn(self): # attributes -> can be removed if support < pd 0.25 is dropped df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter(action="error") _check_pandas_roundtrip(df, preserve_index=True) - assert len(record) == 0, [r.message for r in record] - def test_multiindex_columns(self): columns = pd.MultiIndex.from_arrays([ ['one', 'two'], ['X', 'Y'] @@ -290,11 +293,10 @@ def test_multiindex_doesnt_warn(self): columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']]) df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) - with pytest.warns(None) as record: + with warnings.catch_warnings(): + warnings.simplefilter(action="error") _check_pandas_roundtrip(df, preserve_index=True) - assert len(record) == 0, [r.message for r in record] - def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) @@ -2111,7 +2113,13 @@ def test_to_list_of_structs_pandas(self): ]) series = pd.Series(data.to_pandas()) - tm.assert_series_equal(series, expected) + + # pandas.testing generates a + # DeprecationWarning: elementwise comparison failed + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise comparison failed", + DeprecationWarning) + tm.assert_series_equal(series, expected) @pytest.mark.parametrize('t,data,expected', [ ( @@ -2164,9 +2172,16 @@ def test_nested_large_list(self): s = (pa.array([[[1, 2, 3], [4]], None], type=pa.large_list(pa.large_list(pa.int64()))) .to_pandas()) - tm.assert_series_equal( - s, pd.Series([[[1, 2, 3], [4]], None], dtype=object), - check_names=False) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", + "Creating an ndarray from ragged nested", + np.VisibleDeprecationWarning) + warnings.filterwarnings("ignore", "elementwise comparison failed", + DeprecationWarning) + tm.assert_series_equal( + s, pd.Series([[[1, 2, 3], [4]], None], dtype=object), + check_names=False) def test_large_binary_list(self): for list_type_factory in (pa.list_, pa.large_list): @@ -4378,6 +4393,7 @@ def make_df_with_timestamps(): @pytest.mark.parquet +@pytest.mark.filterwarnings("ignore:Parquet format '2.0':FutureWarning") def test_timestamp_as_object_parquet(tempdir): # Timestamps can be stored as Parquet and reloaded into Pandas with no loss # of information if the timestamp_as_object option is True. diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index cc5fd0357dd..9b5862e6483 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -30,9 +30,13 @@ import pyarrow as pa -# ignore all Plasma deprecation warnings in this file, we test that the -# warnings are actually raised in test_plasma_deprecated.py -pytestmark = pytest.mark.filterwarnings("ignore:Plasma:DeprecationWarning") +pytestmark = [ + # ignore all Plasma deprecation warnings in this file, we test that the + # warnings are actually raised in test_plasma_deprecated.py + pytest.mark.filterwarnings("ignore:Plasma:DeprecationWarning"), + # Ignore other ResourceWarning as plasma is soon to be removed in ~12.0.0 + pytest.mark.filterwarnings("ignore:subprocess:ResourceWarning") +] DEFAULT_PLASMA_STORE_MEMORY = 10 ** 8 USE_VALGRIND = os.getenv("PLASMA_VALGRIND") == "1" diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 030e4aad820..7cd4459f6fd 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -17,6 +17,7 @@ import os import pathlib + import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index ddeca128791..df7936371ee 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -357,18 +357,19 @@ def signal_wakeup_fd(*, warn_on_full_buffer=False): def _ensure_minio_component_version(component, minimum_year): full_args = [component, '--version'] - proc = subprocess.Popen(full_args, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, encoding='utf-8') - if proc.wait(10) != 0: - return False - stdout = proc.stdout.read() - pattern = component + r' version RELEASE\.(\d+)-.*' - version_match = re.search(pattern, stdout) - if version_match: - version_year = version_match.group(1) - return int(version_year) >= minimum_year - else: - raise FileNotFoundError("minio component older than the minimum year") + with subprocess.Popen(full_args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding='utf-8') as proc: + if proc.wait(10) != 0: + return False + stdout = proc.stdout.read() + pattern = component + r' version RELEASE\.(\d+)-.*' + version_match = re.search(pattern, stdout) + if version_match: + version_year = version_match.group(1) + return int(version_year) >= minimum_year + else: + raise FileNotFoundError( + "minio component older than the minimum year") def _wait_for_minio_startup(mcdir, address, access_key, secret_key): @@ -385,16 +386,16 @@ def _wait_for_minio_startup(mcdir, address, access_key, secret_key): def _run_mc_command(mcdir, *args): full_args = ['mc', '-C', mcdir] + list(args) - proc = subprocess.Popen(full_args, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, encoding='utf-8') - retval = proc.wait(10) - cmd_str = ' '.join(full_args) - print(f'Cmd: {cmd_str}') - print(f' Return: {retval}') - print(f' Stdout: {proc.stdout.read()}') - print(f' Stderr: {proc.stderr.read()}') - if retval != 0: - raise ChildProcessError("Could not run mc") + with subprocess.Popen(full_args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding='utf-8') as proc: + retval = proc.wait(10) + cmd_str = ' '.join(full_args) + print(f'Cmd: {cmd_str}') + print(f' Return: {retval}') + print(f' Stdout: {proc.stdout.read()}') + print(f' Stderr: {proc.stderr.read()}') + if retval != 0: + raise ChildProcessError("Could not run mc") def _configure_s3_limited_user(s3_server, policy):