apache · jorisvandenbossche · Nov 30, 2022 · Nov 24, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
@@ -1175,6 +1175,10 @@ def open(self):
         reader = self.open_file_func(self.path)
         if not isinstance(reader, ParquetFile):
             reader = ParquetFile(reader, **self.file_options)
+
+        # ensure reader knows it's responsible for closing source
+        # since we opened the source here internally.
+        reader._close_source = True
         return reader
 
     def read(self, columns=None, use_threads=True, partitions=None,

diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
@@ -174,6 +174,7 @@ def s3_server(s3_connection):
         finally:
             if proc is not None:
                 proc.kill()
+                proc.wait()
 
 
 @pytest.fixture(scope='session')
@@ -194,3 +195,4 @@ def gcs_server():
     finally:
         if proc is not None:
             proc.kill()
+            proc.wait()
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
@@ -17,6 +17,7 @@
 
 from collections import OrderedDict
 import io
+import warnings
 
 import numpy as np
 import pytest
@@ -617,15 +618,16 @@ def test_read_non_existent_file(tempdir, use_legacy_dataset):
 
 @parametrize_legacy_dataset
 def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
-    with pytest.warns(None) as record:
-        pq.read_table(datadir / 'v0.7.1.parquet',
-                      use_legacy_dataset=use_legacy_dataset)
-
     if use_legacy_dataset:
-        # FutureWarning: 'use_legacy_dataset=True'
-        assert len(record) == 1
+        msg = "Passing 'use_legacy_dataset=True'"
+        with pytest.warns(FutureWarning, match=msg):
+            pq.read_table(datadir / 'v0.7.1.parquet',
+                          use_legacy_dataset=use_legacy_dataset)
     else:
-        assert len(record) == 0
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="error")
+            pq.read_table(datadir / 'v0.7.1.parquet',
+                          use_legacy_dataset=use_legacy_dataset)
 
 
 @pytest.mark.pandas

diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -81,6 +81,7 @@ def test_parquet_piece_open_and_get_metadata(tempdir):
 
     with pytest.warns(FutureWarning):
         piece = pq.ParquetDatasetPiece(path)
+
     table1 = piece.read()
     assert isinstance(table1, pa.Table)
     meta1 = piece.get_metadata()

diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
@@ -17,6 +17,7 @@
 
 import datetime
 import io
+import warnings
 
 import numpy as np
 import pytest
@@ -321,7 +322,11 @@ def get_table(pq_reader_method, filename, **kwargs):
     # with the default resolution of ns, we get wrong values for INT96
     # that are out of bounds for nanosecond range
     tab_error = get_table(pq_reader_method, filename)
-    assert tab_error["a"].to_pylist() != oob_dts
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore",
+                                "Discarding nonzero nanoseconds in conversion",
+                                UserWarning)
+        assert tab_error["a"].to_pylist() != oob_dts
 
     # avoid this overflow by specifying the resolution to use for INT96 values
     tab_correct = get_table(

diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
@@ -400,6 +400,7 @@ def test_multi_dataset_metadata(tempdir):
     assert md['serialized_size'] > 0
 
 
+@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
 def test_write_metadata(tempdir):
     path = str(tempdir / "metadata")
     schema = pa.schema([("a", "int64"), ("b", "float64")])

diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -314,6 +314,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
 
 
 @pytest.mark.pandas
+@pytest.mark.filterwarnings("ignore:Parquet format '2.0':FutureWarning")
 def test_spark_flavor_preserves_pandas_metadata():
     df = _test_dataframe(size=100)
     df.index = np.arange(0, 10 * len(df), 10)

diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py
@@ -63,6 +63,7 @@ def test_resolve_local_path():
         assert path == uri
 
 
+@pytest.mark.filterwarnings("ignore:pyarrow.filesystem.LocalFileSystem")
 def test_resolve_home_directory():
     uri = '~/myfile.parquet'
     fs, path = filesystem.resolve_filesystem_and_path(uri)

diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
@@ -846,6 +846,7 @@ def run(self):
                     self._batches.append(batch)
         finally:
             connection.close()
+            self._sock.close()
 
     def get_result(self):
         return (self._schema, self._table if self._do_read_all

diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
@@ -52,8 +52,10 @@ def fix_example_values(actual_cols, expected_cols):
         if (name == "map" and
                 [d.keys() == {'key', 'value'} for m in expected for d in m]):
             # convert [{'key': k, 'value': v}, ...] to [(k, v), ...]
+            col = expected_cols[name].copy()
             for i, m in enumerate(expected):
-                expected_cols[name][i] = [(d['key'], d['value']) for d in m]
+                col[i] = [(d['key'], d['value']) for d in m]
+            expected_cols[name] = col
             continue
 
         typ = actual[0].__class__

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -20,6 +20,7 @@
 import json
 import multiprocessing as mp
 import sys
+import warnings
 
 from collections import OrderedDict
 from datetime import date, datetime, time, timedelta, timezone
@@ -95,9 +96,12 @@ def _check_pandas_roundtrip(df, expected=None, use_threads=False,
     if expected is None:
         expected = df
 
-    tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
-                          check_index_type=('equiv' if preserve_index
-                                            else False))
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", "elementwise comparison failed", DeprecationWarning)
+        tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
+                              check_index_type=('equiv' if preserve_index
+                                                else False))
 
 
 def _check_series_roundtrip(s, type_=None, expected_pa_type=None):
@@ -239,11 +243,10 @@ def test_rangeindex_doesnt_warn(self):
         # attributes -> can be removed if support < pd 0.25 is dropped
         df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
 
-        with pytest.warns(None) as record:
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="error")
             _check_pandas_roundtrip(df, preserve_index=True)
 
-        assert len(record) == 0, [r.message for r in record]
-
     def test_multiindex_columns(self):
         columns = pd.MultiIndex.from_arrays([
             ['one', 'two'], ['X', 'Y']
@@ -290,11 +293,10 @@ def test_multiindex_doesnt_warn(self):
         columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
 
-        with pytest.warns(None) as record:
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="error")
             _check_pandas_roundtrip(df, preserve_index=True)
 
-        assert len(record) == 0, [r.message for r in record]
-
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
@@ -2111,7 +2113,13 @@ def test_to_list_of_structs_pandas(self):
         ])
 
         series = pd.Series(data.to_pandas())
-        tm.assert_series_equal(series, expected)
+
+        # pandas.testing generates a
+        # DeprecationWarning: elementwise comparison failed
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                    DeprecationWarning)
+            tm.assert_series_equal(series, expected)
 
     @pytest.mark.parametrize('t,data,expected', [
         (
@@ -2164,9 +2172,16 @@ def test_nested_large_list(self):
         s = (pa.array([[[1, 2, 3], [4]], None],
                       type=pa.large_list(pa.large_list(pa.int64())))
              .to_pandas())
-        tm.assert_series_equal(
-            s, pd.Series([[[1, 2, 3], [4]], None], dtype=object),
-            check_names=False)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore",
+                                    "Creating an ndarray from ragged nested",
+                                    np.VisibleDeprecationWarning)
+            warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                    DeprecationWarning)
+            tm.assert_series_equal(
+                s, pd.Series([[[1, 2, 3], [4]], None], dtype=object),
+                check_names=False)
 
     def test_large_binary_list(self):
         for list_type_factory in (pa.list_, pa.large_list):
@@ -4378,6 +4393,7 @@ def make_df_with_timestamps():
 
 
 @pytest.mark.parquet
+@pytest.mark.filterwarnings("ignore:Parquet format '2.0':FutureWarning")
 def test_timestamp_as_object_parquet(tempdir):
     # Timestamps can be stored as Parquet and reloaded into Pandas with no loss
     # of information if the timestamp_as_object option is True.

diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py
@@ -30,9 +30,13 @@
 import pyarrow as pa
 
 
-# ignore all Plasma deprecation warnings in this file, we test that the
-# warnings are actually raised in test_plasma_deprecated.py
-pytestmark = pytest.mark.filterwarnings("ignore:Plasma:DeprecationWarning")
+pytestmark = [
+    # ignore all Plasma deprecation warnings in this file, we test that the
+    # warnings are actually raised in test_plasma_deprecated.py
+    pytest.mark.filterwarnings("ignore:Plasma:DeprecationWarning"),
+    # Ignore other ResourceWarning as plasma is soon to be removed in ~12.0.0
+    pytest.mark.filterwarnings("ignore:subprocess:ResourceWarning")
+]
 
 DEFAULT_PLASMA_STORE_MEMORY = 10 ** 8
 USE_VALGRIND = os.getenv("PLASMA_VALGRIND") == "1"

diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py
@@ -17,6 +17,7 @@
 
 import os
 import pathlib
+
 import pytest
 
 import pyarrow as pa

diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
@@ -357,18 +357,19 @@ def signal_wakeup_fd(*, warn_on_full_buffer=False):
 
 def _ensure_minio_component_version(component, minimum_year):
     full_args = [component, '--version']
-    proc = subprocess.Popen(full_args, stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE, encoding='utf-8')
-    if proc.wait(10) != 0:
-        return False
-    stdout = proc.stdout.read()
-    pattern = component + r' version RELEASE\.(\d+)-.*'
-    version_match = re.search(pattern, stdout)
-    if version_match:
-        version_year = version_match.group(1)
-        return int(version_year) >= minimum_year
-    else:
-        raise FileNotFoundError("minio component older than the minimum year")
+    with subprocess.Popen(full_args, stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE, encoding='utf-8') as proc:
+        if proc.wait(10) != 0:
+            return False
+        stdout = proc.stdout.read()
+        pattern = component + r' version RELEASE\.(\d+)-.*'
+        version_match = re.search(pattern, stdout)
+        if version_match:
+            version_year = version_match.group(1)
+            return int(version_year) >= minimum_year
+        else:
+            raise FileNotFoundError(
+                "minio component older than the minimum year")
 
 
 def _wait_for_minio_startup(mcdir, address, access_key, secret_key):
@@ -385,16 +386,16 @@ def _wait_for_minio_startup(mcdir, address, access_key, secret_key):
 
 def _run_mc_command(mcdir, *args):
     full_args = ['mc', '-C', mcdir] + list(args)
-    proc = subprocess.Popen(full_args, stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE, encoding='utf-8')
-    retval = proc.wait(10)
-    cmd_str = ' '.join(full_args)
-    print(f'Cmd: {cmd_str}')
-    print(f'  Return: {retval}')
-    print(f'  Stdout: {proc.stdout.read()}')
-    print(f'  Stderr: {proc.stderr.read()}')
-    if retval != 0:
-        raise ChildProcessError("Could not run mc")
+    with subprocess.Popen(full_args, stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE, encoding='utf-8') as proc:
+        retval = proc.wait(10)
+        cmd_str = ' '.join(full_args)
+        print(f'Cmd: {cmd_str}')
+        print(f'  Return: {retval}')
+        print(f'  Stdout: {proc.stdout.read()}')
+        print(f'  Stderr: {proc.stderr.read()}')
+        if retval != 0:
+            raise ChildProcessError("Could not run mc")
 
 
 def _configure_s3_limited_user(s3_server, policy):