diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 413f3db534e..74beb23994d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -45,6 +45,7 @@ 'gandiva', 'hdfs', 'large_memory', + 'memory_leak', 'nopandas', 'orc', 'pandas', @@ -65,6 +66,7 @@ 'gandiva': False, 'hdfs': False, 'large_memory': False, + 'memory_leak': False, 'orc': False, 'nopandas': False, 'pandas': False, diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py new file mode 100644 index 00000000000..d95444d2d46 --- /dev/null +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import numpy as np +import pyarrow as pa + +import pyarrow.tests.util as test_util + +try: + import pandas as pd +except ImportError: + pass + + +@pytest.mark.memory_leak +@pytest.mark.pandas +def test_deserialize_pandas_arrow_7956(): + df = pd.DataFrame({'a': np.arange(10000), + 'b': [pd.util.testing.rands(5) for _ in range(10000)]}) + + def action(): + df_bytes = pa.ipc.serialize_pandas(df).to_pybytes() + buf = pa.py_buffer(df_bytes) + pa.ipc.deserialize_pandas(buf) + + # Abort at 128MB threshold + test_util.memory_leak_check(action, threshold=1 << 27, iterations=100) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 2b270b9bbd7..027133784e4 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -21,6 +21,7 @@ import contextlib import decimal +import gc import numpy as np import os import random @@ -122,6 +123,49 @@ def make_dataframe(): return df +def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10, + check_interval=1): + """ + Execute the function and try to detect a clear memory leak either internal + to Arrow or caused by a reference counting problem in the Python binding + implementation. Raises exception if a leak detected + + Parameters + ---------- + f : callable + Function to invoke on each iteration + metric : {'rss', 'vms', 'shared'}, default 'rss' + Attribute of psutil.Process.memory_info to use for determining current + memory use + threshold : int, default 128K + Threshold in number of bytes to consider a leak + iterations : int, default 10 + Total number of invocations of f + check_interval : int, default 1 + Number of invocations of f in between each memory use check + """ + import psutil + proc = psutil.Process() + + def _get_use(): + gc.collect() + return getattr(proc.memory_info(), metric) + + baseline_use = _get_use() + + def _leak_check(): + current_use = _get_use() + if current_use - baseline_use > threshold: + raise Exception("Memory leak detected. " + "Departure from baseline {} after {} iterations" + .format(current_use - baseline_use, i)) + + for i in range(iterations): + f() + if i % check_interval == 0: + _leak_check() + + def get_modified_env_with_pythonpath(): # Prepend pyarrow root directory to PYTHONPATH env = os.environ.copy()