From a863425302a640adad6d0518178bf8f684c9413b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 5 Mar 2020 19:58:20 -0600 Subject: [PATCH 1/6] Add utility to help find "macro" memory leaks that are undetectable through Arrow memory pools --- python/pyarrow/tests/util.py | 44 ++++++++++++++++++++++++++++++++++++ python/requirements-test.txt | 1 + 2 files changed, 45 insertions(+) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 2b270b9bbd7..5f902dadd9a 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -21,6 +21,7 @@ import contextlib import decimal +import gc import numpy as np import os import random @@ -122,6 +123,49 @@ def make_dataframe(): return df +def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10, + check_interval=1): + """ + Execute the function and try to detect a clear memory leak either internal + to Arrow or cause by a reference counting problem in the Python binding + implementation. Raises exception if a leak detected + + Parameters + ---------- + f : callable + Function to invoke on each iteration + metric : {'rss', 'vms', 'shared'}, default 'rss' + Attribute of psutil.Process.memory_info to use for determining current + memory use + threshold : int, default 128K + Threshold in number of bytes to consider a leak + iterations : int, default 10 + Total number of invocations of f + check_interval : int, default 1 + Number of invocations of f in between each memory use check + """ + import psutil + proc = psutil.Process() + + def _get_use(): + gc.collect() + return getattr(proc.memory_info(), metric) + + baseline_use = _get_use() + + def _leak_check(): + current_use = _get_use() + if current_use - baseline_use > threshold: + raise Exception("Memory leak detected. Baseline use {}, " + "current use after {} iterations is {}" + .format(baseline_use, i, current_use)) + + for i in range(iterations): + f() + if i % check_interval != 0: + continue + + def get_modified_env_with_pythonpath(): # Prepend pyarrow root directory to PYTHONPATH env = os.environ.copy() diff --git a/python/requirements-test.txt b/python/requirements-test.txt index b019eac0020..367dbd850b3 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -5,6 +5,7 @@ hypothesis; python_version > "3.5.2" pandas==0.24; python_version <= "3.5.2" pandas; python_version > "3.5.2" pickle5; python_version == "3.6" or python_version == "3.7" +psutil pytest pytest-lazy-fixture pytz From 9ae59b9c1289eec593a9d72ea1d52e30e188cefc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 5 Mar 2020 20:19:44 -0600 Subject: [PATCH 2/6] Add utility for testing for macro memory use leaks non detectible through Arrow memory pools. Tests for ARROW-7956 --- python/pyarrow/tests/conftest.py | 2 + .../pyarrow/tests/test_adhoc_memory_leak.py | 38 +++++++++++++++++++ python/pyarrow/tests/util.py | 7 ++-- 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 python/pyarrow/tests/test_adhoc_memory_leak.py diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 413f3db534e..74beb23994d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -45,6 +45,7 @@ 'gandiva', 'hdfs', 'large_memory', + 'memory_leak', 'nopandas', 'orc', 'pandas', @@ -65,6 +66,7 @@ 'gandiva': False, 'hdfs': False, 'large_memory': False, + 'memory_leak': False, 'orc': False, 'nopandas': False, 'pandas': False, diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py new file mode 100644 index 00000000000..6371b0b70a4 --- /dev/null +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import numpy as np +import pyarrow as pa +import pandas as pd + +import pyarrow.tests.util as test_util + + +@pytest.mark.memory_leak +def test_deserialize_pandas_arrow_7956(): + df = pd.DataFrame({'a': np.arange(10000), + 'b': [pd.util.testing.rands(5) for _ in range(10000)]}) + + def action(): + df_bytes = pa.ipc.serialize_pandas(df).to_pybytes() + buf = pa.py_buffer(df_bytes) + pa.ipc.deserialize_pandas(buf) + + # Abort at 128MB threshold + test_util.memory_leak_check(action, threshold=1 << 27, iterations=100) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 5f902dadd9a..9f6e7696880 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -156,14 +156,15 @@ def _get_use(): def _leak_check(): current_use = _get_use() if current_use - baseline_use > threshold: - raise Exception("Memory leak detected. Baseline use {}, " - "current use after {} iterations is {}" - .format(baseline_use, i, current_use)) + raise Exception("Memory leak detected. " + "Departure from baseline {} after {} iterations" + .format(current_use - baseline_use, i)) for i in range(iterations): f() if i % check_interval != 0: continue + _leak_check() def get_modified_env_with_pythonpath(): From abf9997edb1473c3d507a733e49a59683eb9c06b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 5 Mar 2020 20:22:10 -0600 Subject: [PATCH 3/6] docstring typo --- python/pyarrow/tests/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 9f6e7696880..7d591948b32 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -127,7 +127,7 @@ def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10, check_interval=1): """ Execute the function and try to detect a clear memory leak either internal - to Arrow or cause by a reference counting problem in the Python binding + to Arrow or caused by a reference counting problem in the Python binding implementation. Raises exception if a leak detected Parameters From e9e1451f758e06656a3a9d3af8379ff5c3442d5e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 5 Mar 2020 22:54:46 -0600 Subject: [PATCH 4/6] Remove psutil since optional --- python/requirements-test.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 367dbd850b3..b019eac0020 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -5,7 +5,6 @@ hypothesis; python_version > "3.5.2" pandas==0.24; python_version <= "3.5.2" pandas; python_version > "3.5.2" pickle5; python_version == "3.6" or python_version == "3.7" -psutil pytest pytest-lazy-fixture pytz From 2c28ccbffba6bf44f5fff023837cbb61c1a5dd7f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 5 Mar 2020 23:08:09 -0600 Subject: [PATCH 5/6] Fix non-idiomatic code --- python/pyarrow/tests/util.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 7d591948b32..027133784e4 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -162,9 +162,8 @@ def _leak_check(): for i in range(iterations): f() - if i % check_interval != 0: - continue - _leak_check() + if i % check_interval == 0: + _leak_check() def get_modified_env_with_pythonpath(): From 94e077f30d164ef8c11a322dd85cce4c89baf6fa Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 6 Mar 2020 08:21:45 -0600 Subject: [PATCH 6/6] Allow for pandas-free build --- python/pyarrow/tests/test_adhoc_memory_leak.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 6371b0b70a4..d95444d2d46 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -19,12 +19,17 @@ import numpy as np import pyarrow as pa -import pandas as pd import pyarrow.tests.util as test_util +try: + import pandas as pd +except ImportError: + pass + @pytest.mark.memory_leak +@pytest.mark.pandas def test_deserialize_pandas_arrow_7956(): df = pd.DataFrame({'a': np.arange(10000), 'b': [pd.util.testing.rands(5) for _ in range(10000)]})