From 84591238ca8648f54f45fd23fa4fcafcaa956c03 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 14 May 2017 11:52:53 -0400 Subject: [PATCH] Fix memory leak caused by list append ref count, lack of setting NPY_ARRAY_OWNDATA Change-Id: I22b284add9268c48278e56e54f1b8be0ab6a2593 --- cpp/src/arrow/python/common.h | 7 +++++- cpp/src/arrow/python/pandas_convert.cc | 25 +++++++++--------- python/scripts/test_leak.py | 35 ++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 14 deletions(-) create mode 100644 python/scripts/test_leak.py diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index f6e706b6948..ec40d0eafa3 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -69,7 +69,7 @@ class ARROW_EXPORT OwnedRef { ~OwnedRef() { PyAcquireGIL lock; - Py_XDECREF(obj_); + release(); } void reset(PyObject* obj) { @@ -80,6 +80,11 @@ class ARROW_EXPORT OwnedRef { obj_ = obj; } + void release() { + Py_XDECREF(obj_); + obj_ = nullptr; + } + PyObject* obj() const { return obj_; } private: diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 264bed11b04..b6fb05e1e5e 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -1023,7 +1023,8 @@ static inline PyObject* NewArray1DFromType( } set_numpy_metadata(type, arrow_type, descr); - return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr); + return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, + NPY_ARRAY_OWNDATA | NPY_ARRAY_CARRAY, nullptr); } class PandasBlock { @@ -1078,12 +1079,10 @@ class PandasBlock { PyObject* block_arr; if (ndim == 2) { npy_intp block_dims[2] = {num_columns_, num_rows_}; - block_arr = PyArray_NewFromDescr( - &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr); + block_arr = PyArray_SimpleNewFromDescr(2, block_dims, descr); } else { npy_intp block_dims[1] = {num_rows_}; - block_arr = PyArray_NewFromDescr( - &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr); + block_arr = PyArray_SimpleNewFromDescr(1, block_dims, descr); } if (block_arr == NULL) { @@ -1091,6 +1090,8 @@ class PandasBlock { return Status::OK(); } + PyArray_ENABLEFLAGS(reinterpret_cast(block_arr), NPY_ARRAY_OWNDATA); + npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); if (placement_arr == NULL) { @@ -1357,8 +1358,6 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) class ObjectBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - virtual ~ObjectBlock() {} - Status Allocate() override { return AllocateNDArray(NPY_OBJECT); } Status Write(const std::shared_ptr& col, int64_t abs_placement, @@ -1416,7 +1415,6 @@ template class IntBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(arrow_traits::npy_type); } @@ -1450,7 +1448,6 @@ using Int64Block = IntBlock; class Float32Block : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); } Status Write(const std::shared_ptr& col, int64_t abs_placement, @@ -1470,7 +1467,6 @@ class Float32Block : public PandasBlock { class Float64Block : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); } Status Write(const std::shared_ptr& col, int64_t abs_placement, @@ -1523,7 +1519,6 @@ class Float64Block : public PandasBlock { class BoolBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_BOOL); } Status Write(const std::shared_ptr& col, int64_t abs_placement, @@ -1544,7 +1539,6 @@ class BoolBlock : public PandasBlock { class DatetimeBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status AllocateDatetime(int ndim) { RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME, ndim)); @@ -1629,7 +1623,6 @@ template class CategoricalBlock : public PandasBlock { public: explicit CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {} - Status Allocate() override { constexpr int npy_type = arrow_traits::npy_type; @@ -1960,6 +1953,9 @@ class DataFrameBlockCreator { PyObject* item; RETURN_NOT_OK(it.second->GetPyResult(&item)); if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); } + + // ARROW-1017; PyList_Append increments object refcount + Py_DECREF(item); } return Status::OK(); } @@ -2045,6 +2041,9 @@ class ArrowDeserializer { // Arrow data is immutable. PyArray_CLEARFLAGS(arr_, NPY_ARRAY_WRITEABLE); + // Arrow data is owned by another + PyArray_CLEARFLAGS(arr_, NPY_ARRAY_OWNDATA); + return Status::OK(); } diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py new file mode 100644 index 00000000000..2b197b6c130 --- /dev/null +++ b/python/scripts/test_leak.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import numpy as np +import memory_profiler +import gc + + +def leak(): + data = [pa.array(np.concatenate([np.random.randn(100000)] * 1000))] + table = pa.Table.from_arrays(data, ['foo']) + while True: + print('calling to_pandas') + print('memory_usage: {0}'.format(memory_profiler.memory_usage())) + table.to_pandas() + gc.collect() + +leak()