From b08a2fe2f5ff0a7cbcfeb639309fbc3247630a4e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Jan 2017 15:11:12 -0500 Subject: [PATCH] Count bytes in numpy serialization, not length When determining if we should compress an array we take a few samples of length 10000. However, previously this 10000 was the number of elements rather than the number of bytes. This resulted in odd behavior when an array's size was less than the sample it was trying to extract. We have resolved this by counting by bytes rather than elements --- distributed/protocol/numpy.py | 4 ++-- distributed/protocol/tests/test_numpy.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/distributed/protocol/numpy.py b/distributed/protocol/numpy.py index 5ed22fb554d..e146df937e8 100644 --- a/distributed/protocol/numpy.py +++ b/distributed/protocol/numpy.py @@ -50,7 +50,7 @@ def serialize_numpy_ndarray(x): data = x.view('u1').data - if blosc and len(data) > 1e5: + if blosc and data.nbytes > 1e5: frames = frame_split_size([data]) if sys.version_info.major == 2: frames = [ensure_bytes(frame) for frame in frames] @@ -58,7 +58,7 @@ def serialize_numpy_ndarray(x): out = [] compression = [] for frame in frames: - sample = byte_sample(frame, 10000 * size, 5) + sample = byte_sample(frame, 10000 // size * size, 5) csample = blosc.compress(sample, typesize=size, cname='lz4', clevel=3) if len(csample) < 0.8 * len(sample): compressed = blosc.compress(frame, typesize=size, cname='lz4', clevel=5) diff --git a/distributed/protocol/tests/test_numpy.py b/distributed/protocol/tests/test_numpy.py index ebecd8ee6e1..8b976dedbb2 100644 --- a/distributed/protocol/tests/test_numpy.py +++ b/distributed/protocol/tests/test_numpy.py @@ -47,6 +47,8 @@ def test_serialize(): np.ones(shape=(5,), dtype=('f8', 32)), np.ones(shape=(5,), dtype=[('x', 'f8', 32)]), np.array([(1, 'abc')], dtype=[('x', 'i4'), ('s', object)]), + np.zeros(5000, dtype=[('x%d'%i,'