From b08a2fe2f5ff0a7cbcfeb639309fbc3247630a4e Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Jan 2017 15:11:12 -0500
Subject: [PATCH] Count bytes in numpy serialization, not length

When determining if we should compress an array we take a few samples of length
10000.  However, previously this 10000 was the number of elements rather than
the number of bytes.  This resulted in odd behavior when an array's size was
less than the sample it was trying to extract.

We have resolved this by counting by bytes rather than elements
---
 distributed/protocol/numpy.py            | 4 ++--
 distributed/protocol/tests/test_numpy.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/distributed/protocol/numpy.py b/distributed/protocol/numpy.py
index 5ed22fb554d..e146df937e8 100644
--- a/distributed/protocol/numpy.py
+++ b/distributed/protocol/numpy.py
@@ -50,7 +50,7 @@ def serialize_numpy_ndarray(x):
 
     data = x.view('u1').data
 
-    if blosc and len(data) > 1e5:
+    if blosc and data.nbytes > 1e5:
         frames = frame_split_size([data])
         if sys.version_info.major == 2:
             frames = [ensure_bytes(frame) for frame in frames]
@@ -58,7 +58,7 @@ def serialize_numpy_ndarray(x):
         out = []
         compression = []
         for frame in frames:
-            sample = byte_sample(frame, 10000 * size, 5)
+            sample = byte_sample(frame, 10000 // size * size, 5)
             csample = blosc.compress(sample, typesize=size, cname='lz4', clevel=3)
             if len(csample) < 0.8 * len(sample):
                 compressed = blosc.compress(frame, typesize=size, cname='lz4', clevel=5)
diff --git a/distributed/protocol/tests/test_numpy.py b/distributed/protocol/tests/test_numpy.py
index ebecd8ee6e1..8b976dedbb2 100644
--- a/distributed/protocol/tests/test_numpy.py
+++ b/distributed/protocol/tests/test_numpy.py
@@ -47,6 +47,8 @@ def test_serialize():
          np.ones(shape=(5,), dtype=('f8', 32)),
          np.ones(shape=(5,), dtype=[('x', 'f8', 32)]),
          np.array([(1, 'abc')], dtype=[('x', 'i4'), ('s', object)]),
+         np.zeros(5000, dtype=[('x%d'%i,'<f8') for i in range(4)]),
+         np.zeros(5000, dtype='S32'),
          np.ones(shape=(5, 6)).astype(dtype=[('total', '<f8'), ('n', '<f8')])])
 def test_dumps_serialize_numpy(x):
     header, frames = serialize(x)