apache · HyukjinKwon · May 3, 2019 · May 3, 2019 · HyukjinKwon · May 3, 2019
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -62,11 +62,12 @@
 if sys.version < '3':
     import cPickle as pickle
     from itertools import izip as zip, imap as map
+    pickle_protocol = 2
 else:
     import pickle
     basestring = unicode = str
     xrange = range
-pickle_protocol = pickle.HIGHEST_PROTOCOL
+    pickle_protocol = 3
 
 from pyspark import cloudpickle
 from pyspark.util import _exception_message

diff --git a/python/pyspark/sql/tests/test_serde.py b/python/pyspark/sql/tests/test_serde.py
@@ -126,6 +126,12 @@ def test_BinaryType_serialization(self):
         df = self.spark.createDataFrame(data, schema=schema)
         df.collect()
 
+    def test_int_array_serialization(self):
+        # Note that this test seems dependent on parallelism.
+        data = self.spark.sparkContext.parallelize([[1, 2, 3, 4]] * 100, numSlices=12)
+        df = self.spark.createDataFrame(data, "array<integer>")
+        self.assertEqual(len(list(filter(lambda r: None in r.value, df.collect()))), 0)
+
 
 if __name__ == "__main__":
     import unittest