From b8bf7b93d5851f9562329ce265c002c74ece945f Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Wed, 18 Mar 2026 07:41:11 +0000 Subject: [PATCH] test: add tests for SPARK-55056 triple nested array empty outer segfault --- python/pyspark/sql/tests/arrow/test_arrow.py | 35 ++++++++++++++++ .../execution/arrow/ArrowWriterSuite.scala | 41 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py index 4189860181a8d..db74f303e3d43 100644 --- a/python/pyspark/sql/tests/arrow/test_arrow.py +++ b/python/pyspark/sql/tests/arrow/test_arrow.py @@ -1871,6 +1871,41 @@ def test_toArrow_with_compression_codec_large_dataset(self): self.assertEqual(t.num_rows, 10000) self.assertEqual(t.column_names, ["id", "str_col", "mod_col"]) + def test_toPandas_double_nested_array_empty_outer(self): + schema = StructType([StructField("data", ArrayType(ArrayType(StringType())))]) + df = self.spark.createDataFrame([Row(data=[])], schema=schema) + pdf = df.toPandas() + self.assertEqual(len(pdf), 1) + self.assertEqual(len(pdf["data"][0]), 0) + + def test_toPandas_array_of_map_empty_outer(self): + schema = StructType([StructField("data", ArrayType(MapType(StringType(), StringType())))]) + df = self.spark.createDataFrame([Row(data=[])], schema=schema) + pdf = df.toPandas() + self.assertEqual(len(pdf), 1) + self.assertEqual(len(pdf["data"][0]), 0) + + def test_toPandas_triple_nested_array_empty_outer(self): + # SPARK-55056: This used to trigger SIGSEGV before the upstream arrow-java fix. + # When the outer array is empty, the second-level ArrayWriter is never + # invoked, so its count stays 0. Arrow format requires ListArray offset + # buffer to have N+1 entries even when N=0, but getBufferSizeFor(0) + # returns 0 and the buffer is omitted in IPC serialization. + schema = StructType([StructField("data", ArrayType(ArrayType(ArrayType(StringType()))))]) + df = self.spark.createDataFrame([Row(data=[])], schema=schema) + pdf = df.toPandas() + self.assertEqual(len(pdf), 1) + self.assertEqual(len(pdf["data"][0]), 0) + + def test_toPandas_nested_array_with_map_empty_outer(self): + schema = StructType( + [StructField("data", ArrayType(ArrayType(MapType(StringType(), StringType()))))] + ) + df = self.spark.createDataFrame([Row(data=[])], schema=schema) + pdf = df.toPandas() + self.assertEqual(len(pdf), 1) + self.assertEqual(len(pdf["data"][0]), 0) + @unittest.skipIf( not have_pandas or not have_pyarrow, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala index 2c0c0494bbacf..c3e9af54d431e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala @@ -875,4 +875,45 @@ class ArrowWriterSuite extends SparkFunSuite { assert(map2.keyArray().array().mkString(",") == Array(1).mkString(",")) assert(stringRepr(map2) == Array("bob", "40").mkString(",")) } + + test("SPARK-55056: triple nested array with empty outer array") { + // Schema: array>> + // This used to trigger SIGSEGV before the upstream arrow-java fix. + // When the outer array is empty, the second-level ArrayWriter is never + // invoked, so its count stays 0. Arrow format requires ListArray offset + // buffer to have N+1 entries even when N=0, but getBufferSizeFor(0) + // returns 0 and the buffer is omitted. + val schema = new StructType() + .add("data", ArrayType(ArrayType(ArrayType(StringType)))) + val writer = ArrowWriter.create(schema, null) + assert(writer.schema === schema) + + // Write a row with an empty outer array + writer.write(InternalRow(ArrayData.toArrayData(Array.empty))) + writer.finish() + + val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0)) + val array0 = reader.getArray(0) + assert(array0.numElements() === 0) + + writer.root.close() + } + + test("SPARK-55056: nested array with map inside empty outer array") { + // Schema: array>> + val schema = new StructType() + .add("data", ArrayType(ArrayType(MapType(StringType, StringType)))) + val writer = ArrowWriter.create(schema, null) + assert(writer.schema === schema) + + // Write a row with an empty outer array + writer.write(InternalRow(ArrayData.toArrayData(Array.empty))) + writer.finish() + + val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0)) + val array0 = reader.getArray(0) + assert(array0.numElements() === 0) + + writer.root.close() + } }