From 7c62c5b6daec1d35f0f75bac56163323e8559596 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 5 Oct 2020 16:14:45 -0700 Subject: [PATCH 1/3] Remove patch to fix Spark master build --- ci/docker/conda-python-spark.dockerfile | 4 -- ci/etc/integration_spark_ARROW-9438.patch | 72 ----------------------- 2 files changed, 76 deletions(-) delete mode 100644 ci/etc/integration_spark_ARROW-9438.patch diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index a20f1ff3521..d3f0a224582 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -36,10 +36,6 @@ ARG spark=master COPY ci/scripts/install_spark.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark -# patch spark to build with current Arrow Java -COPY ci/etc/integration_spark_ARROW-9438.patch /arrow/ci/etc/ -RUN patch -d /spark -p1 -i /arrow/ci/etc/integration_spark_ARROW-9438.patch - # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/ci/etc/integration_spark_ARROW-9438.patch b/ci/etc/integration_spark_ARROW-9438.patch deleted file mode 100644 index 2baed303717..00000000000 --- a/ci/etc/integration_spark_ARROW-9438.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 0b5388a945a7e5c5706cf00d0754540a6c68254d Mon Sep 17 00:00:00 2001 -From: Bryan Cutler -Date: Mon, 13 Jul 2020 23:12:25 -0700 -Subject: [PATCH] Update Arrow Java for 1.0.0 - ---- - pom.xml | 17 ++++++++++++++--- - sql/catalyst/pom.xml | 4 ++++ - 2 files changed, 18 insertions(+), 3 deletions(-) - -diff --git a/pom.xml b/pom.xml -index 08ca13bfe9..6619fca200 100644 ---- a/pom.xml -+++ b/pom.xml -@@ -199,7 +199,7 @@ - If you are changing Arrow version specification, please check ./python/pyspark/sql/utils.py, - and ./python/setup.py too. - --> -- 0.15.1 -+ 1.0.0-SNAPSHOT - - org.fusesource.leveldbjni - -@@ -2288,7 +2288,7 @@ - - - com.fasterxml.jackson.core -- jackson-databind -+ jackson-core - - - io.netty -@@ -2298,9 +2298,20 @@ - io.netty - netty-common - -+ -+ -+ -+ org.apache.arrow -+ arrow-memory-netty -+ ${arrow.version} -+ - - io.netty -- netty-handler -+ netty-buffer -+ -+ -+ io.netty -+ netty-common - - - -diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml -index 9edbb7fec9..6b79eb722f 100644 ---- a/sql/catalyst/pom.xml -+++ b/sql/catalyst/pom.xml -@@ -117,6 +117,10 @@ - org.apache.arrow - arrow-vector - -+ -+ org.apache.arrow -+ arrow-memory-netty -+ - - - target/scala-${scala.binary.version}/classes --- -2.17.1 - From ed31bb2fea84d753539752aae3e80ae800c69ab6 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 6 Oct 2020 15:44:45 -0700 Subject: [PATCH 2/3] Add Spark patch to correct nested timestamps --- ci/docker/conda-python-spark.dockerfile | 4 +++ ci/etc/integration_spark_ARROW-10178.patch | 37 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 ci/etc/integration_spark_ARROW-10178.patch diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index d3f0a224582..4d1a6933f03 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -36,6 +36,10 @@ ARG spark=master COPY ci/scripts/install_spark.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark +# patch spark for nested timestamp correction, remove after SPARK-32285 +COPY ci/etc/integration_spark_ARROW-10178.patch /arrow/ci/etc/ +RUN patch -d /spark -p1 -i /arrow/ci/etc/integration_spark_ARROW-10178.patch + # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/ci/etc/integration_spark_ARROW-10178.patch b/ci/etc/integration_spark_ARROW-10178.patch new file mode 100644 index 00000000000..39bcc0bac3a --- /dev/null +++ b/ci/etc/integration_spark_ARROW-10178.patch @@ -0,0 +1,37 @@ +diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py +index 42562e1fb9..d00b67e99b 100644 +--- a/python/pyspark/sql/pandas/serializers.py ++++ b/python/pyspark/sql/pandas/serializers.py +@@ -120,15 +120,30 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): + + def arrow_to_pandas(self, arrow_column): + from pyspark.sql.pandas.types import _check_series_localize_timestamps +- import pyarrow ++ import pyarrow as pa + + # If the given column is a date type column, creates a series of datetime.date directly + # instead of creating datetime64[ns] as intermediate data to avoid overflow caused by + # datetime64[ns] type handling. + s = arrow_column.to_pandas(date_as_object=True) + +- if pyarrow.types.is_timestamp(arrow_column.type): ++ if pa.types.is_timestamp(arrow_column.type): + return _check_series_localize_timestamps(s, self._timezone) ++ elif pa.types.is_struct(arrow_column.type): ++ if isinstance(arrow_column, pa.ChunkedArray): ++ arrow_column = pa.concat_arrays(arrow_column.iterchunks()) ++ children = [] ++ names = [] ++ for f in arrow_column.type: ++ child = arrow_column.field(f.name) ++ if pa.types.is_timestamp(child.type): ++ child_series = child.to_pandas() ++ child_series = _check_series_localize_timestamps(child_series, self._timezone) ++ child = pa.array(child_series, type=pa.timestamp('us')) ++ children.append(child) ++ names.append(f.name) ++ arr = pa.StructArray.from_arrays(children, names) ++ return arr.to_pandas(date_as_object=True) + else: + return s + From 6a064e5084029ac8407836a1bd56b3437599761a Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 6 Oct 2020 23:06:32 -0700 Subject: [PATCH 3/3] Revert "Add Spark patch to correct nested timestamps" This reverts commit ed31bb2fea84d753539752aae3e80ae800c69ab6. --- ci/docker/conda-python-spark.dockerfile | 4 --- ci/etc/integration_spark_ARROW-10178.patch | 37 ---------------------- 2 files changed, 41 deletions(-) delete mode 100644 ci/etc/integration_spark_ARROW-10178.patch diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 4d1a6933f03..d3f0a224582 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -36,10 +36,6 @@ ARG spark=master COPY ci/scripts/install_spark.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark -# patch spark for nested timestamp correction, remove after SPARK-32285 -COPY ci/etc/integration_spark_ARROW-10178.patch /arrow/ci/etc/ -RUN patch -d /spark -p1 -i /arrow/ci/etc/integration_spark_ARROW-10178.patch - # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/ci/etc/integration_spark_ARROW-10178.patch b/ci/etc/integration_spark_ARROW-10178.patch deleted file mode 100644 index 39bcc0bac3a..00000000000 --- a/ci/etc/integration_spark_ARROW-10178.patch +++ /dev/null @@ -1,37 +0,0 @@ -diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py -index 42562e1fb9..d00b67e99b 100644 ---- a/python/pyspark/sql/pandas/serializers.py -+++ b/python/pyspark/sql/pandas/serializers.py -@@ -120,15 +120,30 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): - - def arrow_to_pandas(self, arrow_column): - from pyspark.sql.pandas.types import _check_series_localize_timestamps -- import pyarrow -+ import pyarrow as pa - - # If the given column is a date type column, creates a series of datetime.date directly - # instead of creating datetime64[ns] as intermediate data to avoid overflow caused by - # datetime64[ns] type handling. - s = arrow_column.to_pandas(date_as_object=True) - -- if pyarrow.types.is_timestamp(arrow_column.type): -+ if pa.types.is_timestamp(arrow_column.type): - return _check_series_localize_timestamps(s, self._timezone) -+ elif pa.types.is_struct(arrow_column.type): -+ if isinstance(arrow_column, pa.ChunkedArray): -+ arrow_column = pa.concat_arrays(arrow_column.iterchunks()) -+ children = [] -+ names = [] -+ for f in arrow_column.type: -+ child = arrow_column.field(f.name) -+ if pa.types.is_timestamp(child.type): -+ child_series = child.to_pandas() -+ child_series = _check_series_localize_timestamps(child_series, self._timezone) -+ child = pa.array(child_series, type=pa.timestamp('us')) -+ children.append(child) -+ names.append(f.name) -+ arr = pa.StructArray.from_arrays(children, names) -+ return arr.to_pandas(date_as_object=True) - else: - return s -