From bd08d0ecbe355b9e0de7d07e8b9ff6ccdb150e73 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Fri, 31 Jan 2020 07:07:54 +0900
Subject: [PATCH 01/26] ARROW-7712: [CI] [Crossbow] Delete fuzzit jobs

We will focus on OSS-Fuzz, now that we have been accepted there.

Also add an OSS-Fuzz badge in the README.

Closes #6328 from pitrou/ARROW-7712-delete-fuzzit-jobs and squashes the following commits:

6c6d2c462 <Antoine Pitrou> Remove remaining fuzzit-related configuration
a6abaa9ba <Antoine Pitrou> ARROW-7712:   Delete fuzzit jobs

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 README.md                             |  2 +-
 ci/docker/linux-apt-fuzzit.dockerfile | 47 ---------------------------
 ci/scripts/fuzzit_build.sh            | 46 --------------------------
 dev/tasks/tasks.yml                   | 29 -----------------
 docker-compose.yml                    | 24 --------------
 5 files changed, 1 insertion(+), 147 deletions(-)
 delete mode 100644 ci/docker/linux-apt-fuzzit.dockerfile
 delete mode 100755 ci/scripts/fuzzit_build.sh
diff --git a/README.md b/README.md
index f7dbb5852c1..e6b5718a8c6 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
 
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/apache/arrow/branch/master?svg=true)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/branch/master)
 [![Coverage Status](https://codecov.io/gh/apache/arrow/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master)
-[![Fuzzit Status](https://app.fuzzit.dev/badge?org_id=yMxZh42xl9qy6bvg3EiJ&branch=master)](https://app.fuzzit.dev/admin/yMxZh42xl9qy6bvg3EiJ/dashboard)
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow)
 [![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt)
 [![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow)
 
diff --git a/ci/docker/linux-apt-fuzzit.dockerfile b/ci/docker/linux-apt-fuzzit.dockerfile
deleted file mode 100644
index 5cb14020ae4..00000000000
--- a/ci/docker/linux-apt-fuzzit.dockerfile
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ARG base
-FROM ${base}
-
-RUN apt-get update && \
-    apt-get install -y -q \
-        clang-7 \
-        libclang-7-dev \
-        clang-format-7 \
-        clang-tidy-7 \
-        clang-tools-7 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN wget -O /usr/local/bin/fuzzit https://github.com/fuzzitdev/fuzzit/releases/latest/download/fuzzit_Linux_x86_64 && \
-    chmod a+x /usr/local/bin/fuzzit
-
-ENV ARROW_FUZZING="ON" \
-    ARROW_USE_ASAN="ON" \
-    CC="clang-7" \
-    CXX="clang++-7" \
-    ARROW_BUILD_TYPE="RelWithDebInfo" \
-    ARROW_FLIGHT="OFF" \
-    ARROW_GANDIVA="OFF" \
-    ARROW_ORC="OFF" \
-    ARROW_PARQUET="OFF" \
-    ARROW_PLASMA="OFF" \
-    ARROW_WITH_BZ2="OFF" \
-    ARROW_WITH_ZSTD="OFF" \
-    ARROW_BUILD_BENCHMARKS="OFF" \
-    ARROW_BUILD_UTILITIES="OFF"
diff --git a/ci/scripts/fuzzit_build.sh b/ci/scripts/fuzzit_build.sh
deleted file mode 100755
index b73c49cddb4..00000000000
--- a/ci/scripts/fuzzit_build.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -euxo pipefail
-
-export TARGET_ID=apache-arrow/arrow-ipc-fuzzing
-
-source_dir=${1}/cpp
-build_dir=${2}/cpp
-
-pushd ${build_dir}/relwithdebinfo
-
-mkdir -p out
-
-cp arrow-ipc-fuzzing-test out/fuzzer
-ldd arrow-ipc-fuzzing-test | grep "=> /" | awk '{print $3}' | xargs -I '{}' cp -v '{}' out/
-
-pushd out
-tar -czvf fuzzer.tar.gz *
-stat fuzzer.tar.gz
-popd
-
-fuzzit create job \
-    --type $FUZZIT_JOB_TYPE \
-    --host $FUZZIT_HOST \
-    --revision $CI_ARROW_SHA \
-    --branch $CI_ARROW_BRANCH \
-    $TARGET_ID \
-    out/fuzzer.tar.gz
-
-popd
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 7142491fea6..38023f7f6a4 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -122,8 +122,6 @@ groups:
     - test-ubuntu-18.04-r-sanitizer
     - test-debian-10-go-1.12
     - test-ubuntu-18.04-docs
-    - test-ubuntu-fuzzit-fuzzing
-    - test-ubuntu-fuzzit-regression
     - test-conda-python-2.7-pandas-latest
     - test-conda-python-2.7-pandas-master
     - test-conda-python-3.7-pandas-latest
@@ -300,9 +298,6 @@ groups:
     - test-ubuntu-18.04-r-sanitizer
     - test-debian-10-go-1.12
     - test-ubuntu-18.04-docs
-    # https://issues.apache.org/jira/browse/ARROW-7712
-    # - test-ubuntu-fuzzit-fuzzing
-    # - test-ubuntu-fuzzit-regression
     - test-conda-python-2.7-pandas-latest
     - test-conda-python-3.7-pandas-latest
     - test-conda-python-3.7-pandas-master
@@ -2058,30 +2053,6 @@ tasks:
         - docker-compose build ubuntu-docs
         - docker-compose run ubuntu-docs
 
-  test-ubuntu-fuzzit-fuzzing:
-    ci: circle
-    platform: linux
-    template: docker-tests/circle.linux.yml
-    params:
-      commands:
-        - docker-compose pull --ignore-pull-failures ubuntu-cpp
-        - docker-compose pull --ignore-pull-failures ubuntu-fuzzit
-        - docker-compose build ubuntu-cpp
-        - docker-compose build ubuntu-fuzzit
-        - docker-compose run -e FUZZIT_JOB_TYPE=fuzzing ubuntu-fuzzit
-
-  test-ubuntu-fuzzit-regression:
-    ci: circle
-    platform: linux
-    template: docker-tests/circle.linux.yml
-    params:
-      commands:
-        - docker-compose pull --ignore-pull-failures ubuntu-cpp
-        - docker-compose pull --ignore-pull-failures ubuntu-fuzzit
-        - docker-compose build ubuntu-cpp
-        - docker-compose build ubuntu-fuzzit
-        - docker-compose run -e FUZZIT_JOB_TYPE=regression ubuntu-fuzzit
-
   ############################## Integration tests ############################
 
   test-conda-python-2.7-pandas-latest:
diff --git a/docker-compose.yml b/docker-compose.yml
index 4eb92bfa14c..108a039adca 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -951,30 +951,6 @@ services:
     command: >
       /bin/bash -c "pip install -e /arrow/dev/archery && archery lint"
 
-  ubuntu-fuzzit:
-    # Usage:
-    #   docker-compose build ubuntu-cpp
-    #   docker-compose build ubuntu-fuzzit
-    #   docker-compose run ubuntu-fuzzit
-    build:
-      context: .
-      dockerfile: ci/docker/linux-apt-fuzzit.dockerfile
-      cache_from:
-        - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-fuzzit
-      args:
-        base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp
-    environment:
-      <<: *ccache
-      FUZZIT_API_KEY: c0b760d37db6012fcaafd8ca5f412ba7bcd297ac969650502994b51aa11798153824442f999a067e1ef67821989ed664
-      FUZZIT_HOST: bionic-llvm7
-      CI_ARROW_SHA: ${CI_ARROW_SHA:-UNSET}
-      CI_ARROW_BRANCH: ${CI_ARROW_BRANCH:-UNSET}
-    volumes: *debian-volumes
-    command: >
-      /bin/bash -c "
-        /arrow/ci/scripts/cpp_build.sh /arrow /build &&
-        /arrow/ci/scripts/fuzzit_build.sh /arrow /build"
-
   ######################### Integration Tests #################################
 
   postgres:

From 942a4d0324a2d8d77a0f06eb54e6e5f5f45e3ce0 Mon Sep 17 00:00:00 2001
From: liyafan82 <fan_li_ya@foxmail.com>
Date: Sun, 2 Feb 2020 19:39:48 -0800
Subject: [PATCH 02/26] ARROW-6738: [Java] Fix problems with current union
 comparison logic

There are some problems with the current union comparison logic. For example:
1. For type check, we should not require fields to be equal. It is possible that two vectors' value ranges are equal but their fields are different.
2. We should not compare the number of sub vectors, as it is possible that two union vectors have different numbers of sub vectors, but have equal values in the range.

Closes #5544 from liyafan82/fly_0930_share and squashes the following commits:

d6ef3d220 <liyafan82>  Refine test case
c008289ec <liyafan82>  Resolve test failure after rebasing
c5153931e <liyafan82>  Rule out the change for union type comparison
bab74028f <liyafan82>  Compare fields for all vectors except union vectors
5b2225e3c <liyafan82>  Fix the bug with decimal vector
4d8b570df <liyafan82>  Fix problems with current union comparison logic

Authored-by: liyafan82 <fan_li_ya@foxmail.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 .../main/codegen/templates/UnionVector.java   |  2 +-
 .../vector/compare/ApproxEqualsVisitor.java   |  6 ++-
 .../vector/compare/RangeEqualsVisitor.java    | 46 +++++++++--------
 .../compare/TestRangeEqualsVisitor.java       | 51 +++++++++++++++++++
 4 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java
index db3c8a89f5e..2c28c66beb3 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -550,7 +550,7 @@ public Iterator<ValueVector> iterator() {
     return vectors.iterator();
   }
 
-    private ValueVector getVector(int index) {
+    public ValueVector getVector(int index) {
       int type = typeBuffer.getByte(index * TYPE_WIDTH);
       switch (MinorType.values()[type]) {
         case NULL:
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java
index ab5ce94e7e3..bcf8c64e0ce 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java
@@ -109,8 +109,10 @@ public Boolean visit(BaseFixedWidthVector left, Range range) {
   }
 
   @Override
-  protected ApproxEqualsVisitor createInnerVisitor(ValueVector left, ValueVector right) {
-    return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone());
+  protected ApproxEqualsVisitor createInnerVisitor(
+      ValueVector left, ValueVector right,
+      BiFunction<ValueVector, ValueVector, Boolean> typeComparator) {
+    return new ApproxEqualsVisitor(left, right, floatDiffFunction.clone(), doubleDiffFunction.clone(), typeComparator);
   }
 
   private boolean float4ApproxEquals(Range range) {
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
index cbba0b82039..49fa69f5ac7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
@@ -24,7 +24,6 @@
 import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.BaseFixedWidthVector;
 import org.apache.arrow.vector.BaseVariableWidthVector;
-import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.NullVector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
@@ -193,12 +192,9 @@ public Boolean visit(NullVector left, Range range) {
     return true;
   }
 
-  /**
-   * Creates a visitor to visit child vectors.
-   * It is used for complex vector types.
-   * @return the visitor for child vecors.
-   */
-  protected RangeEqualsVisitor createInnerVisitor(ValueVector leftInner, ValueVector rightInner) {
+  protected RangeEqualsVisitor createInnerVisitor(
+          ValueVector leftInner, ValueVector rightInner,
+          BiFunction<ValueVector, ValueVector, Boolean> typeComparator) {
     return new RangeEqualsVisitor(leftInner, rightInner, typeComparator);
   }
 
@@ -206,16 +202,23 @@ protected boolean compareUnionVectors(Range range) {
     UnionVector leftVector = (UnionVector) left;
     UnionVector rightVector = (UnionVector) right;
 
-    List<FieldVector> leftChildren = leftVector.getChildrenFromFields();
-    List<FieldVector> rightChildren = rightVector.getChildrenFromFields();
-
-    if (leftChildren.size() != rightChildren.size()) {
-      return false;
-    }
-
-    for (int k = 0; k < leftChildren.size(); k++) {
-      RangeEqualsVisitor visitor = createInnerVisitor(leftChildren.get(k), rightChildren.get(k));
-      if (!visitor.rangeEquals(range)) {
+    Range subRange = new Range(0, 0, 1);
+    for (int i = 0; i < range.getLength(); i++) {
+      subRange.setLeftStart(range.getLeftStart() + i).setRightStart(range.getRightStart() + i);
+      ValueVector leftSubVector = leftVector.getVector(range.getLeftStart() + i);
+      ValueVector rightSubVector = rightVector.getVector(range.getRightStart() + i);
+
+      if (leftSubVector == null || rightSubVector == null) {
+        if (leftSubVector == rightSubVector) {
+          continue;
+        } else {
+          return false;
+        }
+      }
+      TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightSubVector);
+      RangeEqualsVisitor visitor =
+          createInnerVisitor(leftSubVector, rightSubVector, (left, right) -> typeVisitor.equals(left));
+      if (!visitor.rangeEquals(subRange)) {
         return false;
       }
     }
@@ -232,7 +235,8 @@ protected boolean compareStructVectors(Range range) {
     }
 
     for (String name : leftChildNames) {
-      RangeEqualsVisitor visitor = createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name));
+      RangeEqualsVisitor visitor =
+          createInnerVisitor(leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null);
       if (!visitor.rangeEquals(range)) {
         return false;
       }
@@ -311,7 +315,8 @@ protected boolean compareListVectors(Range range) {
     ListVector leftVector = (ListVector) left;
     ListVector rightVector = (ListVector) right;
 
-    RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector());
+    RangeEqualsVisitor innerVisitor =
+        createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null);
     Range innerRange = new Range();
 
     for (int i = 0; i < range.getLength(); i++) {
@@ -357,7 +362,8 @@ protected boolean compareFixedSizeListVectors(Range range) {
     }
 
     int listSize = leftVector.getListSize();
-    RangeEqualsVisitor innerVisitor = createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector());
+    RangeEqualsVisitor innerVisitor =
+        createInnerVisitor(leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null);
     Range innerRange = new Range(0, 0, listSize);
 
     for (int i = 0; i < range.getLength(); i++) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
index 183c8abac18..c35fd4107a9 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
@@ -282,6 +282,57 @@ public void testUnionVectorRangeEquals() {
     }
   }
 
+  /**
+   * Test comparing two union vectors.
+   * The two vectors are different in total, but have a range with equal values.
+   */
+  @Test
+  public void testUnionVectorSubRangeEquals() {
+    try (final UnionVector vector1 = new UnionVector("union", allocator, null, null);
+         final UnionVector vector2 = new UnionVector("union", allocator, null, null);) {
+
+      final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder();
+      uInt4Holder.value = 10;
+      uInt4Holder.isSet = 1;
+
+      final NullableIntHolder intHolder = new NullableIntHolder();
+      intHolder.value = 20;
+      intHolder.isSet = 1;
+
+      vector1.setType(0, Types.MinorType.UINT4);
+      vector1.setSafe(0, uInt4Holder);
+
+      vector1.setType(1, Types.MinorType.INT);
+      vector1.setSafe(1, intHolder);
+
+      vector1.setType(2, Types.MinorType.INT);
+      vector1.setSafe(2, intHolder);
+
+      vector1.setType(3, Types.MinorType.INT);
+      vector1.setSafe(3, intHolder);
+
+      vector1.setValueCount(4);
+
+      vector2.setType(0, Types.MinorType.UINT4);
+      vector2.setSafe(0, uInt4Holder);
+
+      vector2.setType(1, Types.MinorType.INT);
+      vector2.setSafe(1, intHolder);
+
+      vector2.setType(2, Types.MinorType.INT);
+      vector2.setSafe(2, intHolder);
+
+      vector2.setType(3, Types.MinorType.UINT4);
+      vector2.setSafe(3, uInt4Holder);
+
+      vector2.setValueCount(4);
+
+      RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2);
+      assertFalse(visitor.rangeEquals(new Range(0, 0, 4)));
+      assertTrue(visitor.rangeEquals(new Range(1, 1, 2)));
+    }
+  }
+
   @Ignore
   @Test
   public void testEqualsWithOutTypeCheck() {

From 49aada21d6f4a0e0e0a8cc651884fe402eaeb5fd Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Sun, 2 Feb 2020 19:42:55 -0800
Subject: [PATCH 03/26] ARROW-7734: [C++] check status details for nullptr in
 equality

When checking statuses for equality, check to make sure that both have status detail objects before proceeding to compare those objects.

Closes #6332 from lidavidm/arrow-7734 and squashes the following commits:

9084bcec7 <David Li> ARROW-7734:  check status details for nullptr in equality

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 cpp/src/arrow/status.h       |  8 ++++++--
 cpp/src/arrow/status_test.cc | 13 +++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index c44e311b7b2..df3ea6b9af6 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -292,6 +292,7 @@ class ARROW_EXPORT Status : public util::EqualityComparable<Status>,
   }
 
   bool IsExecutionError() const { return code() == StatusCode::ExecutionError; }
+  bool IsAlreadyExists() const { return code() == StatusCode::AlreadyExists; }
 
   /// \brief Return a string representation of this status suitable for printing.
   ///
@@ -385,8 +386,11 @@ bool Status::Equals(const Status& s) const {
     return false;
   }
 
-  if (detail() != s.detail() && !(*detail() == *s.detail())) {
-    return false;
+  if (detail() != s.detail()) {
+    if ((detail() && !s.detail()) || (!detail() && s.detail())) {
+      return false;
+    }
+    return *detail() == *s.detail();
   }
 
   return code() == s.code() && message() == s.message();
diff --git a/cpp/src/arrow/status_test.cc b/cpp/src/arrow/status_test.cc
index 593ad2194bf..fc5a7ec45cf 100644
--- a/cpp/src/arrow/status_test.cc
+++ b/cpp/src/arrow/status_test.cc
@@ -114,4 +114,17 @@ TEST(StatusTest, TestEquality) {
   ASSERT_NE(Status::Invalid("error"), Status::Invalid("other error"));
 }
 
+TEST(StatusTest, TestDetailEquality) {
+  const auto status_with_detail =
+      arrow::Status(StatusCode::IOError, "", std::make_shared<TestStatusDetail>());
+  const auto status_with_detail2 =
+      arrow::Status(StatusCode::IOError, "", std::make_shared<TestStatusDetail>());
+  const auto status_without_detail = arrow::Status::IOError("");
+
+  ASSERT_EQ(*status_with_detail.detail(), *status_with_detail2.detail());
+  ASSERT_EQ(status_with_detail, status_with_detail2);
+  ASSERT_NE(status_with_detail, status_without_detail);
+  ASSERT_NE(status_without_detail, status_with_detail);
+}
+
 }  // namespace arrow

From af24bb7ec0a73b725d5b0ac54f14a27b938eb377 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Sun, 2 Feb 2020 19:47:01 -0800
Subject: [PATCH 04/26] ARROW-6724: [C++] Allow simpler BufferOutputStream
 creation

Make the initial capacity argument optional.

Closes #6327 from pitrou/ARROW-6724-bufferoutstream-simpler-ctor and squashes the following commits:

b563f4601 <Antoine Pitrou> ARROW-6724:  Allow simpler BufferOutputStream creation

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 cpp/src/arrow/dataset/file_ipc_test.cc | 7 ++-----
 cpp/src/arrow/extension_type_test.cc   | 4 ++--
 cpp/src/arrow/io/compressed_test.cc    | 2 +-
 cpp/src/arrow/io/memory.h              | 2 +-
 cpp/src/arrow/ipc/feather_test.cc      | 4 ++--
 cpp/src/parquet/reader_test.cc         | 2 +-
 6 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc
index 29cfc384a42..49fb6879008 100644
--- a/cpp/src/arrow/dataset/file_ipc_test.cc
+++ b/cpp/src/arrow/dataset/file_ipc_test.cc
@@ -34,7 +34,6 @@
 namespace arrow {
 namespace dataset {
 
-constexpr int64_t kDefaultOutputStreamSize = 1024;
 constexpr int64_t kBatchSize = 1UL << 12;
 constexpr int64_t kBatchRepetitions = 1 << 5;
 constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions;
@@ -42,8 +41,7 @@ constexpr int64_t kNumRows = kBatchSize * kBatchRepetitions;
 class ArrowIpcWriterMixin : public ::testing::Test {
  public:
   std::shared_ptr<Buffer> Write(std::vector<RecordBatchReader*> readers) {
-    EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create(
-                                        kDefaultOutputStreamSize, default_memory_pool()));
+    EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create());
     auto writer_schema = readers[0]->schema();
 
     EXPECT_OK_AND_ASSIGN(auto writer,
@@ -69,8 +67,7 @@ class ArrowIpcWriterMixin : public ::testing::Test {
   }
 
   std::shared_ptr<Buffer> Write(const Table& table) {
-    EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create(
-                                        kDefaultOutputStreamSize, default_memory_pool()));
+    EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create());
 
     EXPECT_OK_AND_ASSIGN(auto writer,
                          ipc::RecordBatchFileWriter::Open(sink.get(), table.schema()));
diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index aead8ab25e4..a26f57e667f 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -217,7 +217,7 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
 
 auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
                          std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024));
+  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
   ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(),
                                         out_stream.get()));
 
@@ -255,7 +255,7 @@ TEST_F(TestExtensionType, UnrecognizedExtension) {
 
   // Write full IPC stream including schema, then unregister type, then read
   // and ensure that a plain instance of the storage type is created
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024));
+  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
   ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(),
                                         out_stream.get()));
 
diff --git a/cpp/src/arrow/io/compressed_test.cc b/cpp/src/arrow/io/compressed_test.cc
index 71b2012b67d..df969feb3b6 100644
--- a/cpp/src/arrow/io/compressed_test.cc
+++ b/cpp/src/arrow/io/compressed_test.cc
@@ -127,7 +127,7 @@ void CheckCompressedInputStream(Codec* codec, const std::vector<uint8_t>& data)
 void CheckCompressedOutputStream(Codec* codec, const std::vector<uint8_t>& data,
                                  bool do_flush) {
   // Create compressed output stream
-  ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create(1024));
+  ASSERT_OK_AND_ASSIGN(auto buffer_writer, BufferOutputStream::Create());
   ASSERT_OK_AND_ASSIGN(auto stream, CompressedOutputStream::Make(codec, buffer_writer));
   ASSERT_OK_AND_EQ(0, stream->Tell());
 
diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h
index 185a972a6b6..0b710aff325 100644
--- a/cpp/src/arrow/io/memory.h
+++ b/cpp/src/arrow/io/memory.h
@@ -46,7 +46,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream {
   /// \param[in,out] pool a MemoryPool to use for allocations
   /// \return the created stream
   static Result<std::shared_ptr<BufferOutputStream>> Create(
-      int64_t initial_capacity, MemoryPool* pool = default_memory_pool());
+      int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool());
 
   ARROW_DEPRECATED("Use Result-returning overload")
   static Status Create(int64_t initial_capacity, MemoryPool* pool,
diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc
index 74b94a10169..b2c2994aece 100644
--- a/cpp/src/arrow/ipc/feather_test.cc
+++ b/cpp/src/arrow/ipc/feather_test.cc
@@ -281,7 +281,7 @@ void CheckBatches(const RecordBatch& expected, const RecordBatch& result) {
 class TestTableReader : public ::testing::Test {
  public:
   void SetUp() {
-    ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024));
+    ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create());
     ASSERT_OK(TableWriter::Open(stream_, &writer_));
   }
 
@@ -356,7 +356,7 @@ TEST_F(TestTableReader, ReadNames) {
 class TestTableWriter : public ::testing::Test {
  public:
   void SetUp() {
-    ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create(1024));
+    ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create());
     ASSERT_OK(TableWriter::Open(stream_, &writer_));
   }
 
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 3ec4246eac4..f28b22639d3 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -412,7 +412,7 @@ TEST(TestFileReader, BufferedReads) {
   std::shared_ptr<WriterProperties> writer_props =
       WriterProperties::Builder().write_batch_size(64)->data_pagesize(128)->build();
 
-  ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create(1024));
+  ASSERT_OK_AND_ASSIGN(auto out_file, arrow::io::BufferOutputStream::Create());
   std::shared_ptr<ParquetFileWriter> file_writer =
       ParquetFileWriter::Open(out_file, schema, writer_props);
 

From 85996e6c2e9af4700a3410b68d5a917605e24cf1 Mon Sep 17 00:00:00 2001
From: tianchen <niki.lj@alibaba-inc.com>
Date: Sun, 2 Feb 2020 19:51:30 -0800
Subject: [PATCH 05/26] ARROW-6871: [Java] Enhance TransferPair related
 parameters check and tests

Related to [ARROW-6871](https://issues.apache.org/jira/browse/ARROW-6871).

TransferPair related param checks in different classes have potential problems:

i. splitAndTansfer has no indices check in classes like VarcharVector
ii. splitAndTranser indices check in classes like UnionVector is not correct (Preconditions.checkArgument(startIndex + length <= valueCount)), should check params separately.
iii. should add more UT to cover corner cases.

Closes #5645 from tianchen92/ARROW-6871 and squashes the following commits:

f3b897ddd <tianchen> fix style
0d3c7eab3 <tianchen> add benchmark
01f9a48f2 <tianchen> revert changes in copyFrom
a22d58abb <tianchen> ARROW-6871:  Enhance TransferPair related parameters check and tests

Authored-by: tianchen <niki.lj@alibaba-inc.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 .../sort/FixedWidthInPlaceVectorSorter.java   |   1 +
 .../vector/util/TransferPairBenchmarks.java   | 123 +++++++++++++
 .../main/codegen/templates/UnionVector.java   |   5 +-
 .../arrow/vector/BaseFixedWidthVector.java    |   5 +-
 .../arrow/vector/BaseVariableWidthVector.java |   5 +-
 .../org/apache/arrow/vector/BitVector.java    |   5 +-
 .../vector/complex/FixedSizeListVector.java   |   5 +-
 .../arrow/vector/complex/ListVector.java      |   5 +-
 .../arrow/vector/complex/StructVector.java    |   5 +-
 .../arrow/vector/TestSplitAndTransfer.java    | 166 ++++++++++++++++--
 10 files changed, 306 insertions(+), 19 deletions(-)
 create mode 100644 java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java

diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java
index bac25cf8165..c5ecd8acb9b 100644
--- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthInPlaceVectorSorter.java
@@ -46,6 +46,7 @@ public void sortInPlace(V vec, VectorValueComparator<V> comparator) {
       this.comparator = comparator;
       this.pivotBuffer = (V) vec.getField().createVector(vec.getAllocator());
       this.pivotBuffer.allocateNew(1);
+      this.pivotBuffer.setValueCount(1);
 
       comparator.attachVectors(vec, pivotBuffer);
       quickSort();
diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java
new file mode 100644
index 00000000000..235eca53c84
--- /dev/null
+++ b/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.util;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+/**
+ * Benchmarks for {@link TransferPair}.
+ */
+@State(Scope.Benchmark)
+public class TransferPairBenchmarks {
+
+  private static final int VECTOR_LENGTH = 1024;
+
+  private static final int ALLOCATOR_CAPACITY = 1024 * 1024;
+
+  private BufferAllocator allocator;
+
+  private IntVector intVector;
+
+  private VarCharVector varCharVector;
+
+  /**
+   * Setup benchmarks.
+   */
+  @Setup
+  public void prepare() {
+    allocator = new RootAllocator(ALLOCATOR_CAPACITY);
+    intVector = new IntVector("intVector", allocator);
+    varCharVector = new VarCharVector("varcharVector", allocator);
+
+    intVector.allocateNew(VECTOR_LENGTH);
+    varCharVector.allocateNew(VECTOR_LENGTH);
+
+    for (int i = 0; i < VECTOR_LENGTH; i++) {
+      if (i % 3 == 0) {
+        intVector.setNull(i);
+        varCharVector.setNull(i);
+      } else {
+        intVector.setSafe(i, i * i);
+        varCharVector.setSafe(i, ("teststring" + i).getBytes(StandardCharsets.UTF_8));
+      }
+    }
+    intVector.setValueCount(VECTOR_LENGTH);
+    varCharVector.setValueCount(VECTOR_LENGTH);
+  }
+
+  /**
+   * Tear down benchmarks.
+   */
+  @TearDown
+  public void tearDown() {
+    intVector.close();
+    varCharVector.close();;
+    allocator.close();
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.MICROSECONDS)
+  public int splitAndTransferIntVector() {
+    IntVector toVector = new IntVector("intVector", allocator);
+    toVector.setValueCount(VECTOR_LENGTH);
+    TransferPair transferPair = intVector.makeTransferPair(toVector);
+    transferPair.splitAndTransfer(0, VECTOR_LENGTH);
+    toVector.close();
+    return 0;
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.MICROSECONDS)
+  public int splitAndTransferVarcharVector() {
+    VarCharVector toVector = new VarCharVector("varcharVector", allocator);
+    toVector.setValueCount(VECTOR_LENGTH);
+    TransferPair transferPair = varCharVector.makeTransferPair(toVector);
+    transferPair.splitAndTransfer(0, VECTOR_LENGTH);
+    toVector.close();
+    return 0;
+  }
+
+  public static void main(String [] args) throws RunnerException {
+    Options opt = new OptionsBuilder()
+        .include(TransferPairBenchmarks.class.getSimpleName())
+        .forks(1)
+        .build();
+
+    new Runner(opt).run();
+  }
+}
diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java
index 2c28c66beb3..d760fe53fc6 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -469,7 +469,10 @@ public void transfer() {
 
     @Override
     public void splitAndTransfer(int startIndex, int length) {
-      Preconditions.checkArgument(startIndex + length <= valueCount);
+      Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+          "Invalid startIndex: %s", startIndex);
+      Preconditions.checkArgument(startIndex + length <= valueCount,
+          "Invalid length: %s", length);
       to.clear();
       internalStructVectorTransferPair.splitAndTransfer(startIndex, length);
       final int startPoint = startIndex * TYPE_WIDTH;
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
index ed91d6e659b..a4e94bcac09 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
@@ -584,7 +584,10 @@ public void transferTo(BaseFixedWidthVector target) {
    */
   public void splitAndTransferTo(int startIndex, int length,
                                  BaseFixedWidthVector target) {
-    Preconditions.checkArgument(startIndex + length <= valueCount);
+    Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+        "Invalid startIndex: %s", startIndex);
+    Preconditions.checkArgument(startIndex + length <= valueCount,
+        "Invalid length: %s", length);
     compareTypes(target, "splitAndTransferTo");
     target.clear();
     splitAndTransferValidityBuffer(startIndex, length, target);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index 7e276445643..89395ef7e22 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -714,6 +714,10 @@ public void transferTo(BaseVariableWidthVector target) {
    */
   public void splitAndTransferTo(int startIndex, int length,
                                  BaseVariableWidthVector target) {
+    Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+        "Invalid startIndex: %s", startIndex);
+    Preconditions.checkArgument(startIndex + length <= valueCount,
+        "Invalid length: %s", length);
     compareTypes(target, "splitAndTransferTo");
     target.clear();
     splitAndTransferValidityBuffer(startIndex, length, target);
@@ -750,7 +754,6 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariab
    */
   private void splitAndTransferValidityBuffer(int startIndex, int length,
                                               BaseVariableWidthVector target) {
-    Preconditions.checkArgument(startIndex + length <= valueCount);
     int firstByteSource = BitVectorHelper.byteIndex(startIndex);
     int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
     int byteSizeTarget = getValidityBufferSizeFromCount(length);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
index 2fa5fdef4ef..b41c62677fc 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
@@ -158,6 +158,10 @@ public int getBufferSize() {
    * @param target     destination vector
    */
   public void splitAndTransferTo(int startIndex, int length, BaseFixedWidthVector target) {
+    Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+        "Invalid startIndex: %s", startIndex);
+    Preconditions.checkArgument(startIndex + length <= valueCount,
+        "Invalid length: %s", length);
     compareTypes(target, "splitAndTransferTo");
     target.clear();
     target.validityBuffer = splitAndTransferBuffer(startIndex, length, target,
@@ -174,7 +178,6 @@ private ArrowBuf splitAndTransferBuffer(
       BaseFixedWidthVector target,
       ArrowBuf sourceBuffer,
       ArrowBuf destBuffer) {
-    assert startIndex + length <= valueCount;
     int firstByteSource = BitVectorHelper.byteIndex(startIndex);
     int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1);
     int byteSizeTarget = getValidityBufferSizeFromCount(length);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
index 653719feafe..8fa43fb06b7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
@@ -590,7 +590,10 @@ public void transfer() {
 
     @Override
     public void splitAndTransfer(int startIndex, int length) {
-      Preconditions.checkArgument(startIndex + length <= valueCount);
+      Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+          "Invalid startIndex: %s", startIndex);
+      Preconditions.checkArgument(startIndex + length <= valueCount,
+          "Invalid length: %s", length);
       final int startPoint = listSize * startIndex;
       final int sliceLength = listSize * length;
       to.clear();
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 59481d39399..312a3556082 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -493,7 +493,10 @@ public void transfer() {
      */
     @Override
     public void splitAndTransfer(int startIndex, int length) {
-      Preconditions.checkArgument(startIndex + length <= valueCount);
+      Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+          "Invalid startIndex: %s", startIndex);
+      Preconditions.checkArgument(startIndex + length <= valueCount,
+          "Invalid length: %s", length);
       final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH);
       final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint;
       to.clear();
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
index ec922a032a9..7b22835963a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
@@ -195,7 +195,10 @@ public void copyValueSafe(int fromIndex, int toIndex) {
 
     @Override
     public void splitAndTransfer(int startIndex, int length) {
-      Preconditions.checkArgument(startIndex + length <= valueCount);
+      Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount,
+          "Invalid startIndex: %s", startIndex);
+      Preconditions.checkArgument(startIndex + length <= valueCount,
+          "Invalid length: %s", length);
       target.clear();
       splitAndTransferValidityBuffer(startIndex, length, target);
       super.splitAndTransfer(startIndex, length);
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
index 6405e256b22..d5e4a00d599 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java
@@ -18,16 +18,17 @@
 package org.apache.arrow.vector;
 
 import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.util.TransferPair;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.jupiter.api.Assertions;
 
 public class TestSplitAndTransfer {
   private BufferAllocator allocator;
@@ -41,6 +42,17 @@ public void init() {
   public void terminate() throws Exception {
     allocator.close();
   }
+
+  private void populateVarcharVector(final VarCharVector vector, int valueCount, String[] compareArray) {
+    for (int i = 0; i < valueCount; i += 3) {
+      final String s = String.format("%010d", i);
+      vector.set(i, s.getBytes());
+      if (compareArray != null) {
+        compareArray[i] = s;
+      }
+    }
+    vector.setValueCount(valueCount);
+  }
   
   @Test /* VarCharVector */
   public void test() throws Exception {
@@ -50,12 +62,7 @@ public void test() throws Exception {
       final int valueCount = 500;
       final String[] compareArray = new String[valueCount];
   
-      for (int i = 0; i < valueCount; i += 3) {
-        final String s = String.format("%010d", i);
-        varCharVector.set(i, s.getBytes());
-        compareArray[i] = s;
-      }
-      varCharVector.setValueCount(valueCount);
+      populateVarcharVector(varCharVector, valueCount, compareArray);
   
       final TransferPair tp = varCharVector.getTransferPair(allocator);
       final VarCharVector newVarCharVector = (VarCharVector) tp.getTo();
@@ -89,11 +96,7 @@ public void testMemoryConstrainedTransfer() {
 
       final int valueCount = 1000;
 
-      for (int i = 0; i < valueCount; i += 3) {
-        final String s = String.format("%010d", i);
-        varCharVector.set(i, s.getBytes());
-      }
-      varCharVector.setValueCount(valueCount);
+      populateVarcharVector(varCharVector, valueCount, null);
 
       final TransferPair tp = varCharVector.getTransferPair(allocator);
       final VarCharVector newVarCharVector = (VarCharVector) tp.getTo();
@@ -107,4 +110,143 @@ public void testMemoryConstrainedTransfer() {
       }
     }
   }
+
+  @Test
+  public void testTransfer() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) {
+      varCharVector.allocateNew(10000, 1000);
+
+      final int valueCount = 500;
+      final String[] compareArray = new String[valueCount];
+      populateVarcharVector(varCharVector, valueCount, compareArray);
+
+      final TransferPair tp = varCharVector.getTransferPair(allocator);
+      final VarCharVector newVarCharVector = (VarCharVector) tp.getTo();
+      tp.transfer();
+
+      assertEquals(0, varCharVector.valueCount);
+      assertEquals(valueCount, newVarCharVector.valueCount);
+
+      for (int i = 0; i < valueCount; i++) {
+        final boolean expectedSet = (i % 3) == 0;
+        if (expectedSet) {
+          final byte[] expectedValue = compareArray[i].getBytes();
+          assertFalse(newVarCharVector.isNull(i));
+          assertArrayEquals(expectedValue, newVarCharVector.get(i));
+        } else {
+          assertTrue(newVarCharVector.isNull(i));
+        }
+      }
+
+      newVarCharVector.clear();
+    }
+  }
+
+  @Test
+  public void testCopyValueSafe() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator);
+         final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) {
+      varCharVector.allocateNew(10000, 1000);
+
+      final int valueCount = 500;
+      populateVarcharVector(varCharVector, valueCount, null);
+
+      final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector);
+
+      // new vector memory is not pre-allocated, we expect copyValueSafe work fine.
+      for (int i = 0; i < valueCount; i++) {
+        tp.copyValueSafe(i, i);
+      }
+      newVarCharVector.setValueCount(valueCount);
+
+      for (int i = 0; i < valueCount; i++) {
+        final boolean expectedSet = (i % 3) == 0;
+        if (expectedSet) {
+          assertFalse(varCharVector.isNull(i));
+          assertFalse(newVarCharVector.isNull(i));
+          assertArrayEquals(varCharVector.get(i), newVarCharVector.get(i));
+        } else {
+          assertTrue(newVarCharVector.isNull(i));
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testSplitAndTransferNon() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) {
+
+      varCharVector.allocateNew(10000, 1000);
+      final int valueCount = 500;
+      populateVarcharVector(varCharVector, valueCount, null);
+
+      final TransferPair tp = varCharVector.getTransferPair(allocator);
+      VarCharVector newVarCharVector = (VarCharVector) tp.getTo();
+
+      tp.splitAndTransfer(0, 0);
+      assertEquals(0, newVarCharVector.getValueCount());
+
+      newVarCharVector.clear();
+    }
+  }
+
+  @Test
+  public void testSplitAndTransferAll() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) {
+
+      varCharVector.allocateNew(10000, 1000);
+      final int valueCount = 500;
+      populateVarcharVector(varCharVector, valueCount, null);
+
+      final TransferPair tp = varCharVector.getTransferPair(allocator);
+      VarCharVector newVarCharVector = (VarCharVector) tp.getTo();
+
+      tp.splitAndTransfer(0, valueCount);
+      assertEquals(valueCount, newVarCharVector.getValueCount());
+
+      newVarCharVector.clear();
+    }
+  }
+
+  @Test
+  public void testInvalidStartIndex() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator);
+        final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) {
+
+      varCharVector.allocateNew(10000, 1000);
+      final int valueCount = 500;
+      populateVarcharVector(varCharVector, valueCount, null);
+
+      final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector);
+
+      IllegalArgumentException e = Assertions.assertThrows(
+          IllegalArgumentException.class,
+          () -> tp.splitAndTransfer(valueCount, 10));
+
+      assertEquals("Invalid startIndex: 500", e.getMessage());
+
+      newVarCharVector.clear();
+    }
+  }
+
+  @Test
+  public void testInvalidLength() {
+    try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator);
+        final VarCharVector newVarCharVector = new VarCharVector("newvector", allocator)) {
+
+      varCharVector.allocateNew(10000, 1000);
+      final int valueCount = 500;
+      populateVarcharVector(varCharVector, valueCount, null);
+
+      final TransferPair tp = varCharVector.makeTransferPair(newVarCharVector);
+
+      IllegalArgumentException e = Assertions.assertThrows(
+          IllegalArgumentException.class,
+          () -> tp.splitAndTransfer(0, valueCount * 2));
+
+      assertEquals("Invalid length: 1000", e.getMessage());
+
+      newVarCharVector.clear();
+    }
+  }
 }

From 4c7bfc7e1ca9efef00a0cb0d5539b13fd67acf7b Mon Sep 17 00:00:00 2001
From: liyafan82 <fan_li_ya@foxmail.com>
Date: Sun, 2 Feb 2020 20:28:40 -0800
Subject: [PATCH 06/26] ARROW-7301: [Java] Sql type DATE should correspond to
 DateDayVector

According to SQL convertion, sql type DATE should correspond to a format of YYYY-MM-DD, without the components for hour/minute/second/millis

Therefore, JDBC type DATE should correspond to DateDayVector, with a type width of 4, instead of 8.

Closes #5944 from liyafan82/fly_1203_date and squashes the following commits:

a6de37756 <liyafan82>  Remove division in time conversion
be731925d <liyafan82>  Resolve comments
eea8b7911 <liyafan82>  Sql type DATE should correspond to DateDayVector

Authored-by: liyafan82 <fan_li_ya@foxmail.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 .../arrow/adapter/jdbc/JdbcToArrowUtils.java  |  8 ++--
 .../adapter/jdbc/consumer/DateConsumer.java   | 46 ++++++++++++++-----
 .../adapter/jdbc/JdbcToArrowTestHelper.java   | 12 ++---
 .../jdbc/h2/JdbcToArrowDataTypesTest.java     |  6 +--
 .../adapter/jdbc/h2/JdbcToArrowNullTest.java  | 10 ++--
 .../adapter/jdbc/h2/JdbcToArrowTest.java      |  6 +--
 .../jdbc/h2/JdbcToArrowTimeZoneTest.java      |  6 +--
 .../h2/JdbcToArrowVectorIteratorTest.java     | 14 +++---
 .../resources/h2/test1_all_datatypes_h2.yml   |  2 +-
 ...t1_all_datatypes_selected_null_rows_h2.yml |  2 +-
 .../src/test/resources/h2/test1_date_h2.yml   | 20 ++++----
 .../test/resources/h2/test1_est_date_h2.yml   | 20 ++++----
 .../test/resources/h2/test1_gmt_date_h2.yml   | 20 ++++----
 .../test/resources/h2/test1_pst_date_h2.yml   | 20 ++++----
 .../apache/arrow/vector/DateDayVector.java    |  1 +
 15 files changed, 110 insertions(+), 83 deletions(-)

diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java
index 21278e4cabd..f7bf3248eb4 100644
--- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java
+++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java
@@ -59,7 +59,7 @@
 import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.Float4Vector;
@@ -210,7 +210,7 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig
    *   <li>BINARY --> ArrowType.Binary</li>
    *   <li>VARBINARY --> ArrowType.Binary</li>
    *   <li>LONGVARBINARY --> ArrowType.Binary</li>
-   *   <li>DATE --> ArrowType.Date(DateUnit.MILLISECOND)</li>
+   *   <li>DATE --> ArrowType.Date(DateUnit.DAY)</li>
    *   <li>TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)</li>
    *   <li>TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)</li>
    *   <li>CLOB --> ArrowType.Utf8</li>
@@ -265,7 +265,7 @@ public static ArrowType getArrowTypeForJdbcField(JdbcFieldInfo fieldInfo, Calend
       case Types.CLOB:
         return new ArrowType.Utf8();
       case Types.DATE:
-        return new ArrowType.Date(DateUnit.MILLISECOND);
+        return new ArrowType.Date(DateUnit.DAY);
       case Types.TIME:
         return new ArrowType.Time(TimeUnit.MILLISECOND, 32);
       case Types.TIMESTAMP:
@@ -402,7 +402,7 @@ static JdbcConsumer getConsumer(ResultSet resultSet, int columnIndex, int jdbcCo
       case Types.LONGNVARCHAR:
         return VarCharConsumer.createConsumer((VarCharVector) vector, columnIndex, nullable);
       case Types.DATE:
-        return DateConsumer.createConsumer((DateMilliVector) vector, columnIndex, nullable, calendar);
+        return DateConsumer.createConsumer((DateDayVector) vector, columnIndex, nullable, calendar);
       case Types.TIME:
         return TimeConsumer.createConsumer((TimeMilliVector) vector, columnIndex, nullable, calendar);
       case Types.TIMESTAMP:
diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java
index 2fbffa08cf7..bb9ab1cf0b9 100644
--- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java
+++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java
@@ -17,11 +17,15 @@
 
 package org.apache.arrow.adapter.jdbc.consumer;
 
+import java.sql.Date;
 import java.sql.ResultSet;
 import java.sql.SQLException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.util.Calendar;
-import java.util.Date;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DateMilliVector;
 
 /**
@@ -30,11 +34,23 @@
  */
 public class DateConsumer {
 
+  public static final int MAX_DAY;
+
+  static {
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    try {
+      java.util.Date date = dateFormat.parse("9999-12-31");
+      MAX_DAY = (int) TimeUnit.MILLISECONDS.toDays(date.getTime());
+    } catch (ParseException e) {
+      throw new IllegalArgumentException("Failed to parse max day", e);
+    }
+  }
+
   /**
    * Creates a consumer for {@link DateMilliVector}.
    */
-  public static JdbcConsumer<DateMilliVector> createConsumer(
-          DateMilliVector vector, int index, boolean nullable, Calendar calendar) {
+  public static JdbcConsumer<DateDayVector> createConsumer(
+      DateDayVector vector, int index, boolean nullable, Calendar calendar) {
     if (nullable) {
       return new NullableDateConsumer(vector, index, calendar);
     } else {
@@ -45,21 +61,21 @@ public static JdbcConsumer<DateMilliVector> createConsumer(
   /**
    * Nullable consumer for date.
    */
-  static class NullableDateConsumer extends BaseConsumer<DateMilliVector> {
+  static class NullableDateConsumer extends BaseConsumer<DateDayVector> {
 
     protected final Calendar calendar;
 
     /**
      * Instantiate a DateConsumer.
      */
-    public NullableDateConsumer(DateMilliVector vector, int index) {
+    public NullableDateConsumer(DateDayVector vector, int index) {
       this(vector, index, /* calendar */null);
     }
 
     /**
      * Instantiate a DateConsumer.
      */
-    public NullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) {
+    public NullableDateConsumer(DateDayVector vector, int index, Calendar calendar) {
       super(vector, index);
       this.calendar = calendar;
     }
@@ -69,7 +85,11 @@ public void consume(ResultSet resultSet) throws SQLException {
       Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) :
           resultSet.getDate(columnIndexInResultSet, calendar);
       if (!resultSet.wasNull()) {
-        vector.setSafe(currentIndex, date.getTime());
+        int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime());
+        if (day < 0 || day > MAX_DAY) {
+          throw new IllegalArgumentException("Day overflow: " + day);
+        }
+        vector.setSafe(currentIndex, day);
       }
       currentIndex++;
     }
@@ -78,21 +98,21 @@ public void consume(ResultSet resultSet) throws SQLException {
   /**
    * Non-nullable consumer for date.
    */
-  static class NonNullableDateConsumer extends BaseConsumer<DateMilliVector> {
+  static class NonNullableDateConsumer extends BaseConsumer<DateDayVector> {
 
     protected final Calendar calendar;
 
     /**
      * Instantiate a DateConsumer.
      */
-    public NonNullableDateConsumer(DateMilliVector vector, int index) {
+    public NonNullableDateConsumer(DateDayVector vector, int index) {
       this(vector, index, /* calendar */null);
     }
 
     /**
      * Instantiate a DateConsumer.
      */
-    public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calendar) {
+    public NonNullableDateConsumer(DateDayVector vector, int index, Calendar calendar) {
       super(vector, index);
       this.calendar = calendar;
     }
@@ -101,7 +121,11 @@ public NonNullableDateConsumer(DateMilliVector vector, int index, Calendar calen
     public void consume(ResultSet resultSet) throws SQLException {
       Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) :
           resultSet.getDate(columnIndexInResultSet, calendar);
-      vector.setSafe(currentIndex, date.getTime());
+      int day = (int) TimeUnit.MILLISECONDS.toDays(date.getTime());
+      if (day < 0 || day > MAX_DAY) {
+        throw new IllegalArgumentException("Day overflow: " + day);
+      }
+      vector.setSafe(currentIndex, day);
       currentIndex++;
     }
   }
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java
index 3e21144c0ca..c194dfbc355 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java
@@ -32,7 +32,7 @@
 import org.apache.arrow.vector.BaseValueVector;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -173,14 +173,14 @@ public static void assertTimeVectorValues(TimeMilliVector timeMilliVector, int r
     }
   }
 
-  public static void assertDateVectorValues(DateMilliVector dateMilliVector, int rowCount, Long[] values) {
-    assertEquals(rowCount, dateMilliVector.getValueCount());
+  public static void assertDateVectorValues(DateDayVector dateDayVector, int rowCount, Integer[] values) {
+    assertEquals(rowCount, dateDayVector.getValueCount());
 
-    for (int j = 0; j < dateMilliVector.getValueCount(); j++) {
+    for (int j = 0; j < dateDayVector.getValueCount(); j++) {
       if (values[j] == null) {
-        assertTrue(dateMilliVector.isNull(j));
+        assertTrue(dateDayVector.isNull(j));
       } else {
-        assertEquals(values[j].longValue(), dateMilliVector.get(j));
+        assertEquals(values[j].longValue(), dateDayVector.get(j));
       }
     }
   }
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
index 650c766e02a..cafb7a050d7 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
@@ -50,7 +50,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) {
             table.getCharValues());
         break;
       case DATE:
-        assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length,
-            table.getLongValues());
+        assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length,
+            table.getIntValues());
         break;
       case TIME:
         assertTimeVectorValues((TimeMilliVector) root.getVector(table.getVector()), table.getValues().length,
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
index 199b821ba19..57fcf566d7d 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
@@ -58,7 +58,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -206,8 +206,8 @@ private void testAllVectorValues(VectorSchemaRoot root) {
     assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(),
         getBooleanValues(table.getValues(), BOOL));
 
-    assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(),
-        getLongValues(table.getValues(), DATE));
+    assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(),
+        getIntValues(table.getValues(), DATE));
 
     assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(),
         getLongValues(table.getValues(), TIME));
@@ -242,7 +242,7 @@ public void sqlToArrowTestNullValues(String[] vectors, VectorSchemaRoot root, in
     assertNullValues((Float8Vector) root.getVector(vectors[6]), rowCount);
     assertNullValues((Float4Vector) root.getVector(vectors[7]), rowCount);
     assertNullValues((TimeMilliVector) root.getVector(vectors[8]), rowCount);
-    assertNullValues((DateMilliVector) root.getVector(vectors[9]), rowCount);
+    assertNullValues((DateDayVector) root.getVector(vectors[9]), rowCount);
     assertNullValues((TimeStampVector) root.getVector(vectors[10]), rowCount);
     assertNullValues((VarBinaryVector) root.getVector(vectors[11]), rowCount);
     assertNullValues((VarCharVector) root.getVector(vectors[12]), rowCount);
@@ -265,7 +265,7 @@ public void sqlToArrowTestSelectedNullColumnsValues(String[] vectors, VectorSche
     assertNullValues((Float8Vector) root.getVector(vectors[2]), rowCount);
     assertNullValues((Float4Vector) root.getVector(vectors[3]), rowCount);
     assertNullValues((TimeMilliVector) root.getVector(vectors[4]), rowCount);
-    assertNullValues((DateMilliVector) root.getVector(vectors[5]), rowCount);
+    assertNullValues((DateDayVector) root.getVector(vectors[5]), rowCount);
     assertNullValues((TimeStampVector) root.getVector(vectors[6]), rowCount);
     assertNullValues((VarBinaryVector) root.getVector(vectors[7]), rowCount);
     assertNullValues((VarCharVector) root.getVector(vectors[8]), rowCount);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
index dfcdb90be7c..b7d517f26e1 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
@@ -79,7 +79,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -202,8 +202,8 @@ public void testDataSets(VectorSchemaRoot root) {
     assertBooleanVectorValues((BitVector) root.getVector(BOOL), table.getRowCount(),
         getBooleanValues(table.getValues(), BOOL));
 
-    assertDateVectorValues((DateMilliVector) root.getVector(DATE), table.getRowCount(),
-        getLongValues(table.getValues(), DATE));
+    assertDateVectorValues((DateDayVector) root.getVector(DATE), table.getRowCount(),
+        getIntValues(table.getValues(), DATE));
 
     assertTimeVectorValues((TimeMilliVector) root.getVector(TIME), table.getRowCount(),
         getLongValues(table.getValues(), TIME));
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
index 002a1239101..ef2b406d120 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
@@ -37,7 +37,7 @@
 import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
 import org.apache.arrow.adapter.jdbc.Table;
 import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.TimeMilliVector;
 import org.apache.arrow.vector.TimeStampVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
@@ -145,8 +145,8 @@ public void testDataSets(VectorSchemaRoot root) {
       case EST_DATE:
       case GMT_DATE:
       case PST_DATE:
-        assertDateVectorValues((DateMilliVector) root.getVector(table.getVector()), table.getValues().length,
-            table.getLongValues());
+        assertDateVectorValues((DateDayVector) root.getVector(table.getVector()), table.getValues().length,
+            table.getIntValues());
         break;
       case EST_TIME:
       case GMT_TIME:
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
index d041f082b0b..fa75c1cdbaf 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
@@ -38,7 +38,7 @@
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.DateMilliVector;
+import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -51,6 +51,7 @@
 import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
@@ -66,6 +67,7 @@ public JdbcToArrowVectorIteratorTest(Table table) {
     super(table);
   }
 
+  @Test
   @Override
   public void testJdbcToArrowValues() throws SQLException, IOException {
 
@@ -91,7 +93,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep
     List<VarCharVector> vectorsForChar = new ArrayList<>();
     List<BitVector> vectorsForBit = new ArrayList<>();
     List<BitVector> vectorsForBool = new ArrayList<>();
-    List<DateMilliVector> dateMilliVectors = new ArrayList<>();
+    List<DateDayVector> dateDayVectors = new ArrayList<>();
     List<TimeMilliVector> timeMilliVectors = new ArrayList<>();
     List<TimeStampVector> timeStampVectors = new ArrayList<>();
     List<DecimalVector> decimalVectors = new ArrayList<>();
@@ -115,7 +117,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep
       vectorsForChar.add((VarCharVector) root.getVector(CHAR));
       vectorsForBit.add((BitVector) root.getVector(BIT));
       vectorsForBool.add((BitVector) root.getVector(BOOL));
-      dateMilliVectors.add((DateMilliVector) root.getVector(DATE));
+      dateDayVectors.add((DateDayVector) root.getVector(DATE));
       timeMilliVectors.add((TimeMilliVector) root.getVector(TIME));
       timeStampVectors.add((TimeStampVector) root.getVector(TIMESTAMP));
       decimalVectors.add((DecimalVector) root.getVector(DECIMAL));
@@ -134,7 +136,7 @@ private void validate(ArrowVectorIterator iterator) throws SQLException, IOExcep
     assertVarCharVectorValues(vectorsForChar, table.getRowCount(), getCharArray(table.getValues(), CHAR));
     assertBitVectorValues(vectorsForBit, table.getRowCount(), getIntValues(table.getValues(), BIT));
     assertBooleanVectorValues(vectorsForBool, table.getRowCount(), getBooleanValues(table.getValues(), BOOL));
-    assertDateMilliVectorValues(dateMilliVectors, table.getRowCount(), getLongValues(table.getValues(), DATE));
+    assertDateDayVectorValues(dateDayVectors, table.getRowCount(), getLongValues(table.getValues(), DATE));
     assertTimeMilliVectorValues(timeMilliVectors, table.getRowCount(), getLongValues(table.getValues(), TIME));
     assertTimeStampVectorValues(timeStampVectors, table.getRowCount(), getLongValues(table.getValues(), TIMESTAMP));
     assertDecimalVectorValues(decimalVectors, table.getRowCount(), getDecimalValues(table.getValues(), DECIMAL));
@@ -205,12 +207,12 @@ private void assertTimeMilliVectorValues(List<TimeMilliVector> vectors, int rowC
     }
   }
 
-  private void assertDateMilliVectorValues(List<DateMilliVector> vectors, int rowCount, Long[] values) {
+  private void assertDateDayVectorValues(List<DateDayVector> vectors, int rowCount, Long[] values) {
     int valueCount = vectors.stream().mapToInt(ValueVector::getValueCount).sum();
     assertEquals(rowCount, valueCount);
 
     int index = 0;
-    for (DateMilliVector vector : vectors) {
+    for (DateDayVector vector : vectors) {
       for (int i = 0; i < vector.getValueCount(); i++) {
         assertEquals(values[index++].longValue(), vector.get(i));
       }
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml
index fc9784df7ff..9baae643aef 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml
@@ -85,7 +85,7 @@ values:
  - 'DECIMAL_FIELD6=17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23,17345667789.23'
  - 'DOUBLE_FIELD7=56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345,56478356785.345'
  - 'TIME_FIELD9=45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000,45935000'
- - 'DATE_FIELD10=1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000,1518393600000'
+ - 'DATE_FIELD10=17574,17574,17574,17574,17574,17574,17574,17574,17574,17574'
  - 'TIMESTAMP_FIELD11=1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000,1518439535000'
  - 'CHAR_FIELD16=some char text,some char text,some char text,some char text,some char text,
  		 some char text,some char text,some char text,some char text,some char text'
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml
index bb9bfb81548..4be8ab86ecc 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml
@@ -71,7 +71,7 @@ values:
  - 'DECIMAL_FIELD6=null,17345667789.23,null,17345667789.23,null'
  - 'DOUBLE_FIELD7=null,56478356785.345,null,56478356785.345,null'
  - 'TIME_FIELD9=null,45935000,null,45935000,null'
- - 'DATE_FIELD10=null,1518393600000,null,1518393600000,null'
+ - 'DATE_FIELD10=null,17574,null,17574,null'
  - 'TIMESTAMP_FIELD11=null,1518439535000,null,1518439535000,null'
  - 'CHAR_FIELD16=null,some char text,null,some char text,null'
  - 'VARCHAR_FIELD13=null,some text that needs to be converted to varchar,null,
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml
index 45aa56c7417..da33d9d47cd 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_date_h2.yml
@@ -34,13 +34,13 @@ query: 'select date_field10 from table1;'
 drop: 'DROP table table1;'
 
 values:
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml
index 290c32ebafa..1868db3adeb 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_est_date_h2.yml
@@ -36,13 +36,13 @@ query: 'select date_field10 from table1;'
 drop: 'DROP table table1;'
 
 values:
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
- - '1518411600000'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml
index 03929c936e4..65824861a8a 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_gmt_date_h2.yml
@@ -36,13 +36,13 @@ query: 'select date_field10 from table1;'
 drop: 'DROP table table1;'
 
 values:
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
- - '1518393600000'
\ No newline at end of file
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
\ No newline at end of file
diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml
index 81a668f37a4..798cfc7d67d 100644
--- a/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml
+++ b/java/adapter/jdbc/src/test/resources/h2/test1_pst_date_h2.yml
@@ -36,13 +36,13 @@ query: 'select date_field10 from table1;'
 drop: 'DROP table table1;'
 
 values:
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
- - '1518422400000'
\ No newline at end of file
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
+ - '17574'
\ No newline at end of file
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java
index 7f244bfa658..9448a1a1453 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java
@@ -37,6 +37,7 @@
  * maintained to track which elements in the vector are null.
  */
 public final class DateDayVector extends BaseFixedWidthVector {
+
   private static final byte TYPE_WIDTH = 4;
   private final FieldReader reader;
 

From ab3d86e70adb4ac95b5fa7e3f6e46048dd7648f4 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 3 Feb 2020 11:56:47 +0100
Subject: [PATCH 07/26] ARROW-7736: [Release] Retry binary download on
 transient error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It makes binary verification more robust.

Closes #6335 from kou/release-verification-binaries-retry and squashes the following commits:

38944a9c7 <Sutou Kouhei>  Retry binary download on transient error

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/release/download_rc_binaries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 dev/release/download_rc_binaries.py

diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py
old mode 100644
new mode 100755
index b2dfc867bc0..da66e483d37
--- a/dev/release/download_rc_binaries.py
+++ b/dev/release/download_rc_binaries.py
@@ -112,7 +112,7 @@ def _download_file(self, dest, info):
         bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath)
 
         cmd = [
-            'curl', '--fail', '--location',
+            'curl', '--fail', '--location', '--retry', '5',
             '--output', dest_path, bintray_abspath
         ]
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,

From 19b0d4b1164a7f73cf46b298426f1d242f29dd62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 3 Feb 2020 08:35:27 -0500
Subject: [PATCH 08/26] ARROW-7466: [CI][Java] Fix gandiva-jar-osx nightly
 build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #6331 from kszucs/ARROW-7466 and squashes the following commits:

d50b8c8975 <Krisztián Szűcs> don't install already installed python3
d956297c74 <Krisztián Szűcs> re-enable steps
421142dba8 <Krisztián Szűcs> start all arguments at the same column...
2131836b6f <Krisztián Szűcs> travis multi line string
b50865e70c <Krisztián Szűcs> use travis_build_dir
6f1beb6851 <Krisztián Szűcs> debug paths
7b054ecabf <Krisztián Szűcs> queue path
6929f3d2c7 <Krisztián Szűcs> fix crossbow path
2a2d7c378b <Krisztián Szűcs> deploy using crossbow

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: François Saint-Jacques <fsaintjacques@gmail.com>
---
 dev/tasks/gandiva-jars/travis.osx.yml | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/dev/tasks/gandiva-jars/travis.osx.yml b/dev/tasks/gandiva-jars/travis.osx.yml
index 5626b1b795d..8a43e3a2429 100644
--- a/dev/tasks/gandiva-jars/travis.osx.yml
+++ b/dev/tasks/gandiva-jars/travis.osx.yml
@@ -33,10 +33,10 @@ env:
     - ARROW_TRAVIS_GANDIVA=1
 
 before_script:
+  - pwd
   - git clone --no-checkout {{ arrow.remote }} arrow
   - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }}
   - if [ $CROSSBOW_USE_COMMIT_ID = true ]; then git -C arrow checkout {{ arrow.head }}; else git -C arrow checkout FETCH_HEAD; fi
-
   - export TRAVIS_BUILD_DIR=$TRAVIS_BUILD_DIR/arrow
   - brew update
   - brew install bison flex llvm@7
@@ -52,15 +52,17 @@ script:
   - mkdir -p dist
   - dev/tasks/gandiva-jars/build-cpp-osx.sh || travis_terminate 1
   - dev/tasks/gandiva-jars/build-java.sh || travis_terminate 1
-
-deploy:
-  provider: releases
-  api_key: $CROSSBOW_GITHUB_TOKEN
-  file_glob: true
-  file: dist/*.jar
-  skip_cleanup: true
-  on:
-    tags: true
+  # deploy using crossbow
+  - brew install libgit2
+  - pip3 install click github3.py jinja2 jira pygit2 ruamel.yaml setuptools_scm toolz
+  - >
+    python3 dev/tasks/crossbow.py
+    --queue-path $TRAVIS_BUILD_DIR
+    --queue-remote {{ queue.remote_url }}
+    upload-artifacts
+    --sha {{ task.branch }}
+    --tag {{ task.tag }}
+    --pattern "dist/*.jar"
 
 notifications:
   email:

From d091894a6d5c24543e57722e59224a76ca04e7b0 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Mon, 3 Feb 2020 08:18:12 -0700
Subject: [PATCH 09/26] ARROW-7684: [Rust] Example Flight client and server for
 DataFusion

This PR adds DataFusion examples for a Flight client and server where the client can send a SQL query to the server and then receive the results.

I have manually tested with a Java client as well to confirm that it works.

Closes #6308 from andygrove/datafusion-flight-example and squashes the following commits:

788feef8c <Andy Grove> code cleanup
9c473389a <Neville Dipale> Complete flight client's record batch reader
1337b9890 <Andy Grove> parse recordbatch
459bef386 <Andy Grove> client parses schema from ipc batches
31c894bbc <Andy Grove> update release test script
efe05aef5 <Andy Grove> update release test script
5ecea83b1 <Andy Grove> formatting
8b419dab3 <Andy Grove> update release test script
03d2c8437 <Andy Grove> client streams results
0a39a513d <Andy Grove> client can stream batches
e72c60545 <Andy Grove> add starting point for flight-client example
ab28da82e <Andy Grove> get schema from query plan instead of from first batch
0901a3f88 <Neville Dipale> Merge branch 'datafusion-flight-example' of https://github.com/andygrove/arrow into datafusion-flight-example
ad2e3b066 <Neville Dipale> send schema before batches
996f2a4a2 <Andy Grove> Use PARQUET_TEST_DATA env var
260f9cac1 <Neville Dipale> fix license violation
516b66dc8 <Neville Dipale> add helpers to convert record batch to flight data proto message
6beb4eafb <Andy Grove> WIP example Flight server for DataFusion

Lead-authored-by: Andy Grove <andygrove73@gmail.com>
Co-authored-by: Neville Dipale <nevilledips@gmail.com>
Signed-off-by: Andy Grove <andygrove73@gmail.com>
---
 dev/release/00-prepare-test.rb            |   8 +-
 rust/arrow/Cargo.toml                     |   4 +-
 rust/arrow/src/flight/mod.rs              |  83 +++++++++++
 rust/arrow/src/ipc/convert.rs             |   6 +
 rust/arrow/src/ipc/reader.rs              |   2 +-
 rust/arrow/src/ipc/writer.rs              |  37 +++--
 rust/arrow/src/lib.rs                     |   2 +
 rust/datafusion/Cargo.toml                |   6 +
 rust/datafusion/examples/README.md        |  28 ++++
 rust/datafusion/examples/flight-client.rs |  62 ++++++++
 rust/datafusion/examples/flight-server.rs | 174 ++++++++++++++++++++++
 11 files changed, 394 insertions(+), 18 deletions(-)
 create mode 100644 rust/arrow/src/flight/mod.rs
 create mode 100644 rust/datafusion/examples/README.md
 create mode 100644 rust/datafusion/examples/flight-client.rs
 create mode 100644 rust/datafusion/examples/flight-server.rs

diff --git a/dev/release/00-prepare-test.rb b/dev/release/00-prepare-test.rb
index e9894d0bc8a..afabe88fa7e 100644
--- a/dev/release/00-prepare-test.rb
+++ b/dev/release/00-prepare-test.rb
@@ -276,7 +276,9 @@ def test_version_pre_tag
                        ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }",
                         "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }",
                         "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }",
-                        "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"]
+                        "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"],
+                       ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@snapshot_version}\" }",
+                        "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }"]
                      ],
                    },
                    {
@@ -458,7 +460,9 @@ def test_version_post_tag
                        ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }",
                         "-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }",
                         "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }",
-                        "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"]
+                        "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"],
+                       ["-arrow-flight = { path = \"../arrow-flight\", version = \"#{@release_version}\" }",
+                        "+arrow-flight = { path = \"../arrow-flight\", version = \"#{@next_snapshot_version}\" }"]
                      ],
                    },
                    {
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 29829610889..57fc261985c 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -50,10 +50,12 @@ packed_simd = { version = "0.3.1", optional = true }
 chrono = "0.4"
 flatbuffers = "0.6.0"
 hex = "0.4"
+arrow-flight = { path = "../arrow-flight", optional = true }
 
 [features]
 simd = ["packed_simd"]
-default = ["simd"]
+flight = ["arrow-flight"]
+default = ["simd", "flight"]
 
 [dev-dependencies]
 criterion = "0.2"
diff --git a/rust/arrow/src/flight/mod.rs b/rust/arrow/src/flight/mod.rs
new file mode 100644
index 00000000000..6d09ca48c2a
--- /dev/null
+++ b/rust/arrow/src/flight/mod.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities to assist with reading and writing Arrow data as Flight messages
+
+use std::convert::TryFrom;
+use std::sync::Arc;
+
+use flight::FlightData;
+
+use crate::datatypes::Schema;
+use crate::error::{ArrowError, Result};
+use crate::ipc::{convert, reader, writer};
+use crate::record_batch::RecordBatch;
+
+/// Convert a `RecordBatch` to `FlightData` by getting the header and body as bytes
+impl From<&RecordBatch> for FlightData {
+    fn from(batch: &RecordBatch) -> Self {
+        let (header, body) = writer::record_batch_to_bytes(batch);
+        Self {
+            flight_descriptor: None,
+            app_metadata: vec![],
+            data_header: header,
+            data_body: body,
+        }
+    }
+}
+
+/// Convert a `Schema` to `FlightData` by converting to an IPC message
+impl From<&Schema> for FlightData {
+    fn from(schema: &Schema) -> Self {
+        let schema = writer::schema_to_bytes(schema);
+        Self {
+            flight_descriptor: None,
+            app_metadata: vec![],
+            data_header: schema,
+            data_body: vec![],
+        }
+    }
+}
+
+/// Try convert `FlightData` into an Arrow Schema
+///
+/// Returns an error if the `FlightData` header is not a valid IPC schema
+impl TryFrom<&FlightData> for Schema {
+    type Error = ArrowError;
+    fn try_from(data: &FlightData) -> Result<Self> {
+        convert::schema_from_bytes(&data.data_header[..]).ok_or(ArrowError::ParseError(
+            "Unable to convert flight data to Arrow schema".to_string(),
+        ))
+    }
+}
+
+/// Convert a FlightData message to a RecordBatch
+pub fn flight_data_to_batch(
+    data: &FlightData,
+    schema: Arc<Schema>,
+) -> Result<Option<RecordBatch>> {
+    // check that the data_header is a record batch message
+    let message = crate::ipc::get_root_as_message(&data.data_header[..]);
+    let batch_header = message
+        .header_as_record_batch()
+        .ok_or(ArrowError::ParseError(
+            "Unable to convert flight data header to a record batch".to_string(),
+        ))?;
+    reader::read_record_batch(&data.data_body, batch_header, schema)
+}
+
+// TODO: add more explicit conversion that expoess flight descriptor and metadata options
diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs
index a38975fbf1c..f53914e31d2 100644
--- a/rust/arrow/src/ipc/convert.rs
+++ b/rust/arrow/src/ipc/convert.rs
@@ -152,6 +152,12 @@ pub(crate) fn fb_to_schema(fb: ipc::Schema) -> Schema {
     Schema::new_with_metadata(fields, metadata)
 }
 
+/// Deserialize an IPC message into a schema
+pub(crate) fn schema_from_bytes(bytes: &[u8]) -> Option<Schema> {
+    let ipc = ipc::get_root_as_message(bytes);
+    ipc.header_as_schema().map(|schema| fb_to_schema(schema))
+}
+
 /// Get the Arrow data type from the flatbuffer Field table
 pub(crate) fn get_data_type(field: ipc::Field) -> DataType {
     match field.type_type() {
diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs
index 99173641d32..25ba0cd3df1 100644
--- a/rust/arrow/src/ipc/reader.rs
+++ b/rust/arrow/src/ipc/reader.rs
@@ -348,7 +348,7 @@ fn create_list_array(
 }
 
 /// Creates a record batch from binary data using the `ipc::RecordBatch` indexes and the `Schema`
-fn read_record_batch(
+pub(crate) fn read_record_batch(
     buf: &Vec<u8>,
     batch: ipc::RecordBatch,
     schema: Arc<Schema>,
diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs
index 36c89c7a666..c872c8286d7 100644
--- a/rust/arrow/src/ipc/writer.rs
+++ b/rust/arrow/src/ipc/writer.rs
@@ -209,8 +209,7 @@ impl<W: Write> Drop for StreamWriter<W> {
     }
 }
 
-/// Convert the schema to its IPC representation, and write it to the `writer`
-fn write_schema<R: Write>(writer: &mut BufWriter<R>, schema: &Schema) -> Result<usize> {
+pub(crate) fn schema_to_bytes(schema: &Schema) -> Vec<u8> {
     let mut fbb = FlatBufferBuilder::new();
     let schema = {
         let fb = ipc::convert::schema_to_fb_offset(&mut fbb, schema);
@@ -227,9 +226,13 @@ fn write_schema<R: Write>(writer: &mut BufWriter<R>, schema: &Schema) -> Result<
     fbb.finish(data, None);
 
     let data = fbb.finished_data();
-    let written = write_padded_data(writer, data, WriteDataType::Header);
+    data.to_vec()
+}
 
-    written
+/// Convert the schema to its IPC representation, and write it to the `writer`
+fn write_schema<R: Write>(writer: &mut BufWriter<R>, schema: &Schema) -> Result<usize> {
+    let data = schema_to_bytes(schema);
+    write_padded_data(writer, &data[..], WriteDataType::Header)
 }
 
 /// The message type being written. This determines whether to write the data length or not.
@@ -266,13 +269,8 @@ fn write_padded_data<R: Write>(
     Ok(total_len as usize)
 }
 
-/// Write a record batch to the writer, writing the message size before the message
-/// if the record batch is being written to a stream
-fn write_record_batch<R: Write>(
-    writer: &mut BufWriter<R>,
-    batch: &RecordBatch,
-    is_stream: bool,
-) -> Result<(usize, usize)> {
+/// Write a `RecordBatch` into a tuple of bytes, one for the header (ipc::Message) and the other for the batch's data
+pub(crate) fn record_batch_to_bytes(batch: &RecordBatch) -> (Vec<u8>, Vec<u8>) {
     let mut fbb = FlatBufferBuilder::new();
 
     let mut nodes: Vec<ipc::FieldNode> = vec![];
@@ -313,13 +311,24 @@ fn write_record_batch<R: Write>(
     let root = message.finish();
     fbb.finish(root, None);
     let finished_data = fbb.finished_data();
+
+    (finished_data.to_vec(), arrow_data)
+}
+
+/// Write a record batch to the writer, writing the message size before the message
+/// if the record batch is being written to a stream
+fn write_record_batch<R: Write>(
+    writer: &mut BufWriter<R>,
+    batch: &RecordBatch,
+    is_stream: bool,
+) -> Result<(usize, usize)> {
+    let (meta_data, arrow_data) = record_batch_to_bytes(batch);
     // write the length of data if writing to stream
     if is_stream {
-        let total_len: u32 = finished_data.len() as u32;
+        let total_len: u32 = meta_data.len() as u32;
         writer.write(&total_len.to_le_bytes()[..])?;
     }
-    let meta_written =
-        write_padded_data(writer, fbb.finished_data(), WriteDataType::Body)?;
+    let meta_written = write_padded_data(writer, &meta_data[..], WriteDataType::Body)?;
     let arrow_data_written =
         write_padded_data(writer, &arrow_data[..], WriteDataType::Body)?;
     Ok((meta_written, arrow_data_written))
diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs
index 899bb62f08a..4383922be09 100644
--- a/rust/arrow/src/lib.rs
+++ b/rust/arrow/src/lib.rs
@@ -33,6 +33,8 @@ pub mod compute;
 pub mod csv;
 pub mod datatypes;
 pub mod error;
+#[cfg(feature = "flight")]
+pub mod flight;
 pub mod ipc;
 pub mod json;
 pub mod memory;
diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml
index 14fc4e54548..f489d2b0d6d 100644
--- a/rust/datafusion/Cargo.toml
+++ b/rust/datafusion/Cargo.toml
@@ -56,6 +56,12 @@ crossbeam = "0.7.1"
 [dev-dependencies]
 criterion = "0.2.0"
 tempdir = "0.3.7"
+futures = "0.3"
+prost = "0.6"
+tokio = { version = "0.2", features = ["macros"] }
+tonic = "0.1"
+flatbuffers = "0.6.0"
+arrow-flight = { path = "../arrow-flight", version = "0.16.0-SNAPSHOT" }
 
 [[bench]]
 name = "aggregate_query_sql"
diff --git a/rust/datafusion/examples/README.md b/rust/datafusion/examples/README.md
new file mode 100644
index 00000000000..163ef3d952b
--- /dev/null
+++ b/rust/datafusion/examples/README.md
@@ -0,0 +1,28 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Examples
+
+## Single Process
+
+The examples `csv_sql.rs` and `parquet_sql.rs` demonstrate building a query plan from a SQL statement and then executing the query plan against local CSV and Parquet files, respectively.
+
+## Distributed
+
+The `flight-client.rs` and `flight-server.rs` examples demonstrate how to run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol.
\ No newline at end of file
diff --git a/rust/datafusion/examples/flight-client.rs b/rust/datafusion/examples/flight-client.rs
new file mode 100644
index 00000000000..97db7934620
--- /dev/null
+++ b/rust/datafusion/examples/flight-client.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::convert::TryFrom;
+use std::sync::Arc;
+
+use arrow::array::Int32Array;
+use arrow::datatypes::Schema;
+use arrow::flight::flight_data_to_batch;
+use flight::flight_service_client::FlightServiceClient;
+use flight::Ticket;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut client = FlightServiceClient::connect("http://localhost:50051").await?;
+
+    let request = tonic::Request::new(Ticket {
+        ticket: "SELECT id FROM alltypes_plain".into(),
+    });
+
+    let mut stream = client.do_get(request).await?.into_inner();
+
+    // the schema should be the first message returned, else client should error
+    let flight_data = stream.message().await?.unwrap();
+    // convert FlightData to a stream
+    let schema = Arc::new(Schema::try_from(&flight_data)?);
+    println!("Schema: {:?}", schema);
+
+    // all the remaining stream messages should be dictionary and record batches
+    while let Some(flight_data) = stream.message().await? {
+        // the unwrap is infallible and thus safe
+        let record_batch = flight_data_to_batch(&flight_data, schema.clone())?.unwrap();
+
+        println!(
+            "record_batch has {} columns and {} rows",
+            record_batch.num_columns(),
+            record_batch.num_rows()
+        );
+        let column = record_batch.column(0);
+        let column = column
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("Unable to get column");
+        println!("Column 1: {:?}", column);
+    }
+
+    Ok(())
+}
diff --git a/rust/datafusion/examples/flight-server.rs b/rust/datafusion/examples/flight-server.rs
new file mode 100644
index 00000000000..e6cc22ac503
--- /dev/null
+++ b/rust/datafusion/examples/flight-server.rs
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::pin::Pin;
+
+use futures::Stream;
+use tonic::transport::Server;
+use tonic::{Request, Response, Status, Streaming};
+
+use datafusion::execution::context::ExecutionContext;
+
+use flight::{
+    flight_service_server::FlightService, flight_service_server::FlightServiceServer,
+    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
+    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
+};
+
+#[derive(Clone)]
+pub struct FlightServiceImpl {}
+
+#[tonic::async_trait]
+impl FlightService for FlightServiceImpl {
+    type HandshakeStream = Pin<
+        Box<dyn Stream<Item = Result<HandshakeResponse, Status>> + Send + Sync + 'static>,
+    >;
+    type ListFlightsStream =
+        Pin<Box<dyn Stream<Item = Result<FlightInfo, Status>> + Send + Sync + 'static>>;
+    type DoGetStream =
+        Pin<Box<dyn Stream<Item = Result<FlightData, Status>> + Send + Sync + 'static>>;
+    type DoPutStream =
+        Pin<Box<dyn Stream<Item = Result<PutResult, Status>> + Send + Sync + 'static>>;
+    type DoActionStream = Pin<
+        Box<dyn Stream<Item = Result<flight::Result, Status>> + Send + Sync + 'static>,
+    >;
+    type ListActionsStream =
+        Pin<Box<dyn Stream<Item = Result<ActionType, Status>> + Send + Sync + 'static>>;
+
+    async fn do_get(
+        &self,
+        request: Request<Ticket>,
+    ) -> Result<Response<Self::DoGetStream>, Status> {
+        let ticket = request.into_inner();
+        match String::from_utf8(ticket.ticket.to_vec()) {
+            Ok(sql) => {
+                println!("do_get: {}", sql);
+
+                // create local execution context
+                let mut ctx = ExecutionContext::new();
+
+                let testdata = std::env::var("PARQUET_TEST_DATA")
+                    .expect("PARQUET_TEST_DATA not defined");
+
+                // register parquet file with the execution context
+                ctx.register_parquet(
+                    "alltypes_plain",
+                    &format!("{}/alltypes_plain.parquet", testdata),
+                )
+                .map_err(|e| to_tonic_err(&e))?;
+
+                // create the query plan
+                let plan = ctx
+                    .create_logical_plan(&sql)
+                    .and_then(|plan| ctx.optimize(&plan))
+                    .and_then(|plan| ctx.create_physical_plan(&plan, 1024 * 1024))
+                    .map_err(|e| to_tonic_err(&e))?;
+
+                // execute the query
+                let results = ctx.collect(plan.as_ref()).map_err(|e| to_tonic_err(&e))?;
+                if results.is_empty() {
+                    return Err(Status::internal("There were no results from ticket"));
+                }
+
+                // add an initial FlightData message that sends schema
+                let schema = plan.schema();
+                let mut flights: Vec<Result<FlightData, Status>> =
+                    vec![Ok(FlightData::from(schema.as_ref()))];
+
+                let mut batches: Vec<Result<FlightData, Status>> = results
+                    .iter()
+                    .map(|batch| Ok(FlightData::from(batch)))
+                    .collect();
+
+                // append batch vector to schema vector, so that the first message sent is the schema
+                flights.append(&mut batches);
+
+                let output = futures::stream::iter(flights);
+
+                Ok(Response::new(Box::pin(output) as Self::DoGetStream))
+            }
+            Err(e) => Err(Status::invalid_argument(format!("Invalid ticket: {:?}", e))),
+        }
+    }
+
+    async fn handshake(
+        &self,
+        _request: Request<Streaming<HandshakeRequest>>,
+    ) -> Result<Response<Self::HandshakeStream>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn list_flights(
+        &self,
+        _request: Request<Criteria>,
+    ) -> Result<Response<Self::ListFlightsStream>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn get_flight_info(
+        &self,
+        _request: Request<FlightDescriptor>,
+    ) -> Result<Response<FlightInfo>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn get_schema(
+        &self,
+        _request: Request<FlightDescriptor>,
+    ) -> Result<Response<SchemaResult>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn do_put(
+        &self,
+        _request: Request<Streaming<FlightData>>,
+    ) -> Result<Response<Self::DoPutStream>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn do_action(
+        &self,
+        _request: Request<Action>,
+    ) -> Result<Response<Self::DoActionStream>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+
+    async fn list_actions(
+        &self,
+        _request: Request<Empty>,
+    ) -> Result<Response<Self::ListActionsStream>, Status> {
+        Err(Status::unimplemented("Not yet implemented"))
+    }
+}
+
+fn to_tonic_err(e: &datafusion::error::ExecutionError) -> Status {
+    Status::internal(format!("{:?}", e))
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let addr = "0.0.0.0:50051".parse()?;
+    let service = FlightServiceImpl {};
+
+    let svc = FlightServiceServer::new(service);
+
+    println!("Listening on {:?}", addr);
+
+    Server::builder().add_service(svc).serve(addr).await?;
+
+    Ok(())
+}

From 68c2f3c3edc04f4478853b9045962ff138e7ad24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 3 Feb 2020 17:13:40 +0100
Subject: [PATCH 10/26] ARROW-7729: [Python][CI] Pin pandas version to 0.25 in
 the dask integration test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Additionally test agains dask's latest release not just the master revision.

Closes #6326 from kszucs/dask-pandas-pin and squashes the following commits:

b5cb40eec <Krisztián Szűcs> pin pandas depending on dask's version
083221ce9 <Krisztián Szűcs> pin pandas in the dask integration test

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 ci/docker/conda-python-dask.dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile
index e567fecfd13..cf19c05237f 100644
--- a/ci/docker/conda-python-dask.dockerfile
+++ b/ci/docker/conda-python-dask.dockerfile
@@ -23,3 +23,6 @@ FROM ${repo}:${arch}-conda-python-${python}
 ARG dask=latest
 COPY ci/scripts/install_dask.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_dask.sh ${dask}
+
+# The Spark tests currently break with pandas >= 1.0
+RUN if [ ${dask} == "latest" ]; then conda install pandas=0.25.3; fi
\ No newline at end of file

From cb81f7d84c26cd75f4568ae1b29744cb4218b578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Mon, 3 Feb 2020 17:30:19 +0100
Subject: [PATCH 11/26] ARROW-7735: [Release][Python] Use pip to install
 dependencies for wheel verification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wheel verification script fails for python 3.5.
At the same time the wheel properly works for python 3.5 docker
images without conda environment.

Conda forge doesn't maintain packages for python 3.5 anymore
and something must have mixed with the numpy versions.

This change fixed the wheel verification locally for me.

Closes #6339 from kszucs/wheel-verification and squashes the following commits:

3e9694958 <Krisztián Szűcs> remove pytest verbose flags
026e5fb34 <Krisztián Szűcs> use pip to install dependencies for wheel verification

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/release/verify-release-candidate.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 000af83a172..9d68b7ce18b 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -609,8 +609,10 @@ test_linux_wheels() {
       pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[mu.]/}-cp${py_arch//./}-manylinux${ml_spec}_x86_64.whl
       check_python_imports py_arch
 
-      # execute the python unit tests
-      conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas
+      # install test requirements
+      pip install -r ${ARROW_DIR}/python/requirements-test.txt
+
+      # execute the tests
       pytest --pyargs pyarrow
     done
 
@@ -640,8 +642,10 @@ test_macos_wheels() {
     pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[m.]/}-cp${py_arch//./}-${macos_suffix}.whl
     check_python_imports py_arch
 
-    # execute the python unit tests
-    conda install -y --file ${ARROW_DIR}/ci/conda_env_python.yml pandas
+    # install test requirements
+    pip install -r ${ARROW_DIR}/python/requirements-test.txt
+
+    # execute the tests
     pytest --pyargs pyarrow
 
     conda deactivate

From 4b549977fa0ddc9bf0a6031486b7b1987e66baec Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 4 Feb 2020 06:44:19 +0900
Subject: [PATCH 12/26] ARROW-7726: [CI] [C++] Use boost binaries on Windows
 GHA build

The binaries are installed using Chocolatey, which takes a bit of time (it's a 2+GB install...), but less so than recompiling Boost from scratch during the CMake build.

[skip appveyor]

Closes #6325 from pitrou/ARROW-7726-download-boost-gha and squashes the following commits:

e877622b9 <Antoine Pitrou> Revert "Try a more flexible way of finding Boost"
eb5db8fc1 <Antoine Pitrou> Try a more flexible way of finding Boost
d57064960 <Antoine Pitrou> ARROW-7726:   Use boost binaries on Windows GHA build

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index abc3b8f4d91..cf746c327db 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -175,10 +175,15 @@ jobs:
       ARROW_BUILD_STATIC: OFF
       ARROW_BOOST_USE_SHARED: OFF
       ARROW_VERBOSE_THIRDPARTY_BUILD: OFF
-      BOOST_SOURCE: BUNDLED
+      BOOST_ROOT: C:\local\boost_1_67_0
+      BOOST_LIBRARYDIR: C:\local\boost_1_67_0\lib64-msvc-14.1\
     steps:
       - name: Disable Crash Dialogs
         run: reg add "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" /v DontShowUI /t REG_DWORD /d 1 /f
+      - name: Installed Packages
+        run: choco list -l
+      - name: Install Boost
+        run: choco install -y boost-msvc-14.1
       - name: Checkout Arrow
         uses: actions/checkout@v1
         with:

From bc261d1e43aea04f5277ff0c3252a21582e7ab61 Mon Sep 17 00:00:00 2001
From: liyafan82 <fan_li_ya@foxmail.com>
Date: Mon, 3 Feb 2020 21:08:10 -0800
Subject: [PATCH 13/26] ARROW-7073: [Java] Support concating vectors values in
 batch

We need a way to copy vector values in batch. Currently, we have copyFrom and copyFromSafe APIs. However, they are not enough, as copying values individually is not performant.

Closes #5916 from liyafan82/fly_1125_veccat and squashes the following commits:

94b407c85 <liyafan82>  Support dense union vector
ee49dc678 <liyafan82>  Add tests with null values
ad33e234c <liyafan82>  Rewrite tests with vector populator for result verification
c89211abc <liyafan82>  Rewrite tests with vector populator and provide static utility
7c13ede14 <liyafan82>  Support concating vectors values in batch

Authored-by: liyafan82 <fan_li_ya@foxmail.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 .../main/codegen/templates/UnionVector.java   |  12 +-
 .../arrow/vector/util/VectorAppender.java     | 326 +++++++++++++++++
 .../vector/util/VectorBatchAppender.java      |  39 ++
 .../testing/ValueVectorDataPopulator.java     |  73 +++-
 .../arrow/vector/util/TestVectorAppender.java | 346 ++++++++++++++++++
 .../vector/util/TestVectorBatchAppender.java  |  72 ++++
 6 files changed, 863 insertions(+), 5 deletions(-)
 create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
 create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java
 create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
 create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java

diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java
index d760fe53fc6..8bf5d8a9b20 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -553,9 +553,13 @@ public Iterator<ValueVector> iterator() {
     return vectors.iterator();
   }
 
-    public ValueVector getVector(int index) {
-      int type = typeBuffer.getByte(index * TYPE_WIDTH);
-      switch (MinorType.values()[type]) {
+  public ValueVector getVector(int index) {
+    int type = typeBuffer.getByte(index * TYPE_WIDTH);
+    return getVectorByType(type);
+  }
+
+    public ValueVector getVectorByType(int typeId) {
+      switch (MinorType.values()[typeId]) {
         case NULL:
           return null;
       <#list vv.types as type>
@@ -574,7 +578,7 @@ public ValueVector getVector(int index) {
         case LIST:
           return getList();
         default:
-          throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]);
+          throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[typeId]);
       }
     }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
new file mode 100644
index 00000000000..8dac7676858
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.util;
+
+import java.util.HashSet;
+
+import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.BaseFixedWidthVector;
+import org.apache.arrow.vector.BaseVariableWidthVector;
+import org.apache.arrow.vector.BitVectorHelper;
+import org.apache.arrow.vector.NullVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.compare.TypeEqualsVisitor;
+import org.apache.arrow.vector.compare.VectorVisitor;
+import org.apache.arrow.vector.complex.DenseUnionVector;
+import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.UnionVector;
+
+import io.netty.util.internal.PlatformDependent;
+
+/**
+ * Utility to append two vectors together.
+ */
+class VectorAppender implements VectorVisitor<ValueVector, Void> {
+
+  /**
+   * The targetVector to be appended.
+   */
+  private final ValueVector targetVector;
+
+  private final TypeEqualsVisitor typeVisitor;
+
+  /**
+   * Constructs a new targetVector appender, with the given targetVector.
+   * @param targetVector the targetVector to be appended.
+   */
+  VectorAppender(ValueVector targetVector) {
+    this.targetVector = targetVector;
+    typeVisitor = new TypeEqualsVisitor(targetVector, false, true);
+  }
+
+  @Override
+  public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) {
+    Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()),
+            "The targetVector to append must have the same type as the targetVector being appended");
+
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    // make sure there is enough capacity
+    while (targetVector.getValueCapacity() < newValueCount) {
+      targetVector.reAlloc();
+    }
+
+    // append validity buffer
+    BitVectorHelper.concatBits(
+            targetVector.getValidityBuffer(), targetVector.getValueCount(),
+            deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
+
+    // append data buffer
+    PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(),
+            targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(),
+            deltaVector.getTypeWidth() * deltaVector.getValueCount());
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) {
+    Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()),
+            "The targetVector to append must have the same type as the targetVector being appended");
+
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    int targetDataSize = targetVector.getOffsetBuffer().getInt(
+            targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
+    int deltaDataSize = deltaVector.getOffsetBuffer().getInt(
+            deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
+    int newValueCapacity = targetDataSize + deltaDataSize;
+
+    // make sure there is enough capacity
+    while (targetVector.getValueCapacity() < newValueCount) {
+      targetVector.reAlloc();
+    }
+    while (targetVector.getDataBuffer().capacity() < newValueCapacity) {
+      ((BaseVariableWidthVector) targetVector).reallocDataBuffer();
+    }
+
+    // append validity buffer
+    BitVectorHelper.concatBits(
+            targetVector.getValidityBuffer(), targetVector.getValueCount(),
+            deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
+
+    // append data buffer
+    PlatformDependent.copyMemory(deltaVector.getDataBuffer().memoryAddress(),
+            targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize);
+
+    // copy offset buffer
+    PlatformDependent.copyMemory(
+            deltaVector.getOffsetBuffer().memoryAddress() + BaseVariableWidthVector.OFFSET_WIDTH,
+            targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) *
+                    BaseVariableWidthVector.OFFSET_WIDTH,
+            deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
+
+    // increase each offset from the second buffer
+    for (int i = 0; i < deltaVector.getValueCount(); i++) {
+      int oldOffset = targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) *
+              BaseVariableWidthVector.OFFSET_WIDTH);
+      targetVector.getOffsetBuffer().setInt(
+              (targetVector.getValueCount() + 1 + i) *
+                      BaseVariableWidthVector.OFFSET_WIDTH, oldOffset + targetDataSize);
+    }
+    ((BaseVariableWidthVector) targetVector).setLastSet(newValueCount - 1);
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(ListVector deltaVector, Void value) {
+    Preconditions.checkArgument(typeVisitor.equals(deltaVector),
+            "The targetVector to append must have the same type as the targetVector being appended");
+
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    int targetListSize = targetVector.getOffsetBuffer().getInt(
+            targetVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
+    int deltaListSize = deltaVector.getOffsetBuffer().getInt(
+            deltaVector.getValueCount() * BaseVariableWidthVector.OFFSET_WIDTH);
+
+    ListVector targetListVector = (ListVector) targetVector;
+
+    // make sure the underlying vector has value count set
+    targetListVector.getDataVector().setValueCount(targetListSize);
+    deltaVector.getDataVector().setValueCount(deltaListSize);
+
+    // make sure there is enough capacity
+    while (targetVector.getValueCapacity() < newValueCount) {
+      targetVector.reAlloc();
+    }
+
+    // append validity buffer
+    BitVectorHelper.concatBits(
+            targetVector.getValidityBuffer(), targetVector.getValueCount(),
+            deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
+
+    // append offset buffer
+    PlatformDependent.copyMemory(deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH,
+            targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) *
+                    ListVector.OFFSET_WIDTH,
+            deltaVector.getValueCount() * ListVector.OFFSET_WIDTH);
+
+    // increase each offset from the second buffer
+    for (int i = 0; i < deltaVector.getValueCount(); i++) {
+      int oldOffset =
+              targetVector.getOffsetBuffer().getInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH);
+      targetVector.getOffsetBuffer().setInt((targetVector.getValueCount() + 1 + i) * ListVector.OFFSET_WIDTH,
+              oldOffset + targetListSize);
+    }
+    targetListVector.setLastSet(newValueCount - 1);
+
+    // append underlying vectors
+    VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector());
+    deltaVector.getDataVector().accept(innerAppender, null);
+
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(FixedSizeListVector deltaVector, Void value) {
+    Preconditions.checkArgument(typeVisitor.equals(deltaVector),
+            "The vector to append must have the same type as the targetVector being appended");
+
+    FixedSizeListVector targetListVector = (FixedSizeListVector) targetVector;
+
+    Preconditions.checkArgument(targetListVector.getListSize() == deltaVector.getListSize(),
+            "FixedSizeListVector must have the same list size to append");
+
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    int targetListSize = targetListVector.getValueCount() * targetListVector.getListSize();
+    int deltaListSize = deltaVector.getValueCount() * deltaVector.getListSize();
+
+    // make sure the underlying vector has value count set
+    targetListVector.getDataVector().setValueCount(targetListSize);
+    deltaVector.getDataVector().setValueCount(deltaListSize);
+
+    // make sure there is enough capacity
+    while (targetVector.getValueCapacity() < newValueCount) {
+      targetVector.reAlloc();
+    }
+
+    // append validity buffer
+    BitVectorHelper.concatBits(
+            targetVector.getValidityBuffer(), targetVector.getValueCount(),
+            deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
+
+    // append underlying vectors
+    VectorAppender innerAppender = new VectorAppender(targetListVector.getDataVector());
+    deltaVector.getDataVector().accept(innerAppender, null);
+
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(NonNullableStructVector deltaVector, Void value) {
+    Preconditions.checkArgument(typeVisitor.equals(deltaVector),
+            "The vector to append must have the same type as the targetVector being appended");
+
+    NonNullableStructVector targetStructVector = (NonNullableStructVector) targetVector;
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    // make sure there is enough capacity
+    while (targetVector.getValueCapacity() < newValueCount) {
+      targetVector.reAlloc();
+    }
+
+    // append validity buffer
+    BitVectorHelper.concatBits(
+            targetVector.getValidityBuffer(), targetVector.getValueCount(),
+            deltaVector.getValidityBuffer(), deltaVector.getValueCount(), targetVector.getValidityBuffer());
+
+    // append child vectors
+    for (int i = 0; i < targetStructVector.getChildrenFromFields().size(); i++) {
+      ValueVector targetChild = targetStructVector.getVectorById(i);
+      ValueVector deltaChild = deltaVector.getVectorById(i);
+
+      targetChild.setValueCount(targetStructVector.getValueCount());
+      deltaChild.setValueCount(deltaVector.getValueCount());
+
+      VectorAppender innerAppender = new VectorAppender(targetChild);
+      deltaChild.accept(innerAppender, null);
+    }
+
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(UnionVector deltaVector, Void value) {
+    // we only make sure that both vectors are union vectors.
+    Preconditions.checkArgument(targetVector.getMinorType() == deltaVector.getMinorType(),
+            "The vector to append must have the same type as the targetVector being appended");
+
+    UnionVector targetUnionVector = (UnionVector) targetVector;
+    int newValueCount = targetVector.getValueCount() + deltaVector.getValueCount();
+
+    // make sure there is enough capacity
+    while (targetUnionVector.getValueCapacity() < newValueCount) {
+      targetUnionVector.reAlloc();
+    }
+
+    // append type buffers
+    PlatformDependent.copyMemory(deltaVector.getValidityBufferAddress(),
+            targetUnionVector.getValidityBufferAddress() + targetVector.getValueCount(),
+            deltaVector.getValueCount());
+
+    // build the hash set for all types
+    HashSet<Integer> targetTypes = new HashSet<>();
+    for (int i = 0; i < targetUnionVector.getValueCount(); i++) {
+      targetTypes.add((int) targetUnionVector.getValidityBuffer().getByte(i));
+    }
+    HashSet<Integer> deltaTypes = new HashSet<>();
+    for (int i = 0; i < deltaVector.getValueCount(); i++) {
+      deltaTypes.add((int) deltaVector.getValidityBuffer().getByte(i));
+    }
+
+    // append child vectors
+    for (int i = 0; i < Byte.MAX_VALUE; i++) {
+      if (targetTypes.contains(i) || deltaTypes.contains(i)) {
+        ValueVector targetChild = targetUnionVector.getVectorByType(i);
+        if (!targetTypes.contains(i)) {
+          // if the vector type does not exist in the target, it must be newly created
+          // and we must make sure it has enough capacity.
+          while (targetChild.getValueCapacity() < newValueCount) {
+            targetChild.reAlloc();
+          }
+        }
+
+        if (deltaTypes.contains(i)) {
+          // append child vectors
+          ValueVector deltaChild = deltaVector.getVectorByType(i);
+
+          targetChild.setValueCount(targetUnionVector.getValueCount());
+          deltaChild.setValueCount(deltaVector.getValueCount());
+
+          VectorAppender innerAppender = new VectorAppender(targetChild);
+          deltaChild.accept(innerAppender, null);
+        }
+        targetChild.setValueCount(newValueCount);
+      }
+    }
+
+    targetVector.setValueCount(newValueCount);
+    return targetVector;
+  }
+
+  @Override
+  public ValueVector visit(DenseUnionVector left, Void value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public ValueVector visit(NullVector deltaVector, Void value) {
+    Preconditions.checkArgument(targetVector.getField().getType().equals(deltaVector.getField().getType()),
+            "The targetVector to append must have the same type as the targetVector being appended");
+    return targetVector;
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java
new file mode 100644
index 00000000000..570783d1070
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorBatchAppender.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.util;
+
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * Utility to add vector values in batch.
+ */
+public class VectorBatchAppender {
+
+  /**
+   * Add value vectors in batch.
+   * @param targetVector the  target vector.
+   * @param vectorsToAppend the vectors to append.
+   * @param <V> the vector type.
+   */
+  public static <V extends ValueVector> void batchAppend(V targetVector, V... vectorsToAppend) {
+    VectorAppender appender = new VectorAppender(targetVector);
+    for (V delta : vectorsToAppend) {
+      delta.accept(appender, null);
+    }
+  }
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
index ab3f81a6446..b50a3441322 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
@@ -17,10 +17,14 @@
 
 package org.apache.arrow.vector.testing;
 
+import static org.junit.Assert.assertEquals;
+
 import java.nio.charset.StandardCharsets;
+import java.util.List;
 
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.BitVectorHelper;
 import org.apache.arrow.vector.DateDayVector;
 import org.apache.arrow.vector.DateMilliVector;
 import org.apache.arrow.vector.DecimalVector;
@@ -51,7 +55,12 @@
 import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
+import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.holders.IntervalDayHolder;
+import org.apache.arrow.vector.types.Types;
+import org.apache.arrow.vector.types.pojo.FieldType;
 
 /**
  * Utility for populating {@link org.apache.arrow.vector.ValueVector}.
@@ -533,10 +542,72 @@ public static void setVector(VarCharVector vector, String... values) {
     vector.allocateNewSafe();
     for (int i = 0; i < length; i++) {
       if (values[i] != null) {
-        vector.set(i, values[i].getBytes(StandardCharsets.UTF_8));
+        vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8));
       }
     }
     vector.setValueCount(length);
   }
 
+  /**
+   * Populate values for {@link ListVector}.
+   */
+  public static void setVector(ListVector vector, List<Integer>... values) {
+    Types.MinorType type = Types.MinorType.INT;
+    vector.addOrGetVector(FieldType.nullable(type.getType()));
+
+    IntVector dataVector = (IntVector) vector.getDataVector();
+    dataVector.allocateNew();
+
+    // set underlying vectors
+    int curPos = 0;
+    vector.getOffsetBuffer().setInt(0, curPos);
+    for (int i = 0; i < values.length; i++) {
+      if (values[i] == null) {
+        BitVectorHelper.unsetBit(vector.getValidityBuffer(), i);
+      } else {
+        BitVectorHelper.setBit(vector.getValidityBuffer(), i);
+        for (int value : values[i]) {
+          dataVector.set(curPos, value);
+          curPos += 1;
+        }
+      }
+      vector.getOffsetBuffer().setInt((i + 1) * BaseRepeatedValueVector.OFFSET_WIDTH, curPos);
+    }
+    dataVector.setValueCount(curPos);
+    vector.setLastSet(values.length - 1);
+    vector.setValueCount(values.length);
+  }
+
+  /**
+   * Populate values for {@link FixedSizeListVector}.
+   */
+  public static void setVector(FixedSizeListVector vector, List<Integer>... values) {
+    for (int i = 0; i < values.length; i++) {
+      if (values[i] != null) {
+        assertEquals(vector.getListSize(), values[i].size());
+      }
+    }
+
+    Types.MinorType type = Types.MinorType.INT;
+    vector.addOrGetVector(FieldType.nullable(type.getType()));
+
+    IntVector dataVector = (IntVector) vector.getDataVector();
+    dataVector.allocateNew();
+
+    // set underlying vectors
+    int curPos = 0;
+    for (int i = 0; i < values.length; i++) {
+      if (values[i] == null) {
+        BitVectorHelper.unsetBit(vector.getValidityBuffer(), i);
+      } else {
+        BitVectorHelper.setBit(vector.getValidityBuffer(), i);
+        for (int value : values[i]) {
+          dataVector.set(curPos, value);
+          curPos += 1;
+        }
+      }
+    }
+    dataVector.setValueCount(curPos);
+    vector.setValueCount(values.length);
+  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
new file mode 100644
index 00000000000..83fc24fe90a
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.util;
+
+import static junit.framework.TestCase.assertEquals;
+import static junit.framework.TestCase.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.compare.Range;
+import org.apache.arrow.vector.compare.RangeEqualsVisitor;
+import org.apache.arrow.vector.compare.TypeEqualsVisitor;
+import org.apache.arrow.vector.complex.FixedSizeListVector;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.complex.UnionVector;
+import org.apache.arrow.vector.testing.ValueVectorDataPopulator;
+import org.apache.arrow.vector.types.Types;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link VectorAppender}.
+ */
+public class TestVectorAppender {
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testAppendFixedWidthVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+    try (IntVector target = new IntVector("", allocator);
+         IntVector delta = new IntVector("", allocator)) {
+
+      target.allocateNew(length1);
+      delta.allocateNew(length2);
+
+      ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, null, 8, 9);
+      ValueVectorDataPopulator.setVector(delta, null, 11, 12, 13, 14);
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(length1 + length2, target.getValueCount());
+
+      try (IntVector expected = new IntVector("expected", allocator)) {
+        expected.allocateNew();
+        ValueVectorDataPopulator.setVector(expected, 0, 1, 2, 3, 4, 5, 6, null, 8, 9, null, 11, 12, 13, 14);
+        assertVectorsEqual(expected, target);
+      }
+    }
+  }
+
+  @Test
+  public void testAppendVariableWidthVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+    try (VarCharVector target = new VarCharVector("", allocator);
+         VarCharVector delta = new VarCharVector("", allocator)) {
+
+      target.allocateNew(5, length1);
+      delta.allocateNew(5, length2);
+
+      ValueVectorDataPopulator.setVector(target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9");
+      ValueVectorDataPopulator.setVector(delta, "a10", "a11", "a12", "a13", null);
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      try (VarCharVector expected = new VarCharVector("expected", allocator)) {
+        expected.allocateNew();
+        ValueVectorDataPopulator.setVector(expected,
+            "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13", null);
+        assertVectorsEqual(expected, target);
+      }
+    }
+  }
+
+  @Test
+  public void testAppendListVector() {
+    final int length1 = 5;
+    final int length2 = 2;
+    try (ListVector target = ListVector.empty("target", allocator);
+         ListVector delta = ListVector.empty("delta", allocator)) {
+
+      target.allocateNew();
+      ValueVectorDataPopulator.setVector(target,
+          Arrays.asList(0, 1),
+          Arrays.asList(2, 3),
+          null,
+          Arrays.asList(6, 7),
+          Arrays.asList(8, 9));
+      assertEquals(length1, target.getValueCount());
+
+      delta.allocateNew();
+      ValueVectorDataPopulator.setVector(delta,
+          Arrays.asList(10, 11, 12, 13, 14),
+          Arrays.asList(15, 16, 17, 18, 19));
+      assertEquals(length2, delta.getValueCount());
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(7, target.getValueCount());
+
+      List<Integer> expected = Arrays.asList(0, 1);
+      assertEquals(expected, target.getObject(0));
+
+      expected = Arrays.asList(2, 3);
+      assertEquals(expected, target.getObject(1));
+
+      assertTrue(target.isNull(2));
+
+      expected = Arrays.asList(6, 7);
+      assertEquals(expected, target.getObject(3));
+
+      expected = Arrays.asList(8, 9);
+      assertEquals(expected, target.getObject(4));
+
+      expected = Arrays.asList(10, 11, 12, 13, 14);
+      assertEquals(expected, target.getObject(5));
+
+      expected = Arrays.asList(15, 16, 17, 18, 19);
+      assertEquals(expected, target.getObject(6));
+    }
+  }
+
+  @Test
+  public void testAppendFixedSizeListVector() {
+    try (FixedSizeListVector target = FixedSizeListVector.empty("target", 5, allocator);
+         FixedSizeListVector delta = FixedSizeListVector.empty("delta", 5, allocator)) {
+
+      target.allocateNew();
+      ValueVectorDataPopulator.setVector(target,
+          Arrays.asList(0, 1, 2, 3, 4),
+          null);
+      assertEquals(2, target.getValueCount());
+
+      delta.allocateNew();
+      ValueVectorDataPopulator.setVector(delta,
+          Arrays.asList(10, 11, 12, 13, 14),
+          Arrays.asList(15, 16, 17, 18, 19));
+      assertEquals(2, delta.getValueCount());
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(4, target.getValueCount());
+
+      assertEquals(Arrays.asList(0, 1, 2, 3, 4), target.getObject(0));
+      assertTrue(target.isNull(1));
+      assertEquals(Arrays.asList(10, 11, 12, 13, 14), target.getObject(2));
+      assertEquals(Arrays.asList(15, 16, 17, 18, 19), target.getObject(3));
+    }
+  }
+
+  @Test
+  public void testAppendStructVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+    try (final StructVector target = StructVector.empty("target", allocator);
+         final StructVector delta = StructVector.empty("delta", allocator)) {
+
+      IntVector targetChild1 = target.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      VarCharVector targetChild2 = target.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class);
+      targetChild1.allocateNew();
+      targetChild2.allocateNew();
+      ValueVectorDataPopulator.setVector(targetChild1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9);
+      ValueVectorDataPopulator.setVector(targetChild2, "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9");
+      target.setValueCount(length1);
+
+      IntVector deltaChild1 = delta.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      VarCharVector deltaChild2 = delta.addOrGet("f1", FieldType.nullable(new ArrowType.Utf8()), VarCharVector.class);
+      deltaChild1.allocateNew();
+      deltaChild2.allocateNew();
+      ValueVectorDataPopulator.setVector(deltaChild1, 10, 11, 12, null, 14);
+      ValueVectorDataPopulator.setVector(deltaChild2, "a10", "a11", "a12", "a13", "a14");
+      delta.setValueCount(length2);
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(length1 + length2, target.getValueCount());
+      IntVector child1 = (IntVector) target.getVectorById(0);
+      VarCharVector child2 = (VarCharVector) target.getVectorById(1);
+
+      try (IntVector expected1 = new IntVector("expected1", allocator);
+           VarCharVector expected2 = new VarCharVector("expected2", allocator)) {
+        expected1.allocateNew();
+        expected2.allocateNew();
+
+        ValueVectorDataPopulator.setVector(expected1, 0, 1, 2, 3, 4, null, 6, 7, 8, 9, 10, 11, 12, null, 14);
+        ValueVectorDataPopulator.setVector(expected2,
+            "a0", "a1", "a2", "a3", "a4", "a5", "a6", null, "a8", "a9", "a10", "a11", "a12", "a13", "a14");
+
+        assertVectorsEqual(expected1, target.getChild("f0"));
+        assertVectorsEqual(expected2, target.getChild("f1"));
+      }
+    }
+  }
+
+  @Test
+  public void testAppendUnionVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+
+    try (final UnionVector target = UnionVector.empty("target", allocator);
+         final UnionVector delta = UnionVector.empty("delta", allocator)) {
+
+      // alternating ints and big ints
+      target.setType(0, Types.MinorType.INT);
+      target.setType(1, Types.MinorType.BIGINT);
+      target.setType(2, Types.MinorType.INT);
+      target.setType(3, Types.MinorType.BIGINT);
+      target.setType(4, Types.MinorType.INT);
+      target.setType(5, Types.MinorType.BIGINT);
+      target.setType(6, Types.MinorType.INT);
+      target.setType(7, Types.MinorType.BIGINT);
+      target.setType(8, Types.MinorType.INT);
+      target.setType(9, Types.MinorType.BIGINT);
+      target.setType(10, Types.MinorType.INT);
+      target.setType(11, Types.MinorType.BIGINT);
+      target.setType(12, Types.MinorType.INT);
+      target.setType(13, Types.MinorType.BIGINT);
+      target.setType(14, Types.MinorType.INT);
+      target.setType(15, Types.MinorType.BIGINT);
+      target.setType(16, Types.MinorType.INT);
+      target.setType(17, Types.MinorType.BIGINT);
+      target.setType(18, Types.MinorType.INT);
+      target.setType(19, Types.MinorType.BIGINT);
+
+      IntVector targetIntVec = target.getIntVector();
+      targetIntVec.allocateNew();
+      ValueVectorDataPopulator.setVector(
+          targetIntVec,
+          0, null, 1, null, 2, null, 3, null, 4, null, 5, null, 6, null, 7, null, 8, null, 9, null);
+      assertEquals(length1 * 2, targetIntVec.getValueCount());
+
+      BigIntVector targetBigIntVec = target.getBigIntVector();
+      targetBigIntVec.allocateNew();
+      ValueVectorDataPopulator.setVector(
+          targetBigIntVec,
+          null, 0L, null, 1L, null, 2L, null, 3L, null, 4L, null, 5L, null, 6L, null, 7L, null, 8L, null, 9L);
+      assertEquals(length1 * 2, targetBigIntVec.getValueCount());
+
+      target.setValueCount(length1 * 2);
+
+      // populate the delta vector
+      delta.setType(0, Types.MinorType.FLOAT4);
+      delta.setType(1, Types.MinorType.FLOAT4);
+      delta.setType(2, Types.MinorType.FLOAT4);
+      delta.setType(3, Types.MinorType.FLOAT4);
+      delta.setType(4, Types.MinorType.FLOAT4);
+
+      Float4Vector deltaFloatVector = delta.getFloat4Vector();
+      deltaFloatVector.allocateNew();
+      ValueVectorDataPopulator.setVector(deltaFloatVector, 10f, 11f, 12f, 13f, 14f);
+      assertEquals(length2, deltaFloatVector.getValueCount());
+      delta.setValueCount(length2);
+
+      VectorAppender appender = new VectorAppender(target);
+      delta.accept(appender, null);
+
+      assertEquals(length1 * 2 + length2, target.getValueCount());
+
+      for (int i = 0; i < length1; i++) {
+        Object intObj = target.getObject(i * 2);
+        assertTrue(intObj instanceof Integer);
+        assertEquals(i, ((Integer) intObj).intValue());
+
+        Object longObj = target.getObject(i * 2 + 1);
+        assertTrue(longObj instanceof Long);
+        assertEquals(i, ((Long) longObj).longValue());
+      }
+
+      for (int i = 0; i < length2; i++) {
+        Object floatObj = target.getObject(length1 * 2 + i);
+        assertTrue(floatObj instanceof Float);
+        assertEquals(i + length1, ((Float) floatObj).intValue());
+      }
+    }
+  }
+
+  @Test
+  public void testAppendVectorNegative() {
+    final int vectorLength = 10;
+    try (IntVector target = new IntVector("", allocator);
+         VarCharVector delta = new VarCharVector("", allocator)) {
+
+      target.allocateNew(vectorLength);
+      delta.allocateNew(vectorLength);
+
+      VectorAppender appender = new VectorAppender(target);
+
+      assertThrows(IllegalArgumentException.class,
+          () -> delta.accept(appender, null));
+    }
+  }
+
+  public static void assertVectorsEqual(ValueVector vector1, ValueVector vector2) {
+    assertEquals(vector1.getValueCount(), vector2.getValueCount());
+
+    TypeEqualsVisitor typeEqualsVisitor = new TypeEqualsVisitor(vector1, false, false);
+    RangeEqualsVisitor equalsVisitor =
+        new RangeEqualsVisitor(vector1, vector2, (v1, v2) -> typeEqualsVisitor.equals(vector2));
+    assertTrue(equalsVisitor.rangeEquals(new Range(0, 0, vector1.getValueCount())));
+  }
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java
new file mode 100644
index 00000000000..799c25c0ad7
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorBatchAppender.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.util;
+
+import static junit.framework.TestCase.assertEquals;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.testing.ValueVectorDataPopulator;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link VectorBatchAppender}.
+ */
+public class TestVectorBatchAppender {
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testBatchAppendIntVector() {
+    final int length1 = 10;
+    final int length2 = 5;
+    final int length3 = 7;
+    try (IntVector target = new IntVector("", allocator);
+         IntVector delta1 = new IntVector("", allocator);
+         IntVector delta2 = new IntVector("", allocator)) {
+
+      target.allocateNew(length1);
+      delta1.allocateNew(length2);
+      delta2.allocateNew(length3);
+
+      ValueVectorDataPopulator.setVector(target, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
+      ValueVectorDataPopulator.setVector(delta1, 10, 11, 12, 13, 14);
+      ValueVectorDataPopulator.setVector(delta2, 15, 16, 17, 18, 19, 20, 21);
+
+      VectorBatchAppender.batchAppend(target, delta1, delta2);
+
+      assertEquals(length1 + length2 + length3, target.getValueCount());
+      for (int i = 0; i < target.getValueCount(); i++) {
+        assertEquals(i, target.get(i));
+      }
+    }
+  }
+}

From a6054174f0fe8d8058e9b0c04355bda32eb7a80a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Tue, 4 Feb 2020 12:50:50 +0100
Subject: [PATCH 14/26] ARROW-7750: [Release] Make the source release
 verification script restartable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Executing the verification script can take quite some time, so creating a new environment in case if anything fails is time consuming.
Let the script reuse the same build directory for source release verification.

Need to export `TMPDIR` environment variable. @kou shall we use an argument instead?

Closes #6344 from kszucs/restartable-verification and squashes the following commits:

6d4723da3 <Krisztián Szűcs> Support for restarting the release verification script

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 dev/release/verify-release-candidate.sh | 35 ++++++++++++++++++-------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 9d68b7ce18b..24c60ea632b 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -28,6 +28,9 @@
 #
 # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to
 # LD_LIBRARY_PATH.
+#
+# To reuse build artifacts between runs set TMPDIR environment variable to
+# a directory where the temporary files should be placed to.
 
 case $# in
   3) ARTIFACT="$1"
@@ -205,7 +208,12 @@ setup_tempdir() {
     fi
   }
   trap cleanup EXIT
-  TMPDIR=$(mktemp -d -t "$1.XXXXX")
+
+  if [ -z "${TMPDIR}" ]; then
+    TMPDIR=$(mktemp -d -t "$1.XXXXX")
+  else
+    mkdir -p "${TMPDIR}"
+  fi
 }
 
 
@@ -219,9 +227,12 @@ setup_miniconda() {
 
   MINICONDA=$PWD/test-miniconda
 
-  wget -O miniconda.sh $MINICONDA_URL
-  bash miniconda.sh -b -p $MINICONDA
-  rm -f miniconda.sh
+  if [ ! -d "${MINICONDA}" ]; then
+    # Setup miniconda only if the directory doesn't exist yet
+    wget -O miniconda.sh $MINICONDA_URL
+    bash miniconda.sh -b -p $MINICONDA
+    rm -f miniconda.sh
+  fi
 
   . $MINICONDA/etc/profile.d/conda.sh
 }
@@ -249,7 +260,7 @@ test_and_install_cpp() {
         cython
   conda activate arrow-test
 
-  mkdir cpp/build
+  mkdir -p cpp/build
   pushd cpp/build
 
   ARROW_CMAKE_OPTIONS="
@@ -523,10 +534,14 @@ test_source_distribution() {
     NPROC=$(nproc)
   fi
 
-  git clone https://github.com/apache/arrow-testing.git
+  # Clone testing repositories if not cloned already
+  if [ ! -d "arrow-testing" ]; then
+    git clone https://github.com/apache/arrow-testing.git
+  fi
+  if [ ! -d "parquet-testing" ]; then
+    git clone https://github.com/apache/parquet-testing.git
+  fi
   export ARROW_TEST_DATA=$PWD/arrow-testing/data
-
-  git clone https://github.com/apache/parquet-testing.git
   export PARQUET_TEST_DATA=$PWD/parquet-testing/data
 
   if [ ${TEST_JAVA} -gt 0 ]; then
@@ -612,7 +627,7 @@ test_linux_wheels() {
       # install test requirements
       pip install -r ${ARROW_DIR}/python/requirements-test.txt
 
-      # execute the tests
+      # execute the python unit tests
       pytest --pyargs pyarrow
     done
 
@@ -645,7 +660,7 @@ test_macos_wheels() {
     # install test requirements
     pip install -r ${ARROW_DIR}/python/requirements-test.txt
 
-    # execute the tests
+    # execute the python unit tests
     pytest --pyargs pyarrow
 
     conda deactivate

From 0326ea34b63ae399582a99d60f0d23cc03aaa628 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Tue, 4 Feb 2020 13:42:03 +0100
Subject: [PATCH 15/26] ARROW-7751: [Release] macOS wheel verification also
 needs arrow-testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This addition follows the pattern of `test_source_distribution` (as it is in #6344). There are also two error message patches to make them consistent with everywhere else that references this env var (though FWIW `testing/data` is still not correct in the verification script, it's `arrow-testing/data` 🤷‍♂).

Closes #6345 from nealrichardson/flight-testing-data and squashes the following commits:

a29e88a7e <Krisztián Szűcs> factor out testing repository cloning
df9ef255d <Neal Richardson> Move addition and fix lint
e165d54f1 <Neal Richardson> Make sure macOS wheel verification has test data

Lead-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Co-authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 cpp/src/arrow/flight/test_util.cc       |  3 ++-
 dev/release/verify-release-candidate.sh | 24 +++++++++++++++---------
 python/pyarrow/tests/test_flight.py     |  2 +-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc
index 5ad7a899c66..5d83353d281 100644
--- a/cpp/src/arrow/flight/test_util.cc
+++ b/cpp/src/arrow/flight/test_util.cc
@@ -454,7 +454,8 @@ Status TestClientBasicAuthHandler::GetToken(std::string* token) {
 Status GetTestResourceRoot(std::string* out) {
   const char* c_root = std::getenv("ARROW_TEST_DATA");
   if (!c_root) {
-    return Status::IOError("Test resources not found, set ARROW_TEST_DATA");
+    return Status::IOError(
+        "Test resources not found, set ARROW_TEST_DATA to <repo root>/testing/data");
   }
   *out = std::string(c_root);
   return Status::OK();
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 24c60ea632b..bae762d3274 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -522,6 +522,18 @@ test_integration() {
               $INTEGRATION_TEST_ARGS
 }
 
+clone_testing_repositories() {
+  # Clone testing repositories if not cloned already
+  if [ ! -d "arrow-testing" ]; then
+    git clone https://github.com/apache/arrow-testing.git
+  fi
+  if [ ! -d "parquet-testing" ]; then
+    git clone https://github.com/apache/parquet-testing.git
+  fi
+  export ARROW_TEST_DATA=$PWD/arrow-testing/data
+  export PARQUET_TEST_DATA=$PWD/parquet-testing/data
+}
+
 test_source_distribution() {
   export ARROW_HOME=$TMPDIR/install
   export PARQUET_HOME=$TMPDIR/install
@@ -534,15 +546,7 @@ test_source_distribution() {
     NPROC=$(nproc)
   fi
 
-  # Clone testing repositories if not cloned already
-  if [ ! -d "arrow-testing" ]; then
-    git clone https://github.com/apache/arrow-testing.git
-  fi
-  if [ ! -d "parquet-testing" ]; then
-    git clone https://github.com/apache/parquet-testing.git
-  fi
-  export ARROW_TEST_DATA=$PWD/arrow-testing/data
-  export PARQUET_TEST_DATA=$PWD/parquet-testing/data
+  clone_testing_repositories
 
   if [ ${TEST_JAVA} -gt 0 ]; then
     test_package_java
@@ -668,6 +672,8 @@ test_macos_wheels() {
 }
 
 test_wheels() {
+  clone_testing_repositories
+
   local download_dir=binaries
   mkdir -p ${download_dir}
 
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index ff31b64105c..fe0470f7e5d 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -60,7 +60,7 @@ def resource_root():
     """Get the path to the test resources directory."""
     if not os.environ.get("ARROW_TEST_DATA"):
         raise RuntimeError("Test resources not found; set "
-                           "ARROW_TEST_DATA to <repo root>/testing")
+                           "ARROW_TEST_DATA to <repo root>/testing/data")
     return pathlib.Path(os.environ["ARROW_TEST_DATA"]) / "flight"
 
 

From d6b6f878d951833bbe594131de9c093f37fbfdf1 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 4 Feb 2020 15:41:19 +0100
Subject: [PATCH 16/26] ARROW-7691: [C++] Check non-scalar Flatbuffers fields
 are not null

We're discussing whether to make those fields required in the schema definitions (which would make validation automatic by the flatbuffers generated verifier), but in the meantime we can check those fields manually.

This should fix a bunch of issues detected by OSS-Fuzz.

Closes #6293 from pitrou/ARROW-7691-check-fb-fields-not-null and squashes the following commits:

02478a620 <Antoine Pitrou> Use a function rather than a macro
e6d3d8852 <Antoine Pitrou> ARROW-7691:  Check non-scalar Flatbuffers fields are not null

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/ipc/metadata_internal.cc | 77 ++++++--------------------
 cpp/src/arrow/ipc/metadata_internal.h  | 25 +++++++--
 cpp/src/arrow/ipc/reader.cc            | 21 +++----
 3 files changed, 46 insertions(+), 77 deletions(-)

diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index 05f508df31c..94bb6cbd839 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -286,11 +286,7 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
     case flatbuf::Type::Timestamp: {
       auto ts_type = static_cast<const flatbuf::Timestamp*>(type_data);
       TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit());
-      if (ts_type->timezone() != 0 && ts_type->timezone()->Length() > 0) {
-        *out = timestamp(unit, ts_type->timezone()->str());
-      } else {
-        *out = timestamp(unit);
-      }
+      *out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone()));
       return Status::OK();
     }
     case flatbuf::Type::Duration: {
@@ -369,10 +365,7 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field,
                                  const KeyValueMetadata* field_metadata,
                                  std::shared_ptr<DataType>* out) {
   auto type_data = field->type();
-  if (type_data == nullptr) {
-    return Status::IOError(
-        "Type-pointer in custom metadata of flatbuffer-encoded Field is null.");
-  }
+  CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type");
   RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out));
 
   // Look for extension metadata in custom_metadata field
@@ -474,14 +467,8 @@ Status KeyValueMetadataFromFlatbuffer(const KVVector* fb_metadata,
 
   metadata->reserve(fb_metadata->size());
   for (const auto& pair : *fb_metadata) {
-    if (pair->key() == nullptr) {
-      return Status::IOError(
-          "Key-pointer in custom metadata of flatbuffer-encoded Schema is null.");
-    }
-    if (pair->value() == nullptr) {
-      return Status::IOError(
-          "Value-pointer in custom metadata of flatbuffer-encoded Schema is null.");
-    }
+    CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key");
+    CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value");
     metadata->Append(pair->key()->str(), pair->value()->str());
   }
 
@@ -776,9 +763,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
 
   // Reconstruct the data type
   auto children = field->children();
-  if (children == nullptr) {
-    return Status::IOError("Children-pointer of flatbuffer-encoded Field is null.");
-  }
+  CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children");
   std::vector<std::shared_ptr<Field>> child_fields(children->size());
   for (int i = 0; i < static_cast<int>(children->size()); ++i) {
     RETURN_NOT_OK(
@@ -786,6 +771,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
   }
   RETURN_NOT_OK(TypeFromFlatbuffer(field, child_fields, metadata.get(), &type));
 
+  auto field_name = StringFromFlatbuffers(field->name());
+
   const flatbuf::DictionaryEncoding* encoding = field->dictionary();
 
   if (encoding != nullptr) {
@@ -794,22 +781,14 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
     // dictionary_memo
     std::shared_ptr<DataType> index_type;
     auto int_data = encoding->indexType();
-    if (int_data == nullptr) {
-      return Status::IOError(
-          "indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding "
-          "is null.");
-    }
+    CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType");
     RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type));
     ARROW_ASSIGN_OR_RAISE(type,
                           DictionaryType::Make(index_type, type, encoding->isOrdered()));
-    *out = ::arrow::field(field->name()->str(), type, field->nullable(), metadata);
+    *out = ::arrow::field(field_name, type, field->nullable(), metadata);
     RETURN_NOT_OK(dictionary_memo->AddField(encoding->id(), *out));
   } else {
-    auto name = field->name();
-    if (name == nullptr) {
-      return Status::IOError("Name-pointer of flatbuffer-encoded Field is null.");
-    }
-    *out = ::arrow::field(name->str(), type, field->nullable(), metadata);
+    *out = ::arrow::field(field_name, type, field->nullable(), metadata);
   }
   return Status::OK();
 }
@@ -1183,17 +1162,15 @@ Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dicti
 Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
                  std::shared_ptr<Schema>* out) {
   auto schema = static_cast<const flatbuf::Schema*>(opaque_schema);
-  if (schema->fields() == nullptr) {
-    return Status::IOError("Fields-pointer of flatbuffer-encoded Schema is null.");
-  }
+  CHECK_FLATBUFFERS_NOT_NULL(schema, "schema");
+  CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields");
   int num_fields = static_cast<int>(schema->fields()->size());
 
   std::vector<std::shared_ptr<Field>> fields(num_fields);
   for (int i = 0; i < num_fields; ++i) {
     const flatbuf::Field* field = schema->fields()->Get(i);
-    if (field == nullptr) {
-      return Status::IOError("Field-pointer of flatbuffer-encoded Schema is null.");
-    }
+    // XXX I don't think this check is necessary (AP)
+    CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType");
     RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i]));
   }
 
@@ -1225,12 +1202,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
     auto dim = tensor->shape()->Get(i);
 
     shape->push_back(dim->size());
-    auto fb_name = dim->name();
-    if (fb_name == 0) {
-      dim_names->push_back("");
-    } else {
-      dim_names->push_back(fb_name->str());
-    }
+    dim_names->push_back(StringFromFlatbuffers(dim->name()));
   }
 
   if (tensor->strides() && tensor->strides()->size() > 0) {
@@ -1239,11 +1211,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
     }
   }
 
-  auto type_data = tensor->type();
-  if (type_data == nullptr) {
-    return Status::IOError(
-        "Type-pointer in custom metadata of flatbuffer-encoded Tensor is null.");
-  }
+  auto type_data = tensor->type();  // Required
   return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type);
 }
 
@@ -1283,12 +1251,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
       }
 
       if (dim_names) {
-        auto fb_name = dim->name();
-        if (fb_name == 0) {
-          dim_names->push_back("");
-        } else {
-          dim_names->push_back(fb_name->str());
-        }
+        dim_names->push_back(StringFromFlatbuffers(dim->name()));
       }
     }
   }
@@ -1324,11 +1287,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
     }
   }
 
-  auto type_data = sparse_tensor->type();
-  if (type_data == nullptr) {
-    return Status::IOError(
-        "Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null.");
-  }
+  auto type_data = sparse_tensor->type();  // Required
   if (type) {
     return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type);
   } else {
diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h
index b2ac81f86e7..f5ffee3fa75 100644
--- a/cpp/src/arrow/ipc/metadata_internal.h
+++ b/cpp/src/arrow/ipc/metadata_internal.h
@@ -31,20 +31,16 @@
 #include "arrow/buffer.h"
 #include "arrow/ipc/dictionary.h"  // IYWU pragma: keep
 #include "arrow/ipc/message.h"
-#include "arrow/memory_pool.h"
 #include "arrow/sparse_tensor.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
 
 #include "generated/Message_generated.h"
 #include "generated/Schema_generated.h"
 
 namespace arrow {
 
-class DataType;
-class Schema;
-class Tensor;
-class SparseTensor;
-
 namespace flatbuf = org::apache::arrow::flatbuf;
 
 namespace io {
@@ -92,6 +88,23 @@ struct FileBlock {
   int64_t body_length;
 };
 
+// Low-level utilities to help with reading Flatbuffers data.
+
+#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name)             \
+  if ((fb_value) == NULLPTR) {                                 \
+    return Status::IOError("Unexpected null field ", name,     \
+                           " in flatbuffer-encoded metadata"); \
+  }
+
+template <typename T>
+inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector<T>* vec) {
+  return (vec == NULLPTR) ? 0 : vec->size();
+}
+
+inline std::string StringFromFlatbuffers(const flatbuffers::String* s) {
+  return (s == NULLPTR) ? "" : s->str();
+}
+
 // Read interface classes. We do not fully deserialize the flatbuffers so that
 // individual fields metadata can be retrieved from very large schema without
 //
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 8712d46d1c9..f99037a0d34 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -102,10 +102,7 @@ class IpcComponentSource {
 
   Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
     auto buffers = metadata_->buffers();
-    if (buffers == nullptr) {
-      return Status::IOError(
-          "Buffers-pointer of flatbuffer-encoded RecordBatch is null.");
-    }
+    CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers");
     if (buffer_index >= static_cast<int>(buffers->size())) {
       return Status::IOError("buffer_index out of range.");
     }
@@ -127,9 +124,7 @@ class IpcComponentSource {
 
   Status GetFieldMetadata(int field_index, ArrayData* out) {
     auto nodes = metadata_->nodes();
-    if (nodes == nullptr) {
-      return Status::IOError("Nodes-pointer of flatbuffer-encoded Table is null.");
-    }
+    CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes");
     // pop off a field
     if (field_index >= static_cast<int>(nodes->size())) {
       return Status::Invalid("Ran out of field metadata, likely malformed");
@@ -441,6 +436,7 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo,
   // The dictionary is embedded in a record batch with a single column
   std::shared_ptr<RecordBatch> batch;
   auto batch_meta = dictionary_batch->data();
+  CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data");
   RETURN_NOT_OK(ReadRecordBatch(batch_meta, ::arrow::schema({value_field}),
                                 dictionary_memo, options, file, &batch));
   if (batch->num_columns() != 1) {
@@ -475,9 +471,6 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl {
     }
     CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type());
     CHECK_HAS_NO_BODY(*message);
-    if (message->header() == nullptr) {
-      return Status::IOError("Header-pointer of flatbuffer-encoded Message is null.");
-    }
     return internal::GetSchema(message->header(), &dictionary_memo_, &schema_);
   }
 
@@ -663,9 +656,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl {
     return Status::OK();
   }
 
-  int num_dictionaries() const { return footer_->dictionaries()->size(); }
+  int num_dictionaries() const {
+    return static_cast<int>(internal::FlatBuffersVectorSize(footer_->dictionaries()));
+  }
 
-  int num_record_batches() const { return footer_->recordBatches()->size(); }
+  int num_record_batches() const {
+    return static_cast<int>(internal::FlatBuffersVectorSize(footer_->recordBatches()));
+  }
 
   MetadataVersion version() const {
     return internal::GetMetadataVersion(footer_->version());

From 09059d51e42d4c930e72016a09e551e04c6f6b60 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 4 Feb 2020 12:02:29 -0600
Subject: [PATCH 17/26] ARROW-7760: [Release] Fix verify-release-candidate.sh
 since pip3 seems to no longer be in miniconda, install miniconda
 unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change was necessary for me to get the script to finish to completion.

Closes #6348 from wesm/pip3-to-pip and squashes the following commits:

fcf3ae643 <Wes McKinney> conda environment must be activated for other steps to work if C++ is disabled. Fix selective disabling of integration test components in archery
26da75982 <Krisztián Szűcs> always set up miniconda
490ceacfe <Wes McKinney> pip3 no longer in miniconda

Lead-authored-by: Wes McKinney <wesm+git@apache.org>
Co-authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
---
 dev/archery/archery/integration/runner.py | 12 +++----
 dev/release/verify-release-candidate.sh   | 44 +++++++++++++----------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 9fa3f6194c7..60ee8f3b856 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -227,23 +227,23 @@ def get_static_json_files():
     ]
 
 
-def run_all_tests(enable_cpp=True, enable_java=True, enable_js=True,
-                  enable_go=True, run_flight=False,
+def run_all_tests(with_cpp=True, with_java=True, with_js=True,
+                  with_go=True, run_flight=False,
                   tempdir=None, **kwargs):
     tempdir = tempdir or tempfile.mkdtemp()
 
     testers = []
 
-    if enable_cpp:
+    if with_cpp:
         testers.append(CPPTester(kwargs))
 
-    if enable_java:
+    if with_java:
         testers.append(JavaTester(kwargs))
 
-    if enable_js:
+    if with_js:
         testers.append(JSTester(kwargs))
 
-    if enable_go:
+    if with_go:
         testers.append(GoTester(kwargs))
 
     static_json_files = get_static_json_files()
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index bae762d3274..88974529ecf 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -30,7 +30,8 @@
 # LD_LIBRARY_PATH.
 #
 # To reuse build artifacts between runs set TMPDIR environment variable to
-# a directory where the temporary files should be placed to.
+# a directory where the temporary files should be placed to, note that this
+# directory is not cleaned up automatically.
 
 case $# in
   3) ARTIFACT="$1"
@@ -126,7 +127,7 @@ test_binary() {
   local download_dir=binaries
   mkdir -p ${download_dir}
 
-  python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir}
+  python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir}
   verify_dir_artifact_signatures ${download_dir}
 }
 
@@ -207,11 +208,13 @@ setup_tempdir() {
       echo "Failed to verify release candidate. See ${TMPDIR} for details."
     fi
   }
-  trap cleanup EXIT
 
   if [ -z "${TMPDIR}" ]; then
+    # clean up automatically if TMPDIR is not defined
     TMPDIR=$(mktemp -d -t "$1.XXXXX")
+    trap cleanup EXIT
   else
+    # don't clean up automatically
     mkdir -p "${TMPDIR}"
   fi
 }
@@ -235,6 +238,15 @@ setup_miniconda() {
   fi
 
   . $MINICONDA/etc/profile.d/conda.sh
+
+  conda create -n arrow-test -y -q -c conda-forge \
+        python=3.6 \
+        nomkl \
+        numpy \
+        pandas \
+        six \
+        cython
+  conda activate arrow-test
 }
 
 # Build and test Java (Requires newer Maven -- I used 3.3.9)
@@ -251,15 +263,6 @@ test_package_java() {
 # Build and test C++
 
 test_and_install_cpp() {
-  conda create -n arrow-test -y -q -c conda-forge \
-        python=3.6 \
-        nomkl \
-        numpy \
-        pandas \
-        six \
-        cython
-  conda activate arrow-test
-
   mkdir -p cpp/build
   pushd cpp/build
 
@@ -503,7 +506,7 @@ test_integration() {
   export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar
   export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/release
 
-  pip3 install -e dev/archery
+  pip install -e dev/archery
 
   INTEGRATION_TEST_ARGS=""
 
@@ -552,7 +555,6 @@ test_source_distribution() {
     test_package_java
   fi
   if [ ${TEST_CPP} -gt 0 ]; then
-    setup_miniconda
     test_and_install_cpp
   fi
   if [ ${TEST_CSHARP} -gt 0 ]; then
@@ -686,9 +688,9 @@ test_wheels() {
   conda create -yq -n py3-base python=3.7
   conda activate py3-base
 
-  python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
-          --regex=${filter_regex} \
-          --dest=${download_dir}
+  python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
+         --regex=${filter_regex} \
+         --dest=${download_dir}
 
   verify_dir_artifact_signatures ${download_dir}
 
@@ -751,6 +753,9 @@ setup_tempdir "arrow-${VERSION}"
 echo "Working in sandbox ${TMPDIR}"
 cd ${TMPDIR}
 
+setup_miniconda
+echo "Using miniconda environment ${MINICONDA}"
+
 if [ "${ARTIFACT}" == "source" ]; then
   dist_name="apache-arrow-${VERSION}"
   if [ ${TEST_SOURCE} -gt 0 ]; then
@@ -759,6 +764,10 @@ if [ "${ARTIFACT}" == "source" ]; then
     tar xf ${dist_name}.tar.gz
   else
     mkdir -p ${dist_name}
+    if [ ! -f ${TEST_ARCHIVE} ]; then
+      echo "${TEST_ARCHIVE} not found, did you mean to pass TEST_SOURCE=1?"
+      exit 1
+    fi
     tar xf ${TEST_ARCHIVE} -C ${dist_name} --strip-components=1
   fi
   pushd ${dist_name}
@@ -766,7 +775,6 @@ if [ "${ARTIFACT}" == "source" ]; then
   popd
 elif [ "${ARTIFACT}" == "wheels" ]; then
   import_gpg_keys
-  setup_miniconda
   test_wheels
 else
   import_gpg_keys

From b7dbbccae93839e3d5a65286d366a56d92548061 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 4 Feb 2020 12:35:14 -0600
Subject: [PATCH 18/26] ARROW-6757: [Release] Use same CMake generator for C++
 and Python when verifying RC, remove Python 3.5 from wheel verification

This resolves the issues I was having as described in ARROW-6757. This does not fix the Python 3.8 wheel, though

Closes #6350 from wesm/windows-rc-verify-fixes and squashes the following commits:

a9d4c6656 <Wes McKinney> Fixes for Windows release verification scripts

Authored-by: Wes McKinney <wesm+git@apache.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
---
 dev/release/verify-release-candidate-wheels.bat |  7 ++-----
 dev/release/verify-release-candidate.bat        | 12 +++++++++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat
index 0f8e047752e..19056cd77b6 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -25,7 +25,7 @@
 @echo on
 
 set _CURRENT_DIR=%CD%
-set _VERIFICATION_DIR=C:\tmp\arrow-verify-release
+set _VERIFICATION_DIR=C:\tmp\arrow-verify-release-wheels
 
 if not exist "C:\tmp\" mkdir C:\tmp
 if exist %_VERIFICATION_DIR% rd %_VERIFICATION_DIR% /s /q
@@ -33,9 +33,6 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR%
 
 cd %_VERIFICATION_DIR%
 
-CALL :verify_wheel 3.5 %1 %2
-if errorlevel 1 GOTO error
-
 CALL :verify_wheel 3.6 %1 %2
 if errorlevel 1 GOTO error
 
@@ -62,7 +59,7 @@ set ARROW_VERSION=%2
 set RC_NUMBER=%3
 set PY_VERSION_NO_PERIOD=%PY_VERSION:.=%
 
-set CONDA_ENV_PATH=C:\tmp\arrow-verify-release\_verify-wheel-%PY_VERSION%
+set CONDA_ENV_PATH=%_VERIFICATION_DIR%\_verify-wheel-%PY_VERSION%
 call conda create -p %CONDA_ENV_PATH% ^
     --no-shortcuts -f -q -y python=%PY_VERSION% ^
     || EXIT /B 1
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index 70d47347823..8596b96f870 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -71,7 +71,10 @@ pushd %ARROW_SOURCE%\cpp\build
 @rem This is the path for Visual Studio Community 2017
 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64
 
-cmake -G "Ninja" ^
+@rem NOTE(wesm): not using Ninja for now to be able to more easily control the
+@rem generator used
+
+cmake -G "%GENERATOR%" ^
       -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
       -DARROW_BUILD_STATIC=OFF ^
       -DARROW_BOOST_USE_SHARED=ON ^
@@ -90,11 +93,13 @@ cmake -G "Ninja" ^
       -DARROW_PARQUET=ON ^
       ..  || exit /B
 
+cmake --build . --target INSTALL --config Release
+
 @rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it
 @rem first fixes the problem
-ninja googletest_ep || exit /B
 
-ninja install || exit /B
+@rem ninja googletest_ep || exit /B
+@rem ninja install || exit /B
 
 @rem Get testing datasets for Parquet unit tests
 git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing
@@ -112,6 +117,7 @@ popd
 @rem Build and import pyarrow
 pushd %ARROW_SOURCE%\python
 
+set PYARROW_CMAKE_GENERATOR=%GENERATOR%
 set PYARROW_WITH_FLIGHT=1
 set PYARROW_WITH_PARQUET=1
 python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel  || exit /B

From 992d9fcff72c8ee3bd22292ce44a450acffe044f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 5 Feb 2020 06:42:37 +0900
Subject: [PATCH 19/26] ARROW-7752: [Release] Enable and test dataset in the
 verification script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We're not testing the dataset feature in the verifications scripts yet.

Closes #6346 from kszucs/dataset-verification and squashes the following commits:

b8530ea39 <Krisztián Szűcs> Test dataset during the verification

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/verify-release-candidate-wheels.bat | 1 +
 dev/release/verify-release-candidate.bat        | 2 ++
 dev/release/verify-release-candidate.sh         | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat
index 19056cd77b6..6bf1c111218 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -75,6 +75,7 @@ python -c "import pyarrow" || EXIT /B 1
 python -c "import pyarrow.flight" || EXIT /B 1
 python -c "import pyarrow.gandiva" || EXIT /B 1
 python -c "import pyarrow.parquet" || EXIT /B 1
+python -c "import pyarrow.dataset" || EXIT /B 1
 
 call deactivate
 
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index 8596b96f870..bcef9f51f6e 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -90,6 +90,7 @@ cmake -G "%GENERATOR%" ^
       -DARROW_WITH_BROTLI=ON ^
       -DARROW_FLIGHT=ON ^
       -DARROW_PYTHON=ON ^
+      -DARROW_DATASET=ON ^
       -DARROW_PARQUET=ON ^
       ..  || exit /B
 
@@ -120,6 +121,7 @@ pushd %ARROW_SOURCE%\python
 set PYARROW_CMAKE_GENERATOR=%GENERATOR%
 set PYARROW_WITH_FLIGHT=1
 set PYARROW_WITH_PARQUET=1
+set PYARROW_WITH_DATASET=1
 python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel  || exit /B
 py.test pyarrow -v -s --parquet || exit /B
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 88974529ecf..60a9d544bd9 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -276,6 +276,7 @@ ${ARROW_CMAKE_OPTIONS:-}
 -DARROW_PYTHON=ON
 -DARROW_GANDIVA=ON
 -DARROW_PARQUET=ON
+-DARROW_DATASET=ON
 -DPARQUET_REQUIRE_ENCRYPTION=ON
 -DARROW_WITH_BZ2=ON
 -DARROW_WITH_ZLIB=ON
@@ -365,6 +366,7 @@ test_python() {
 
   pip install -r requirements.txt -r requirements-test.txt
 
+  export PYARROW_WITH_DATASET=1
   export PYARROW_WITH_GANDIVA=1
   export PYARROW_WITH_PARQUET=1
   export PYARROW_WITH_PLASMA=1
@@ -605,7 +607,8 @@ check_python_imports() {
   python -c "import pyarrow.fs"
 
   if [[ "$py_arch" =~ ^3 ]]; then
-    # Flight and Gandiva are only available for py3
+    # Flight, Gandiva and Dataset are only available for py3
+    python -c "import pyarrow.dataset"
     python -c "import pyarrow.flight"
     python -c "import pyarrow.gandiva"
   fi

From 67e34c53b3be4c88348369f8109626b4a8a997aa Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Tue, 4 Feb 2020 18:57:12 -0600
Subject: [PATCH 20/26] ARROW-7766: [Python][Packaging] Windows py38 wheels are
 built with wrong ABI tag

This is fixed in the latest release of `wheel` but we were pinning to an old version

Closes #6353 from nealrichardson/fix-win-38-wheels and squashes the following commits:

e3a865bc0 <Neal Richardson> Remove wheel pin in requirements-wheel.txt

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
---
 python/requirements-wheel.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/requirements-wheel.txt b/python/requirements-wheel.txt
index 99a2666dd42..18e79636e39 100644
--- a/python/requirements-wheel.txt
+++ b/python/requirements-wheel.txt
@@ -6,7 +6,6 @@ pandas
 setuptools_scm==3.2.0
 six>=1.0.0
 # TODO: TensorFlow doesn't support Python 3.8 yet.
+# See https://github.com/tensorflow/tensorflow/issues/33374
 tensorflow; python_version >= "3.0" and python_version < "3.8"
-# pin wheel, because auditwheel is not compatible with wheel=0.32
-# TODO(kszucs): remove after auditwheel properly supports wheel>0.31
-wheel==0.31.1
+wheel

From 184f8288d2ed18175c3a030dcc268aed0946eb8c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 4 Feb 2020 18:58:37 -0600
Subject: [PATCH 21/26] ARROW-7762: [Python] Do not ignore exception for
 invalid version in ParquetWriter

Closes #6352 from jorisvandenbossche/ARROW-7762 and squashes the following commits:

935365831 <Joris Van den Bossche> ARROW-7762:  Do not ignore exception for invalid version in ParquetWriter

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
---
 python/pyarrow/_parquet.pyx          | 5 +++--
 python/pyarrow/tests/test_parquet.py | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 256c3cd2179..cc2b25da72b 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1304,14 +1304,15 @@ cdef class ParquetWriter:
         else:
             props.disallow_truncated_timestamps()
 
-    cdef void _set_version(self, WriterProperties.Builder* props):
+    cdef int _set_version(self, WriterProperties.Builder* props) except -1:
         if self.version is not None:
             if self.version == "1.0":
                 props.version(ParquetVersion_V1)
             elif self.version == "2.0":
                 props.version(ParquetVersion_V2)
             else:
-                raise ArrowException("Unsupported Parquet format version")
+                raise ValueError("Unsupported Parquet format version: {0}"
+                                 .format(self.version))
 
     cdef void _set_compression_props(self, WriterProperties.Builder* props):
         if isinstance(self.compression, basestring):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index f106da3267c..e18075e4aab 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -177,6 +177,12 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size):
     tm.assert_frame_equal(df, df_read)
 
 
+def test_parquet_invalid_version(tempdir):
+    table = pa.table({'a': [1, 2, 3]})
+    with pytest.raises(ValueError, match="Unsupported Parquet format version"):
+        _write_table(table, tempdir / 'test_version.parquet', version="2.2")
+
+
 def test_set_data_page_size():
     arr = pa.array([1, 2, 3] * 100000)
     t = pa.Table.from_arrays([arr], names=['f0'])

From 25fd97b81a65df7aca5d5f3ce482a1126bb01b83 Mon Sep 17 00:00:00 2001
From: Martin Radev <martin.b.radev@gmail.com>
Date: Tue, 4 Feb 2020 19:10:53 -0600
Subject: [PATCH 22/26] PARQUET-1716: [C++] Add BYTE_STREAM_SPLIT encoder and
 decoder

The patch implements an encoder and decoder for Parquet's
BYTE_STREAM_SPLIT encoding. The patch also adds tests for
the new encoding.

Closes #6005 from martinradev/byte_stream_split_submit and squashes the following commits:

5a78f8b6b <Martin Radev> ARROW-5913:  Add BYTE_STREAM_SPLIT encoder and decoder

Authored-by: Martin Radev <martin.b.radev@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
---
 cpp/src/parquet/column_reader.cc |   6 +
 cpp/src/parquet/encoding.cc      | 188 +++++++++++++++++++++++++++++++
 cpp/src/parquet/encoding_test.cc | 187 ++++++++++++++++++++++++++++++
 cpp/src/parquet/types.cc         |   2 +
 cpp/src/parquet/types.h          |   1 +
 5 files changed, 384 insertions(+)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index dded399240e..3cfb8938175 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -577,6 +577,12 @@ class ColumnReaderImplBase {
           decoders_[static_cast<int>(encoding)] = std::move(decoder);
           break;
         }
+        case Encoding::BYTE_STREAM_SPLIT: {
+          auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
+          current_decoder_ = decoder.get();
+          decoders_[static_cast<int>(encoding)] = std::move(decoder);
+          break;
+        }
         case Encoding::RLE_DICTIONARY:
           throw ParquetException("Dictionary page must be before data page.");
 
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index a61bcecc974..dc06436ff9c 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -815,6 +815,104 @@ void DictEncoderImpl<ByteArrayType>::PutDictionary(const arrow::Array& values) {
   }
 }
 
+// ----------------------------------------------------------------------
+// ByteStreamSplitEncoder<T> implementations
+
+template <typename DType>
+class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+  using T = typename DType::c_type;
+  using TypedEncoder<DType>::Put;
+
+  explicit ByteStreamSplitEncoder(
+      const ColumnDescriptor* descr,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  int64_t EstimatedDataEncodedSize() override;
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  void Put(const T* buffer, int num_values) override;
+  void Put(const arrow::Array& values) override;
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override;
+
+ protected:
+  arrow::TypedBufferBuilder<T> values_;
+
+ private:
+  void PutArrowArray(const arrow::Array& values);
+};
+
+template <typename DType>
+ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+                                                      ::arrow::MemoryPool* pool)
+    : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), values_{pool} {}
+
+template <typename DType>
+int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
+  return values_.length() * sizeof(T);
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
+  constexpr size_t num_streams = sizeof(T);
+  std::shared_ptr<ResizableBuffer> output_buffer =
+      AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+  uint8_t* output_buffer_raw = output_buffer->mutable_data();
+  const size_t num_values = values_.length();
+  const uint8_t* raw_values = reinterpret_cast<const uint8_t*>(values_.data());
+  for (size_t i = 0; i < num_values; ++i) {
+    for (size_t j = 0U; j < num_streams; ++j) {
+      const uint8_t byte_in_value = raw_values[i * num_streams + j];
+      output_buffer_raw[j * num_values + i] = byte_in_value;
+    }
+  }
+  values_.Reset();
+  return std::move(output_buffer);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
+  PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values));
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::Put(const ::arrow::Array& values) {
+  PutArrowArray(values);
+}
+
+template <>
+void ByteStreamSplitEncoder<FloatType>::PutArrowArray(const ::arrow::Array& values) {
+  DirectPutImpl<arrow::FloatArray>(values,
+                                   reinterpret_cast<arrow::BufferBuilder*>(&values_));
+}
+
+template <>
+void ByteStreamSplitEncoder<DoubleType>::PutArrowArray(const ::arrow::Array& values) {
+  DirectPutImpl<arrow::DoubleArray>(values,
+                                    reinterpret_cast<arrow::BufferBuilder*>(&values_));
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
+                                              const uint8_t* valid_bits,
+                                              int64_t valid_bits_offset) {
+  std::shared_ptr<ResizableBuffer> buffer;
+  PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(),
+                                                      num_values * sizeof(T), &buffer));
+  int32_t num_valid_values = 0;
+  arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                  num_values);
+  T* data = reinterpret_cast<T*>(buffer->mutable_data());
+  for (int32_t i = 0; i < num_values; i++) {
+    if (valid_bits_reader.IsSet()) {
+      data[num_valid_values++] = src[i];
+    }
+    valid_bits_reader.Next();
+  }
+  Put(data, num_valid_values);
+}
+
 // ----------------------------------------------------------------------
 // Encoder and decoder factory functions
 
@@ -863,6 +961,18 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
         DCHECK(false) << "Encoder not implemented";
         break;
     }
+  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+    switch (type_num) {
+      case Type::FLOAT:
+        return std::unique_ptr<Encoder>(
+            new ByteStreamSplitEncoder<FloatType>(descr, pool));
+      case Type::DOUBLE:
+        return std::unique_ptr<Encoder>(
+            new ByteStreamSplitEncoder<DoubleType>(descr, pool));
+      default:
+        throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+        break;
+    }
   } else {
     ParquetException::NYI("Selected encoding is not supported");
   }
@@ -2236,6 +2346,74 @@ class DeltaByteArrayDecoder : public DecoderImpl,
   ByteArray last_value_;
 };
 
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT
+
+template <typename DType>
+class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+  using T = typename DType::c_type;
+  explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
+
+  int Decode(T* buffer, int max_values) override;
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<DType>::Accumulator* builder) override;
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<DType>::DictAccumulator* builder) override;
+
+  void SetData(int num_values, const uint8_t* data, int len) override;
+
+ private:
+  int num_values_in_buffer{0U};
+};
+
+template <typename DType>
+ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
+    : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
+
+template <typename DType>
+void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
+                                            int len) {
+  DecoderImpl::SetData(num_values, data, len);
+  num_values_in_buffer = num_values;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
+  constexpr size_t num_streams = sizeof(T);
+  const int values_to_decode = std::min(num_values_, max_values);
+  const int num_decoded_previously = num_values_in_buffer - num_values_;
+  for (int i = 0; i < values_to_decode; ++i) {
+    uint8_t gathered_byte_data[num_streams];
+    for (size_t b = 0; b < num_streams; ++b) {
+      const size_t byte_index = b * num_values_in_buffer + num_decoded_previously + i;
+      gathered_byte_data[b] = data_[byte_index];
+    }
+    buffer[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+  }
+  num_values_ -= values_to_decode;
+  len_ -= sizeof(T) * values_to_decode;
+  return values_to_decode;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<DType>::Accumulator* builder) {
+  ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<DType>::DictAccumulator* builder) {
+  ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
+}
+
 // ----------------------------------------------------------------------
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
@@ -2261,6 +2439,16 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
       default:
         break;
     }
+  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+    switch (type_num) {
+      case Type::FLOAT:
+        return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
+      case Type::DOUBLE:
+        return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
+      default:
+        throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+        break;
+    }
   } else {
     ParquetException::NYI("Selected encoding is not supported");
   }
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 473b985b35a..4a40d0162f8 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -29,6 +29,7 @@
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
 
 #include "parquet/encoding.h"
 #include "parquet/platform.h"
@@ -38,6 +39,7 @@
 
 using arrow::default_memory_pool;
 using arrow::MemoryPool;
+using arrow::internal::checked_cast;
 
 // TODO(hatemhelal): investigate whether this can be replaced with GTEST_SKIP in a future
 // gtest release that contains https://github.com/google/googletest/pull/1544
@@ -862,5 +864,190 @@ TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) {
   CheckDict(actual_num_values, *builder);
 }
 
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT encode/decode tests.
+
+template <typename DType>
+void TestByteStreamSplitDecodePath(const uint8_t* encoded_data,
+                                   const int64_t encoded_data_size,
+                                   const typename DType::c_type* expected_decoded_data,
+                                   const int num_elements,
+                                   const bool request_more_values) {
+  std::unique_ptr<TypedDecoder<DType>> decoder =
+      MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT);
+  decoder->SetData(num_elements, encoded_data, static_cast<int>(encoded_data_size));
+  std::vector<typename DType::c_type> decoded_data(num_elements);
+  int num_elements_to_decode = num_elements;
+  if (request_more_values) {
+    num_elements_to_decode += 100;
+  }
+  int num_decoded_elements = decoder->Decode(decoded_data.data(), num_elements_to_decode);
+  ASSERT_EQ(num_elements, num_decoded_elements);
+  for (size_t i = 0U; i < decoded_data.size(); ++i) {
+    ASSERT_EQ(expected_decoded_data[i], decoded_data[i]);
+  }
+  ASSERT_EQ(0, decoder->values_left());
+}
+
+template <typename DType>
+void TestByteStreamSplitRoundTrip(const typename DType::c_type* input_data, const int n) {
+  auto encoder = MakeTypedEncoder<DType>(Encoding::BYTE_STREAM_SPLIT);
+  encoder->Put(input_data, n);
+  const int64_t estimated_num_bytes = encoder->EstimatedDataEncodedSize();
+  ASSERT_EQ(static_cast<int64_t>(n) * sizeof(typename DType::c_type),
+            estimated_num_bytes);
+  std::shared_ptr<Buffer> buffer = encoder->FlushValues();
+  TestByteStreamSplitDecodePath<DType>(buffer->data(), buffer->size(), input_data, n,
+                                       false);
+}
+
+template <typename DType>
+void TestEncodeDecodeWithBigInput() {
+  const int nvalues = 10000;
+  using T = typename DType::c_type;
+  std::vector<T> data(nvalues);
+  GenerateData<T>(nvalues, data.data(), nullptr);
+  TestByteStreamSplitRoundTrip<DType>(data.data(), nvalues);
+}
+
+// Check that the encoder can handle input with one element.
+TEST(ByteStreamSplitEncodeDecode, EncodeOneLenInput) {
+  const float value = 1.0f;
+  TestByteStreamSplitRoundTrip<FloatType>(&value, 1);
+}
+
+// Check that the decoder can handle empty input.
+TEST(ByteStreamSplitEncodeDecode, DecodeZeroLenInput) {
+  std::unique_ptr<TypedDecoder<FloatType>> decoder =
+      MakeTypedDecoder<FloatType>(Encoding::BYTE_STREAM_SPLIT);
+  decoder->SetData(0, NULL, 0);
+  ASSERT_EQ(0U, decoder->Decode(NULL, 0));
+}
+
+TEST(ByteStreamSplitEncodeDecode, DecodeOneLenInput) {
+  const uint8_t data[] = {0x47U, 0x24U, 0xa7U, 0x44U};
+  TestByteStreamSplitDecodePath<FloatType>(
+      data, 4, reinterpret_cast<const float*>(&data[0]), 1, false);
+}
+
+// Check that requesting to decode more elements than is available in the storage
+// of the decoder works correctly.
+TEST(ByteStreamSplitEncodeDecode, DecodeLargerPortion) {
+  const uint8_t data[] = {0xDEU, 0xC0U, 0x37U, 0x13U, 0x11U, 0x22U, 0x33U, 0x44U,
+                          0xAAU, 0xBBU, 0xCCU, 0xDDU, 0x55U, 0x66U, 0x77U, 0x88U};
+  const uint64_t expected_output[2] = {0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL};
+  TestByteStreamSplitDecodePath<DoubleType>(
+      data, sizeof(data), reinterpret_cast<const double*>(&expected_output[0]), 2, true);
+}
+
+// Check that the decoder can decode the input in smaller steps.
+TEST(ByteStreamSplitEncodeDecode, DecodeMultipleTimes) {
+  std::unique_ptr<TypedDecoder<FloatType>> decoder =
+      MakeTypedDecoder<FloatType>(Encoding::BYTE_STREAM_SPLIT);
+  const int num_values = 100;
+  std::vector<uint8_t> data(num_values * 4);
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = static_cast<uint8_t>(i & 0xFFU);
+  }
+  decoder->SetData(num_values, data.data(), num_values * 4);
+
+  const int step = 25;
+  std::vector<float> decoded_data(step);
+  for (int i = 0; i < num_values; i += step) {
+    int num_decoded = decoder->Decode(decoded_data.data(), step);
+    ASSERT_EQ(step, num_decoded);
+    for (int j = 0; j < step; ++j) {
+      const uint32_t assembled_value =
+          static_cast<uint32_t>(data[i + j]) |
+          (static_cast<uint32_t>(data[(i + j) + num_values]) << 8U) |
+          (static_cast<uint32_t>(data[(i + j) + num_values * 2]) << 16U) |
+          (static_cast<uint32_t>(data[(i + j) + num_values * 3]) << 24U);
+      const float assembled_value_as_float =
+          *reinterpret_cast<const float*>(&assembled_value);
+      ASSERT_EQ(assembled_value_as_float, decoded_data[j]);
+    }
+  }
+}
+
+// Check that an encode-decode pipeline produces the original small input.
+// This small-input test is added to ease debugging in case of changes to
+// the encoder/decoder implementation.
+TEST(ByteStreamSplitEncodeDecode, SmallInput) {
+  const float data[] = {-166.166f, -0.2566f, .0f, 322.0f, 178888.189f};
+  const int num_values = sizeof(data) / sizeof(data[0U]);
+  TestByteStreamSplitRoundTrip<FloatType>(data, num_values);
+}
+
+TEST(ByteStreamSplitEncodeDecode, PutSpaced) {
+  const float data[] = {-1.0f, .0f,       .0f, 3.0f,          .0f,       22.1234f,
+                        .0f,   198891.0f, .0f, -223345.4455f, 24443.124f};
+  const float valid_data[] = {-1.0f,     3.0f,          22.1234f,
+                              198891.0f, -223345.4455f, 24443.124f};
+  // The valid ones are the ones which are non-zero.
+  // The enable bits are: 10010101 011.
+  const uint8_t valid_bits[2] = {0xA9U, 0x6U};
+  const int num_values = sizeof(data) / sizeof(data[0U]);
+  const int num_valid_values = sizeof(valid_data) / sizeof(valid_data[0U]);
+  std::unique_ptr<TypedEncoder<FloatType>> encoder =
+      MakeTypedEncoder<FloatType>(Encoding::BYTE_STREAM_SPLIT);
+  encoder->PutSpaced(data, num_values, valid_bits, 0);
+  std::shared_ptr<Buffer> buffer = encoder->FlushValues();
+
+  TestByteStreamSplitDecodePath<FloatType>(buffer->data(), buffer->size(), valid_data,
+                                           num_valid_values, false);
+}
+
+TEST(ByteStreamSplitEncodeDecode, PutArrow) {
+  arrow::random::RandomArrayGenerator rag{1337};
+  const int num_values = 123;
+  auto arr = rag.Float32(num_values, -2048.0f, 2048.0f, 0);
+  std::unique_ptr<TypedEncoder<FloatType>> encoder =
+      MakeTypedEncoder<FloatType>(Encoding::BYTE_STREAM_SPLIT);
+  encoder->Put(*arr);
+  std::shared_ptr<Buffer> buffer = encoder->FlushValues();
+
+  auto raw_values = checked_cast<const arrow::FloatArray&>(*arr).raw_values();
+  TestByteStreamSplitDecodePath<FloatType>(buffer->data(), buffer->size(), raw_values,
+                                           num_values, false);
+}
+
+// Test that the encode-decode pipeline can handle big 32-bit FP input.
+TEST(ByteStreamSplitEncodeDecode, BigInputFloat) {
+  TestEncodeDecodeWithBigInput<FloatType>();
+}
+
+// Test that the encode-decode pipeline can handle big 64-bit FP input.
+TEST(ByteStreamSplitEncodeDecode, BigInputDouble) {
+  TestEncodeDecodeWithBigInput<DoubleType>();
+}
+
+TEST(ByteStreamSplitEncodeDecode, InvalidDataTypes) {
+  // First check encoders.
+  ASSERT_THROW(MakeTypedEncoder<Int32Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedEncoder<Int64Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedEncoder<Int96Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedEncoder<BooleanType>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedEncoder<ByteArrayType>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedEncoder<FLBAType>(Encoding::BYTE_STREAM_SPLIT), ParquetException);
+
+  // Then check decoders.
+  ASSERT_THROW(MakeTypedDecoder<Int32Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedDecoder<Int64Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedDecoder<Int96Type>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedDecoder<BooleanType>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedDecoder<ByteArrayType>(Encoding::BYTE_STREAM_SPLIT),
+               ParquetException);
+  ASSERT_THROW(MakeTypedDecoder<FLBAType>(Encoding::BYTE_STREAM_SPLIT), ParquetException);
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index e7efe6a263c..9dc13e42191 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -157,6 +157,8 @@ std::string EncodingToString(Encoding::type t) {
       return "DELTA_BYTE_ARRAY";
     case Encoding::RLE_DICTIONARY:
       return "RLE_DICTIONARY";
+    case Encoding::BYTE_STREAM_SPLIT:
+      return "BYTE_STREAM_SPLIT";
     default:
       return "UNKNOWN";
   }
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index ebb8c2446e3..420494a8951 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -452,6 +452,7 @@ struct Encoding {
     DELTA_LENGTH_BYTE_ARRAY = 6,
     DELTA_BYTE_ARRAY = 7,
     RLE_DICTIONARY = 8,
+    BYTE_STREAM_SPLIT = 9,
     UNKNOWN = 999
   };
 };

From c02d376ed31d51dce0f37e13b9a2f4cb0c39a792 Mon Sep 17 00:00:00 2001
From: Rok <rok@mihevc.org>
Date: Wed, 5 Feb 2020 15:45:32 +0100
Subject: [PATCH 23/26] ARROW-4226: [C++] Add sparse CSF tensor support

This is to resolve [ARROW-4226](https://issues.apache.org/jira/browse/ARROW-4226).

Closes #5716 from rok/ARROW-4226 and squashes the following commits:

9ca93ab60 <Rok> Implementing review feedback.
1b922f6ae <Rok> Implementing review feedback.
11b81bb04 <Rok> Factoring out index incrementing for dense to COO and CSF indices.
6f4f4a8f9 <Rok> Implementing feedback review.
28d38cb5e <Rok> Removing backslashes from comments.
3291abc90 <Rok> Marking indptrBuffers, indicesBuffers and axisOrder required.
d9ff47e67 <Rok> Further work and implementing review feedback.
24a831f3e <Rok> Style.
4f2bf00dd <Rok> Work on CSF index tests.
6ceb406b6 <Rok> Implementing review feedback.
bd0d8c2f8 <Rok> Dense to sparse CSF conversion now in order of dimension size.
eb519471d <Rok> Switching SparseCSFIndex to '2D' data structure.
a322ff5b2 <Rok> Adding tests for multiple index value types for SparseCSFIndex.
f44d92cfd <Rok> Adding SparseCSFIndex::Make.
7d17995a4 <Rok> Adding Tensor to SparseCSFTensor conversion.
05a47a546 <Rok> Using axis_order in CSF.
6b938f7da <Rok> Documentation.
2d1010405 <Rok> WIP

Authored-by: Rok <rok@mihevc.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compare.cc            |  12 +
 cpp/src/arrow/python/serialize.cc   |   5 +
 cpp/src/arrow/sparse_tensor.cc      | 345 ++++++++++++++++++++++++++--
 cpp/src/arrow/sparse_tensor.h       |  67 ++++++
 cpp/src/arrow/sparse_tensor_test.cc | 222 ++++++++++++++++++
 format/SparseTensor.fbs             |   2 +-
 6 files changed, 628 insertions(+), 25 deletions(-)

diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index f7431f80f5f..d2322009ea8 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1196,6 +1196,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl<SparseIndexTyp
                                                                               right_csc);
     }
 
+    case SparseTensorFormat::CSF: {
+      const auto& right_csf =
+          checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(right);
+      return SparseTensorEqualsImpl<SparseIndexType, SparseCSFIndex>::Compare(left,
+                                                                              right_csf);
+    }
+
     default:
       return false;
   }
@@ -1232,6 +1239,11 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) {
       return SparseTensorEqualsImplDispatch(left_csc, right);
     }
 
+    case SparseTensorFormat::CSF: {
+      const auto& left_csf = checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(left);
+      return SparseTensorEqualsImplDispatch(left_csf, right);
+    }
+
     default:
       return false;
   }
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 09a322b1060..06c85648591 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -654,6 +654,7 @@ Status CountSparseTensors(
   OwnedRef num_sparse_tensors(PyDict_New());
   size_t num_coo = 0;
   size_t num_csr = 0;
+  size_t num_csf = 0;
 
   for (const auto& sparse_tensor : sparse_tensors) {
     switch (sparse_tensor->format_id()) {
@@ -663,6 +664,9 @@ Status CountSparseTensors(
       case SparseTensorFormat::CSR:
         ++num_csr;
         break;
+      case SparseTensorFormat::CSF:
+        ++num_csf;
+        break;
       case SparseTensorFormat::CSC:
         // TODO(mrkn): support csc
         break;
@@ -671,6 +675,7 @@ Status CountSparseTensors(
 
   PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo));
   PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr));
+  PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf));
   RETURN_IF_PYERROR();
 
   *out = num_sparse_tensors.detach();
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
index d42bdf4ca61..549223c0798 100644
--- a/cpp/src/arrow/sparse_tensor.cc
+++ b/cpp/src/arrow/sparse_tensor.cc
@@ -23,9 +23,11 @@
 #include <memory>
 #include <numeric>
 
+#include "arrow/buffer_builder.h"
 #include "arrow/compare.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/sort.h"
 #include "arrow/visitor_inline.h"
 
 namespace arrow {
@@ -55,6 +57,38 @@ class SparseTensorConverter {
   Status Convert() { return Status::Invalid("Unsupported sparse index"); }
 };
 
+// ----------------------------------------------------------------------
+// IncrementIndex for SparseCOOIndex and SparseCSFIndex
+
+inline void IncrementIndex(std::vector<int64_t>& coord,
+                           const std::vector<int64_t>& shape) {
+  const int64_t ndim = shape.size();
+  ++coord[ndim - 1];
+  if (coord[ndim - 1] == shape[ndim - 1]) {
+    int64_t d = ndim - 1;
+    while (d > 0 && coord[d] == shape[d]) {
+      coord[d] = 0;
+      ++coord[d - 1];
+      --d;
+    }
+  }
+}
+
+inline void IncrementIndex(std::vector<int64_t>& coord, const std::vector<int64_t>& shape,
+                           const std::vector<int64_t>& axis_order) {
+  const int64_t ndim = shape.size();
+  const int64_t last_axis = axis_order[ndim - 1];
+  ++coord[last_axis];
+  if (coord[last_axis] == shape[last_axis]) {
+    int64_t d = ndim - 1;
+    while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) {
+      coord[axis_order[d]] = 0;
+      ++coord[axis_order[d - 1]];
+      --d;
+    }
+  }
+}
+
 // ----------------------------------------------------------------------
 // SparseTensorConverter for SparseCOOIndex
 
@@ -128,17 +162,7 @@ class SparseTensorConverter<TYPE, SparseCOOIndex>
             *indices++ = static_cast<c_index_value_type>(coord[i]);
           }
         }
-
-        // increment index
-        ++coord[ndim - 1];
-        if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) {
-          int64_t d = ndim - 1;
-          while (d > 0 && coord[d] == shape[d]) {
-            coord[d] = 0;
-            ++coord[d - 1];
-            --d;
-          }
-        }
+        IncrementIndex(coord, shape);
       }
     }
 
@@ -419,6 +443,144 @@ class SparseTensorConverter<TYPE, SparseCSCIndex>
   inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); }
 };
 
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCSFIndex
+
+template <typename TYPE>
+class SparseTensorConverter<TYPE, SparseCSFIndex>
+    : private SparseTensorConverterBase<TYPE> {
+ public:
+  using BaseClass = SparseTensorConverterBase<TYPE>;
+  using typename BaseClass::NumericTensorType;
+  using typename BaseClass::value_type;
+
+  SparseTensorConverter(const NumericTensorType& tensor,
+                        const std::shared_ptr<DataType>& index_value_type,
+                        MemoryPool* pool)
+      : BaseClass(tensor, index_value_type, pool) {}
+
+  template <typename IndexValueType>
+  Status Convert() {
+    using c_index_value_type = typename IndexValueType::c_type;
+    RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits<c_index_value_type>::max()));
+
+    const int64_t ndim = tensor_.ndim();
+    // Axis order as ascending order of dimension size is a good heuristic but is not
+    // necessarily optimal.
+    std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
+    int64_t nonzero_count = -1;
+    RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count));
+
+    std::shared_ptr<Buffer> values_buffer;
+    RETURN_NOT_OK(
+        AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer));
+    value_type* values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
+
+    std::vector<int64_t> counts(ndim, 0);
+    std::vector<int64_t> coord(ndim, 0);
+    std::vector<int64_t> previous_coord(ndim, -1);
+    std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders(ndim - 1);
+    std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders(ndim);
+
+    if (ndim <= 1) {
+      return Status::NotImplemented("TODO for ndim <= 1");
+    } else {
+      const std::vector<int64_t>& shape = tensor_.shape();
+      for (int64_t n = tensor_.size(); n > 0; n--) {
+        const value_type x = tensor_.Value(coord);
+
+        if (x != 0) {
+          bool tree_split = false;
+          *values++ = x;
+
+          for (int64_t i = 0; i < ndim; ++i) {
+            int64_t dimension = axis_order[i];
+
+            tree_split = tree_split || (coord[dimension] != previous_coord[dimension]);
+            if (tree_split) {
+              if (i < ndim - 1) {
+                RETURN_NOT_OK(indptr_buffer_builders[i].Append(
+                    static_cast<c_index_value_type>(counts[i + 1])));
+              }
+              RETURN_NOT_OK(indices_buffer_builders[i].Append(
+                  static_cast<c_index_value_type>(coord[dimension])));
+              ++counts[i];
+            }
+          }
+          previous_coord = coord;
+        }
+        IncrementIndex(coord, shape, axis_order);
+      }
+    }
+
+    for (int64_t column = 0; column < ndim - 1; ++column) {
+      RETURN_NOT_OK(indptr_buffer_builders[column].Append(
+          static_cast<c_index_value_type>(counts[column + 1])));
+    }
+
+    // make results
+    data = values_buffer;
+
+    std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
+    std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
+    std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
+    std::vector<int64_t> indices_shapes = counts;
+
+    for (int64_t column = 0; column < ndim; ++column) {
+      RETURN_NOT_OK(
+          indices_buffer_builders[column].Finish(&indices_buffers[column], true));
+    }
+    for (int64_t column = 0; column < ndim - 1; ++column) {
+      RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true));
+    }
+
+    ARROW_ASSIGN_OR_RAISE(
+        sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
+                                           indptr_buffers, indices_buffers));
+    return Status::OK();
+  }
+
+#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \
+  case TYPE_CLASS##Type::type_id:              \
+    return Convert<TYPE_CLASS##Type>();
+
+  Status Convert() {
+    switch (index_value_type_->id()) {
+      ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT);
+      // LCOV_EXCL_START: The following invalid causes program failure.
+      default:
+        return Status::TypeError("Unsupported SparseTensor index value type");
+        // LCOV_EXCL_STOP
+    }
+  }
+
+#undef CALL_TYPE_SPECIFIC_CONVERT
+
+  std::shared_ptr<SparseCSFIndex> sparse_index;
+  std::shared_ptr<Buffer> data;
+
+ private:
+  using BaseClass::index_value_type_;
+  using BaseClass::pool_;
+  using BaseClass::tensor_;
+
+  template <typename c_value_type>
+  inline Status CheckMaximumValue(const c_value_type type_max) const {
+    auto max_dimension =
+        *std::max_element(tensor_.shape().begin(), tensor_.shape().end());
+    if (static_cast<int64_t>(type_max) < max_dimension) {
+      // LCOV_EXCL_START: The following invalid causes program failure.
+      return Status::Invalid("The bit width of the index value type is too small");
+      // LCOV_EXCL_STOP
+    }
+    return Status::OK();
+  }
+
+  inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); }
+
+  inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); }
+};
+
 // ----------------------------------------------------------------------
 // Instantiate templates
 
@@ -438,6 +600,7 @@ class SparseTensorConverter<TYPE, SparseCSCIndex>
 INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex);
 INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex);
 INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex);
+INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSFIndex);
 
 }  // namespace
 
@@ -500,6 +663,10 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor,
     case SparseTensorFormat::CSC:
       return MakeSparseTensorFromTensor<SparseCSCIndex>(tensor, index_value_type, pool,
                                                         out_sparse_index, out_data);
+    case SparseTensorFormat::CSF:
+      return MakeSparseTensorFromTensor<SparseCSFIndex>(tensor, index_value_type, pool,
+                                                        out_sparse_index, out_data);
+
     // LCOV_EXCL_START: ignore program failure
     default:
       return Status::Invalid("Invalid sparse tensor format");
@@ -507,6 +674,35 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor,
   }
 }
 
+namespace {
+
+template <typename TYPE, typename IndexValueType>
+void ExpandSparseCSFTensorValues(int64_t dimension, int64_t dense_offset,
+                                 int64_t first_ptr, int64_t last_ptr,
+                                 const SparseCSFIndex& sparse_index, const TYPE* raw_data,
+                                 const std::vector<int64_t>& strides,
+                                 const std::vector<int64_t>& axis_order, TYPE* out) {
+  int64_t ndim = axis_order.size();
+
+  for (int64_t i = first_ptr; i < last_ptr; ++i) {
+    int64_t tmp_dense_offset =
+        dense_offset + sparse_index.indices()[dimension]->Value<IndexValueType>({i}) *
+                           strides[axis_order[dimension]];
+
+    if (dimension < ndim - 1) {
+      ExpandSparseCSFTensorValues<TYPE, IndexValueType>(
+          dimension + 1, tmp_dense_offset,
+          sparse_index.indptr()[dimension]->Value<IndexValueType>({i}),
+          sparse_index.indptr()[dimension]->Value<IndexValueType>({i + 1}), sparse_index,
+          raw_data, strides, axis_order, out);
+    } else {
+      out[tmp_dense_offset] = raw_data[i];
+    }
+  }
+}
+
+}  // namespace
+
 template <typename TYPE, typename IndexValueType>
 Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor,
                                   std::shared_ptr<Tensor>* out) {
@@ -521,18 +717,20 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
 
   std::fill_n(values, sparse_tensor->size(), static_cast<value_type>(0));
 
+  std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
+  for (int i = sparse_tensor->ndim() - 1; i > 0; --i) {
+    strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
+  }
+  std::vector<int64_t> empty_strides;
+
+  const auto raw_data = reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
+
   switch (sparse_tensor->format_id()) {
     case SparseTensorFormat::COO: {
       const auto& sparse_index =
           internal::checked_cast<const SparseCOOIndex&>(*sparse_tensor->sparse_index());
       const std::shared_ptr<const Tensor> coords = sparse_index.indices();
-      const auto raw_data =
-          reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
-      std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
 
-      for (int i = sparse_tensor->ndim() - 1; i > 0; --i) {
-        strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
-      }
       for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
         std::vector<c_index_value_type> coord(sparse_tensor->ndim());
         int64_t offset = 0;
@@ -543,7 +741,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
         values[offset] = raw_data[i];
       }
       *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
-                                      sparse_tensor->shape());
+                                      sparse_tensor->shape(), empty_strides,
+                                      sparse_tensor->dim_names());
       return Status::OK();
     }
 
@@ -552,8 +751,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
           internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
       const std::shared_ptr<const Tensor> indptr = sparse_index.indptr();
       const std::shared_ptr<const Tensor> indices = sparse_index.indices();
-      const auto raw_data =
-          reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
 
       int64_t offset;
       for (int64_t i = 0; i < indptr->size() - 1; ++i) {
@@ -565,7 +762,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
         }
       }
       *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
-                                      sparse_tensor->shape());
+                                      sparse_tensor->shape(), empty_strides,
+                                      sparse_tensor->dim_names());
       return Status::OK();
     }
 
@@ -574,8 +772,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
           internal::checked_cast<const SparseCSCIndex&>(*sparse_tensor->sparse_index());
       const std::shared_ptr<const Tensor> indptr = sparse_index.indptr();
       const std::shared_ptr<const Tensor> indices = sparse_index.indices();
-      const auto raw_data =
-          reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
 
       int64_t offset;
       for (int64_t j = 0; j < indptr->size() - 1; ++j) {
@@ -587,7 +783,21 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
         }
       }
       *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
-                                      sparse_tensor->shape());
+                                      sparse_tensor->shape(), empty_strides,
+                                      sparse_tensor->dim_names());
+      return Status::OK();
+    }
+
+    case SparseTensorFormat::CSF: {
+      const auto& sparse_index =
+          internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
+
+      ExpandSparseCSFTensorValues<value_type, IndexValueType>(
+          0, 0, 0, sparse_index.indptr()[0]->size() - 1, sparse_index, raw_data, strides,
+          sparse_index.axis_order(), values);
+      *out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
+                                      sparse_tensor->shape(), empty_strides,
+                                      sparse_tensor->dim_names());
       return Status::OK();
     }
   }
@@ -625,6 +835,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
       const std::shared_ptr<const Tensor> indices = sparse_index.indices();
       type = indices->type();
       break;
+    }
+    case SparseTensorFormat::CSF: {
+      const auto& sparse_index =
+          internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
+      const std::vector<std::shared_ptr<Tensor>> indices = sparse_index.indices();
+      type = indices[0]->type();
+      break;
     }
       // LCOV_EXCL_START: ignore program failure
     default:
@@ -754,6 +971,86 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
 
 }  // namespace internal
 
+// ----------------------------------------------------------------------
+// SparseCSFIndex
+
+namespace {
+
+inline Status CheckSparseCSFIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+                                          const std::shared_ptr<DataType>& indices_type,
+                                          const int64_t num_indptrs,
+                                          const int64_t num_indices,
+                                          const std::vector<int64_t>& indptr_shape,
+                                          const std::vector<int64_t>& indices_shape,
+                                          const int64_t axis_order_size) {
+  if (!is_integer(indptr_type->id())) {
+    return Status::TypeError("Type of SparseCSFIndex indptr must be integer");
+  }
+  if (!is_integer(indices_type->id())) {
+    return Status::TypeError("Type of SparseCSFIndex indices must be integer");
+  }
+  if (num_indptrs + 1 != num_indices) {
+    return Status::Invalid(
+        "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex.");
+  }
+  if (axis_order_size != num_indices) {
+    return Status::Invalid(
+        "Length of indices must be equal to number of dimensions for SparseCSFIndex.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
+    const std::shared_ptr<DataType>& indptr_type,
+    const std::shared_ptr<DataType>& indices_type,
+    const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+    const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+    const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+  int64_t ndim = axis_order.size();
+  std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+  std::vector<std::shared_ptr<Tensor>> indices(ndim);
+
+  for (int64_t i = 0; i < ndim - 1; ++i)
+    indptr[i] = std::make_shared<Tensor>(indptr_type, indptr_data[i],
+                                         std::vector<int64_t>({indices_shapes[i] + 1}));
+  for (int64_t i = 0; i < ndim; ++i)
+    indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
+                                          std::vector<int64_t>({indices_shapes[i]}));
+
+  RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(),
+                                            indices.size(), indptr.back()->shape(),
+                                            indices.back()->shape(), axis_order.size()));
+
+  return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+}
+
+// Constructor with two index vectors
+SparseCSFIndex::SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+                               const std::vector<std::shared_ptr<Tensor>>& indices,
+                               const std::vector<int64_t>& axis_order)
+    : SparseIndexBase(indices.back()->size()),
+      indptr_(indptr),
+      indices_(indices),
+      axis_order_(axis_order) {
+  ARROW_CHECK_OK(CheckSparseCSFIndexValidity(
+      indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(),
+      indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size()));
+}
+
+std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }
+
+bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const {
+  for (int64_t i = 0; i < static_cast<int64_t>(indices().size()); ++i) {
+    if (!indices()[i]->Equals(*other.indices()[i])) return false;
+  }
+  for (int64_t i = 0; i < static_cast<int64_t>(indptr().size()); ++i) {
+    if (!indptr()[i]->Equals(*other.indptr()[i])) return false;
+  }
+  return axis_order() == other.axis_order();
+}
+
 // ----------------------------------------------------------------------
 // SparseTensor
 
diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h
index f736f7b7576..33a53761e14 100644
--- a/cpp/src/arrow/sparse_tensor.h
+++ b/cpp/src/arrow/sparse_tensor.h
@@ -40,6 +40,8 @@ struct SparseTensorFormat {
     CSR,
     /// Compressed sparse column (CSC) format.
     CSC,
+    /// Compressed sparse fiber (CSF) format.
+    CSF
   };
 };
 
@@ -329,6 +331,68 @@ class ARROW_EXPORT SparseCSCIndex
   using SparseCSXIndex::SparseCSXIndex;
 };
 
+// ----------------------------------------------------------------------
+// SparseCSFIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor
+///
+/// A CSF sparse index manages the location of its non-zero values by set of
+/// prefix trees. Each path from a root to leaf forms one tensor non-zero index.
+/// CSF is implemented with three vectors.
+///
+/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the
+/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and
+/// indices describe the set of prefix trees. Trees traverse dimensions in order given by
+/// axis_order.
+class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIndex> {
+ public:
+  static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF;
+  static constexpr char const* kTypeName = "SparseCSFIndex";
+
+  /// \brief Make SparseCSFIndex from raw properties
+  static Result<std::shared_ptr<SparseCSFIndex>> Make(
+      const std::shared_ptr<DataType>& indptr_type,
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+      const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+      const std::vector<std::shared_ptr<Buffer>>& indices_data);
+
+  /// \brief Make SparseCSFIndex from raw properties
+  static Result<std::shared_ptr<SparseCSFIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+      const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+      const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+    return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data,
+                indices_data);
+  }
+
+  /// \brief Construct SparseCSFIndex from two index vectors
+  explicit SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+                          const std::vector<std::shared_ptr<Tensor>>& indices,
+                          const std::vector<int64_t>& axis_order);
+
+  /// \brief Return a 1D vector of indptr tensors
+  const std::vector<std::shared_ptr<Tensor>>& indptr() const { return indptr_; }
+
+  /// \brief Return a 1D vector of indices tensors
+  const std::vector<std::shared_ptr<Tensor>>& indices() const { return indices_; }
+
+  /// \brief Return a 1D vector specifying the order of axes
+  const std::vector<int64_t>& axis_order() const { return axis_order_; }
+
+  /// \brief Return a string representation of the sparse index
+  std::string ToString() const override;
+
+  /// \brief Return whether the CSF indices are equal
+  bool Equals(const SparseCSFIndex& other) const;
+
+ protected:
+  std::vector<std::shared_ptr<Tensor>> indptr_;
+  std::vector<std::shared_ptr<Tensor>> indices_;
+  std::vector<int64_t> axis_order_;
+};
+
 // ----------------------------------------------------------------------
 // SparseTensor class
 
@@ -527,6 +591,9 @@ using SparseCSRMatrix = SparseTensorImpl<SparseCSRIndex>;
 /// \brief EXPERIMENTAL: Type alias for CSC sparse matrix
 using SparseCSCMatrix = SparseTensorImpl<SparseCSCIndex>;
 
+/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix
+using SparseCSFTensor = SparseTensorImpl<SparseCSFIndex>;
+
 }  // namespace arrow
 
 #endif  // ARROW_SPARSE_TENSOR_H
diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc
index 198aa8f5f8d..45cb8dcc8f3 100644
--- a/cpp/src/arrow/sparse_tensor_test.cc
+++ b/cpp/src/arrow/sparse_tensor_test.cc
@@ -30,6 +30,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
+#include "arrow/util/sort.h"
 
 namespace arrow {
 
@@ -910,4 +911,225 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) {
   ASSERT_TRUE(tensor.Equals(*dense_tensor));
 }
 
+template <typename IndexValueType>
+class TestSparseCSFTensorBase : public ::testing::Test {
+ public:
+  void SetUp() {
+    dim_names_ = {"a", "b", "c", "d"};
+    shape_ = {2, 3, 4, 5};
+    int16_t dense_values[2][3][4][5] = {};  // zero-initialized
+
+    dense_values[0][0][0][1] = 1;
+    dense_values[0][0][0][2] = 2;
+    dense_values[0][1][0][0] = 3;
+    dense_values[0][1][0][2] = 4;
+    dense_values[0][1][1][0] = 5;
+    dense_values[1][1][1][0] = 6;
+    dense_values[1][1][1][1] = 7;
+    dense_values[1][1][1][2] = 8;
+
+    auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values));
+    Tensor dense_tensor_(int16(), dense_buffer, shape_, {}, dim_names_);
+    ASSERT_OK_AND_ASSIGN(
+        sparse_tensor_from_dense_,
+        SparseCSFTensor::Make(dense_tensor_,
+                              TypeTraits<IndexValueType>::type_singleton()));
+  }
+
+ protected:
+  std::vector<int64_t> shape_;
+  std::vector<std::string> dim_names_;
+  std::shared_ptr<SparseCSFTensor> sparse_tensor_from_dense_;
+};
+
+class TestSparseCSFTensor : public TestSparseCSFTensorBase<Int64Type> {};
+
+template <typename IndexValueType>
+class TestSparseCSFTensorForIndexValueType
+    : public TestSparseCSFTensorBase<IndexValueType> {
+ protected:
+  std::shared_ptr<SparseCSFIndex> MakeSparseCSFIndex(
+      const std::vector<int64_t>& axis_order,
+      const std::vector<std::vector<typename IndexValueType::c_type>>& indptr_values,
+      const std::vector<std::vector<typename IndexValueType::c_type>>& indices_values)
+      const {
+    int64_t ndim = axis_order.size();
+    std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+    std::vector<std::shared_ptr<Tensor>> indices(ndim);
+
+    for (int64_t i = 0; i < ndim - 1; ++i) {
+      indptr[i] = std::make_shared<Tensor>(
+          TypeTraits<IndexValueType>::type_singleton(), Buffer::Wrap(indptr_values[i]),
+          std::vector<int64_t>({static_cast<int64_t>(indptr_values[i].size())}));
+    }
+    for (int64_t i = 0; i < ndim; ++i) {
+      indices[i] = std::make_shared<Tensor>(
+          TypeTraits<IndexValueType>::type_singleton(), Buffer::Wrap(indices_values[i]),
+          std::vector<int64_t>({static_cast<int64_t>(indices_values[i].size())}));
+    }
+    return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+  }
+
+  template <typename CValueType>
+  std::shared_ptr<SparseCSFTensor> MakeSparseTensor(
+      const std::shared_ptr<SparseCSFIndex>& si, std::vector<CValueType>& sparse_values,
+      const std::vector<int64_t>& shape,
+      const std::vector<std::string>& dim_names) const {
+    auto data_buffer = Buffer::Wrap(sparse_values);
+    return std::make_shared<SparseCSFTensor>(
+        si, CTypeTraits<CValueType>::type_singleton(), data_buffer, shape, dim_names);
+  }
+};
+
+TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType);
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) {
+  using IndexValueType = TypeParam;
+  using c_index_value_type = typename IndexValueType::c_type;
+
+  std::vector<int64_t> shape = {2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"a", "b", "c", "d"};
+  std::vector<int64_t> axis_order = {0, 1, 2, 3};
+  std::vector<int16_t> sparse_values = {1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<std::vector<c_index_value_type>> indptr_values = {
+      {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}};
+  std::vector<std::vector<c_index_value_type>> indices_values = {
+      {0, 1}, {0, 1, 1}, {0, 0, 1, 1}, {1, 2, 0, 2, 0, 0, 1, 2}};
+
+  auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values);
+  auto st = this->MakeSparseTensor(si, sparse_values, shape, dim_names);
+
+  ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_));
+}
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) {
+  std::vector<std::string> dim_names = {"a", "b", "c", "d"};
+  ASSERT_EQ(8, this->sparse_tensor_from_dense_->non_zero_length());
+  ASSERT_TRUE(this->sparse_tensor_from_dense_->is_mutable());
+  ASSERT_EQ(dim_names, this->sparse_tensor_from_dense_->dim_names());
+}
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) {
+  std::vector<int64_t> shape = {2, 3, 4, 5};
+  int16_t dense_values[2][3][4][5] = {};  // zero-initialized
+  dense_values[0][0][0][1] = 1;
+  dense_values[0][0][0][2] = 2;
+  dense_values[0][1][0][0] = 3;
+  dense_values[0][1][0][2] = 4;
+  dense_values[0][1][1][0] = 5;
+  dense_values[1][1][1][0] = 6;
+  dense_values[1][1][1][1] = 7;
+  dense_values[1][1][1][2] = 8;
+  auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values));
+  Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_);
+
+  std::shared_ptr<Tensor> dt;
+  ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt));
+  ASSERT_TRUE(dense_tensor.Equals(*dt));
+  ASSERT_EQ(dense_tensor.dim_names(), dt->dim_names());
+}
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestRoundTrip) {
+  using IndexValueType = TypeParam;
+
+  std::shared_ptr<Tensor> dt;
+  ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt));
+  std::shared_ptr<SparseCSFTensor> st;
+  ASSERT_OK_AND_ASSIGN(
+      st, SparseCSFTensor::Make(*dt, TypeTraits<IndexValueType>::type_singleton()));
+
+  ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_));
+}
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestAlternativeAxisOrder) {
+  using IndexValueType = TypeParam;
+  using c_index_value_type = typename IndexValueType::c_type;
+
+  std::vector<int16_t> dense_values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5};
+  std::vector<int64_t> shape = {4, 6};
+  std::vector<std::string> dim_names = {"a", "b"};
+  std::shared_ptr<Buffer> dense_buffer = Buffer::Wrap(dense_values);
+  Tensor tensor(int16(), dense_buffer, shape, {}, dim_names);
+
+  // Axis order 1
+  std::vector<int64_t> axis_order_1 = {0, 1};
+  std::vector<int16_t> sparse_values_1 = {1, 3, 2, 4, 5};
+  std::vector<std::vector<c_index_value_type>> indptr_values_1 = {{0, 2, 3, 5}};
+  std::vector<std::vector<c_index_value_type>> indices_values_1 = {{0, 1, 3},
+                                                                   {0, 3, 1, 3, 5}};
+  auto si_1 = this->MakeSparseCSFIndex(axis_order_1, indptr_values_1, indices_values_1);
+  auto st_1 = this->MakeSparseTensor(si_1, sparse_values_1, shape, dim_names);
+
+  // Axis order 2
+  std::vector<int64_t> axis_order_2 = {1, 0};
+  std::vector<int16_t> sparse_values_2 = {1, 2, 3, 4, 5};
+  std::vector<std::vector<c_index_value_type>> indptr_values_2 = {{0, 1, 2, 4, 5}};
+  std::vector<std::vector<c_index_value_type>> indices_values_2 = {{0, 1, 3, 5},
+                                                                   {0, 1, 0, 3, 3}};
+  auto si_2 = this->MakeSparseCSFIndex(axis_order_2, indptr_values_2, indices_values_2);
+  auto st_2 = this->MakeSparseTensor(si_2, sparse_values_2, shape, dim_names);
+
+  std::shared_ptr<Tensor> dt_1, dt_2;
+  ASSERT_OK(st_1->ToTensor(&dt_1));
+  ASSERT_OK(st_2->ToTensor(&dt_2));
+
+  ASSERT_FALSE(st_1->Equals(*st_2));
+  ASSERT_TRUE(dt_1->Equals(*dt_2));
+  ASSERT_TRUE(dt_1->Equals(tensor));
+}
+
+TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) {
+  using IndexValueType = TypeParam;
+  using c_index_value_type = typename IndexValueType::c_type;
+
+  std::vector<int64_t> shape = {5, 2, 3, 4};
+  int16_t dense_values[5][2][3][4] = {};  // zero-initialized
+  dense_values[0][0][0][1] = 1;
+  dense_values[0][0][0][2] = 2;
+  dense_values[0][1][0][0] = 3;
+  dense_values[0][1][0][2] = 4;
+  dense_values[0][1][1][0] = 5;
+  dense_values[1][1][1][0] = 6;
+  dense_values[1][1][1][1] = 7;
+  dense_values[1][1][1][2] = 8;
+  auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values));
+  Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_);
+
+  std::shared_ptr<SparseCSFTensor> sparse_tensor;
+  ASSERT_OK_AND_ASSIGN(
+      sparse_tensor,
+      SparseCSFTensor::Make(dense_tensor, TypeTraits<IndexValueType>::type_singleton()));
+
+  std::vector<std::vector<c_index_value_type>> indptr_values = {
+      {0, 1, 3}, {0, 2, 4, 7}, {0, 1, 2, 3, 4, 6, 7, 8}};
+  std::vector<std::vector<c_index_value_type>> indices_values = {
+      {0, 1}, {0, 0, 1}, {1, 2, 0, 2, 0, 1, 2}, {0, 0, 0, 0, 0, 1, 1, 1}};
+  std::vector<int64_t> axis_order = {1, 2, 3, 0};
+  std::vector<int16_t> sparse_values = {1, 2, 3, 4, 5, 6, 7, 8};
+  auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values);
+  auto st = this->MakeSparseTensor(si, sparse_values, shape, this->dim_names_);
+
+  std::shared_ptr<Tensor> dt;
+  ASSERT_OK(st->ToTensor(&dt));
+  ASSERT_TRUE(dt->Equals(dense_tensor));
+  ASSERT_TRUE(st->Equals(*sparse_tensor));
+}
+
+REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor,
+                           TestTensorToSparseTensor, TestSparseTensorToTensor,
+                           TestAlternativeAxisOrder, TestNonAscendingShape,
+                           TestRoundTrip);
+
+INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, Int16Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCSFTensorForIndexValueType,
+                              UInt16Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, Int32Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType,
+                              UInt32Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type);
+INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType,
+                              UInt64Type);
 }  // namespace arrow
diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs
index 1de67eed19a..9c8ddae0b7c 100644
--- a/format/SparseTensor.fbs
+++ b/format/SparseTensor.fbs
@@ -116,7 +116,7 @@ table SparseMatrixIndexCSX {
 
 union SparseTensorIndex {
   SparseTensorIndexCOO,
-  SparseMatrixIndexCSX,
+  SparseMatrixIndexCSX
 }
 
 table SparseTensor {

From a1ba1cbe1713d252e473a56a3595df58dcbd8a28 Mon Sep 17 00:00:00 2001
From: Takashi Hashida <t-hashida@amiya.co.jp>
Date: Wed, 5 Feb 2020 12:10:31 -0600
Subject: [PATCH 24/26] ARROW-7514: [C#] Make GetValueOffset Obsolete

* Add an [Obsolete] attribute to `BinaryArray.GetValueOffset`
  * `ListArray.GetValueOffset` already has the [Obsolete] attribute, so it is not changed
* Avoid using `GetValueOffset` in the product source code

As a precaution, I added tests for `ValueOffsets` and left tests for `GetValueOffset`.

Closes #6333 from HashidaTKS/ARROW-7514_make_getvalueoffset_obsolete and squashes the following commits:

1dbaf39f0 <Takashi Hashida> ARROW-7514_make_getvalueoffset_obsolete
92b14c055 <Takashi Hashida> ARROW-7514_make_getvalueoffset_obsolete
07d106c48 <Takashi Hashida> ARROW-7514_make_getvalueoffset_obsolete

Authored-by: Takashi Hashida <t-hashida@amiya.co.jp>
Signed-off-by: Eric Erhardt <eric.erhardt@microsoft.com>
---
 csharp/src/Apache.Arrow/Arrays/BinaryArray.cs     | 9 ++++++---
 csharp/src/Apache.Arrow/Arrays/ListArray.cs       | 2 +-
 csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs | 8 ++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs
index 49c483fc41f..e2cf0bc772a 100644
--- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs
@@ -171,6 +171,7 @@ public BinaryArray(IArrowType dataType, int length,
         public ReadOnlySpan<byte> Values => ValueBuffer.Span.CastTo<byte>();
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")]
         public int GetValueOffset(int index)
         {
             if (index < 0 || index > Length)
@@ -193,10 +194,12 @@ public int GetValueLength(int index)
 
         public ReadOnlySpan<byte> GetBytes(int index)
         {
-            var offset = GetValueOffset(index);
-            var length = GetValueLength(index);
+            if (index < 0 || index >= Length)
+            {
+                throw new ArgumentOutOfRangeException(nameof(index));
+            }
 
-            return ValueBuffer.Span.Slice(offset, length);
+            return ValueBuffer.Span.Slice(ValueOffsets[index], GetValueLength(index));
         }
 
     }
diff --git a/csharp/src/Apache.Arrow/Arrays/ListArray.cs b/csharp/src/Apache.Arrow/Arrays/ListArray.cs
index ff7095d5da6..bc171e6ffbd 100644
--- a/csharp/src/Apache.Arrow/Arrays/ListArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/ListArray.cs
@@ -123,7 +123,7 @@ private ListArray(ArrayData data, IArrowArray values) : base(data)
         public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
 
 
-        [Obsolete]
+        [Obsolete("This method has been deprecated. Please use ValueOffsets[index] instead.")]
         public int GetValueOffset(int index)
         {
             if (index < 0 || index > Length)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
index f2a8e58c89f..a1a7b231314 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
@@ -40,11 +40,19 @@ public void ThrowsWhenGetValueAndOffsetIndexOutOfBounds()
             Assert.Equal(1, array.GetValueLength(1));
             Assert.Throws<ArgumentOutOfRangeException>(() => array.GetValueLength(2));
 
+#pragma warning disable 618
             Assert.Throws<ArgumentOutOfRangeException>(() => array.GetValueOffset(-1));
             Assert.Equal(0, array.GetValueOffset(0));
             Assert.Equal(1, array.GetValueOffset(1));
             Assert.Equal(2, array.GetValueOffset(2));
             Assert.Throws<ArgumentOutOfRangeException>(() => array.GetValueOffset(3));
+#pragma warning restore 618
+
+            Assert.Throws<IndexOutOfRangeException>(() => array.ValueOffsets[-1]);
+            Assert.Equal(0, array.ValueOffsets[0]);
+            Assert.Equal(1, array.ValueOffsets[1]);
+            Assert.Equal(2, array.ValueOffsets[2]);
+            Assert.Throws<IndexOutOfRangeException>(() => array.ValueOffsets[3]);
 
         }
 

From 12d58e3239302e84febddadc90971cb5a6d521c6 Mon Sep 17 00:00:00 2001
From: Richard Artoul <richard.artoul@datadoghq.com>
Date: Wed, 5 Feb 2020 20:39:17 -0500
Subject: [PATCH 25/26] Add Reset method to buffer and cleanup comments

---
 go/arrow/memory/buffer.go | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go
index 234f5d4337f..006e45577b1 100644
--- a/go/arrow/memory/buffer.go
+++ b/go/arrow/memory/buffer.go
@@ -22,6 +22,7 @@ import (
 	"github.com/apache/arrow/go/arrow/internal/debug"
 )
 
+// Buffer is a wrapper type for a buffer of bytes.
 type Buffer struct {
 	refCount int64
 	buf      []byte
@@ -35,7 +36,7 @@ func NewBufferBytes(data []byte) *Buffer {
 	return &Buffer{refCount: 0, buf: data, length: len(data)}
 }
 
-// NewBuffer creates a mutable, resizable buffer with an Allocator for managing memory.
+// NewResizableBuffer creates a mutable, resizable buffer with an Allocator for managing memory.
 func NewResizableBuffer(mem Allocator) *Buffer {
 	return &Buffer{refCount: 1, mutable: true, mem: mem}
 }
@@ -60,15 +61,29 @@ func (b *Buffer) Release() {
 	}
 }
 
+// Reset resets the buffer for reuse.
+func (b *Buffer) Reset(buf []byte) {
+	b.refCount = 0
+	b.buf = buf
+	b.length = len(buf)
+}
+
 // Buf returns the slice of memory allocated by the Buffer, which is adjusted by calling Reserve.
 func (b *Buffer) Buf() []byte { return b.buf }
 
 // Bytes returns a slice of size Len, which is adjusted by calling Resize.
 func (b *Buffer) Bytes() []byte { return b.buf[:b.length] }
+
+// Mutable returns a bool indicating whether the buffer is mutable or not.
 func (b *Buffer) Mutable() bool { return b.mutable }
-func (b *Buffer) Len() int      { return b.length }
-func (b *Buffer) Cap() int      { return len(b.buf) }
 
+// Len returns the length of the buffer.
+func (b *Buffer) Len() int { return b.length }
+
+// Cap returns the capacity of the buffer.
+func (b *Buffer) Cap() int { return len(b.buf) }
+
+// Reserve reserves the provided amount of capacity for the buffer.
 func (b *Buffer) Reserve(capacity int) {
 	if capacity > len(b.buf) {
 		newCap := roundUpToMultipleOf64(capacity)
@@ -80,10 +95,13 @@ func (b *Buffer) Reserve(capacity int) {
 	}
 }
 
+// Resize resizes the buffer to the target size.
 func (b *Buffer) Resize(newSize int) {
 	b.resize(newSize, true)
 }
 
+// ResizeNoShrink resizes the buffer to the target size, but will not
+// shrink it.
 func (b *Buffer) ResizeNoShrink(newSize int) {
 	b.resize(newSize, false)
 }

From fb575241b4130c068c1153613cacd8c9c508077a Mon Sep 17 00:00:00 2001
From: Richard Artoul <richard.artoul@datadoghq.com>
Date: Wed, 5 Feb 2020 20:43:35 -0500
Subject: [PATCH 26/26] Dont reset refcount

---
 go/arrow/memory/buffer.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/go/arrow/memory/buffer.go b/go/arrow/memory/buffer.go
index 006e45577b1..57c0db487d1 100644
--- a/go/arrow/memory/buffer.go
+++ b/go/arrow/memory/buffer.go
@@ -63,7 +63,6 @@ func (b *Buffer) Release() {
 
 // Reset resets the buffer for reuse.
 func (b *Buffer) Reset(buf []byte) {
-	b.refCount = 0
 	b.buf = buf
 	b.length = len(buf)
 }