From e49f8a70f4a8e72efb781c0acdc98a5002108c68 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Mon, 23 Mar 2020 13:53:51 -0400
Subject: [PATCH 01/10] ARROW-8164: [C++][Dataset] Provide
 Dataset::ReplaceSchema()

---
 cpp/src/arrow/dataset/dataset.cc             | 17 +++++
 cpp/src/arrow/dataset/dataset.h              | 27 +++++--
 cpp/src/arrow/dataset/dataset_test.cc        | 80 ++++++++++++++++++--
 cpp/src/arrow/dataset/file_base.cc           |  8 ++
 cpp/src/arrow/dataset/file_base.h            |  3 +
 cpp/src/arrow/dataset/file_test.cc           | 24 ++++++
 cpp/src/arrow/dataset/projector.cc           | 43 +++++++----
 cpp/src/arrow/dataset/projector.h            |  2 +
 cpp/src/arrow/result.h                       |  6 ++
 python/pyarrow/_dataset.pyx                  | 14 +++-
 python/pyarrow/includes/libarrow_dataset.pxd |  2 +
 r/R/arrowExports.R                           |  4 +
 r/R/dataset.R                                |  6 +-
 r/src/arrowExports.cpp                       | 17 +++++
 r/src/dataset.cpp                            |  7 ++
 15 files changed, 229 insertions(+), 31 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 2e17cc3ec67..37d0da0bbc5 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -135,6 +135,12 @@ InMemoryDataset::InMemoryDataset(std::shared_ptr<Table> table)
     : Dataset(table->schema()),
       get_batches_(new TableRecordBatchGenerator(std::move(table))) {}
 
+Result<std::shared_ptr<Dataset>> InMemoryDataset::ReplaceSchema(
+    std::shared_ptr<Schema> schema) const {
+  RETURN_NOT_OK(CheckProjectable(*schema_, *schema));
+  return std::make_shared<InMemoryDataset>(std::move(schema), get_batches_);
+}
+
 FragmentIterator InMemoryDataset::GetFragmentsImpl(
     std::shared_ptr<ScanOptions> scan_options) {
   auto schema = this->schema();
@@ -175,6 +181,17 @@ Result<std::shared_ptr<UnionDataset>> UnionDataset::Make(std::shared_ptr<Schema>
       new UnionDataset(std::move(schema), std::move(children)));
 }
 
+Result<std::shared_ptr<Dataset>> UnionDataset::ReplaceSchema(
+    std::shared_ptr<Schema> schema) const {
+  auto children = children_;
+  for (auto& child : children) {
+    ARROW_ASSIGN_OR_RAISE(child, child->ReplaceSchema(schema));
+  }
+
+  return std::shared_ptr<Dataset>(
+      new UnionDataset(std::move(schema), std::move(children)));
+}
+
 FragmentIterator UnionDataset::GetFragmentsImpl(std::shared_ptr<ScanOptions> options) {
   return GetFragmentsFromDatasets(children_, options);
 }
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 9f0e26feabe..740263df3f8 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -122,6 +122,13 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
   /// \brief The name identifying the kind of Dataset
   virtual std::string type_name() const = 0;
 
+  /// \brief Return a copy of this Dataset with a different schema.
+  ///
+  /// The copy will view the same Fragments. If the new schema is not compatible with the
+  /// original dataset's schema then an error will be raised.
+  virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const = 0;
+
   virtual ~Dataset() = default;
 
  protected:
@@ -155,7 +162,7 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
   };
 
   InMemoryDataset(std::shared_ptr<Schema> schema,
-                  std::unique_ptr<RecordBatchGenerator> get_batches)
+                  std::shared_ptr<RecordBatchGenerator> get_batches)
       : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
 
   // Convenience constructor taking a fixed list of batches
@@ -163,12 +170,15 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
 
   explicit InMemoryDataset(std::shared_ptr<Table> table);
 
-  FragmentIterator GetFragmentsImpl(std::shared_ptr<ScanOptions> options) override;
-
   std::string type_name() const override { return "in-memory"; }
 
- private:
-  std::unique_ptr<RecordBatchGenerator> get_batches_;
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+ protected:
+  FragmentIterator GetFragmentsImpl(std::shared_ptr<ScanOptions> options) override;
+
+  std::shared_ptr<RecordBatchGenerator> get_batches_;
 };
 
 /// \brief A Dataset wrapping child Datasets.
@@ -182,13 +192,16 @@ class ARROW_DS_EXPORT UnionDataset : public Dataset {
   static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
                                                     DatasetVector children);
 
-  FragmentIterator GetFragmentsImpl(std::shared_ptr<ScanOptions> options) override;
-
   const DatasetVector& children() const { return children_; }
 
   std::string type_name() const override { return "union"; }
 
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
  protected:
+  FragmentIterator GetFragmentsImpl(std::shared_ptr<ScanOptions> options) override;
+
   explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
       : Dataset(std::move(schema)), children_(std::move(children)) {}
 
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index 01c5b5439b2..c4db812d16a 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -52,6 +52,35 @@ TEST_F(TestInMemoryFragment, Scan) {
 
 class TestInMemoryDataset : public DatasetFixtureMixin {};
 
+TEST_F(TestInMemoryDataset, ReplaceSchema) {
+  constexpr int64_t kBatchSize = 1;
+  constexpr int64_t kNumberBatches = 1;
+
+  SetSchema({field("i32", int32()), field("f64", float64())});
+  auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+  auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
+
+  auto dataset = std::make_shared<InMemoryDataset>(
+      schema_, RecordBatchVector{static_cast<size_t>(kNumberBatches), batch});
+
+  // drop field
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+  // add field (will be materialized as null during projection)
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+  // incompatible type
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+  // incompatible nullability
+  ASSERT_RAISES(
+      TypeError,
+      dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+          .status());
+  // add non-nullable field
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+                    .status());
+}
+
 TEST_F(TestInMemoryDataset, GetFragments) {
   constexpr int64_t kBatchSize = 1024;
   constexpr int64_t kNumberBatches = 16;
@@ -60,8 +89,6 @@ TEST_F(TestInMemoryDataset, GetFragments) {
   auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
   auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
 
-  // It is safe to copy fragment multiple time since Scan() does not consume
-  // the internal array.
   auto dataset = std::make_shared<InMemoryDataset>(
       schema_, RecordBatchVector{static_cast<size_t>(kNumberBatches), batch});
 
@@ -70,6 +97,45 @@ TEST_F(TestInMemoryDataset, GetFragments) {
 
 class TestUnionDataset : public DatasetFixtureMixin {};
 
+TEST_F(TestUnionDataset, ReplaceSchema) {
+  constexpr int64_t kBatchSize = 1;
+  constexpr int64_t kNumberBatches = 1;
+
+  SetSchema({field("i32", int32()), field("f64", float64())});
+  auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+
+  std::vector<std::shared_ptr<RecordBatch>> batches{static_cast<size_t>(kNumberBatches),
+                                                    batch};
+
+  DatasetVector children = {
+      std::make_shared<InMemoryDataset>(schema_, batches),
+      std::make_shared<InMemoryDataset>(schema_, batches),
+  };
+
+  const int64_t total_batches = children.size() * kNumberBatches;
+  auto reader = ConstantArrayGenerator::Repeat(total_batches, batch);
+
+  ASSERT_OK_AND_ASSIGN(auto dataset, UnionDataset::Make(schema_, children));
+  AssertDatasetEquals(reader.get(), dataset.get());
+
+  // drop field
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+  // add nullable field (will be materialized as null during projection)
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+  // incompatible type
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+  // incompatible nullability
+  ASSERT_RAISES(
+      TypeError,
+      dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+          .status());
+  // add non-nullable field
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+                    .status());
+}
+
 TEST_F(TestUnionDataset, GetFragments) {
   constexpr int64_t kBatchSize = 1024;
   constexpr int64_t kChildPerNode = 2;
@@ -105,9 +171,7 @@ TEST_F(TestUnionDataset, GetFragments) {
   AssertDatasetEquals(reader.get(), root_dataset.get());
 }
 
-class TestDataset : public DatasetFixtureMixin {};
-
-TEST_F(TestDataset, TrivialScan) {
+TEST_F(TestUnionDataset, TrivialScan) {
   constexpr int64_t kNumberBatches = 16;
   constexpr int64_t kBatchSize = 1024;
 
@@ -229,8 +293,8 @@ TEST(TestProjector, NonTrivial) {
   AssertBatchesEqual(*expected_batch, *reconciled_batch);
 }
 
-class TestEndToEnd : public TestDataset {
-  void SetUp() {
+class TestEndToEnd : public TestUnionDataset {
+  void SetUp() override {
     bool nullable = false;
     SetSchema({
         field("region", utf8(), nullable),
@@ -413,7 +477,7 @@ inline std::shared_ptr<Schema> SchemaFromNames(const std::vector<std::string> na
   return schema(fields);
 }
 
-class TestSchemaUnification : public TestDataset {
+class TestSchemaUnification : public TestUnionDataset {
  public:
   using i32 = util::optional<int32_t>;
   using PathAndContent = std::vector<std::pair<std::string, std::string>>;
diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index 66e0e1dd68d..fee471d975f 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -118,6 +118,14 @@ Result<std::shared_ptr<FileSystemDataset>> FileSystemDataset::Make(
       std::move(filesystem), std::move(forest), std::move(partitions)));
 }
 
+Result<std::shared_ptr<Dataset>> FileSystemDataset::ReplaceSchema(
+    std::shared_ptr<Schema> schema) const {
+  RETURN_NOT_OK(CheckProjectable(*schema_, *schema));
+  return std::shared_ptr<Dataset>(
+      new FileSystemDataset(std::move(schema), partition_expression_, format_,
+                            filesystem_, forest_, partitions_));
+}
+
 std::vector<std::string> FileSystemDataset::files() const {
   std::vector<std::string> files;
 
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index 157a4256e1c..e6d893193ff 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -244,6 +244,9 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
 
   std::string type_name() const override { return "filesystem"; }
 
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
   const std::shared_ptr<FileFormat>& format() const { return format_; }
 
   std::vector<std::string> files() const;
diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc
index 4d6d6f6348c..699ba989587 100644
--- a/cpp/src/arrow/dataset/file_test.cc
+++ b/cpp/src/arrow/dataset/file_test.cc
@@ -92,6 +92,30 @@ TEST_F(TestFileSystemDataset, Basic) {
   AssertFilesAre(dataset_, {"A/a", "A/B/b"});
 }
 
+TEST_F(TestFileSystemDataset, ReplaceSchema) {
+  auto schm = schema({field("i32", int32()), field("f64", float64())});
+  auto format = std::make_shared<DummyFileFormat>(schm);
+  ASSERT_OK_AND_ASSIGN(auto dataset,
+                       FileSystemDataset::Make(schm, scalar(true), format, fs_, {}));
+
+  // drop field
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+  // add nullable field (will be materialized as null during projection)
+  ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+  // incompatible type
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+  // incompatible nullability
+  ASSERT_RAISES(
+      TypeError,
+      dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+          .status());
+  // add non-nullable field
+  ASSERT_RAISES(TypeError,
+                dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+                    .status());
+}
+
 TEST_F(TestFileSystemDataset, RootPartitionPruning) {
   auto root_partition = ("a"_ == 5).Copy();
   MakeDataset({fs::File("a"), fs::File("b")}, root_partition);
diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc
index 531c4a56694..0d0f2eef47b 100644
--- a/cpp/src/arrow/dataset/projector.cc
+++ b/cpp/src/arrow/dataset/projector.cc
@@ -34,6 +34,30 @@
 namespace arrow {
 namespace dataset {
 
+Status CheckProjectable(const Schema& from, const Schema& to) {
+  for (const auto& to_field : to.fields()) {
+    ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(to_field->name()).FindOneOrNone(from));
+    DCHECK_LE(match.indices().size(), 1);  // name FieldRef returns non nested path
+
+    if (match.indices().empty()) {
+      if (to_field->nullable()) continue;
+
+      return Status::TypeError("field ", to_field->ToString(),
+                               " is not nullable and does not exist in origin schema ",
+                               from);
+    }
+
+    const auto& from_field = from.field(match.indices()[0]);
+    if (!from_field->Equals(to_field, /*check_metadata=*/false)) {
+      return Status::TypeError(
+          "fields had matching names but were not equivalent. From: ",
+          from_field->ToString(), " To: ", to_field->ToString());
+    }
+  }
+
+  return Status::OK();
+}
+
 RecordBatchProjector::RecordBatchProjector(std::shared_ptr<Schema> to)
     : to_(std::move(to)),
       missing_columns_(to_->num_fields(), nullptr),
@@ -86,32 +110,23 @@ Result<std::shared_ptr<RecordBatch>> RecordBatchProjector::Project(
 
 Status RecordBatchProjector::SetInputSchema(std::shared_ptr<Schema> from,
                                             MemoryPool* pool) {
+  RETURN_NOT_OK(CheckProjectable(*from, *to_));
   from_ = std::move(from);
 
   for (int i = 0; i < to_->num_fields(); ++i) {
-    const auto& field = to_->field(i);
-    FieldRef ref(field->name());
-    auto matches = ref.FindAll(*from_);
+    ARROW_ASSIGN_OR_RAISE(auto match,
+                          FieldRef(to_->field(i)->name()).FindOneOrNone(*from_));
 
-    if (matches.empty()) {
+    if (match.indices().empty()) {
       // Mark column i as missing by setting missing_columns_[i]
       // to a non-null placeholder.
       ARROW_ASSIGN_OR_RAISE(missing_columns_[i],
                             MakeArrayOfNull(to_->field(i)->type(), 0, pool));
       column_indices_[i] = kNoMatch;
     } else {
-      RETURN_NOT_OK(ref.CheckNonMultiple(matches, *from_));
-      int matching_index = matches[0].indices()[0];
-
-      if (!from_->field(matching_index)->Equals(field, /*check_metadata=*/false)) {
-        return Status::TypeError("fields had matching names but were not equivalent ",
-                                 from_->field(matching_index)->ToString(), " vs ",
-                                 field->ToString());
-      }
-
       // Mark column i as not missing by setting missing_columns_[i] to nullptr
       missing_columns_[i] = nullptr;
-      column_indices_[i] = matching_index;
+      column_indices_[i] = match.indices()[0];
     }
   }
   return Status::OK();
diff --git a/cpp/src/arrow/dataset/projector.h b/cpp/src/arrow/dataset/projector.h
index 13a0ffb1938..8fd157f7ece 100644
--- a/cpp/src/arrow/dataset/projector.h
+++ b/cpp/src/arrow/dataset/projector.h
@@ -27,6 +27,8 @@
 namespace arrow {
 namespace dataset {
 
+ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
+
 /// \brief Project a RecordBatch to a given schema.
 ///
 /// Projected record batches will reorder columns from input record batches when possible,
diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h
index 13dd3870b1a..90d74ae6497 100644
--- a/cpp/src/arrow/result.h
+++ b/cpp/src/arrow/result.h
@@ -45,6 +45,12 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st);
 
 }  // namespace internal
 
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+template <class T>
+class ARROW_MUST_USE_RESULT Result;
+#endif
+
 // A class for representing either a usable value, or an error.
 ///
 /// A Result object either contains a value of type `T` or a Status object
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 556faa74539..ab42aa7d99a 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -62,7 +62,7 @@ cdef class Dataset:
         self.dataset = sp.get()
 
     @staticmethod
-    cdef wrap(shared_ptr[CDataset]& sp):
+    cdef wrap(const shared_ptr[CDataset]& sp):
         cdef Dataset self
 
         typ = frombytes(sp.get().type_name())
@@ -92,6 +92,18 @@ cdef class Dataset:
         else:
             return Expression.wrap(expr)
 
+    def replace_schema(self, Schema schema not None):
+        """
+        Return a copy of this Dataset with a different schema.
+
+        The copy will view the same Fragments. If the new schema is not
+        compatible with the original dataset's schema then an error will
+        be raised.
+        """
+        cdef shared_ptr[CDataset] copy = GetResultValue(
+            self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema)))
+        return Dataset.wrap(move(copy))
+
     def get_fragments(self, columns=None, filter=None):
         """Returns an iterator over the fragments in this dataset.
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 467a84c9319..53d5cde9dc9 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -187,6 +187,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         const shared_ptr[CExpression] & partition_expression()
         c_string type_name()
 
+        CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema])
+
         CResult[shared_ptr[CScannerBuilder]] NewScanWithContext "NewScan"(
             shared_ptr[CScanContext] context)
         CResult[shared_ptr[CScannerBuilder]] NewScan()
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 0f6b758df14..86ae03e1c63 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -356,6 +356,10 @@ dataset___Dataset__type_name <- function(dataset){
     .Call(`_arrow_dataset___Dataset__type_name` , dataset)
 }
 
+dataset___Dataset__ReplaceSchema <- function(dataset, schm){
+    .Call(`_arrow_dataset___Dataset__ReplaceSchema` , dataset, schm)
+}
+
 dataset___UnionDataset__create <- function(datasets, schm){
     .Call(`_arrow_dataset___UnionDataset__create` , datasets, schm)
 }
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 78dec8e4b92..9c240bcdec7 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -119,7 +119,11 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject,
     # Start a new scan of the data
     # @return A [ScannerBuilder]
     NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)),
-    ToString = function() self$schema$ToString()
+    ToString = function() self$schema$ToString(),
+    ReplaceSchema = function(schema) {
+      assert_is(schema, "Schema")
+      shared_ptr(Dataset, dataset___Dataset__ReplaceSchema(self, schema))
+    }
   ),
   active = list(
     schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)),
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index b38edfc4ded..43264b62bf0 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1406,6 +1406,22 @@ RcppExport SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){
 }
 #endif
 
+// dataset.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(const std::shared_ptr<ds::Dataset>& dataset, const std::shared_ptr<arrow::Schema>& schm);
+RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<ds::Dataset>&>::type dataset(dataset_sexp);
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
+	return Rcpp::wrap(dataset___Dataset__ReplaceSchema(dataset, schm));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
+	Rf_error("Cannot call dataset___Dataset__ReplaceSchema(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
 // dataset.cpp
 #if defined(ARROW_R_WITH_ARROW)
 std::shared_ptr<ds::UnionDataset> dataset___UnionDataset__create(const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm);
@@ -5838,6 +5854,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, 
 		{ "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, 
 		{ "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, 
+		{ "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, 
 		{ "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, 
 		{ "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, 
 		{ "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, 
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 00c480f0ec9..d2d207aef7d 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -41,6 +41,13 @@ std::string dataset___Dataset__type_name(const std::shared_ptr<ds::Dataset>& dat
   return dataset->type_name();
 }
 
+// [[arrow::export]]
+std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(
+    const std::shared_ptr<ds::Dataset>& dataset,
+    const std::shared_ptr<arrow::Schema>& schm) {
+  return VALUE_OR_STOP(dataset->ReplaceSchema(schm));
+}
+
 // [[arrow::export]]
 std::shared_ptr<ds::UnionDataset> dataset___UnionDataset__create(
     const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm) {

From d86846886d3bd7b16a3d5a4e789c64b5d1bd4448 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Fri, 27 Mar 2020 16:03:47 -0400
Subject: [PATCH 02/10] address review comments

---
 cpp/src/arrow/dataset/dataset_test.cc | 60 +++++++++++++++++++++++++--
 cpp/src/arrow/dataset/projector.cc    | 19 +++++----
 cpp/src/arrow/result.h                | 10 +----
 cpp/src/arrow/status.h                |  9 +---
 cpp/src/arrow/testing/gtest_util.h    | 11 ++---
 cpp/src/arrow/util/macros.h           |  5 +++
 6 files changed, 82 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index c4db812d16a..7470efac20a 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -193,6 +193,57 @@ TEST_F(TestUnionDataset, TrivialScan) {
   AssertDatasetEquals(reader.get(), dataset.get());
 }
 
+TEST(TestProjector, CheckProjectable) {
+  struct Assert {
+    explicit Assert(FieldVector from) : from_(from) {}
+    Schema from_;
+
+    void ProjectableTo(FieldVector to) {
+      ARROW_EXPECT_OK(CheckProjectable(from_, Schema(to)));
+    }
+
+    void NotProjectableTo(FieldVector to, std::string substr = "") {
+      EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, testing::HasSubstr(substr),
+                                      CheckProjectable(from_, Schema(to)));
+    }
+  };
+
+  auto i8 = field("i8", int8());
+  auto u16 = field("u16", uint16());
+  auto str = field("str", utf8());
+  auto i8_req = field("i8", int8(), false);
+  auto u16_req = field("u16", uint16(), false);
+  auto str_req = field("str", utf8(), false);
+
+  // trivial
+  Assert({}).ProjectableTo({});
+  Assert({i8}).ProjectableTo({i8});
+  Assert({i8, u16_req}).ProjectableTo({i8, u16_req});
+
+  // reorder
+  Assert({i8, u16}).ProjectableTo({u16, i8});
+  Assert({i8, str, u16}).ProjectableTo({u16, i8, str});
+
+  // drop field(s)
+  Assert({i8}).ProjectableTo({});
+
+  // add field(s)
+  Assert({}).ProjectableTo({i8});
+  Assert({}).ProjectableTo({i8, u16});
+  Assert({}).NotProjectableTo({u16_req},
+                              "is not nullable and does not exist in origin schema");
+  Assert({i8}).NotProjectableTo({u16_req, i8});
+
+  // change nullability
+  Assert({i8}).NotProjectableTo({i8_req},
+                                "not nullable but is not required in origin schema");
+  Assert({i8_req}).ProjectableTo({i8});
+
+  // change field type
+  Assert({i8}).NotProjectableTo({field("i8", utf8())},
+                                "fields had matching names but differing types");
+}
+
 TEST(TestProjector, MismatchedType) {
   constexpr int64_t kBatchSize = 1024;
 
@@ -441,9 +492,9 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) {
   ASSERT_OK(scanner_builder->Project(columns));
 
   // An optional filter expression may also be specified. The filter expression
-  // is evaluated against input rows. Only rows for which the filter evaluates to true are
-  // yielded. Predicate pushdown optimizations are applied using partition information if
-  // available.
+  // is evaluated against input rows. Only rows for which the filter evaluates to true
+  // are yielded. Predicate pushdown optimizations are applied using partition
+  // information if available.
   //
   // This API decouples predicate pushdown from the Dataset implementation
   // and partition discovery.
@@ -551,7 +602,8 @@ class TestSchemaUnification : public TestUnionDataset {
     ASSERT_OK_AND_ASSIGN(auto ds1, get_source("/dataset/alpha", {ds1_df1, ds1_df2}));
     ASSERT_OK_AND_ASSIGN(auto ds2, get_source("/dataset/beta", {ds2_df1, ds2_df2}));
 
-    // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this test
+    // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this
+    // test
     class DisparateSchemasUnionDataset : public UnionDataset {
      public:
       DisparateSchemasUnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc
index 0d0f2eef47b..9ce90ad0ed3 100644
--- a/cpp/src/arrow/dataset/projector.cc
+++ b/cpp/src/arrow/dataset/projector.cc
@@ -36,10 +36,9 @@ namespace dataset {
 
 Status CheckProjectable(const Schema& from, const Schema& to) {
   for (const auto& to_field : to.fields()) {
-    ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(to_field->name()).FindOneOrNone(from));
-    DCHECK_LE(match.indices().size(), 1);  // name FieldRef returns non nested path
+    ARROW_ASSIGN_OR_RAISE(auto from_field, FieldRef(to_field->name()).GetOneOrNone(from));
 
-    if (match.indices().empty()) {
+    if (from_field == nullptr) {
       if (to_field->nullable()) continue;
 
       return Status::TypeError("field ", to_field->ToString(),
@@ -47,11 +46,15 @@ Status CheckProjectable(const Schema& from, const Schema& to) {
                                from);
     }
 
-    const auto& from_field = from.field(match.indices()[0]);
-    if (!from_field->Equals(to_field, /*check_metadata=*/false)) {
-      return Status::TypeError(
-          "fields had matching names but were not equivalent. From: ",
-          from_field->ToString(), " To: ", to_field->ToString());
+    if (!from_field->type()->Equals(to_field->type())) {
+      return Status::TypeError("fields had matching names but differing types. From: ",
+                               from_field->ToString(), " To: ", to_field->ToString());
+    }
+
+    if (from_field->nullable() && !to_field->nullable()) {
+      return Status::TypeError("field ", to_field->ToString(),
+                               " is not nullable but is not required in origin schema ",
+                               from);
     }
   }
 
diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h
index 90d74ae6497..b492057a8b2 100644
--- a/cpp/src/arrow/result.h
+++ b/cpp/src/arrow/result.h
@@ -45,13 +45,7 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st);
 
 }  // namespace internal
 
-#if defined(__clang__)
-// Only clang supports warn_unused_result as a type annotation.
-template <class T>
-class ARROW_MUST_USE_RESULT Result;
-#endif
-
-// A class for representing either a usable value, or an error.
+/// A class for representing either a usable value, or an error.
 ///
 /// A Result object either contains a value of type `T` or a Status object
 /// explaining why such a value is not present. The type `T` must be
@@ -104,7 +98,7 @@ class ARROW_MUST_USE_RESULT Result;
 ///   arrow::Result<int> CalculateFoo();
 /// ```
 template <class T>
-class Result : public util::EqualityComparable<Result<T>> {
+class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
   template <typename U>
   friend class Result;
 
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index 195ed71cce9..aa1f2e151e5 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -95,11 +95,6 @@ enum class StatusCode : char {
   AlreadyExists = 45
 };
 
-#if defined(__clang__)
-// Only clang supports warn_unused_result as a type annotation.
-class ARROW_MUST_USE_RESULT ARROW_EXPORT Status;
-#endif
-
 /// \brief An opaque class that allows subsystems to retain
 /// additional information inside the Status.
 class ARROW_EXPORT StatusDetail {
@@ -124,8 +119,8 @@ class ARROW_EXPORT StatusDetail {
 ///
 /// Additionally, if an error occurred, a specific error message is generally
 /// attached.
-class ARROW_EXPORT Status : public util::EqualityComparable<Status>,
-                            public util::ToStringOstreamable<Status> {
+class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<Status>,
+                                                public util::ToStringOstreamable<Status> {
  public:
   // Create a success status.
   Status() noexcept : state_(NULLPTR) {}
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 6119e0f0fc7..14da481f0bb 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -97,11 +97,12 @@ class Result;
 
 #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr))
 
-#define ARROW_EXPECT_OK(expr)                                       \
-  do {                                                              \
-    auto _res = (expr);                                             \
-    ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
-    EXPECT_TRUE(_st.ok());                                          \
+#define ARROW_EXPECT_OK(expr)                                           \
+  do {                                                                  \
+    auto _res = (expr);                                                 \
+    ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res);     \
+    EXPECT_TRUE(_st.ok()) << "'" ARROW_STRINGIFY(expr) "' failed with " \
+                          << _st.ToString();                            \
   } while (false)
 
 #define ABORT_NOT_OK(expr)                                          \
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index edb03d31c14..69b3fbab9c7 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -68,6 +68,11 @@
 #define ARROW_MUST_USE_RESULT
 #endif
 
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+#define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT
+#endif
+
 // ----------------------------------------------------------------------
 // C++/CLI support macros (see ARROW-1134)
 

From 2ee096c5ba24a37fe29c531cc1e4ba5ac610330b Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Fri, 27 Mar 2020 18:06:53 -0400
Subject: [PATCH 03/10] add #else #define ARROW_MUST_USE_TYPE

---
 cpp/src/arrow/util/macros.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index 69b3fbab9c7..ae8d56d098b 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -71,6 +71,8 @@
 #if defined(__clang__)
 // Only clang supports warn_unused_result as a type annotation.
 #define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT
+#else
+#define ARROW_MUST_USE_TYPE
 #endif
 
 // ----------------------------------------------------------------------

From d0e2e04b21c7289c7dcdb4f975031e4424efda3c Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Wed, 8 Apr 2020 12:50:02 -0400
Subject: [PATCH 04/10] try enabling debug build with tests on mingw32

---
 ci/scripts/PKGBUILD                   | 8 +++++++-
 cpp/cmake_modules/DefineOptions.cmake | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 23a78cb2fb2..c3c797c344f 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -75,6 +75,7 @@ build() {
     ${ARROW_CPP_DIR} \
     -G "MSYS Makefiles" \
     -DARROW_BUILD_SHARED=OFF \
+    -DARROW_BUILD_TESTS=ON \
     -DARROW_BUILD_STATIC=ON \
     -DARROW_BUILD_UTILITIES=OFF \
     -DARROW_COMPUTE=ON \
@@ -91,13 +92,18 @@ build() {
     -DARROW_WITH_SNAPPY=ON \
     -DARROW_WITH_ZLIB=ON \
     -DARROW_WITH_ZSTD=ON \
-    -DCMAKE_BUILD_TYPE="release" \
+    -DCMAKE_BUILD_TYPE="debug" \
     -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX}
 
   make -j2
   popd
 }
 
+check() {
+  export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH"
+  make -C ${cpp_build_dir} test
+}
+
 package() {
   make -C ${cpp_build_dir} DESTDIR="${pkgdir}" install
 
diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index dbcc5b6ff01..b9d2e5b9ff8 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -138,9 +138,15 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   define_option(ARROW_BUILD_BENCHMARKS_REFERENCE
                 "Build the Arrow micro reference benchmarks" OFF)
 
+  if(ARROW_BUILD_SHARED)
+    set(ARROW_TEST_LINKAGE_DEFAULT "shared")
+  else()
+    set(ARROW_TEST_LINKAGE_DEFAULT "static")
+  endif()
+
   define_option_string(ARROW_TEST_LINKAGE
                        "Linkage of Arrow libraries with unit tests executables."
-                       "shared"
+                       "${ARROW_TEST_LINKAGE_DEFAULT}"
                        "shared"
                        "static")
 

From bc767e1b9e13094d94c90a165ccc35c0c8046e63 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Wed, 8 Apr 2020 14:08:05 -0400
Subject: [PATCH 05/10] revert changes to R bindings

---
 r/R/arrowExports.R     |  4 ----
 r/R/dataset.R          |  6 +-----
 r/man/enums.Rd         | 20 +-------------------
 r/man/write_parquet.Rd |  3 +--
 r/src/arrowExports.cpp | 17 -----------------
 r/src/dataset.cpp      |  7 -------
 6 files changed, 3 insertions(+), 54 deletions(-)

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 86ae03e1c63..0f6b758df14 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -356,10 +356,6 @@ dataset___Dataset__type_name <- function(dataset){
     .Call(`_arrow_dataset___Dataset__type_name` , dataset)
 }
 
-dataset___Dataset__ReplaceSchema <- function(dataset, schm){
-    .Call(`_arrow_dataset___Dataset__ReplaceSchema` , dataset, schm)
-}
-
 dataset___UnionDataset__create <- function(datasets, schm){
     .Call(`_arrow_dataset___UnionDataset__create` , datasets, schm)
 }
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 9c240bcdec7..78dec8e4b92 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -119,11 +119,7 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject,
     # Start a new scan of the data
     # @return A [ScannerBuilder]
     NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)),
-    ToString = function() self$schema$ToString(),
-    ReplaceSchema = function(schema) {
-      assert_is(schema, "Schema")
-      shared_ptr(Dataset, dataset___Dataset__ReplaceSchema(self, schema))
-    }
+    ToString = function() self$schema$ToString()
   ),
   active = list(
     schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)),
diff --git a/r/man/enums.Rd b/r/man/enums.Rd
index 862942a46a0..7f7358f760b 100644
--- a/r/man/enums.Rd
+++ b/r/man/enums.Rd
@@ -13,25 +13,7 @@
 \alias{FileType}
 \alias{ParquetVersionType}
 \title{Arrow enums}
-\format{
-An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.
-
-An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2.
-
-An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 28.
-
-An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17.
-
-An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3.
-
-An object of class \code{Message::Type} (inherits from \code{arrow-enum}) of length 5.
-
-An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9.
-
-An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4.
-
-An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2.
-}
+\format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.}
 \usage{
 TimeUnit
 
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index 3ff60df30f9..acd5e8d48c9 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -20,8 +20,7 @@ write_parquet(
   use_deprecated_int96_timestamps = FALSE,
   coerce_timestamps = NULL,
   allow_truncated_timestamps = FALSE,
- 
-    arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps
+  arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps
     = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps,
     allow_truncated_timestamps = allow_truncated_timestamps)
 )
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 43264b62bf0..b38edfc4ded 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1406,22 +1406,6 @@ RcppExport SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){
 }
 #endif
 
-// dataset.cpp
-#if defined(ARROW_R_WITH_ARROW)
-std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(const std::shared_ptr<ds::Dataset>& dataset, const std::shared_ptr<arrow::Schema>& schm);
-RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
-BEGIN_RCPP
-	Rcpp::traits::input_parameter<const std::shared_ptr<ds::Dataset>&>::type dataset(dataset_sexp);
-	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
-	return Rcpp::wrap(dataset___Dataset__ReplaceSchema(dataset, schm));
-END_RCPP
-}
-#else
-RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
-	Rf_error("Cannot call dataset___Dataset__ReplaceSchema(). Please use arrow::install_arrow() to install required runtime libraries. ");
-}
-#endif
-
 // dataset.cpp
 #if defined(ARROW_R_WITH_ARROW)
 std::shared_ptr<ds::UnionDataset> dataset___UnionDataset__create(const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm);
@@ -5854,7 +5838,6 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, 
 		{ "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, 
 		{ "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, 
-		{ "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, 
 		{ "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, 
 		{ "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, 
 		{ "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, 
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index d2d207aef7d..00c480f0ec9 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -41,13 +41,6 @@ std::string dataset___Dataset__type_name(const std::shared_ptr<ds::Dataset>& dat
   return dataset->type_name();
 }
 
-// [[arrow::export]]
-std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(
-    const std::shared_ptr<ds::Dataset>& dataset,
-    const std::shared_ptr<arrow::Schema>& schm) {
-  return VALUE_OR_STOP(dataset->ReplaceSchema(schm));
-}
-
 // [[arrow::export]]
 std::shared_ptr<ds::UnionDataset> dataset___UnionDataset__create(
     const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm) {

From b36ed5fe07dce1bde98f19738ffb8dc1dde935b7 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Wed, 8 Apr 2020 15:43:29 -0400
Subject: [PATCH 06/10] add boost as a dependency in PKGBUILD

---
 ci/scripts/PKGBUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index c3c797c344f..023a0c59dfb 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -24,7 +24,8 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat
 arch=("any")
 url="https://arrow.apache.org/"
 license=("Apache-2.0")
-depends=("${MINGW_PACKAGE_PREFIX}-thrift"
+depends=("${MINGW_PACKAGE_PREFIX}-boost"
+         "${MINGW_PACKAGE_PREFIX}-thrift"
          "${MINGW_PACKAGE_PREFIX}-snappy"
          "${MINGW_PACKAGE_PREFIX}-zlib"
          "${MINGW_PACKAGE_PREFIX}-lz4"

From 1fef351e03d264684b7676632f062914bcf0b2ca Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Wed, 8 Apr 2020 13:12:50 -0700
Subject: [PATCH 07/10] Re-fix R docs

---
 r/man/enums.Rd         | 20 +++++++++++++++++++-
 r/man/write_parquet.Rd |  3 ++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/r/man/enums.Rd b/r/man/enums.Rd
index 7f7358f760b..862942a46a0 100644
--- a/r/man/enums.Rd
+++ b/r/man/enums.Rd
@@ -13,7 +13,25 @@
 \alias{FileType}
 \alias{ParquetVersionType}
 \title{Arrow enums}
-\format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.}
+\format{
+An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.
+
+An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2.
+
+An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 28.
+
+An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17.
+
+An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3.
+
+An object of class \code{Message::Type} (inherits from \code{arrow-enum}) of length 5.
+
+An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9.
+
+An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4.
+
+An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2.
+}
 \usage{
 TimeUnit
 
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index acd5e8d48c9..3ff60df30f9 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -20,7 +20,8 @@ write_parquet(
   use_deprecated_int96_timestamps = FALSE,
   coerce_timestamps = NULL,
   allow_truncated_timestamps = FALSE,
-  arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps
+ 
+    arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps
     = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps,
     allow_truncated_timestamps = allow_truncated_timestamps)
 )

From b0ebb362c44d0ab716654ddfb7dc201fa0dfbdc7 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Wed, 8 Apr 2020 16:36:39 -0400
Subject: [PATCH 08/10] Re fix boost dependency

---
 ci/scripts/PKGBUILD           | 2 ++
 ci/scripts/r_windows_build.sh | 1 +
 2 files changed, 3 insertions(+)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 023a0c59dfb..0b864f89114 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -24,6 +24,7 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat
 arch=("any")
 url="https://arrow.apache.org/"
 license=("Apache-2.0")
+# NOTE boost dependency added to support unittests; not required by arrow core. ARROW-8222
 depends=("${MINGW_PACKAGE_PREFIX}-boost"
          "${MINGW_PACKAGE_PREFIX}-thrift"
          "${MINGW_PACKAGE_PREFIX}-snappy"
@@ -75,6 +76,7 @@ build() {
     ${MINGW_PREFIX}/bin/cmake.exe \
     ${ARROW_CPP_DIR} \
     -G "MSYS Makefiles" \
+    -DBOOST_USE_SHARED=OFF \
     -DARROW_BUILD_SHARED=OFF \
     -DARROW_BUILD_TESTS=ON \
     -DARROW_BUILD_STATIC=ON \
diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
index 145525c2389..d1ddb5e95e8 100755
--- a/ci/scripts/r_windows_build.sh
+++ b/ci/scripts/r_windows_build.sh
@@ -43,6 +43,7 @@ export PKG_CONFIG="/${MINGW_PREFIX}/bin/pkg-config --static"
 
 cp $ARROW_HOME/ci/scripts/PKGBUILD .
 export PKGEXT='.pkg.tar.xz' # pacman default changed to .zst in 2020, but keep the old ext for compat
+unset BOOST_ROOT
 printenv
 makepkg-mingw --noconfirm --noprogressbar --skippgpcheck --nocheck --syncdeps --cleanbuild
 

From c0cc5c4debfaf86d8484c2391c93ab1814303bdf Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Wed, 8 Apr 2020 13:50:28 -0700
Subject: [PATCH 09/10] fixes

---
 ci/scripts/PKGBUILD           | 2 +-
 ci/scripts/r_windows_build.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 0b864f89114..5bc6d748021 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -76,7 +76,7 @@ build() {
     ${MINGW_PREFIX}/bin/cmake.exe \
     ${ARROW_CPP_DIR} \
     -G "MSYS Makefiles" \
-    -DBOOST_USE_SHARED=OFF \
+    -DARROW_BOOST_USE_SHARED=OFF \
     -DARROW_BUILD_SHARED=OFF \
     -DARROW_BUILD_TESTS=ON \
     -DARROW_BUILD_STATIC=ON \
diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
index d1ddb5e95e8..77d3528f749 100755
--- a/ci/scripts/r_windows_build.sh
+++ b/ci/scripts/r_windows_build.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -x
+set -ex
 
 : ${ARROW_HOME:=$(pwd)}
 # Make sure it is absolute and exported

From d9ab1bf87b72b13ae70d8285358fa84dd49bcb0e Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Wed, 8 Apr 2020 14:23:02 -0700
Subject: [PATCH 10/10] Undo Rtools test/boost changes

---
 ci/scripts/PKGBUILD           | 13 ++-----------
 ci/scripts/r_windows_build.sh |  1 -
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 5bc6d748021..23a78cb2fb2 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -24,9 +24,7 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat
 arch=("any")
 url="https://arrow.apache.org/"
 license=("Apache-2.0")
-# NOTE boost dependency added to support unittests; not required by arrow core. ARROW-8222
-depends=("${MINGW_PACKAGE_PREFIX}-boost"
-         "${MINGW_PACKAGE_PREFIX}-thrift"
+depends=("${MINGW_PACKAGE_PREFIX}-thrift"
          "${MINGW_PACKAGE_PREFIX}-snappy"
          "${MINGW_PACKAGE_PREFIX}-zlib"
          "${MINGW_PACKAGE_PREFIX}-lz4"
@@ -76,9 +74,7 @@ build() {
     ${MINGW_PREFIX}/bin/cmake.exe \
     ${ARROW_CPP_DIR} \
     -G "MSYS Makefiles" \
-    -DARROW_BOOST_USE_SHARED=OFF \
     -DARROW_BUILD_SHARED=OFF \
-    -DARROW_BUILD_TESTS=ON \
     -DARROW_BUILD_STATIC=ON \
     -DARROW_BUILD_UTILITIES=OFF \
     -DARROW_COMPUTE=ON \
@@ -95,18 +91,13 @@ build() {
     -DARROW_WITH_SNAPPY=ON \
     -DARROW_WITH_ZLIB=ON \
     -DARROW_WITH_ZSTD=ON \
-    -DCMAKE_BUILD_TYPE="debug" \
+    -DCMAKE_BUILD_TYPE="release" \
     -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX}
 
   make -j2
   popd
 }
 
-check() {
-  export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH"
-  make -C ${cpp_build_dir} test
-}
-
 package() {
   make -C ${cpp_build_dir} DESTDIR="${pkgdir}" install
 
diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
index 77d3528f749..d263d51dc86 100755
--- a/ci/scripts/r_windows_build.sh
+++ b/ci/scripts/r_windows_build.sh
@@ -43,7 +43,6 @@ export PKG_CONFIG="/${MINGW_PREFIX}/bin/pkg-config --static"
 
 cp $ARROW_HOME/ci/scripts/PKGBUILD .
 export PKGEXT='.pkg.tar.xz' # pacman default changed to .zst in 2020, but keep the old ext for compat
-unset BOOST_ROOT
 printenv
 makepkg-mingw --noconfirm --noprogressbar --skippgpcheck --nocheck --syncdeps --cleanbuild