From e49f8a70f4a8e72efb781c0acdc98a5002108c68 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 23 Mar 2020 13:53:51 -0400 Subject: [PATCH 01/10] ARROW-8164: [C++][Dataset] Provide Dataset::ReplaceSchema() --- cpp/src/arrow/dataset/dataset.cc | 17 +++++ cpp/src/arrow/dataset/dataset.h | 27 +++++-- cpp/src/arrow/dataset/dataset_test.cc | 80 ++++++++++++++++++-- cpp/src/arrow/dataset/file_base.cc | 8 ++ cpp/src/arrow/dataset/file_base.h | 3 + cpp/src/arrow/dataset/file_test.cc | 24 ++++++ cpp/src/arrow/dataset/projector.cc | 43 +++++++---- cpp/src/arrow/dataset/projector.h | 2 + cpp/src/arrow/result.h | 6 ++ python/pyarrow/_dataset.pyx | 14 +++- python/pyarrow/includes/libarrow_dataset.pxd | 2 + r/R/arrowExports.R | 4 + r/R/dataset.R | 6 +- r/src/arrowExports.cpp | 17 +++++ r/src/dataset.cpp | 7 ++ 15 files changed, 229 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc index 2e17cc3ec67..37d0da0bbc5 100644 --- a/cpp/src/arrow/dataset/dataset.cc +++ b/cpp/src/arrow/dataset/dataset.cc @@ -135,6 +135,12 @@ InMemoryDataset::InMemoryDataset(std::shared_ptr table) : Dataset(table->schema()), get_batches_(new TableRecordBatchGenerator(std::move(table))) {} +Result> InMemoryDataset::ReplaceSchema( + std::shared_ptr schema) const { + RETURN_NOT_OK(CheckProjectable(*schema_, *schema)); + return std::make_shared(std::move(schema), get_batches_); +} + FragmentIterator InMemoryDataset::GetFragmentsImpl( std::shared_ptr scan_options) { auto schema = this->schema(); @@ -175,6 +181,17 @@ Result> UnionDataset::Make(std::shared_ptr new UnionDataset(std::move(schema), std::move(children))); } +Result> UnionDataset::ReplaceSchema( + std::shared_ptr schema) const { + auto children = children_; + for (auto& child : children) { + ARROW_ASSIGN_OR_RAISE(child, child->ReplaceSchema(schema)); + } + + return std::shared_ptr( + new UnionDataset(std::move(schema), std::move(children))); +} + FragmentIterator UnionDataset::GetFragmentsImpl(std::shared_ptr options) { return GetFragmentsFromDatasets(children_, options); } diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index 9f0e26feabe..740263df3f8 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -122,6 +122,13 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { /// \brief The name identifying the kind of Dataset virtual std::string type_name() const = 0; + /// \brief Return a copy of this Dataset with a different schema. + /// + /// The copy will view the same Fragments. If the new schema is not compatible with the + /// original dataset's schema then an error will be raised. + virtual Result> ReplaceSchema( + std::shared_ptr schema) const = 0; + virtual ~Dataset() = default; protected: @@ -155,7 +162,7 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset { }; InMemoryDataset(std::shared_ptr schema, - std::unique_ptr get_batches) + std::shared_ptr get_batches) : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {} // Convenience constructor taking a fixed list of batches @@ -163,12 +170,15 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset { explicit InMemoryDataset(std::shared_ptr
table); - FragmentIterator GetFragmentsImpl(std::shared_ptr options) override; - std::string type_name() const override { return "in-memory"; } - private: - std::unique_ptr get_batches_; + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + protected: + FragmentIterator GetFragmentsImpl(std::shared_ptr options) override; + + std::shared_ptr get_batches_; }; /// \brief A Dataset wrapping child Datasets. @@ -182,13 +192,16 @@ class ARROW_DS_EXPORT UnionDataset : public Dataset { static Result> Make(std::shared_ptr schema, DatasetVector children); - FragmentIterator GetFragmentsImpl(std::shared_ptr options) override; - const DatasetVector& children() const { return children_; } std::string type_name() const override { return "union"; } + Result> ReplaceSchema( + std::shared_ptr schema) const override; + protected: + FragmentIterator GetFragmentsImpl(std::shared_ptr options) override; + explicit UnionDataset(std::shared_ptr schema, DatasetVector children) : Dataset(std::move(schema)), children_(std::move(children)) {} diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index 01c5b5439b2..c4db812d16a 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -52,6 +52,35 @@ TEST_F(TestInMemoryFragment, Scan) { class TestInMemoryDataset : public DatasetFixtureMixin {}; +TEST_F(TestInMemoryDataset, ReplaceSchema) { + constexpr int64_t kBatchSize = 1; + constexpr int64_t kNumberBatches = 1; + + SetSchema({field("i32", int32()), field("f64", float64())}); + auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); + auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch); + + auto dataset = std::make_shared( + schema_, RecordBatchVector{static_cast(kNumberBatches), batch}); + + // drop field + ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status()); + // add field (will be materialized as null during projection) + ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status()); + // incompatible type + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("i32", utf8())})).status()); + // incompatible nullability + ASSERT_RAISES( + TypeError, + dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)})) + .status()); + // add non-nullable field + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)})) + .status()); +} + TEST_F(TestInMemoryDataset, GetFragments) { constexpr int64_t kBatchSize = 1024; constexpr int64_t kNumberBatches = 16; @@ -60,8 +89,6 @@ TEST_F(TestInMemoryDataset, GetFragments) { auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch); - // It is safe to copy fragment multiple time since Scan() does not consume - // the internal array. auto dataset = std::make_shared( schema_, RecordBatchVector{static_cast(kNumberBatches), batch}); @@ -70,6 +97,45 @@ TEST_F(TestInMemoryDataset, GetFragments) { class TestUnionDataset : public DatasetFixtureMixin {}; +TEST_F(TestUnionDataset, ReplaceSchema) { + constexpr int64_t kBatchSize = 1; + constexpr int64_t kNumberBatches = 1; + + SetSchema({field("i32", int32()), field("f64", float64())}); + auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); + + std::vector> batches{static_cast(kNumberBatches), + batch}; + + DatasetVector children = { + std::make_shared(schema_, batches), + std::make_shared(schema_, batches), + }; + + const int64_t total_batches = children.size() * kNumberBatches; + auto reader = ConstantArrayGenerator::Repeat(total_batches, batch); + + ASSERT_OK_AND_ASSIGN(auto dataset, UnionDataset::Make(schema_, children)); + AssertDatasetEquals(reader.get(), dataset.get()); + + // drop field + ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status()); + // add nullable field (will be materialized as null during projection) + ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status()); + // incompatible type + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("i32", utf8())})).status()); + // incompatible nullability + ASSERT_RAISES( + TypeError, + dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)})) + .status()); + // add non-nullable field + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)})) + .status()); +} + TEST_F(TestUnionDataset, GetFragments) { constexpr int64_t kBatchSize = 1024; constexpr int64_t kChildPerNode = 2; @@ -105,9 +171,7 @@ TEST_F(TestUnionDataset, GetFragments) { AssertDatasetEquals(reader.get(), root_dataset.get()); } -class TestDataset : public DatasetFixtureMixin {}; - -TEST_F(TestDataset, TrivialScan) { +TEST_F(TestUnionDataset, TrivialScan) { constexpr int64_t kNumberBatches = 16; constexpr int64_t kBatchSize = 1024; @@ -229,8 +293,8 @@ TEST(TestProjector, NonTrivial) { AssertBatchesEqual(*expected_batch, *reconciled_batch); } -class TestEndToEnd : public TestDataset { - void SetUp() { +class TestEndToEnd : public TestUnionDataset { + void SetUp() override { bool nullable = false; SetSchema({ field("region", utf8(), nullable), @@ -413,7 +477,7 @@ inline std::shared_ptr SchemaFromNames(const std::vector na return schema(fields); } -class TestSchemaUnification : public TestDataset { +class TestSchemaUnification : public TestUnionDataset { public: using i32 = util::optional; using PathAndContent = std::vector>; diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 66e0e1dd68d..fee471d975f 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -118,6 +118,14 @@ Result> FileSystemDataset::Make( std::move(filesystem), std::move(forest), std::move(partitions))); } +Result> FileSystemDataset::ReplaceSchema( + std::shared_ptr schema) const { + RETURN_NOT_OK(CheckProjectable(*schema_, *schema)); + return std::shared_ptr( + new FileSystemDataset(std::move(schema), partition_expression_, format_, + filesystem_, forest_, partitions_)); +} + std::vector FileSystemDataset::files() const { std::vector files; diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 157a4256e1c..e6d893193ff 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -244,6 +244,9 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset { std::string type_name() const override { return "filesystem"; } + Result> ReplaceSchema( + std::shared_ptr schema) const override; + const std::shared_ptr& format() const { return format_; } std::vector files() const; diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc index 4d6d6f6348c..699ba989587 100644 --- a/cpp/src/arrow/dataset/file_test.cc +++ b/cpp/src/arrow/dataset/file_test.cc @@ -92,6 +92,30 @@ TEST_F(TestFileSystemDataset, Basic) { AssertFilesAre(dataset_, {"A/a", "A/B/b"}); } +TEST_F(TestFileSystemDataset, ReplaceSchema) { + auto schm = schema({field("i32", int32()), field("f64", float64())}); + auto format = std::make_shared(schm); + ASSERT_OK_AND_ASSIGN(auto dataset, + FileSystemDataset::Make(schm, scalar(true), format, fs_, {})); + + // drop field + ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status()); + // add nullable field (will be materialized as null during projection) + ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status()); + // incompatible type + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("i32", utf8())})).status()); + // incompatible nullability + ASSERT_RAISES( + TypeError, + dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)})) + .status()); + // add non-nullable field + ASSERT_RAISES(TypeError, + dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)})) + .status()); +} + TEST_F(TestFileSystemDataset, RootPartitionPruning) { auto root_partition = ("a"_ == 5).Copy(); MakeDataset({fs::File("a"), fs::File("b")}, root_partition); diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index 531c4a56694..0d0f2eef47b 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -34,6 +34,30 @@ namespace arrow { namespace dataset { +Status CheckProjectable(const Schema& from, const Schema& to) { + for (const auto& to_field : to.fields()) { + ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(to_field->name()).FindOneOrNone(from)); + DCHECK_LE(match.indices().size(), 1); // name FieldRef returns non nested path + + if (match.indices().empty()) { + if (to_field->nullable()) continue; + + return Status::TypeError("field ", to_field->ToString(), + " is not nullable and does not exist in origin schema ", + from); + } + + const auto& from_field = from.field(match.indices()[0]); + if (!from_field->Equals(to_field, /*check_metadata=*/false)) { + return Status::TypeError( + "fields had matching names but were not equivalent. From: ", + from_field->ToString(), " To: ", to_field->ToString()); + } + } + + return Status::OK(); +} + RecordBatchProjector::RecordBatchProjector(std::shared_ptr to) : to_(std::move(to)), missing_columns_(to_->num_fields(), nullptr), @@ -86,32 +110,23 @@ Result> RecordBatchProjector::Project( Status RecordBatchProjector::SetInputSchema(std::shared_ptr from, MemoryPool* pool) { + RETURN_NOT_OK(CheckProjectable(*from, *to_)); from_ = std::move(from); for (int i = 0; i < to_->num_fields(); ++i) { - const auto& field = to_->field(i); - FieldRef ref(field->name()); - auto matches = ref.FindAll(*from_); + ARROW_ASSIGN_OR_RAISE(auto match, + FieldRef(to_->field(i)->name()).FindOneOrNone(*from_)); - if (matches.empty()) { + if (match.indices().empty()) { // Mark column i as missing by setting missing_columns_[i] // to a non-null placeholder. ARROW_ASSIGN_OR_RAISE(missing_columns_[i], MakeArrayOfNull(to_->field(i)->type(), 0, pool)); column_indices_[i] = kNoMatch; } else { - RETURN_NOT_OK(ref.CheckNonMultiple(matches, *from_)); - int matching_index = matches[0].indices()[0]; - - if (!from_->field(matching_index)->Equals(field, /*check_metadata=*/false)) { - return Status::TypeError("fields had matching names but were not equivalent ", - from_->field(matching_index)->ToString(), " vs ", - field->ToString()); - } - // Mark column i as not missing by setting missing_columns_[i] to nullptr missing_columns_[i] = nullptr; - column_indices_[i] = matching_index; + column_indices_[i] = match.indices()[0]; } } return Status::OK(); diff --git a/cpp/src/arrow/dataset/projector.h b/cpp/src/arrow/dataset/projector.h index 13a0ffb1938..8fd157f7ece 100644 --- a/cpp/src/arrow/dataset/projector.h +++ b/cpp/src/arrow/dataset/projector.h @@ -27,6 +27,8 @@ namespace arrow { namespace dataset { +ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to); + /// \brief Project a RecordBatch to a given schema. /// /// Projected record batches will reorder columns from input record batches when possible, diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index 13dd3870b1a..90d74ae6497 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -45,6 +45,12 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st); } // namespace internal +#if defined(__clang__) +// Only clang supports warn_unused_result as a type annotation. +template +class ARROW_MUST_USE_RESULT Result; +#endif + // A class for representing either a usable value, or an error. /// /// A Result object either contains a value of type `T` or a Status object diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 556faa74539..ab42aa7d99a 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -62,7 +62,7 @@ cdef class Dataset: self.dataset = sp.get() @staticmethod - cdef wrap(shared_ptr[CDataset]& sp): + cdef wrap(const shared_ptr[CDataset]& sp): cdef Dataset self typ = frombytes(sp.get().type_name()) @@ -92,6 +92,18 @@ cdef class Dataset: else: return Expression.wrap(expr) + def replace_schema(self, Schema schema not None): + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + """ + cdef shared_ptr[CDataset] copy = GetResultValue( + self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema))) + return Dataset.wrap(move(copy)) + def get_fragments(self, columns=None, filter=None): """Returns an iterator over the fragments in this dataset. diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 467a84c9319..53d5cde9dc9 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -187,6 +187,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: const shared_ptr[CExpression] & partition_expression() c_string type_name() + CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) + CResult[shared_ptr[CScannerBuilder]] NewScanWithContext "NewScan"( shared_ptr[CScanContext] context) CResult[shared_ptr[CScannerBuilder]] NewScan() diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 0f6b758df14..86ae03e1c63 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -356,6 +356,10 @@ dataset___Dataset__type_name <- function(dataset){ .Call(`_arrow_dataset___Dataset__type_name` , dataset) } +dataset___Dataset__ReplaceSchema <- function(dataset, schm){ + .Call(`_arrow_dataset___Dataset__ReplaceSchema` , dataset, schm) +} + dataset___UnionDataset__create <- function(datasets, schm){ .Call(`_arrow_dataset___UnionDataset__create` , datasets, schm) } diff --git a/r/R/dataset.R b/r/R/dataset.R index 78dec8e4b92..9c240bcdec7 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -119,7 +119,11 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject, # Start a new scan of the data # @return A [ScannerBuilder] NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)), - ToString = function() self$schema$ToString() + ToString = function() self$schema$ToString(), + ReplaceSchema = function(schema) { + assert_is(schema, "Schema") + shared_ptr(Dataset, dataset___Dataset__ReplaceSchema(self, schema)) + } ), active = list( schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)), diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index b38edfc4ded..43264b62bf0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1406,6 +1406,22 @@ RcppExport SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){ } #endif +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr dataset___Dataset__ReplaceSchema(const std::shared_ptr& dataset, const std::shared_ptr& schm); +RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type dataset(dataset_sexp); + Rcpp::traits::input_parameter&>::type schm(schm_sexp); + return Rcpp::wrap(dataset___Dataset__ReplaceSchema(dataset, schm)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){ + Rf_error("Cannot call dataset___Dataset__ReplaceSchema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // dataset.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr dataset___UnionDataset__create(const ds::DatasetVector& datasets, const std::shared_ptr& schm); @@ -5838,6 +5854,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, + { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 00c480f0ec9..d2d207aef7d 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -41,6 +41,13 @@ std::string dataset___Dataset__type_name(const std::shared_ptr& dat return dataset->type_name(); } +// [[arrow::export]] +std::shared_ptr dataset___Dataset__ReplaceSchema( + const std::shared_ptr& dataset, + const std::shared_ptr& schm) { + return VALUE_OR_STOP(dataset->ReplaceSchema(schm)); +} + // [[arrow::export]] std::shared_ptr dataset___UnionDataset__create( const ds::DatasetVector& datasets, const std::shared_ptr& schm) { From d86846886d3bd7b16a3d5a4e789c64b5d1bd4448 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 27 Mar 2020 16:03:47 -0400 Subject: [PATCH 02/10] address review comments --- cpp/src/arrow/dataset/dataset_test.cc | 60 +++++++++++++++++++++++++-- cpp/src/arrow/dataset/projector.cc | 19 +++++---- cpp/src/arrow/result.h | 10 +---- cpp/src/arrow/status.h | 9 +--- cpp/src/arrow/testing/gtest_util.h | 11 ++--- cpp/src/arrow/util/macros.h | 5 +++ 6 files changed, 82 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index c4db812d16a..7470efac20a 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -193,6 +193,57 @@ TEST_F(TestUnionDataset, TrivialScan) { AssertDatasetEquals(reader.get(), dataset.get()); } +TEST(TestProjector, CheckProjectable) { + struct Assert { + explicit Assert(FieldVector from) : from_(from) {} + Schema from_; + + void ProjectableTo(FieldVector to) { + ARROW_EXPECT_OK(CheckProjectable(from_, Schema(to))); + } + + void NotProjectableTo(FieldVector to, std::string substr = "") { + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, testing::HasSubstr(substr), + CheckProjectable(from_, Schema(to))); + } + }; + + auto i8 = field("i8", int8()); + auto u16 = field("u16", uint16()); + auto str = field("str", utf8()); + auto i8_req = field("i8", int8(), false); + auto u16_req = field("u16", uint16(), false); + auto str_req = field("str", utf8(), false); + + // trivial + Assert({}).ProjectableTo({}); + Assert({i8}).ProjectableTo({i8}); + Assert({i8, u16_req}).ProjectableTo({i8, u16_req}); + + // reorder + Assert({i8, u16}).ProjectableTo({u16, i8}); + Assert({i8, str, u16}).ProjectableTo({u16, i8, str}); + + // drop field(s) + Assert({i8}).ProjectableTo({}); + + // add field(s) + Assert({}).ProjectableTo({i8}); + Assert({}).ProjectableTo({i8, u16}); + Assert({}).NotProjectableTo({u16_req}, + "is not nullable and does not exist in origin schema"); + Assert({i8}).NotProjectableTo({u16_req, i8}); + + // change nullability + Assert({i8}).NotProjectableTo({i8_req}, + "not nullable but is not required in origin schema"); + Assert({i8_req}).ProjectableTo({i8}); + + // change field type + Assert({i8}).NotProjectableTo({field("i8", utf8())}, + "fields had matching names but differing types"); +} + TEST(TestProjector, MismatchedType) { constexpr int64_t kBatchSize = 1024; @@ -441,9 +492,9 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) { ASSERT_OK(scanner_builder->Project(columns)); // An optional filter expression may also be specified. The filter expression - // is evaluated against input rows. Only rows for which the filter evaluates to true are - // yielded. Predicate pushdown optimizations are applied using partition information if - // available. + // is evaluated against input rows. Only rows for which the filter evaluates to true + // are yielded. Predicate pushdown optimizations are applied using partition + // information if available. // // This API decouples predicate pushdown from the Dataset implementation // and partition discovery. @@ -551,7 +602,8 @@ class TestSchemaUnification : public TestUnionDataset { ASSERT_OK_AND_ASSIGN(auto ds1, get_source("/dataset/alpha", {ds1_df1, ds1_df2})); ASSERT_OK_AND_ASSIGN(auto ds2, get_source("/dataset/beta", {ds2_df1, ds2_df2})); - // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this test + // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this + // test class DisparateSchemasUnionDataset : public UnionDataset { public: DisparateSchemasUnionDataset(std::shared_ptr schema, DatasetVector children) diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index 0d0f2eef47b..9ce90ad0ed3 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -36,10 +36,9 @@ namespace dataset { Status CheckProjectable(const Schema& from, const Schema& to) { for (const auto& to_field : to.fields()) { - ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(to_field->name()).FindOneOrNone(from)); - DCHECK_LE(match.indices().size(), 1); // name FieldRef returns non nested path + ARROW_ASSIGN_OR_RAISE(auto from_field, FieldRef(to_field->name()).GetOneOrNone(from)); - if (match.indices().empty()) { + if (from_field == nullptr) { if (to_field->nullable()) continue; return Status::TypeError("field ", to_field->ToString(), @@ -47,11 +46,15 @@ Status CheckProjectable(const Schema& from, const Schema& to) { from); } - const auto& from_field = from.field(match.indices()[0]); - if (!from_field->Equals(to_field, /*check_metadata=*/false)) { - return Status::TypeError( - "fields had matching names but were not equivalent. From: ", - from_field->ToString(), " To: ", to_field->ToString()); + if (!from_field->type()->Equals(to_field->type())) { + return Status::TypeError("fields had matching names but differing types. From: ", + from_field->ToString(), " To: ", to_field->ToString()); + } + + if (from_field->nullable() && !to_field->nullable()) { + return Status::TypeError("field ", to_field->ToString(), + " is not nullable but is not required in origin schema ", + from); } } diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index 90d74ae6497..b492057a8b2 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -45,13 +45,7 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st); } // namespace internal -#if defined(__clang__) -// Only clang supports warn_unused_result as a type annotation. -template -class ARROW_MUST_USE_RESULT Result; -#endif - -// A class for representing either a usable value, or an error. +/// A class for representing either a usable value, or an error. /// /// A Result object either contains a value of type `T` or a Status object /// explaining why such a value is not present. The type `T` must be @@ -104,7 +98,7 @@ class ARROW_MUST_USE_RESULT Result; /// arrow::Result CalculateFoo(); /// ``` template -class Result : public util::EqualityComparable> { +class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable> { template friend class Result; diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 195ed71cce9..aa1f2e151e5 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -95,11 +95,6 @@ enum class StatusCode : char { AlreadyExists = 45 }; -#if defined(__clang__) -// Only clang supports warn_unused_result as a type annotation. -class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; -#endif - /// \brief An opaque class that allows subsystems to retain /// additional information inside the Status. class ARROW_EXPORT StatusDetail { @@ -124,8 +119,8 @@ class ARROW_EXPORT StatusDetail { /// /// Additionally, if an error occurred, a specific error message is generally /// attached. -class ARROW_EXPORT Status : public util::EqualityComparable, - public util::ToStringOstreamable { +class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable, + public util::ToStringOstreamable { public: // Create a success status. Status() noexcept : state_(NULLPTR) {} diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 6119e0f0fc7..14da481f0bb 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -97,11 +97,12 @@ class Result; #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) -#define ARROW_EXPECT_OK(expr) \ - do { \ - auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ - EXPECT_TRUE(_st.ok()); \ +#define ARROW_EXPECT_OK(expr) \ + do { \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + EXPECT_TRUE(_st.ok()) << "'" ARROW_STRINGIFY(expr) "' failed with " \ + << _st.ToString(); \ } while (false) #define ABORT_NOT_OK(expr) \ diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index edb03d31c14..69b3fbab9c7 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -68,6 +68,11 @@ #define ARROW_MUST_USE_RESULT #endif +#if defined(__clang__) +// Only clang supports warn_unused_result as a type annotation. +#define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT +#endif + // ---------------------------------------------------------------------- // C++/CLI support macros (see ARROW-1134) From 2ee096c5ba24a37fe29c531cc1e4ba5ac610330b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 27 Mar 2020 18:06:53 -0400 Subject: [PATCH 03/10] add #else #define ARROW_MUST_USE_TYPE --- cpp/src/arrow/util/macros.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 69b3fbab9c7..ae8d56d098b 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -71,6 +71,8 @@ #if defined(__clang__) // Only clang supports warn_unused_result as a type annotation. #define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT +#else +#define ARROW_MUST_USE_TYPE #endif // ---------------------------------------------------------------------- From d0e2e04b21c7289c7dcdb4f975031e4424efda3c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Apr 2020 12:50:02 -0400 Subject: [PATCH 04/10] try enabling debug build with tests on mingw32 --- ci/scripts/PKGBUILD | 8 +++++++- cpp/cmake_modules/DefineOptions.cmake | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 23a78cb2fb2..c3c797c344f 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -75,6 +75,7 @@ build() { ${ARROW_CPP_DIR} \ -G "MSYS Makefiles" \ -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_STATIC=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_COMPUTE=ON \ @@ -91,13 +92,18 @@ build() { -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE="release" \ + -DCMAKE_BUILD_TYPE="debug" \ -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} make -j2 popd } +check() { + export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" + make -C ${cpp_build_dir} test +} + package() { make -C ${cpp_build_dir} DESTDIR="${pkgdir}" install diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index dbcc5b6ff01..b9d2e5b9ff8 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -138,9 +138,15 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_BUILD_BENCHMARKS_REFERENCE "Build the Arrow micro reference benchmarks" OFF) + if(ARROW_BUILD_SHARED) + set(ARROW_TEST_LINKAGE_DEFAULT "shared") + else() + set(ARROW_TEST_LINKAGE_DEFAULT "static") + endif() + define_option_string(ARROW_TEST_LINKAGE "Linkage of Arrow libraries with unit tests executables." - "shared" + "${ARROW_TEST_LINKAGE_DEFAULT}" "shared" "static") From bc767e1b9e13094d94c90a165ccc35c0c8046e63 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Apr 2020 14:08:05 -0400 Subject: [PATCH 05/10] revert changes to R bindings --- r/R/arrowExports.R | 4 ---- r/R/dataset.R | 6 +----- r/man/enums.Rd | 20 +------------------- r/man/write_parquet.Rd | 3 +-- r/src/arrowExports.cpp | 17 ----------------- r/src/dataset.cpp | 7 ------- 6 files changed, 3 insertions(+), 54 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 86ae03e1c63..0f6b758df14 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -356,10 +356,6 @@ dataset___Dataset__type_name <- function(dataset){ .Call(`_arrow_dataset___Dataset__type_name` , dataset) } -dataset___Dataset__ReplaceSchema <- function(dataset, schm){ - .Call(`_arrow_dataset___Dataset__ReplaceSchema` , dataset, schm) -} - dataset___UnionDataset__create <- function(datasets, schm){ .Call(`_arrow_dataset___UnionDataset__create` , datasets, schm) } diff --git a/r/R/dataset.R b/r/R/dataset.R index 9c240bcdec7..78dec8e4b92 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -119,11 +119,7 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject, # Start a new scan of the data # @return A [ScannerBuilder] NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)), - ToString = function() self$schema$ToString(), - ReplaceSchema = function(schema) { - assert_is(schema, "Schema") - shared_ptr(Dataset, dataset___Dataset__ReplaceSchema(self, schema)) - } + ToString = function() self$schema$ToString() ), active = list( schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)), diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 862942a46a0..7f7358f760b 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -13,25 +13,7 @@ \alias{FileType} \alias{ParquetVersionType} \title{Arrow enums} -\format{ -An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. - -An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2. - -An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 28. - -An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17. - -An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3. - -An object of class \code{Message::Type} (inherits from \code{arrow-enum}) of length 5. - -An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9. - -An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4. - -An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2. -} +\format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.} \usage{ TimeUnit diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index 3ff60df30f9..acd5e8d48c9 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -20,8 +20,7 @@ write_parquet( use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, allow_truncated_timestamps = FALSE, - - arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps + arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, allow_truncated_timestamps = allow_truncated_timestamps) ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 43264b62bf0..b38edfc4ded 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1406,22 +1406,6 @@ RcppExport SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){ } #endif -// dataset.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr dataset___Dataset__ReplaceSchema(const std::shared_ptr& dataset, const std::shared_ptr& schm); -RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type dataset(dataset_sexp); - Rcpp::traits::input_parameter&>::type schm(schm_sexp); - return Rcpp::wrap(dataset___Dataset__ReplaceSchema(dataset, schm)); -END_RCPP -} -#else -RcppExport SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){ - Rf_error("Cannot call dataset___Dataset__ReplaceSchema(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - // dataset.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr dataset___UnionDataset__create(const ds::DatasetVector& datasets, const std::shared_ptr& schm); @@ -5854,7 +5838,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, - { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index d2d207aef7d..00c480f0ec9 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -41,13 +41,6 @@ std::string dataset___Dataset__type_name(const std::shared_ptr& dat return dataset->type_name(); } -// [[arrow::export]] -std::shared_ptr dataset___Dataset__ReplaceSchema( - const std::shared_ptr& dataset, - const std::shared_ptr& schm) { - return VALUE_OR_STOP(dataset->ReplaceSchema(schm)); -} - // [[arrow::export]] std::shared_ptr dataset___UnionDataset__create( const ds::DatasetVector& datasets, const std::shared_ptr& schm) { From b36ed5fe07dce1bde98f19738ffb8dc1dde935b7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Apr 2020 15:43:29 -0400 Subject: [PATCH 06/10] add boost as a dependency in PKGBUILD --- ci/scripts/PKGBUILD | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index c3c797c344f..023a0c59dfb 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -24,7 +24,8 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") -depends=("${MINGW_PACKAGE_PREFIX}-thrift" +depends=("${MINGW_PACKAGE_PREFIX}-boost" + "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" "${MINGW_PACKAGE_PREFIX}-lz4" From 1fef351e03d264684b7676632f062914bcf0b2ca Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Apr 2020 13:12:50 -0700 Subject: [PATCH 07/10] Re-fix R docs --- r/man/enums.Rd | 20 +++++++++++++++++++- r/man/write_parquet.Rd | 3 ++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 7f7358f760b..862942a46a0 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -13,7 +13,25 @@ \alias{FileType} \alias{ParquetVersionType} \title{Arrow enums} -\format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.} +\format{ +An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. + +An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2. + +An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 28. + +An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17. + +An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3. + +An object of class \code{Message::Type} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9. + +An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4. + +An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2. +} \usage{ TimeUnit diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index acd5e8d48c9..3ff60df30f9 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -20,7 +20,8 @@ write_parquet( use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, allow_truncated_timestamps = FALSE, - arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps + + arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, allow_truncated_timestamps = allow_truncated_timestamps) ) From b0ebb362c44d0ab716654ddfb7dc201fa0dfbdc7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Apr 2020 16:36:39 -0400 Subject: [PATCH 08/10] Re fix boost dependency --- ci/scripts/PKGBUILD | 2 ++ ci/scripts/r_windows_build.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 023a0c59dfb..0b864f89114 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -24,6 +24,7 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") +# NOTE boost dependency added to support unittests; not required by arrow core. ARROW-8222 depends=("${MINGW_PACKAGE_PREFIX}-boost" "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" @@ -75,6 +76,7 @@ build() { ${MINGW_PREFIX}/bin/cmake.exe \ ${ARROW_CPP_DIR} \ -G "MSYS Makefiles" \ + -DBOOST_USE_SHARED=OFF \ -DARROW_BUILD_SHARED=OFF \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_STATIC=ON \ diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index 145525c2389..d1ddb5e95e8 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -43,6 +43,7 @@ export PKG_CONFIG="/${MINGW_PREFIX}/bin/pkg-config --static" cp $ARROW_HOME/ci/scripts/PKGBUILD . export PKGEXT='.pkg.tar.xz' # pacman default changed to .zst in 2020, but keep the old ext for compat +unset BOOST_ROOT printenv makepkg-mingw --noconfirm --noprogressbar --skippgpcheck --nocheck --syncdeps --cleanbuild From c0cc5c4debfaf86d8484c2391c93ab1814303bdf Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Apr 2020 13:50:28 -0700 Subject: [PATCH 09/10] fixes --- ci/scripts/PKGBUILD | 2 +- ci/scripts/r_windows_build.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 0b864f89114..5bc6d748021 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -76,7 +76,7 @@ build() { ${MINGW_PREFIX}/bin/cmake.exe \ ${ARROW_CPP_DIR} \ -G "MSYS Makefiles" \ - -DBOOST_USE_SHARED=OFF \ + -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_BUILD_SHARED=OFF \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_STATIC=ON \ diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index d1ddb5e95e8..77d3528f749 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -x +set -ex : ${ARROW_HOME:=$(pwd)} # Make sure it is absolute and exported From d9ab1bf87b72b13ae70d8285358fa84dd49bcb0e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Apr 2020 14:23:02 -0700 Subject: [PATCH 10/10] Undo Rtools test/boost changes --- ci/scripts/PKGBUILD | 13 ++----------- ci/scripts/r_windows_build.sh | 1 - 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 5bc6d748021..23a78cb2fb2 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -24,9 +24,7 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") -# NOTE boost dependency added to support unittests; not required by arrow core. ARROW-8222 -depends=("${MINGW_PACKAGE_PREFIX}-boost" - "${MINGW_PACKAGE_PREFIX}-thrift" +depends=("${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" "${MINGW_PACKAGE_PREFIX}-lz4" @@ -76,9 +74,7 @@ build() { ${MINGW_PREFIX}/bin/cmake.exe \ ${ARROW_CPP_DIR} \ -G "MSYS Makefiles" \ - -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_BUILD_SHARED=OFF \ - -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_STATIC=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_COMPUTE=ON \ @@ -95,18 +91,13 @@ build() { -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE="debug" \ + -DCMAKE_BUILD_TYPE="release" \ -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} make -j2 popd } -check() { - export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" - make -C ${cpp_build_dir} test -} - package() { make -C ${cpp_build_dir} DESTDIR="${pkgdir}" install diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index 77d3528f749..d263d51dc86 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -43,7 +43,6 @@ export PKG_CONFIG="/${MINGW_PREFIX}/bin/pkg-config --static" cp $ARROW_HOME/ci/scripts/PKGBUILD . export PKGEXT='.pkg.tar.xz' # pacman default changed to .zst in 2020, but keep the old ext for compat -unset BOOST_ROOT printenv makepkg-mingw --noconfirm --noprogressbar --skippgpcheck --nocheck --syncdeps --cleanbuild