diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
index 145525c2389..d263d51dc86 100755
--- a/ci/scripts/r_windows_build.sh
+++ b/ci/scripts/r_windows_build.sh
@@ -17,7 +17,7 @@
# specific language governing permissions and limitations
# under the License.
-set -x
+set -ex
: ${ARROW_HOME:=$(pwd)}
# Make sure it is absolute and exported
diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index dbcc5b6ff01..b9d2e5b9ff8 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -138,9 +138,15 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
define_option(ARROW_BUILD_BENCHMARKS_REFERENCE
"Build the Arrow micro reference benchmarks" OFF)
+ if(ARROW_BUILD_SHARED)
+ set(ARROW_TEST_LINKAGE_DEFAULT "shared")
+ else()
+ set(ARROW_TEST_LINKAGE_DEFAULT "static")
+ endif()
+
define_option_string(ARROW_TEST_LINKAGE
"Linkage of Arrow libraries with unit tests executables."
- "shared"
+ "${ARROW_TEST_LINKAGE_DEFAULT}"
"shared"
"static")
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 2e17cc3ec67..37d0da0bbc5 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -135,6 +135,12 @@ InMemoryDataset::InMemoryDataset(std::shared_ptr
table)
: Dataset(table->schema()),
get_batches_(new TableRecordBatchGenerator(std::move(table))) {}
+Result> InMemoryDataset::ReplaceSchema(
+ std::shared_ptr schema) const {
+ RETURN_NOT_OK(CheckProjectable(*schema_, *schema));
+ return std::make_shared(std::move(schema), get_batches_);
+}
+
FragmentIterator InMemoryDataset::GetFragmentsImpl(
std::shared_ptr scan_options) {
auto schema = this->schema();
@@ -175,6 +181,17 @@ Result> UnionDataset::Make(std::shared_ptr
new UnionDataset(std::move(schema), std::move(children)));
}
+Result> UnionDataset::ReplaceSchema(
+ std::shared_ptr schema) const {
+ auto children = children_;
+ for (auto& child : children) {
+ ARROW_ASSIGN_OR_RAISE(child, child->ReplaceSchema(schema));
+ }
+
+ return std::shared_ptr(
+ new UnionDataset(std::move(schema), std::move(children)));
+}
+
FragmentIterator UnionDataset::GetFragmentsImpl(std::shared_ptr options) {
return GetFragmentsFromDatasets(children_, options);
}
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 9f0e26feabe..740263df3f8 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -122,6 +122,13 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this {
/// \brief The name identifying the kind of Dataset
virtual std::string type_name() const = 0;
+ /// \brief Return a copy of this Dataset with a different schema.
+ ///
+ /// The copy will view the same Fragments. If the new schema is not compatible with the
+ /// original dataset's schema then an error will be raised.
+ virtual Result> ReplaceSchema(
+ std::shared_ptr schema) const = 0;
+
virtual ~Dataset() = default;
protected:
@@ -155,7 +162,7 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
};
InMemoryDataset(std::shared_ptr schema,
- std::unique_ptr get_batches)
+ std::shared_ptr get_batches)
: Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
// Convenience constructor taking a fixed list of batches
@@ -163,12 +170,15 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
explicit InMemoryDataset(std::shared_ptr table);
- FragmentIterator GetFragmentsImpl(std::shared_ptr options) override;
-
std::string type_name() const override { return "in-memory"; }
- private:
- std::unique_ptr get_batches_;
+ Result> ReplaceSchema(
+ std::shared_ptr schema) const override;
+
+ protected:
+ FragmentIterator GetFragmentsImpl(std::shared_ptr options) override;
+
+ std::shared_ptr get_batches_;
};
/// \brief A Dataset wrapping child Datasets.
@@ -182,13 +192,16 @@ class ARROW_DS_EXPORT UnionDataset : public Dataset {
static Result> Make(std::shared_ptr schema,
DatasetVector children);
- FragmentIterator GetFragmentsImpl(std::shared_ptr options) override;
-
const DatasetVector& children() const { return children_; }
std::string type_name() const override { return "union"; }
+ Result> ReplaceSchema(
+ std::shared_ptr schema) const override;
+
protected:
+ FragmentIterator GetFragmentsImpl(std::shared_ptr options) override;
+
explicit UnionDataset(std::shared_ptr schema, DatasetVector children)
: Dataset(std::move(schema)), children_(std::move(children)) {}
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index 01c5b5439b2..7470efac20a 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -52,6 +52,35 @@ TEST_F(TestInMemoryFragment, Scan) {
class TestInMemoryDataset : public DatasetFixtureMixin {};
+TEST_F(TestInMemoryDataset, ReplaceSchema) {
+ constexpr int64_t kBatchSize = 1;
+ constexpr int64_t kNumberBatches = 1;
+
+ SetSchema({field("i32", int32()), field("f64", float64())});
+ auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+ auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
+
+ auto dataset = std::make_shared(
+ schema_, RecordBatchVector{static_cast(kNumberBatches), batch});
+
+ // drop field
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+ // add field (will be materialized as null during projection)
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+ // incompatible type
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+ // incompatible nullability
+ ASSERT_RAISES(
+ TypeError,
+ dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+ .status());
+ // add non-nullable field
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+ .status());
+}
+
TEST_F(TestInMemoryDataset, GetFragments) {
constexpr int64_t kBatchSize = 1024;
constexpr int64_t kNumberBatches = 16;
@@ -60,8 +89,6 @@ TEST_F(TestInMemoryDataset, GetFragments) {
auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
auto reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
- // It is safe to copy fragment multiple time since Scan() does not consume
- // the internal array.
auto dataset = std::make_shared(
schema_, RecordBatchVector{static_cast(kNumberBatches), batch});
@@ -70,6 +97,45 @@ TEST_F(TestInMemoryDataset, GetFragments) {
class TestUnionDataset : public DatasetFixtureMixin {};
+TEST_F(TestUnionDataset, ReplaceSchema) {
+ constexpr int64_t kBatchSize = 1;
+ constexpr int64_t kNumberBatches = 1;
+
+ SetSchema({field("i32", int32()), field("f64", float64())});
+ auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+
+ std::vector> batches{static_cast(kNumberBatches),
+ batch};
+
+ DatasetVector children = {
+ std::make_shared(schema_, batches),
+ std::make_shared(schema_, batches),
+ };
+
+ const int64_t total_batches = children.size() * kNumberBatches;
+ auto reader = ConstantArrayGenerator::Repeat(total_batches, batch);
+
+ ASSERT_OK_AND_ASSIGN(auto dataset, UnionDataset::Make(schema_, children));
+ AssertDatasetEquals(reader.get(), dataset.get());
+
+ // drop field
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+ // add nullable field (will be materialized as null during projection)
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+ // incompatible type
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+ // incompatible nullability
+ ASSERT_RAISES(
+ TypeError,
+ dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+ .status());
+ // add non-nullable field
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+ .status());
+}
+
TEST_F(TestUnionDataset, GetFragments) {
constexpr int64_t kBatchSize = 1024;
constexpr int64_t kChildPerNode = 2;
@@ -105,9 +171,7 @@ TEST_F(TestUnionDataset, GetFragments) {
AssertDatasetEquals(reader.get(), root_dataset.get());
}
-class TestDataset : public DatasetFixtureMixin {};
-
-TEST_F(TestDataset, TrivialScan) {
+TEST_F(TestUnionDataset, TrivialScan) {
constexpr int64_t kNumberBatches = 16;
constexpr int64_t kBatchSize = 1024;
@@ -129,6 +193,57 @@ TEST_F(TestDataset, TrivialScan) {
AssertDatasetEquals(reader.get(), dataset.get());
}
+TEST(TestProjector, CheckProjectable) {
+ struct Assert {
+ explicit Assert(FieldVector from) : from_(from) {}
+ Schema from_;
+
+ void ProjectableTo(FieldVector to) {
+ ARROW_EXPECT_OK(CheckProjectable(from_, Schema(to)));
+ }
+
+ void NotProjectableTo(FieldVector to, std::string substr = "") {
+ EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, testing::HasSubstr(substr),
+ CheckProjectable(from_, Schema(to)));
+ }
+ };
+
+ auto i8 = field("i8", int8());
+ auto u16 = field("u16", uint16());
+ auto str = field("str", utf8());
+ auto i8_req = field("i8", int8(), false);
+ auto u16_req = field("u16", uint16(), false);
+ auto str_req = field("str", utf8(), false);
+
+ // trivial
+ Assert({}).ProjectableTo({});
+ Assert({i8}).ProjectableTo({i8});
+ Assert({i8, u16_req}).ProjectableTo({i8, u16_req});
+
+ // reorder
+ Assert({i8, u16}).ProjectableTo({u16, i8});
+ Assert({i8, str, u16}).ProjectableTo({u16, i8, str});
+
+ // drop field(s)
+ Assert({i8}).ProjectableTo({});
+
+ // add field(s)
+ Assert({}).ProjectableTo({i8});
+ Assert({}).ProjectableTo({i8, u16});
+ Assert({}).NotProjectableTo({u16_req},
+ "is not nullable and does not exist in origin schema");
+ Assert({i8}).NotProjectableTo({u16_req, i8});
+
+ // change nullability
+ Assert({i8}).NotProjectableTo({i8_req},
+ "not nullable but is not required in origin schema");
+ Assert({i8_req}).ProjectableTo({i8});
+
+ // change field type
+ Assert({i8}).NotProjectableTo({field("i8", utf8())},
+ "fields had matching names but differing types");
+}
+
TEST(TestProjector, MismatchedType) {
constexpr int64_t kBatchSize = 1024;
@@ -229,8 +344,8 @@ TEST(TestProjector, NonTrivial) {
AssertBatchesEqual(*expected_batch, *reconciled_batch);
}
-class TestEndToEnd : public TestDataset {
- void SetUp() {
+class TestEndToEnd : public TestUnionDataset {
+ void SetUp() override {
bool nullable = false;
SetSchema({
field("region", utf8(), nullable),
@@ -377,9 +492,9 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) {
ASSERT_OK(scanner_builder->Project(columns));
// An optional filter expression may also be specified. The filter expression
- // is evaluated against input rows. Only rows for which the filter evaluates to true are
- // yielded. Predicate pushdown optimizations are applied using partition information if
- // available.
+ // is evaluated against input rows. Only rows for which the filter evaluates to true
+ // are yielded. Predicate pushdown optimizations are applied using partition
+ // information if available.
//
// This API decouples predicate pushdown from the Dataset implementation
// and partition discovery.
@@ -413,7 +528,7 @@ inline std::shared_ptr SchemaFromNames(const std::vector na
return schema(fields);
}
-class TestSchemaUnification : public TestDataset {
+class TestSchemaUnification : public TestUnionDataset {
public:
using i32 = util::optional;
using PathAndContent = std::vector>;
@@ -487,7 +602,8 @@ class TestSchemaUnification : public TestDataset {
ASSERT_OK_AND_ASSIGN(auto ds1, get_source("/dataset/alpha", {ds1_df1, ds1_df2}));
ASSERT_OK_AND_ASSIGN(auto ds2, get_source("/dataset/beta", {ds2_df1, ds2_df2}));
- // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this test
+ // FIXME(bkietz) this is a hack: allow differing schemas for the purposes of this
+ // test
class DisparateSchemasUnionDataset : public UnionDataset {
public:
DisparateSchemasUnionDataset(std::shared_ptr schema, DatasetVector children)
diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index 66e0e1dd68d..fee471d975f 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -118,6 +118,14 @@ Result> FileSystemDataset::Make(
std::move(filesystem), std::move(forest), std::move(partitions)));
}
+Result> FileSystemDataset::ReplaceSchema(
+ std::shared_ptr schema) const {
+ RETURN_NOT_OK(CheckProjectable(*schema_, *schema));
+ return std::shared_ptr(
+ new FileSystemDataset(std::move(schema), partition_expression_, format_,
+ filesystem_, forest_, partitions_));
+}
+
std::vector FileSystemDataset::files() const {
std::vector files;
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index 157a4256e1c..e6d893193ff 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -244,6 +244,9 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
std::string type_name() const override { return "filesystem"; }
+ Result> ReplaceSchema(
+ std::shared_ptr schema) const override;
+
const std::shared_ptr& format() const { return format_; }
std::vector files() const;
diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc
index 4d6d6f6348c..699ba989587 100644
--- a/cpp/src/arrow/dataset/file_test.cc
+++ b/cpp/src/arrow/dataset/file_test.cc
@@ -92,6 +92,30 @@ TEST_F(TestFileSystemDataset, Basic) {
AssertFilesAre(dataset_, {"A/a", "A/B/b"});
}
+TEST_F(TestFileSystemDataset, ReplaceSchema) {
+ auto schm = schema({field("i32", int32()), field("f64", float64())});
+ auto format = std::make_shared(schm);
+ ASSERT_OK_AND_ASSIGN(auto dataset,
+ FileSystemDataset::Make(schm, scalar(true), format, fs_, {}));
+
+ // drop field
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("i32", int32())})).status());
+ // add nullable field (will be materialized as null during projection)
+ ASSERT_OK(dataset->ReplaceSchema(schema({field("str", utf8())})).status());
+ // incompatible type
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("i32", utf8())})).status());
+ // incompatible nullability
+ ASSERT_RAISES(
+ TypeError,
+ dataset->ReplaceSchema(schema({field("f64", float64(), /*nullable=*/false)}))
+ .status());
+ // add non-nullable field
+ ASSERT_RAISES(TypeError,
+ dataset->ReplaceSchema(schema({field("str", utf8(), /*nullable=*/false)}))
+ .status());
+}
+
TEST_F(TestFileSystemDataset, RootPartitionPruning) {
auto root_partition = ("a"_ == 5).Copy();
MakeDataset({fs::File("a"), fs::File("b")}, root_partition);
diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc
index 531c4a56694..9ce90ad0ed3 100644
--- a/cpp/src/arrow/dataset/projector.cc
+++ b/cpp/src/arrow/dataset/projector.cc
@@ -34,6 +34,33 @@
namespace arrow {
namespace dataset {
+Status CheckProjectable(const Schema& from, const Schema& to) {
+ for (const auto& to_field : to.fields()) {
+ ARROW_ASSIGN_OR_RAISE(auto from_field, FieldRef(to_field->name()).GetOneOrNone(from));
+
+ if (from_field == nullptr) {
+ if (to_field->nullable()) continue;
+
+ return Status::TypeError("field ", to_field->ToString(),
+ " is not nullable and does not exist in origin schema ",
+ from);
+ }
+
+ if (!from_field->type()->Equals(to_field->type())) {
+ return Status::TypeError("fields had matching names but differing types. From: ",
+ from_field->ToString(), " To: ", to_field->ToString());
+ }
+
+ if (from_field->nullable() && !to_field->nullable()) {
+ return Status::TypeError("field ", to_field->ToString(),
+ " is not nullable but is not required in origin schema ",
+ from);
+ }
+ }
+
+ return Status::OK();
+}
+
RecordBatchProjector::RecordBatchProjector(std::shared_ptr to)
: to_(std::move(to)),
missing_columns_(to_->num_fields(), nullptr),
@@ -86,32 +113,23 @@ Result> RecordBatchProjector::Project(
Status RecordBatchProjector::SetInputSchema(std::shared_ptr from,
MemoryPool* pool) {
+ RETURN_NOT_OK(CheckProjectable(*from, *to_));
from_ = std::move(from);
for (int i = 0; i < to_->num_fields(); ++i) {
- const auto& field = to_->field(i);
- FieldRef ref(field->name());
- auto matches = ref.FindAll(*from_);
+ ARROW_ASSIGN_OR_RAISE(auto match,
+ FieldRef(to_->field(i)->name()).FindOneOrNone(*from_));
- if (matches.empty()) {
+ if (match.indices().empty()) {
// Mark column i as missing by setting missing_columns_[i]
// to a non-null placeholder.
ARROW_ASSIGN_OR_RAISE(missing_columns_[i],
MakeArrayOfNull(to_->field(i)->type(), 0, pool));
column_indices_[i] = kNoMatch;
} else {
- RETURN_NOT_OK(ref.CheckNonMultiple(matches, *from_));
- int matching_index = matches[0].indices()[0];
-
- if (!from_->field(matching_index)->Equals(field, /*check_metadata=*/false)) {
- return Status::TypeError("fields had matching names but were not equivalent ",
- from_->field(matching_index)->ToString(), " vs ",
- field->ToString());
- }
-
// Mark column i as not missing by setting missing_columns_[i] to nullptr
missing_columns_[i] = nullptr;
- column_indices_[i] = matching_index;
+ column_indices_[i] = match.indices()[0];
}
}
return Status::OK();
diff --git a/cpp/src/arrow/dataset/projector.h b/cpp/src/arrow/dataset/projector.h
index 13a0ffb1938..8fd157f7ece 100644
--- a/cpp/src/arrow/dataset/projector.h
+++ b/cpp/src/arrow/dataset/projector.h
@@ -27,6 +27,8 @@
namespace arrow {
namespace dataset {
+ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
+
/// \brief Project a RecordBatch to a given schema.
///
/// Projected record batches will reorder columns from input record batches when possible,
diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h
index 13dd3870b1a..b492057a8b2 100644
--- a/cpp/src/arrow/result.h
+++ b/cpp/src/arrow/result.h
@@ -45,7 +45,7 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st);
} // namespace internal
-// A class for representing either a usable value, or an error.
+/// A class for representing either a usable value, or an error.
///
/// A Result object either contains a value of type `T` or a Status object
/// explaining why such a value is not present. The type `T` must be
@@ -98,7 +98,7 @@ ARROW_EXPORT void InvalidValueOrDie(const Status& st);
/// arrow::Result CalculateFoo();
/// ```
template
-class Result : public util::EqualityComparable> {
+class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable> {
template
friend class Result;
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index 195ed71cce9..aa1f2e151e5 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -95,11 +95,6 @@ enum class StatusCode : char {
AlreadyExists = 45
};
-#if defined(__clang__)
-// Only clang supports warn_unused_result as a type annotation.
-class ARROW_MUST_USE_RESULT ARROW_EXPORT Status;
-#endif
-
/// \brief An opaque class that allows subsystems to retain
/// additional information inside the Status.
class ARROW_EXPORT StatusDetail {
@@ -124,8 +119,8 @@ class ARROW_EXPORT StatusDetail {
///
/// Additionally, if an error occurred, a specific error message is generally
/// attached.
-class ARROW_EXPORT Status : public util::EqualityComparable,
- public util::ToStringOstreamable {
+class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable,
+ public util::ToStringOstreamable {
public:
// Create a success status.
Status() noexcept : state_(NULLPTR) {}
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 6119e0f0fc7..14da481f0bb 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -97,11 +97,12 @@ class Result;
#define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr))
-#define ARROW_EXPECT_OK(expr) \
- do { \
- auto _res = (expr); \
- ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
- EXPECT_TRUE(_st.ok()); \
+#define ARROW_EXPECT_OK(expr) \
+ do { \
+ auto _res = (expr); \
+ ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
+ EXPECT_TRUE(_st.ok()) << "'" ARROW_STRINGIFY(expr) "' failed with " \
+ << _st.ToString(); \
} while (false)
#define ABORT_NOT_OK(expr) \
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index edb03d31c14..ae8d56d098b 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -68,6 +68,13 @@
#define ARROW_MUST_USE_RESULT
#endif
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+#define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT
+#else
+#define ARROW_MUST_USE_TYPE
+#endif
+
// ----------------------------------------------------------------------
// C++/CLI support macros (see ARROW-1134)
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 556faa74539..ab42aa7d99a 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -62,7 +62,7 @@ cdef class Dataset:
self.dataset = sp.get()
@staticmethod
- cdef wrap(shared_ptr[CDataset]& sp):
+ cdef wrap(const shared_ptr[CDataset]& sp):
cdef Dataset self
typ = frombytes(sp.get().type_name())
@@ -92,6 +92,18 @@ cdef class Dataset:
else:
return Expression.wrap(expr)
+ def replace_schema(self, Schema schema not None):
+ """
+ Return a copy of this Dataset with a different schema.
+
+ The copy will view the same Fragments. If the new schema is not
+ compatible with the original dataset's schema then an error will
+ be raised.
+ """
+ cdef shared_ptr[CDataset] copy = GetResultValue(
+ self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema)))
+ return Dataset.wrap(move(copy))
+
def get_fragments(self, columns=None, filter=None):
"""Returns an iterator over the fragments in this dataset.
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 467a84c9319..53d5cde9dc9 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -187,6 +187,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
const shared_ptr[CExpression] & partition_expression()
c_string type_name()
+ CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema])
+
CResult[shared_ptr[CScannerBuilder]] NewScanWithContext "NewScan"(
shared_ptr[CScanContext] context)
CResult[shared_ptr[CScannerBuilder]] NewScan()