From 3c621c14bdb0fd9d9a361ac3ac2465e96b7caf20 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 25 Mar 2021 11:31:07 -0400
Subject: [PATCH 1/9] ARROW-10882: [C++] Create InMemoryDataset from
 RecordBatchReader

---
 cpp/src/arrow/dataset/dataset.cc      | 16 ++++++++++++++++
 cpp/src/arrow/dataset/dataset.h       |  2 ++
 cpp/src/arrow/dataset/dataset_test.cc | 14 ++++++++++++++
 3 files changed, 32 insertions(+)
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index df155784924..9bc24b04920 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -148,6 +148,22 @@ InMemoryDataset::InMemoryDataset(std::shared_ptr<Table> table)
     : Dataset(table->schema()),
       get_batches_(new TableRecordBatchGenerator(std::move(table))) {}
 
+struct ReaderRecordBatchGenerator : InMemoryDataset::RecordBatchGenerator {
+  explicit ReaderRecordBatchGenerator(std::shared_ptr<RecordBatchReader> reader)
+      : reader_(std::move(reader)) {}
+
+  RecordBatchIterator Get() const final {
+    auto reader = reader_;
+    return MakeFunctionIterator([reader] { return reader->Next(); });
+  }
+
+  std::shared_ptr<RecordBatchReader> reader_;
+};
+
+InMemoryDataset::InMemoryDataset(std::shared_ptr<RecordBatchReader> reader)
+    : Dataset(reader->schema()),
+      get_batches_(new ReaderRecordBatchGenerator(std::move(reader))) {}
+
 Result<std::shared_ptr<Dataset>> InMemoryDataset::ReplaceSchema(
     std::shared_ptr<Schema> schema) const {
   RETURN_NOT_OK(CheckProjectable(*schema_, *schema));
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index a28b79840d6..169959633c4 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -32,6 +32,7 @@
 #include "arrow/util/mutex.h"
 
 namespace arrow {
+
 namespace dataset {
 
 /// \brief A granular piece of a Dataset, such as an individual file.
@@ -184,6 +185,7 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
   InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
 
   explicit InMemoryDataset(std::shared_ptr<Table> table);
+  explicit InMemoryDataset(std::shared_ptr<RecordBatchReader> reader);
 
   std::string type_name() const override { return "in-memory"; }
 
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index a3603558924..70eaf5cfa73 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -79,6 +79,20 @@ TEST_F(TestInMemoryDataset, ReplaceSchema) {
                     .status());
 }
 
+TEST_F(TestInMemoryDataset, FromReader) {
+  constexpr int64_t kBatchSize = 1024;
+  constexpr int64_t kNumberBatches = 16;
+
+  SetSchema({field("i32", int32()), field("f64", float64())});
+  auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+  auto source_reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
+  auto target_reader = ConstantArrayGenerator::Repeat(kNumberBatches, batch);
+
+  auto dataset = std::make_shared<InMemoryDataset>(source_reader);
+
+  AssertDatasetEquals(target_reader.get(), dataset.get());
+}
+
 TEST_F(TestInMemoryDataset, GetFragments) {
   constexpr int64_t kBatchSize = 1024;
   constexpr int64_t kNumberBatches = 16;

From 0fa931511c5d1090b3be0aa3b1d342be3c8ea874 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 25 Mar 2021 13:28:47 -0400
Subject: [PATCH 2/9] ARROW-10882: [Python] Expose InMemoryDataset

---
 python/pyarrow/_dataset.pyx                  | 71 +++++++++++++++++++-
 python/pyarrow/dataset.py                    | 32 +++++++--
 python/pyarrow/includes/libarrow_dataset.pxd |  5 ++
 python/pyarrow/tests/test_dataset.py         | 54 ++++++++++++++-
 python/pyarrow/util.py                       |  8 +++
 5 files changed, 161 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 12ddcee5343..de317533a21 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -26,11 +26,11 @@ import os
 
 import pyarrow as pa
 from pyarrow.lib cimport *
-from pyarrow.lib import frombytes, tobytes
+from pyarrow.lib import ArrowTypeError, frombytes, tobytes
 from pyarrow.includes.libarrow_dataset cimport *
 from pyarrow._fs cimport FileSystem, FileInfo, FileSelector
 from pyarrow._csv cimport ConvertOptions, ParseOptions, ReadOptions
-from pyarrow.util import _is_path_like, _stringify_path
+from pyarrow.util import _is_iterable, _is_path_like, _stringify_path
 
 from pyarrow._parquet cimport (
     _create_writer_properties, _create_arrow_writer_properties,
@@ -441,6 +441,73 @@ cdef class Dataset(_Weakrefable):
         return pyarrow_wrap_schema(self.dataset.schema())
 
 
+cdef class InMemoryDataset(Dataset):
+    """A Dataset wrapping in-memory data.
+
+    Parameters
+    ----------
+    source
+        The data for this dataset. Can be a RecordBatch, Table, list of
+        RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader.
+        If an iterable is provided, the schema must also be provided.
+    schema : Schema, optional
+        Only required if passing an iterable as the source.
+    """
+
+    cdef:
+        CInMemoryDataset* in_memory_dataset
+
+    def __init__(self, source, Schema schema=None):
+        cdef:
+            RecordBatchReader reader
+            shared_ptr[CInMemoryDataset] in_memory_dataset
+
+        if isinstance(source, (pa.RecordBatch, pa.Table)):
+            source = [source]
+
+        if isinstance(source, (list, tuple)):
+            batches = []
+            for item in source:
+                if isinstance(item, pa.RecordBatch):
+                    batches.append(item)
+                elif isinstance(item, pa.Table):
+                    batches.extend(item.to_batches())
+                else:
+                    raise TypeError(
+                        'Expected a list of tables or batches. The given list '
+                        'contains a ' + type(item).__name__)
+                if schema is None:
+                    schema = item.schema
+                elif not schema.equals(item.schema):
+                    raise ArrowTypeError(
+                        f'Item has schema {item.schema} which which does not '
+                        f'match expected schema {schema}')
+            if not batches and schema is None:
+                raise ValueError('Must provide schema to construct in-memory '
+                                 'dataset from an empty list')
+            reader = pa.ipc.RecordBatchReader.from_batches(schema, batches)
+        elif isinstance(source, pa.ipc.RecordBatchReader):
+            reader = source
+        elif _is_iterable(source):
+            if schema is None:
+                raise ValueError('Must provide schema to construct in-memory '
+                                 'dataset from an iterable')
+            reader = pa.ipc.RecordBatchReader.from_batches(schema, source)
+        else:
+            raise TypeError(
+                'Expected a table, batch, iterable of tables/batches, or a '
+                'record batch reader instead of the given type: ' +
+                type(source).__name__
+            )
+
+        in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
+        self.init(<shared_ptr[CDataset]> in_memory_dataset)
+
+    cdef void init(self, const shared_ptr[CDataset]& sp):
+        Dataset.init(self, sp)
+        self.in_memory_dataset = <CInMemoryDataset*> sp.get()
+
+
 cdef class UnionDataset(Dataset):
     """A Dataset wrapping child datasets.
 
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 195d414b047..cf14dcb9f55 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -18,7 +18,7 @@
 """Dataset is currently unstable. APIs subject to change without notice."""
 
 import pyarrow as pa
-from pyarrow.util import _stringify_path, _is_path_like
+from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
 
 from pyarrow._dataset import (  # noqa
     CsvFileFormat,
@@ -37,6 +37,7 @@
     HivePartitioning,
     IpcFileFormat,
     IpcFileWriteOptions,
+    InMemoryDataset,
     ParquetDatasetFactory,
     ParquetFactoryOptions,
     ParquetFileFormat,
@@ -408,6 +409,13 @@ def _filesystem_dataset(source, schema=None, filesystem=None,
     return factory.finish(schema)
 
 
+def _in_memory_dataset(source, schema=None, **kwargs):
+    if any(v is not None for v in kwargs.values()):
+        raise ValueError(
+            "For in-memory datasets, you cannot pass any additional arguments")
+    return InMemoryDataset(source, schema)
+
+
 def _union_dataset(children, schema=None, **kwargs):
     if any(v is not None for v in kwargs.values()):
         raise ValueError(
@@ -508,7 +516,8 @@ def dataset(source, schema=None, format=None, filesystem=None,
 
     Parameters
     ----------
-    source : path, list of paths, dataset, list of datasets or URI
+    source : path, list of paths, dataset, list of datasets, (list or
+             iterable of) batches or tables, RecordBatchReader, or URI
         Path pointing to a single file:
             Open a FileSystemDataset from a single file.
         Path pointing to a directory:
@@ -524,6 +533,11 @@ def dataset(source, schema=None, format=None, filesystem=None,
             A nested UnionDataset gets constructed, it allows arbitrary
             composition of other datasets.
             Note that additional keyword arguments are not allowed.
+        (List or iterable of) batches or tables:
+            Create an InMemoryDataset. If an iterable or empty list is given,
+            a schema must also be given.
+        RecordBatchReader:
+            Create an InMemoryDataset.
     schema : Schema, optional
         Optionally provide the Schema for the Dataset, in which case it will
         not be inferred from the source.
@@ -636,7 +650,6 @@ def dataset(source, schema=None, format=None, filesystem=None,
         selector_ignore_prefixes=ignore_prefixes
     )
 
-    # TODO(kszucs): support InMemoryDataset for a table input
     if _is_path_like(source):
         return _filesystem_dataset(source, **kwargs)
     elif isinstance(source, (tuple, list)):
@@ -644,13 +657,22 @@ def dataset(source, schema=None, format=None, filesystem=None,
             return _filesystem_dataset(source, **kwargs)
         elif all(isinstance(elem, Dataset) for elem in source):
             return _union_dataset(source, **kwargs)
+        elif all(isinstance(elem, (pa.RecordBatch, pa.Table))
+                 for elem in source):
+            return _in_memory_dataset(source, **kwargs)
         else:
             unique_types = set(type(elem).__name__ for elem in source)
             type_names = ', '.join('{}'.format(t) for t in unique_types)
             raise TypeError(
-                'Expected a list of path-like or dataset objects. The given '
-                'list contains the following types: {}'.format(type_names)
+                'Expected a list of path-like or dataset objects, or a list '
+                'of batches or tables. The given list contains the following '
+                'types: {}'.format(type_names)
             )
+    elif isinstance(source, (pa.RecordBatch, pa.ipc.RecordBatchReader,
+                             pa.Table)):
+        return _in_memory_dataset(source, **kwargs)
+    elif _is_iterable(source):
+        return _in_memory_dataset(source, **kwargs)
     else:
         raise TypeError(
             'Expected a path-like, list of path-likes or a list of Datasets '
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index f7f2a142001..db2e73acdff 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -124,6 +124,11 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
 
         CResult[shared_ptr[CScannerBuilder]] NewScan()
 
+    cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"(
+            CDataset):
+        CInMemoryDataset(shared_ptr[CRecordBatchReader])
+        CInMemoryDataset(shared_ptr[CTable])
+
     cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"(
             CDataset):
         @staticmethod
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 36cff9958f9..ae16635b0f8 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1622,13 +1622,16 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
         fs.FileSelector('/schema'),
         format=ds.ParquetFileFormat()
     )
+    batch1 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["a"])
+    batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"])
 
     with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
         ds.dataset([child1, child2])
 
     expected = (
-        "Expected a list of path-like or dataset objects. The given list "
-        "contains the following types: int"
+        "Expected a list of path-like or dataset objects, or a list "
+        "of batches or tables. The given list contains the following "
+        "types: int"
     )
     with pytest.raises(TypeError, match=expected):
         ds.dataset([1, 2, 3])
@@ -1640,6 +1643,53 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
     with pytest.raises(TypeError, match=expected):
         ds.dataset(None)
 
+    expected = (
+        "Must provide schema to construct in-memory dataset from an iterable"
+    )
+    with pytest.raises(ValueError, match=expected):
+        ds.dataset((batch1 for _ in range(3)))
+
+    expected = (
+        "Must provide schema to construct in-memory dataset from an empty list"
+    )
+    with pytest.raises(ValueError, match=expected):
+        ds.InMemoryDataset([])
+
+    expected = (
+        "Item has schema b: int64 which which does not match expected schema "
+        "a: int64"
+    )
+    with pytest.raises(TypeError, match=expected):
+        ds.dataset([batch1, batch2])
+
+    expected = (
+        "Expected a list of path-like or dataset objects, or a list of "
+        "batches or tables. The given list contains the following types:"
+    )
+    with pytest.raises(TypeError, match=expected):
+        ds.dataset([batch1, 0])
+
+    expected = (
+        "Expected a list of tables or batches. The given list contains a int"
+    )
+    with pytest.raises(TypeError, match=expected):
+        ds.InMemoryDataset([batch1, 0])
+
+
+def test_construct_in_memory():
+    batch = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["a"])
+    table = pa.Table.from_batches([batch])
+    reader = pa.ipc.RecordBatchReader.from_batches(
+        batch.schema, (batch for _ in range(1)))
+    iterable = (batch for _ in range(1))
+
+    for source in (batch, table, reader, [batch], [table]):
+        dataset = ds.dataset(source)
+        assert dataset.to_table().equals(table)
+
+    assert ds.dataset(iterable, schema=batch.schema).to_table().equals(table)
+    assert ds.dataset([], schema=pa.schema([])).to_table() == pa.table([])
+
 
 @pytest.mark.parquet
 def test_open_dataset_partitioned_directory(tempdir):
diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py
index e91294a3a1b..446e6733351 100644
--- a/python/pyarrow/util.py
+++ b/python/pyarrow/util.py
@@ -62,6 +62,14 @@ def __instancecheck__(self, other):
     return _DeprecatedMeta(old_name, (new_class,), {})
 
 
+def _is_iterable(obj):
+    try:
+        iter(obj)
+        return True
+    except TypeError:
+        return False
+
+
 def _is_path_like(path):
     # PEP519 filesystem path protocol is available from python 3.6, so pathlib
     # doesn't implement __fspath__ for earlier versions

From 0900fb42bd18a31567f7f091e966cbd5fb9cab91 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 25 Mar 2021 13:46:37 -0400
Subject: [PATCH 3/9] ARROW-10882: [Python] Allow writing an iterable of
 batches

---
 python/pyarrow/dataset.py            | 15 ++++++++++-----
 python/pyarrow/tests/test_dataset.py | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index cf14dcb9f55..c38b529f085 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -698,9 +698,11 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
 
     Parameters
     ----------
-    data : Dataset, Table/RecordBatch, or list of Table/RecordBatch
+    data : Dataset, Table/RecordBatch, RecordBatchReader, or list or
+           iterable of Table/RecordBatch
         The data to write. This can be a Dataset instance or
-        in-memory Arrow data.
+        in-memory Arrow data. If an iterable is given, the schema must
+        also be given.
     base_dir : str
         The root directory where to write the dataset.
     basename_template : str, optional
@@ -735,12 +737,15 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
     elif isinstance(data, (pa.Table, pa.RecordBatch)):
         schema = schema or data.schema
         data = [data]
-    elif isinstance(data, list):
+    elif isinstance(data, (list, tuple)):
         schema = schema or data[0].schema
+    elif isinstance(data, pa.ipc.RecordBatchReader) or _is_iterable(data):
+        data = InMemoryDataset(data, schema=schema)
+        schema = schema or data.schema
     else:
         raise ValueError(
-            "Only Dataset, Table/RecordBatch or a list of Table/RecordBatch "
-            "objects are supported."
+            "Only Dataset, Table/RecordBatch, RecordBatchReader, or a list "
+            "or iterable of Table/RecordBatch objects are supported."
         )
 
     if format is None and isinstance(data, FileSystemDataset):
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index ae16635b0f8..3b4b5e18eb1 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -2906,6 +2906,28 @@ def test_write_table_multiple_fragments(tempdir):
     )
 
 
+def test_write_iterable(tempdir):
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))
+    ], names=["f1", "f2", "part"])
+
+    base_dir = tempdir / 'inmemory_iterable'
+    ds.write_dataset((batch for batch in table.to_batches()), base_dir,
+                     schema=table.schema,
+                     basename_template='dat_{i}.arrow', format="feather")
+    result = ds.dataset(base_dir, format="ipc").to_table()
+    assert result.equals(table)
+
+    base_dir = tempdir / 'inmemory_reader'
+    reader = pa.ipc.RecordBatchReader.from_batches(table.schema,
+                                                   table.to_batches())
+    ds.write_dataset(reader, base_dir,
+                     basename_template='dat_{i}.arrow', format="feather")
+    result = ds.dataset(base_dir, format="ipc").to_table()
+    assert result.equals(table)
+
+
 def test_write_table_partitioned_dict(tempdir):
     # ensure writing table partitioned on a dictionary column works without
     # specifying the dictionary values explicitly

From 878d6dd9a2d75893db087da02fae1d7469384e8e Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 25 Mar 2021 13:53:13 -0400
Subject: [PATCH 4/9] ARROW-10882: [Python] Clarify that iterables of tables
 are not accepted

---
 python/pyarrow/dataset.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index c38b529f085..c343f76db63 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -516,8 +516,8 @@ def dataset(source, schema=None, format=None, filesystem=None,
 
     Parameters
     ----------
-    source : path, list of paths, dataset, list of datasets, (list or
-             iterable of) batches or tables, RecordBatchReader, or URI
+    source : path, list of paths, dataset, list of datasets, (list of) batches
+             or tables, iterable of batches, RecordBatchReader, or URI
         Path pointing to a single file:
             Open a FileSystemDataset from a single file.
         Path pointing to a directory:
@@ -533,11 +533,9 @@ def dataset(source, schema=None, format=None, filesystem=None,
             A nested UnionDataset gets constructed, it allows arbitrary
             composition of other datasets.
             Note that additional keyword arguments are not allowed.
-        (List or iterable of) batches or tables:
+        (List of) batches or tables, iterable of batches, or RecordBatchReader:
             Create an InMemoryDataset. If an iterable or empty list is given,
             a schema must also be given.
-        RecordBatchReader:
-            Create an InMemoryDataset.
     schema : Schema, optional
         Optionally provide the Schema for the Dataset, in which case it will
         not be inferred from the source.
@@ -698,8 +696,8 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
 
     Parameters
     ----------
-    data : Dataset, Table/RecordBatch, RecordBatchReader, or list or
-           iterable of Table/RecordBatch
+    data : Dataset, Table/RecordBatch, RecordBatchReader, list of
+           Table/RecordBatch, or iterable of RecordBatch
         The data to write. This can be a Dataset instance or
         in-memory Arrow data. If an iterable is given, the schema must
         also be given.
@@ -744,8 +742,8 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
         schema = schema or data.schema
     else:
         raise ValueError(
-            "Only Dataset, Table/RecordBatch, RecordBatchReader, or a list "
-            "or iterable of Table/RecordBatch objects are supported."
+            "Only Dataset, Table/RecordBatch, RecordBatchReader, a list "
+            "of Tables/RecordBatches, or iterable of batches are supported."
         )
 
     if format is None and isinstance(data, FileSystemDataset):

From 906d4934b6d7dde794eea074a98535c0512fd5e5 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 25 Mar 2021 13:54:19 -0400
Subject: [PATCH 5/9] Undo unnecessary change

---
 cpp/src/arrow/dataset/dataset.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 169959633c4..6be83059fc1 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -32,7 +32,6 @@
 #include "arrow/util/mutex.h"
 
 namespace arrow {
-
 namespace dataset {
 
 /// \brief A granular piece of a Dataset, such as an individual file.

From 87c5943d9cb194f0b46df32068780769a8f907f6 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 29 Mar 2021 10:17:13 -0400
Subject: [PATCH 6/9] Raise error when re-scanning one-shot InMemoryDataset

---
 cpp/src/arrow/dataset/dataset.cc      |  8 ++++++-
 cpp/src/arrow/dataset/dataset_test.cc |  3 +++
 python/pyarrow/_dataset.pyx           | 11 +++++----
 python/pyarrow/dataset.py             |  4 +++-
 python/pyarrow/tests/test_dataset.py  | 33 +++++++++++++++++++++++++--
 5 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 9bc24b04920..6d4eeb5603f 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -150,14 +150,20 @@ InMemoryDataset::InMemoryDataset(std::shared_ptr<Table> table)
 
 struct ReaderRecordBatchGenerator : InMemoryDataset::RecordBatchGenerator {
   explicit ReaderRecordBatchGenerator(std::shared_ptr<RecordBatchReader> reader)
-      : reader_(std::move(reader)) {}
+      : reader_(std::move(reader)), consumed_(false) {}
 
   RecordBatchIterator Get() const final {
+    if (consumed_) {
+      return MakeErrorIterator<std::shared_ptr<RecordBatch>>(Status::Invalid(
+          "RecordBatchReader-backed InMemoryDataset was already consumed"));
+    }
+    consumed_ = true;
     auto reader = reader_;
     return MakeFunctionIterator([reader] { return reader->Next(); });
   }
 
   std::shared_ptr<RecordBatchReader> reader_;
+  mutable bool consumed_;
 };
 
 InMemoryDataset::InMemoryDataset(std::shared_ptr<RecordBatchReader> reader)
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index 70eaf5cfa73..e7b9954fec4 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -91,6 +91,9 @@ TEST_F(TestInMemoryDataset, FromReader) {
   auto dataset = std::make_shared<InMemoryDataset>(source_reader);
 
   AssertDatasetEquals(target_reader.get(), dataset.get());
+  // Such datasets can only be scanned once
+  ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments());
+  ASSERT_RAISES(Invalid, fragments.Next());
 }
 
 TEST_F(TestInMemoryDataset, GetFragments) {
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index de317533a21..a86e5521bc3 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -480,19 +480,23 @@ cdef class InMemoryDataset(Dataset):
                     schema = item.schema
                 elif not schema.equals(item.schema):
                     raise ArrowTypeError(
-                        f'Item has schema {item.schema} which which does not '
-                        f'match expected schema {schema}')
+                        f'Item has schema\n{item.schema}\nwhich does not '
+                        f'match expected schema\n{schema}')
             if not batches and schema is None:
                 raise ValueError('Must provide schema to construct in-memory '
                                  'dataset from an empty list')
-            reader = pa.ipc.RecordBatchReader.from_batches(schema, batches)
+            table = pa.Table.from_batches(batches, schema=schema)
+            in_memory_dataset = make_shared[CInMemoryDataset](
+                pyarrow_unwrap_table(table))
         elif isinstance(source, pa.ipc.RecordBatchReader):
             reader = source
+            in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
         elif _is_iterable(source):
             if schema is None:
                 raise ValueError('Must provide schema to construct in-memory '
                                  'dataset from an iterable')
             reader = pa.ipc.RecordBatchReader.from_batches(schema, source)
+            in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
         else:
             raise TypeError(
                 'Expected a table, batch, iterable of tables/batches, or a '
@@ -500,7 +504,6 @@ cdef class InMemoryDataset(Dataset):
                 type(source).__name__
             )
 
-        in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
         self.init(<shared_ptr[CDataset]> in_memory_dataset)
 
     cdef void init(self, const shared_ptr[CDataset]& sp):
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index c343f76db63..68afb5321ba 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -535,7 +535,9 @@ def dataset(source, schema=None, format=None, filesystem=None,
             Note that additional keyword arguments are not allowed.
         (List of) batches or tables, iterable of batches, or RecordBatchReader:
             Create an InMemoryDataset. If an iterable or empty list is given,
-            a schema must also be given.
+            a schema must also be given. If an iterable or RecordBatchReader
+            is given, the resulting dataset can only be scanned once; further
+            attempts will raise an error.
     schema : Schema, optional
         Optionally provide the Schema for the Dataset, in which case it will
         not be inferred from the source.
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3b4b5e18eb1..a3e56c7e6a0 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1679,8 +1679,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
 def test_construct_in_memory():
     batch = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["a"])
     table = pa.Table.from_batches([batch])
-    reader = pa.ipc.RecordBatchReader.from_batches(
-        batch.schema, (batch for _ in range(1)))
+    reader = pa.ipc.RecordBatchReader.from_batches(batch.schema, [batch])
     iterable = (batch for _ in range(1))
 
     for source in (batch, table, reader, [batch], [table]):
@@ -1690,6 +1689,36 @@ def test_construct_in_memory():
     assert ds.dataset(iterable, schema=batch.schema).to_table().equals(table)
     assert ds.dataset([], schema=pa.schema([])).to_table() == pa.table([])
 
+    # When constructed from batches/tables, should be reusable
+    for source in (batch, table, [batch], [table]):
+        dataset = ds.dataset(source)
+        assert len(list(dataset.get_fragments())) == 1
+        assert len(list(dataset.get_fragments())) == 1
+        assert dataset.to_table().equals(table)
+        assert dataset.to_table().equals(table)
+
+    # When constructed from readers/iterators, should be one-shot
+    match = "InMemoryDataset was already consumed"
+    for factory in (
+            lambda: pa.ipc.RecordBatchReader.from_batches(
+                batch.schema, [batch]),
+            lambda: (batch for _ in range(1)),
+    ):
+        dataset = ds.dataset(factory(), schema=batch.schema)
+        # Getting fragments consumes the underlying iterator
+        assert len(list(dataset.get_fragments())) == 1
+        with pytest.raises(pa.ArrowInvalid, match=match):
+            list(dataset.get_fragments())
+        with pytest.raises(pa.ArrowInvalid, match=match):
+            dataset.to_table()
+        # Materializing consumes the underlying iterator
+        dataset = ds.dataset(factory(), schema=batch.schema)
+        dataset.to_table()
+        with pytest.raises(pa.ArrowInvalid, match=match):
+            list(dataset.get_fragments())
+        with pytest.raises(pa.ArrowInvalid, match=match):
+            dataset.to_table()
+
 
 @pytest.mark.parquet
 def test_open_dataset_partitioned_directory(tempdir):

From 7508c01ef932ae3c6d5eed77496e4f73e334674e Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 29 Mar 2021 10:52:29 -0400
Subject: [PATCH 7/9] Fix InMemoryFragment being constructed with empty schema

---
 cpp/src/arrow/dataset/dataset.cc      |  7 +++++--
 cpp/src/arrow/dataset/dataset_test.cc | 15 +++++++++++++++
 python/pyarrow/tests/test_dataset.py  | 13 ++++++++-----
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 6d4eeb5603f..2df34145cd9 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -66,8 +66,11 @@ InMemoryFragment::InMemoryFragment(std::shared_ptr<Schema> schema,
 
 InMemoryFragment::InMemoryFragment(RecordBatchVector record_batches,
                                    Expression partition_expression)
-    : InMemoryFragment(record_batches.empty() ? schema({}) : record_batches[0]->schema(),
-                       std::move(record_batches), std::move(partition_expression)) {}
+    : Fragment(std::move(partition_expression), /*schema=*/nullptr),
+      record_batches_(std::move(record_batches)) {
+  // Order of argument evaluation is undefined, so compute physical_schema here
+  physical_schema_ = record_batches_.empty() ? schema({}) : record_batches_[0]->schema();
+}
 
 Result<ScanTaskIterator> InMemoryFragment::Scan(std::shared_ptr<ScanOptions> options) {
   // Make an explicit copy of record_batches_ to ensure Scan can be called
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index e7b9954fec4..b2b82561359 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -110,6 +110,21 @@ TEST_F(TestInMemoryDataset, GetFragments) {
   AssertDatasetEquals(reader.get(), dataset.get());
 }
 
+TEST_F(TestInMemoryDataset, InMemoryFragment) {
+  constexpr int64_t kBatchSize = 1024;
+  constexpr int64_t kNumberBatches = 16;
+
+  SetSchema({field("i32", int32()), field("f64", float64())});
+  auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);
+  RecordBatchVector batches{batch};
+
+  // Regression test: previously this constructor relied on undefined behavior (order of
+  // evaluation of arguments) leading to fragments being constructed with empty schemas
+  auto fragment = std::make_shared<InMemoryFragment>(batches);
+  ASSERT_OK_AND_ASSIGN(auto schema, fragment->ReadPhysicalSchema());
+  AssertSchemaEqual(batch->schema(), schema);
+}
+
 class TestUnionDataset : public DatasetFixtureMixin {};
 
 TEST_F(TestUnionDataset, ReplaceSchema) {
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index a3e56c7e6a0..ebd99922f9e 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1684,7 +1684,7 @@ def test_construct_in_memory():
 
     for source in (batch, table, reader, [batch], [table]):
         dataset = ds.dataset(source)
-        assert dataset.to_table().equals(table)
+        assert dataset.to_table() == table
 
     assert ds.dataset(iterable, schema=batch.schema).to_table().equals(table)
     assert ds.dataset([], schema=pa.schema([])).to_table() == pa.table([])
@@ -1694,8 +1694,9 @@ def test_construct_in_memory():
         dataset = ds.dataset(source)
         assert len(list(dataset.get_fragments())) == 1
         assert len(list(dataset.get_fragments())) == 1
-        assert dataset.to_table().equals(table)
-        assert dataset.to_table().equals(table)
+        assert dataset.to_table() == table
+        assert dataset.to_table() == table
+        assert next(dataset.get_fragments()).to_table() == table
 
     # When constructed from readers/iterators, should be one-shot
     match = "InMemoryDataset was already consumed"
@@ -1706,14 +1707,16 @@ def test_construct_in_memory():
     ):
         dataset = ds.dataset(factory(), schema=batch.schema)
         # Getting fragments consumes the underlying iterator
-        assert len(list(dataset.get_fragments())) == 1
+        fragments = list(dataset.get_fragments())
+        assert len(fragments) == 1
+        assert fragments[0].to_table() == table
         with pytest.raises(pa.ArrowInvalid, match=match):
             list(dataset.get_fragments())
         with pytest.raises(pa.ArrowInvalid, match=match):
             dataset.to_table()
         # Materializing consumes the underlying iterator
         dataset = ds.dataset(factory(), schema=batch.schema)
-        dataset.to_table()
+        assert dataset.to_table() == table
         with pytest.raises(pa.ArrowInvalid, match=match):
             list(dataset.get_fragments())
         with pytest.raises(pa.ArrowInvalid, match=match):

From 14330416b6a92aa84848116b97ec06a64d24f3f0 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 29 Mar 2021 10:59:28 -0400
Subject: [PATCH 8/9] Use InMemoryDataset for writing

---
 python/pyarrow/_dataset.pyx          | 26 ++++++++------------------
 python/pyarrow/dataset.py            |  7 +++----
 python/pyarrow/tests/test_dataset.py |  2 +-
 3 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index a86e5521bc3..387471185a1 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2670,10 +2670,14 @@ def _get_partition_keys(Expression partition_expression):
 
 
 def _filesystemdataset_write(
-    data not None, object base_dir not None, str basename_template not None,
-    Schema schema not None, FileSystem filesystem not None,
+    Dataset data not None,
+    object base_dir not None,
+    str basename_template not None,
+    Schema schema not None,
+    FileSystem filesystem not None,
     Partitioning partitioning not None,
-    FileWriteOptions file_options not None, bint use_threads,
+    FileWriteOptions file_options not None,
+    bint use_threads,
     int max_partitions,
 ):
     """
@@ -2691,21 +2695,7 @@ def _filesystemdataset_write(
     c_options.max_partitions = max_partitions
     c_options.basename_template = tobytes(basename_template)
 
-    if isinstance(data, Dataset):
-        scanner = data._scanner(use_threads=use_threads)
-    else:
-        # data is list of batches/tables
-        for table in data:
-            if isinstance(table, Table):
-                for batch in table.to_batches():
-                    c_batches.push_back((<RecordBatch> batch).sp_batch)
-            else:
-                c_batches.push_back((<RecordBatch> table).sp_batch)
-
-        data = Fragment.wrap(shared_ptr[CFragment](
-            new CInMemoryFragment(move(c_batches), _true.unwrap())))
-
-        scanner = Scanner.from_fragment(data, schema, use_threads=use_threads)
+    scanner = data._scanner(use_threads=use_threads)
 
     c_scanner = (<Scanner> scanner).unwrap()
     with nogil:
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 68afb5321ba..0c65070d872 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -734,12 +734,11 @@ def write_dataset(data, base_dir, basename_template=None, format=None,
 
     if isinstance(data, Dataset):
         schema = schema or data.schema
-    elif isinstance(data, (pa.Table, pa.RecordBatch)):
-        schema = schema or data.schema
-        data = [data]
     elif isinstance(data, (list, tuple)):
         schema = schema or data[0].schema
-    elif isinstance(data, pa.ipc.RecordBatchReader) or _is_iterable(data):
+        data = InMemoryDataset(data, schema=schema)
+    elif isinstance(data, (pa.RecordBatch, pa.ipc.RecordBatchReader,
+                           pa.Table)) or _is_iterable(data):
         data = InMemoryDataset(data, schema=schema)
         schema = schema or data.schema
     else:
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index ebd99922f9e..a7dd1520168 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1656,7 +1656,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
         ds.InMemoryDataset([])
 
     expected = (
-        "Item has schema b: int64 which which does not match expected schema "
+        "Item has schema\nb: int64\nwhich does not match expected schema\n"
         "a: int64"
     )
     with pytest.raises(TypeError, match=expected):

From c63e2466b0d6b1250a39ad18eaae2a4e653da530 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Mon, 29 Mar 2021 16:42:21 -0400
Subject: [PATCH 9/9] Remove unused variable

---
 cpp/src/arrow/dataset/dataset_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
index b2b82561359..1db96b8b5c3 100644
--- a/cpp/src/arrow/dataset/dataset_test.cc
+++ b/cpp/src/arrow/dataset/dataset_test.cc
@@ -112,7 +112,6 @@ TEST_F(TestInMemoryDataset, GetFragments) {
 
 TEST_F(TestInMemoryDataset, InMemoryFragment) {
   constexpr int64_t kBatchSize = 1024;
-  constexpr int64_t kNumberBatches = 16;
 
   SetSchema({field("i32", int32()), field("f64", float64())});
   auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_);