From 733a6f3a3a928122eaec38dd390ef10cc9faabf7 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Thu, 11 May 2023 18:40:59 -0700
Subject: [PATCH 1/5] feat: sketch out a minimal protocol interface

---
 .../{dataset.py => dataset/__init__.py}       |  0
 python/pyarrow/dataset/protocol.py            | 77 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 rename python/pyarrow/{dataset.py => dataset/__init__.py} (100%)
 create mode 100644 python/pyarrow/dataset/protocol.py

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset/__init__.py
similarity index 100%
rename from python/pyarrow/dataset.py
rename to python/pyarrow/dataset/__init__.py
diff --git a/python/pyarrow/dataset/protocol.py b/python/pyarrow/dataset/protocol.py
new file mode 100644
index 00000000000..4e4eb84106b
--- /dev/null
+++ b/python/pyarrow/dataset/protocol.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Protocol definitions for pyarrow.dataset
+
+These provide the abstract interface for a dataset. Other libraries may implement
+this interface to expose their data, without having to extend PyArrow's classes.
+
+Applications and libraries that want to consume datasets should accept datasets
+that implement these protocols, rather than requiring the specific
+PyArrow classes.
+"""
+from abc import abstractmethod
+from typing import Iterator, List, Optional, Protocol
+
+from pyarrow.dataset import Expression
+from pyarrow import Table, IntegerArray, RecordBatch, RecordBatchReader, Schema
+
+
+class Scanner(Protocol):
+    @abstractmethod
+    def count_rows(self) -> int:
+        ...
+    
+    @abstractmethod
+    def head(self, num_rows: int) -> Table:
+        ...
+
+    @abstractmethod
+    def take(self, indices: IntegerArray) -> Table:
+        ...
+    
+    @abstractmethod
+    def to_table(self) -> Table:
+        ...
+    
+    @abstractmethod
+    def to_batches(self) -> Iterator[RecordBatch]:
+        ...
+
+    @abstractmethod
+    def to_reader(self) -> RecordBatchReader:
+        ...
+
+
+class Scannable(Protocol):
+    @abstractmethod
+    def scanner(self, columns: Optional[List[str]] = None,
+                filter: Optional[Expression] = None, **kwargs) -> Scanner:
+        ...
+    
+    @abstractmethod
+    def schema(self) -> Schema:
+        ...
+
+
+class Fragment(Scannable):
+    ...
+
+
+class Dataset(Scannable):
+    @abstractmethod
+    def get_fragments(self, filter: Optional[Expression] = None) -> Iterator[Fragment]:
+        ...

From b81ca2dfe85e5ebd12198db54b29e227fcf1b972 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Sun, 28 May 2023 13:47:03 -0700
Subject: [PATCH 2/5] wip: start a document

---
 docs/source/python/integration/dataset.rst | 90 ++++++++++++++++++++++
 python/pyarrow/dataset/protocol.py         | 11 ++-
 2 files changed, 97 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/python/integration/dataset.rst

diff --git a/docs/source/python/integration/dataset.rst b/docs/source/python/integration/dataset.rst
new file mode 100644
index 00000000000..7e25144ce78
--- /dev/null
+++ b/docs/source/python/integration/dataset.rst
@@ -0,0 +1,90 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.dataset
+
+Extending PyArrow Datasets
+==========================
+
+PyArrow provides a core protocol for datasets, so third-party libraries can both
+produce and consume PyArrow datasets.
+
+Dataset Producers
+-----------------
+
+If you are a library implementing a new data source, you'll want to be able to
+produce a PyArrow-compatible dataset. Your dataset could be backed by the classes
+implemented in PyArrow or you could implement your own classes. Either way, you
+should implement the protocol below.
+
+When implementing the dataset, consider the following:
+
+* To scale to very large dataset, don't eagerly load all the fragments into memory.
+  Instead, load fragments once a filter is passed. This allows you to skip loading
+  metadata about fragments that aren't relevant to queries. For example, if you
+  have a dataset that uses Hive-style paritioning for a column ``date`` and the
+  user passes a filter for ``date=2023-01-01``, then you can skip listing directory
+  for HIVE partitions that don't match that date.
+* Filters passed down should be fully executed. While other systems have scanners
+  that are "best-effort", only executing the parts of the filter that it can, PyArrow
+  datasets should always remove all rows that don't match the filter.
+
+
+Dataset Consumers
+-----------------
+
+If you are a query engine, you'll want to be able to
+consume any PyArrow datasets. To make sure your integration is compatible
+with any dataset, you should only call methods that are included in the 
+protocol. Dataset implementations provided by PyArrow implements additional
+options and methods beyond those, but they should not be relied upon.
+
+There are two general patterns for consuming PyArrow datasets: reading a single
+stream or reading a stream per fragment.
+
+If you have a streaming execution model, you can recieve a single stream
+of data by calling ``dataset.scanner(filter=..., columns=...).to_reader()``.
+This will return a RecordBatchReader, which can be exported over the 
+:ref:`C Stream Interface <c-stream-interface>`. The record batches yield 
+from the stream can then be passed to worker threads for parallelism.
+
+If you are using a partition-based or distributed model, you can split the
+dataset into fragments and then distribute those fragments into tasks that
+create their own scanners and readers. In this case, the code looks more
+like:
+
+.. code-block:: python
+
+    fragments = list(dataset.get_fragments(filter=..., columns=...))
+
+    def scan_partition(i):
+        fragment = fragments[i]
+        scanner = fragment.scanner()
+        return reader = scanner.to_reader()
+
+Fragments are pickleable, so they can be passed to remote workers in a 
+distributed system.
+
+If your engine supports predicate (filter) and projection (column) pushdown,
+you can pass those down to the dataset by passing them to the ``scanner``.
+
+
+The protocol
+------------
+
+.. literalinclude:: ../../python/pyarrow/dataset/protocol.py
+   :language: python
diff --git a/python/pyarrow/dataset/protocol.py b/python/pyarrow/dataset/protocol.py
index 4e4eb84106b..14f783958b2 100644
--- a/python/pyarrow/dataset/protocol.py
+++ b/python/pyarrow/dataset/protocol.py
@@ -31,6 +31,7 @@
 
 
 class Scanner(Protocol):
+    """A scanner implementation for a dataset."""
     @abstractmethod
     def count_rows(self) -> int:
         ...
@@ -43,13 +44,12 @@ def head(self, num_rows: int) -> Table:
     def take(self, indices: IntegerArray) -> Table:
         ...
     
-    @abstractmethod
     def to_table(self) -> Table:
-        ...
+        self.to_reader().read_all()
     
-    @abstractmethod
     def to_batches(self) -> Iterator[RecordBatch]:
-        ...
+        for batch in self.to_reader():
+            yield batch
 
     @abstractmethod
     def to_reader(self) -> RecordBatchReader:
@@ -68,6 +68,9 @@ def schema(self) -> Schema:
 
 
 class Fragment(Scannable):
+    """A fragment of a dataset.
+    
+    This class should be pickleable so that it can be used in a distrubuted scan."""
     ...
 
 

From 777031122ef3cf32c9eacaa64621681184e5462f Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Sun, 11 Jun 2023 18:30:02 -0700
Subject: [PATCH 3/5] add docs, diagram and tests

---
 docs/source/python/integration.rst            |   1 +
 docs/source/python/integration/dataset.rst    |  57 ++++++-
 .../integration/pyarrow_dataset_protocol.svg  |   4 +
 python/pyarrow/dataset/protocol.py            | 144 +++++++++++++++---
 python/pyarrow/tests/test_dataset_protocol.py |  29 ++++
 5 files changed, 202 insertions(+), 33 deletions(-)
 create mode 100644 docs/source/python/integration/pyarrow_dataset_protocol.svg
 create mode 100644 python/pyarrow/tests/test_dataset_protocol.py

diff --git a/docs/source/python/integration.rst b/docs/source/python/integration.rst
index 997bc52102f..1c05b9f3e19 100644
--- a/docs/source/python/integration.rst
+++ b/docs/source/python/integration.rst
@@ -38,3 +38,4 @@ This allows to easily integrate PyArrow with other languages and technologies.
    integration/python_java
    integration/extending
    integration/cuda
+   integration/dataset
diff --git a/docs/source/python/integration/dataset.rst b/docs/source/python/integration/dataset.rst
index 7e25144ce78..9d5826f34a1 100644
--- a/docs/source/python/integration/dataset.rst
+++ b/docs/source/python/integration/dataset.rst
@@ -23,6 +23,39 @@ Extending PyArrow Datasets
 PyArrow provides a core protocol for datasets, so third-party libraries can both
 produce and consume PyArrow datasets.
 
+The idea is that any library can have a method that returns their dataset as a
+PyArrow dataset. Then, any query engine can consume that dataset and push down
+filters and projections.
+
+.. image:: pyarrow_dataset_protocol.svg
+   :alt: A diagram showing the workflow for using the PyArrow Dataset protocol.
+         There are two flows shown, one for streams and one for tasks. The stream
+         case shows a linear flow from a producer class, to a dataset, to a 
+         scanner, and finally to a RecordBatchReader. The tasks case shows a
+         similar diagram, except the dataset is split into fragments, which are
+         then distributed to tasks, which each create their own scanner and
+         RecordBatchReader.
+
+Producers are responsible for outputting a class that conforms to the protocol.
+
+Consumers are responsible for calling methods on the protocol to get the data
+out of the dataset. The protocol supports getting data as a single stream or
+as a series of tasks which may be distributed.
+
+From the perspective of a user, this looks something like
+
+.. code-block:: python
+
+    dataset = producer_library.get_dataset(...)
+    df = consumer_library.read_dataset(dataset)
+    df.filter("x > 0").select("y")
+
+Here, the consumer would pass the filter ``x > 0`` and the projection of ``y`` down
+to the producer through the dataset protocol. Thus, the user gets to enjoy the
+performance benefits of pushing down filters and projections while being able
+to specify those in their preferred query engine.
+
+
 Dataset Producers
 -----------------
 
@@ -33,15 +66,17 @@ should implement the protocol below.
 
 When implementing the dataset, consider the following:
 
-* To scale to very large dataset, don't eagerly load all the fragments into memory.
-  Instead, load fragments once a filter is passed. This allows you to skip loading
-  metadata about fragments that aren't relevant to queries. For example, if you
-  have a dataset that uses Hive-style paritioning for a column ``date`` and the
-  user passes a filter for ``date=2023-01-01``, then you can skip listing directory
-  for HIVE partitions that don't match that date.
 * Filters passed down should be fully executed. While other systems have scanners
   that are "best-effort", only executing the parts of the filter that it can, PyArrow
   datasets should always remove all rows that don't match the filter.
+* The API does not require that a dataset has metadata about all fragments
+  loaded into memory. Indeed, to scale to very large Datasets, don't eagerly
+  load all the fragment metadata into memory. Instead, load fragment metadata
+  once a filter is passed. This allows you to skip loading metadata about
+  fragments that aren't relevant to queries. For example, if you have a dataset
+  that uses Hive-style paritioning for a column ``date`` and the user passes a
+  filter for ``date=2023-01-01``, then you can skip listing directory for HIVE
+  partitions that don't match that date.
 
 
 Dataset Consumers
@@ -51,7 +86,8 @@ If you are a query engine, you'll want to be able to
 consume any PyArrow datasets. To make sure your integration is compatible
 with any dataset, you should only call methods that are included in the 
 protocol. Dataset implementations provided by PyArrow implements additional
-options and methods beyond those, but they should not be relied upon.
+options and methods beyond those, but they should not be relied upon without
+checking for specific classes.
 
 There are two general patterns for consuming PyArrow datasets: reading a single
 stream or reading a stream per fragment.
@@ -86,5 +122,10 @@ you can pass those down to the dataset by passing them to the ``scanner``.
 The protocol
 ------------
 
-.. literalinclude:: ../../python/pyarrow/dataset/protocol.py
+This module can be imported starting in PyArrow ``13.0.0`` at
+``pyarrow.dataset.protocol``. The protocol is defined with ``typing.Protocol``
+classes. They can be checked at runtime with ``isinstance`` but can also be
+checked statically with Python type checkers like ``mypy``.
+
+.. literalinclude:: ../../../../python/pyarrow/dataset/protocol.py
    :language: python
diff --git a/docs/source/python/integration/pyarrow_dataset_protocol.svg b/docs/source/python/integration/pyarrow_dataset_protocol.svg
new file mode 100644
index 00000000000..7b6e464ca69
--- /dev/null
+++ b/docs/source/python/integration/pyarrow_dataset_protocol.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Do not edit this file with editors other than draw.io -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="792px" height="371px" viewBox="-0.5 -0.5 792 371" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2023-06-12T00:40:47.634Z&quot; agent=&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36&quot; etag=&quot;X1A77038cASBWeZh0xo6&quot; version=&quot;21.3.8&quot; type=&quot;device&quot;&gt;&#10;  &lt;diagram name=&quot;Page-1&quot; id=&quot;HfmSbbCPDSQB9xqB2VGd&quot;&gt;&#10;    &lt;mxGraphModel dx=&quot;1328&quot; dy=&quot;1006&quot; grid=&quot;1&quot; gridSize=&quot;10&quot; guides=&quot;1&quot; tooltips=&quot;1&quot; connect=&quot;1&quot; arrows=&quot;1&quot; fold=&quot;1&quot; page=&quot;1&quot; pageScale=&quot;1&quot; pageWidth=&quot;850&quot; pageHeight=&quot;1100&quot; math=&quot;0&quot; shadow=&quot;0&quot;&gt;&#10;      &lt;root&gt;&#10;        &lt;mxCell id=&quot;0&quot; /&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot; value=&quot;Untitled Layer&quot; style=&quot;&quot; parent=&quot;0&quot; /&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-32&quot; value=&quot;&amp;lt;b style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;&amp;lt;font style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;Consumer&amp;lt;/font&amp;gt;&amp;lt;/b&amp;gt;&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;align=left;verticalAlign=top;fillColor=#dae8fc;strokeColor=#6c8ebf;fontSize=14;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;480&quot; y=&quot;260&quot; width=&quot;370&quot; height=&quot;110&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-33&quot; value=&quot;&amp;lt;b style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;&amp;lt;font style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;Producer&amp;lt;/font&amp;gt;&amp;lt;/b&amp;gt;&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;align=left;verticalAlign=top;fillColor=#d5e8d4;strokeColor=#82b366;fontSize=14;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;150&quot; y=&quot;260&quot; width=&quot;170&quot; height=&quot;110&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-34&quot; value=&quot;&amp;lt;b style=&amp;quot;font-size: 18px;&amp;quot;&amp;gt;&amp;lt;font style=&amp;quot;font-size: 18px;&amp;quot;&amp;gt;Stream&amp;lt;/font&amp;gt;&amp;lt;/b&amp;gt;&quot; style=&quot;text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=18;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;60&quot; y=&quot;260&quot; width=&quot;85&quot; height=&quot;30&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-35&quot; value=&quot;&amp;lt;span style=&amp;quot;font-size: 18px;&amp;quot;&amp;gt;&amp;lt;b style=&amp;quot;font-size: 18px;&amp;quot;&amp;gt;Tasks&amp;lt;/b&amp;gt;&amp;lt;/span&amp;gt;&quot; style=&quot;text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=18;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;60&quot; y=&quot;380&quot; width=&quot;85&quot; height=&quot;30&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-37&quot; value=&quot;PyArrow Dataset Protocol&quot; style=&quot;text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=24;fontStyle=1&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;60&quot; y=&quot;220&quot; width=&quot;300&quot; height=&quot;30&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-38&quot; value=&quot;&amp;lt;b style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;&amp;lt;font style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;Consumer&amp;lt;/font&amp;gt;&amp;lt;/b&amp;gt;&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;align=left;verticalAlign=top;fillColor=#dae8fc;strokeColor=#6c8ebf;fontSize=14;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;480&quot; y=&quot;380&quot; width=&quot;370&quot; height=&quot;210&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-39&quot; value=&quot;&amp;lt;b style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;&amp;lt;font style=&amp;quot;font-size: 14px;&amp;quot;&amp;gt;Producer&amp;lt;/font&amp;gt;&amp;lt;/b&amp;gt;&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;align=left;verticalAlign=top;fillColor=#d5e8d4;strokeColor=#82b366;fontSize=14;&quot; vertex=&quot;1&quot; parent=&quot;Q4anuJzCkoYXW7Jg2v8g-31&quot;&gt;&#10;          &lt;mxGeometry x=&quot;150&quot; y=&quot;380&quot; width=&quot;170&quot; height=&quot;210&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;1&quot; style=&quot;locked=1;&quot; parent=&quot;0&quot; /&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-5&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-1&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-2&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-1&quot; value=&quot;Producer class&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;180&quot; y=&quot;310&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-6&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-2&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-3&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-2&quot; value=&quot;Dataset&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;340&quot; y=&quot;310&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-7&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-3&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-4&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-3&quot; value=&quot;Scanner&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;620&quot; y=&quot;310&quot; width=&quot;80&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-4&quot; value=&quot;RecordBatchReader&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;720&quot; y=&quot;310&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-27&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-9&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-10&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-9&quot; value=&quot;Producer class&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;180&quot; y=&quot;480&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-28&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-10&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-13&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-29&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-10&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-17&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-30&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-10&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-20&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-10&quot; value=&quot;Dataset&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;340&quot; y=&quot;480&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-26&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-11&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-12&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-11&quot; value=&quot;Scanner&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;620&quot; y=&quot;420&quot; width=&quot;80&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-12&quot; value=&quot;RecordBatchReader&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;720&quot; y=&quot;420&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-21&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-13&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-11&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-13&quot; value=&quot;Fragment&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;530&quot; y=&quot;420&quot; width=&quot;70&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-25&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-15&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-16&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-15&quot; value=&quot;Scanner&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;620&quot; y=&quot;480&quot; width=&quot;80&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-16&quot; value=&quot;RecordBatchReader&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;720&quot; y=&quot;480&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-22&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-17&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-15&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-17&quot; value=&quot;Fragment&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;530&quot; y=&quot;480&quot; width=&quot;70&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-24&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-18&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-19&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-18&quot; value=&quot;Scanner&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;620&quot; y=&quot;540&quot; width=&quot;80&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-19&quot; value=&quot;RecordBatchReader&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;720&quot; y=&quot;540&quot; width=&quot;120&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-23&quot; style=&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;&quot; edge=&quot;1&quot; parent=&quot;1&quot; source=&quot;Q4anuJzCkoYXW7Jg2v8g-20&quot; target=&quot;Q4anuJzCkoYXW7Jg2v8g-18&quot;&gt;&#10;          &lt;mxGeometry relative=&quot;1&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;        &lt;mxCell id=&quot;Q4anuJzCkoYXW7Jg2v8g-20&quot; value=&quot;Fragment&quot; style=&quot;rounded=0;whiteSpace=wrap;html=1;&quot; vertex=&quot;1&quot; parent=&quot;1&quot;&gt;&#10;          &lt;mxGeometry x=&quot;530&quot; y=&quot;540&quot; width=&quot;70&quot; height=&quot;40&quot; as=&quot;geometry&quot; /&gt;&#10;        &lt;/mxCell&gt;&#10;      &lt;/root&gt;&#10;    &lt;/mxGraphModel&gt;&#10;  &lt;/diagram&gt;&#10;&lt;/mxfile&gt;&#10;"><defs/><g><rect x="420" y="40" width="370" height="110" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 368px; height: 1px; padding-top: 47px; margin-left: 422px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><b style="font-size: 14px;"><font style="font-size: 14px;">Consumer</font></b></div></div></div></foreignObject><text x="422" y="61" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="14px">Consumer</text></switch></g><rect x="90" y="40" width="170" height="110" fill="#d5e8d4" stroke="#82b366" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 168px; height: 1px; padding-top: 47px; margin-left: 92px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><b style="font-size: 14px;"><font style="font-size: 14px;">Producer</font></b></div></div></div></foreignObject><text x="92" y="61" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="14px">Producer</text></switch></g><rect x="0" y="40" width="85" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 83px; height: 1px; padding-top: 55px; margin-left: 2px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 18px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><b style="font-size: 18px;"><font style="font-size: 18px;">Stream</font></b></div></div></div></foreignObject><text x="2" y="60" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="18px">Stream</text></switch></g><rect x="0" y="160" width="85" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 83px; height: 1px; padding-top: 175px; margin-left: 2px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 18px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span style="font-size: 18px;"><b style="font-size: 18px;">Tasks</b></span></div></div></div></foreignObject><text x="2" y="180" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="18px">Tasks</text></switch></g><rect x="0" y="0" width="300" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 298px; height: 1px; padding-top: 15px; margin-left: 1px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 24px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">PyArrow Dataset Protocol</div></div></div></foreignObject><text x="150" y="22" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="24px" text-anchor="middle" font-weight="bold">PyArrow Dataset Protocol</text></switch></g><rect x="420" y="160" width="370" height="210" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 368px; height: 1px; padding-top: 167px; margin-left: 422px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><b style="font-size: 14px;"><font style="font-size: 14px;">Consumer</font></b></div></div></div></foreignObject><text x="422" y="181" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="14px">Consumer</text></switch></g><rect x="90" y="160" width="170" height="210" fill="#d5e8d4" stroke="#82b366" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 168px; height: 1px; padding-top: 167px; margin-left: 92px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><b style="font-size: 14px;"><font style="font-size: 14px;">Producer</font></b></div></div></div></foreignObject><text x="92" y="181" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="14px">Producer</text></switch></g><path d="M 240 110 L 273.63 110" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 278.88 110 L 271.88 113.5 L 273.63 110 L 271.88 106.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="120" y="90" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 110px; margin-left: 121px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Producer class</div></div></div></foreignObject><text x="180" y="114" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Producer class</text></switch></g><path d="M 400 110 L 553.63 110" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 558.88 110 L 551.88 113.5 L 553.63 110 L 551.88 106.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="280" y="90" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 110px; margin-left: 281px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Dataset</div></div></div></foreignObject><text x="340" y="114" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Dataset</text></switch></g><path d="M 640 110 L 653.63 110" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 658.88 110 L 651.88 113.5 L 653.63 110 L 651.88 106.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="560" y="90" width="80" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 110px; margin-left: 561px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Scanner</div></div></div></foreignObject><text x="600" y="114" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Scanner</text></switch></g><rect x="660" y="90" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 110px; margin-left: 661px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">RecordBatchReader</div></div></div></foreignObject><text x="720" y="114" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">RecordBatchReader</text></switch></g><path d="M 240 280 L 273.63 280" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 278.88 280 L 271.88 283.5 L 273.63 280 L 271.88 276.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="120" y="260" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 280px; margin-left: 121px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Producer class</div></div></div></foreignObject><text x="180" y="284" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Producer class</text></switch></g><path d="M 400 280 L 435 280 L 435 220 L 463.63 220" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 468.88 220 L 461.88 223.5 L 463.63 220 L 461.88 216.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 400 280 L 463.63 280" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 468.88 280 L 461.88 283.5 L 463.63 280 L 461.88 276.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 400 280 L 435 280 L 435 340 L 463.63 340" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 468.88 340 L 461.88 343.5 L 463.63 340 L 461.88 336.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="280" y="260" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 280px; margin-left: 281px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Dataset</div></div></div></foreignObject><text x="340" y="284" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Dataset</text></switch></g><path d="M 640 220 L 653.63 220" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 658.88 220 L 651.88 223.5 L 653.63 220 L 651.88 216.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="560" y="200" width="80" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 220px; margin-left: 561px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Scanner</div></div></div></foreignObject><text x="600" y="224" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Scanner</text></switch></g><rect x="660" y="200" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 220px; margin-left: 661px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">RecordBatchReader</div></div></div></foreignObject><text x="720" y="224" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">RecordBatchReader</text></switch></g><path d="M 540 220 L 553.63 220" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 558.88 220 L 551.88 223.5 L 553.63 220 L 551.88 216.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="470" y="200" width="70" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 68px; height: 1px; padding-top: 220px; margin-left: 471px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Fragment</div></div></div></foreignObject><text x="505" y="224" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Fragment</text></switch></g><path d="M 640 280 L 653.63 280" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 658.88 280 L 651.88 283.5 L 653.63 280 L 651.88 276.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="560" y="260" width="80" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 280px; margin-left: 561px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Scanner</div></div></div></foreignObject><text x="600" y="284" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Scanner</text></switch></g><rect x="660" y="260" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 280px; margin-left: 661px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">RecordBatchReader</div></div></div></foreignObject><text x="720" y="284" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">RecordBatchReader</text></switch></g><path d="M 540 280 L 553.63 280" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 558.88 280 L 551.88 283.5 L 553.63 280 L 551.88 276.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="470" y="260" width="70" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 68px; height: 1px; padding-top: 280px; margin-left: 471px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Fragment</div></div></div></foreignObject><text x="505" y="284" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Fragment</text></switch></g><path d="M 640 340 L 653.63 340" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 658.88 340 L 651.88 343.5 L 653.63 340 L 651.88 336.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="560" y="320" width="80" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 340px; margin-left: 561px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Scanner</div></div></div></foreignObject><text x="600" y="344" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Scanner</text></switch></g><rect x="660" y="320" width="120" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 340px; margin-left: 661px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">RecordBatchReader</div></div></div></foreignObject><text x="720" y="344" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">RecordBatchReader</text></switch></g><path d="M 540 340 L 553.63 340" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 558.88 340 L 551.88 343.5 L 553.63 340 L 551.88 336.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="470" y="320" width="70" height="40" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 68px; height: 1px; padding-top: 340px; margin-left: 471px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Fragment</div></div></div></foreignObject><text x="505" y="344" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Fragment</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/python/pyarrow/dataset/protocol.py b/python/pyarrow/dataset/protocol.py
index 14f783958b2..7380ff1eda1 100644
--- a/python/pyarrow/dataset/protocol.py
+++ b/python/pyarrow/dataset/protocol.py
@@ -22,59 +22,153 @@
 Applications and libraries that want to consume datasets should accept datasets
 that implement these protocols, rather than requiring the specific
 PyArrow classes.
+
+See Extending PyArrow Datasets for more information:
+
+https://arrow.apache.org/docs/python/integration/dataset.html
 """
-from abc import abstractmethod
-from typing import Iterator, List, Optional, Protocol
+from abc import abstractmethod, abstractproperty
+from typing import Iterator, List, Optional, Protocol, runtime_checkable
 
 from pyarrow.dataset import Expression
-from pyarrow import Table, IntegerArray, RecordBatch, RecordBatchReader, Schema
+from pyarrow import Table, RecordBatchReader, Schema
 
 
+@runtime_checkable
 class Scanner(Protocol):
-    """A scanner implementation for a dataset."""
+    """
+    A scanner implementation for a dataset.
+
+    This may be a scan of a whole dataset, or a scan of a single fragment.
+    """
     @abstractmethod
     def count_rows(self) -> int:
+        """
+        Count the number of rows in this dataset.
+
+        Implementors may provide optimized code paths that compute this from metadata.
+
+        Returns
+        -------
+        int
+            The number of rows in the dataset.
+        """
         ...
-    
+
     @abstractmethod
     def head(self, num_rows: int) -> Table:
-        ...
+        """
+        Get the first ``num_rows`` rows of the dataset.
 
-    @abstractmethod
-    def take(self, indices: IntegerArray) -> Table:
+        Parameters
+        ----------
+        num_rows : int
+            The number of rows to return.
+
+        Returns
+        -------
+        Table
+            A table containing the first ``num_rows`` rows of the dataset.
+        """
         ...
-    
-    def to_table(self) -> Table:
-        self.to_reader().read_all()
-    
-    def to_batches(self) -> Iterator[RecordBatch]:
-        for batch in self.to_reader():
-            yield batch
 
     @abstractmethod
     def to_reader(self) -> RecordBatchReader:
+        """
+        Create a Record Batch Reader for this scan.
+
+        This is used to read the data in chunks.
+
+        Returns
+        -------
+        RecordBatchReader
+        """
         ...
 
 
+@runtime_checkable
 class Scannable(Protocol):
     @abstractmethod
     def scanner(self, columns: Optional[List[str]] = None,
-                filter: Optional[Expression] = None, **kwargs) -> Scanner:
-        ...
-    
-    @abstractmethod
-    def schema(self) -> Schema:
+                filter: Optional[Expression] = None, batch_size: Optional[int] = None,
+                use_threads: bool = True,
+                **kwargs) -> Scanner:
+        """Create a scanner for this dataset.
+
+        Parameters
+        ----------
+        columns : List[str], optional
+            Names of columns to include in the scan. If None, all columns are
+            included.
+        filter : Expression, optional
+            Filter expression to apply to the scan. If None, no filter is applied.
+        batch_size : int, optional
+            The number of rows to include in each batch. If None, the default
+            value is used. The default value is implementation specific.
+        use_threads : bool, default True
+            Whether to use multiple threads to read the rows. It is expected
+            that consumers reading a whole dataset in one scanner will keep this
+            as True, while consumers reading a single fragment per worker will
+            typically set this to False.
+
+        Notes
+        -----
+        The filters must be fully satisfied. If the dataset cannot satisfy the
+        filter, it should raise an error.
+
+        Only the following expressions are allowed in the filter:
+        - Equality / inequalities (==, !=, <, >, <=, >=)
+        - Conjunctions (and, or)
+        - Field references (e.g. "a" or "a.b.c")
+        - Literals (e.g. 1, 1.0, "a", True)
+        - cast
+        - is_null / not_null
+        - isin
+        - between
+        - negation (not)
+
+        """
         ...
 
 
-class Fragment(Scannable):
+@runtime_checkable
+class Fragment(Scannable, Protocol):
     """A fragment of a dataset.
-    
-    This class should be pickleable so that it can be used in a distrubuted scan."""
+
+    This might be a partition, a file, a file chunk, etc.
+
+    This class should be pickleable so that it can be used in a distributed scan."""
     ...
 
 
-class Dataset(Scannable):
+@runtime_checkable
+class Dataset(Scannable, Protocol):
     @abstractmethod
-    def get_fragments(self, filter: Optional[Expression] = None) -> Iterator[Fragment]:
+    def get_fragments(
+        self,
+        filter: Optional[Expression] = None, **kwargs
+    ) -> Iterator[Fragment]:
+        """Get the fragments of this dataset.
+
+        Parameters
+        ----------
+        filter : Expression, optional
+            Filter expression to use to prune which fragments are selected.
+            See Scannable.scanner for details on allowed filters. The filter is
+            just used to prune which fragments are selected. It does not need to
+            save the filter to apply to the scan. That is handled by the scanner.
+        **kwargs : dict
+            Additional arguments to pass to underlying implementation.
+        """
+        ...
+
+    @abstractproperty
+    def schema(self) -> Schema:
+        """
+        Get the schema of this dataset.
+
+        Returns
+        -------
+        Schema
+        """
         ...
diff --git a/python/pyarrow/tests/test_dataset_protocol.py b/python/pyarrow/tests/test_dataset_protocol.py
new file mode 100644
index 00000000000..f2567415492
--- /dev/null
+++ b/python/pyarrow/tests/test_dataset_protocol.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test that PyArrow datasets conform to the protocol."""
+import pyarrow.dataset.protocol as protocol
+import pyarrow.dataset as ds
+
+
+def test_dataset_protocol():
+    assert isinstance(ds.Dataset, protocol.Dataset)
+    assert isinstance(ds.Fragment, protocol.Fragment)
+
+    assert isinstance(ds.Dataset, protocol.Scannable)
+    assert isinstance(ds.Fragment, protocol.Scannable)
+
+    assert isinstance(ds.Scanner, protocol.Scanner)

From 0f8a61cf165090244088e95dee762bf0f9953505 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Sun, 11 Jun 2023 21:28:13 -0700
Subject: [PATCH 4/5] refinements

---
 docs/source/python/integration/dataset.rst | 35 ++++++++++++----------
 python/pyarrow/dataset/protocol.py         |  8 ++++-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/docs/source/python/integration/dataset.rst b/docs/source/python/integration/dataset.rst
index 9d5826f34a1..849ae6d9272 100644
--- a/docs/source/python/integration/dataset.rst
+++ b/docs/source/python/integration/dataset.rst
@@ -15,21 +15,17 @@
 .. specific language governing permissions and limitations
 .. under the License.
 
-.. currentmodule:: pyarrow.dataset
-
 Extending PyArrow Datasets
 ==========================
 
 PyArrow provides a core protocol for datasets, so third-party libraries can both
-produce and consume PyArrow datasets.
-
-The idea is that any library can have a method that returns their dataset as a
-PyArrow dataset. Then, any query engine can consume that dataset and push down
-filters and projections.
+produce and consume classes that conform to useful subset of the PyArrow dataset
+API. This subset provides enough functionality to provide predicate and filter
+pushdown. The subset of the API is contained in ``pyarrow.dataset.protocol``.
 
 .. image:: pyarrow_dataset_protocol.svg
    :alt: A diagram showing the workflow for using the PyArrow Dataset protocol.
-         There are two flows shown, one for streams and one for tasks. The stream
+         There are two flows shown, one for stream and one for tasks. The stream
          case shows a linear flow from a producer class, to a dataset, to a 
          scanner, and finally to a RecordBatchReader. The tasks case shows a
          similar diagram, except the dataset is split into fragments, which are
@@ -42,7 +38,7 @@ Consumers are responsible for calling methods on the protocol to get the data
 out of the dataset. The protocol supports getting data as a single stream or
 as a series of tasks which may be distributed.
 
-From the perspective of a user, this looks something like
+From the perspective of a user, the code looks like:
 
 .. code-block:: python
 
@@ -68,7 +64,10 @@ When implementing the dataset, consider the following:
 
 * Filters passed down should be fully executed. While other systems have scanners
   that are "best-effort", only executing the parts of the filter that it can, PyArrow
-  datasets should always remove all rows that don't match the filter.
+  datasets should always remove all rows that don't match the filter. If the
+  implementation cannot execute the filter, it should raise an exception. A
+  limited set of expressions are allowed in these filters for the general
+  protocol. See the docstrings for ``Scannable`` below for details.
 * The API does not require that a dataset has metadata about all fragments
   loaded into memory. Indeed, to scale to very large Datasets, don't eagerly
   load all the fragment metadata into memory. Instead, load fragment metadata
@@ -90,18 +89,17 @@ options and methods beyond those, but they should not be relied upon without
 checking for specific classes.
 
 There are two general patterns for consuming PyArrow datasets: reading a single
-stream or reading a stream per fragment.
+stream or creating a scan task per fragment.
 
-If you have a streaming execution model, you can recieve a single stream
+If you have a streaming execution model, you can receive a single stream
 of data by calling ``dataset.scanner(filter=..., columns=...).to_reader()``.
 This will return a RecordBatchReader, which can be exported over the 
 :ref:`C Stream Interface <c-stream-interface>`. The record batches yield 
 from the stream can then be passed to worker threads for parallelism.
 
-If you are using a partition-based or distributed model, you can split the
-dataset into fragments and then distribute those fragments into tasks that
-create their own scanners and readers. In this case, the code looks more
-like:
+If you are using a task-based model, you can split the dataset into fragments 
+and then distribute those fragments into tasks that create their own scanners
+and readers. In this case, the code looks more like:
 
 .. code-block:: python
 
@@ -117,6 +115,11 @@ distributed system.
 
 If your engine supports predicate (filter) and projection (column) pushdown,
 you can pass those down to the dataset by passing them to the ``scanner``.
+Column pushdown is limited to selecting a subset of columns from the schema.
+Some implementations, including PyArrow may also support projecting and
+renaming columns, but this is not part of the protocol. Predicate pushdown
+is limited to a subset of expressions. See the docstrings for ``Scannable``
+for the allowed expressions.
 
 
 The protocol
diff --git a/python/pyarrow/dataset/protocol.py b/python/pyarrow/dataset/protocol.py
index 7380ff1eda1..1f6657f916a 100644
--- a/python/pyarrow/dataset/protocol.py
+++ b/python/pyarrow/dataset/protocol.py
@@ -28,7 +28,13 @@
 https://arrow.apache.org/docs/python/integration/dataset.html
 """
 from abc import abstractmethod, abstractproperty
-from typing import Iterator, List, Optional, Protocol, runtime_checkable
+from typing import Iterator, List, Optional
+
+# TODO: remove once we drop support for Python 3.7
+if sys.version_info >= (3, 8):
+    from typing import Protocol, runtime_checkable
+else:
+    from typing_extensions import Protocol, runtime_checkable
 
 from pyarrow.dataset import Expression
 from pyarrow import Table, RecordBatchReader, Schema

From be648e2f601a2ea00f6178acfff962860eb008c3 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 3 Jul 2023 15:33:59 -0600
Subject: [PATCH 5/5] remove filters for now

---
 docs/source/python/integration/dataset.rst | 56 +++++++++-------------
 python/pyarrow/dataset/protocol.py         | 50 ++++++-------------
 2 files changed, 37 insertions(+), 69 deletions(-)

diff --git a/docs/source/python/integration/dataset.rst b/docs/source/python/integration/dataset.rst
index 849ae6d9272..4d12b89896a 100644
--- a/docs/source/python/integration/dataset.rst
+++ b/docs/source/python/integration/dataset.rst
@@ -18,9 +18,13 @@
 Extending PyArrow Datasets
 ==========================
 
+.. warn::
+
+    This protocol is currently experimental.
+
 PyArrow provides a core protocol for datasets, so third-party libraries can both
 produce and consume classes that conform to useful subset of the PyArrow dataset
-API. This subset provides enough functionality to provide predicate and filter
+API. This subset provides enough functionality to provide projection
 pushdown. The subset of the API is contained in ``pyarrow.dataset.protocol``.
 
 .. image:: pyarrow_dataset_protocol.svg
@@ -38,18 +42,24 @@ Consumers are responsible for calling methods on the protocol to get the data
 out of the dataset. The protocol supports getting data as a single stream or
 as a series of tasks which may be distributed.
 
-From the perspective of a user, the code looks like:
+As an example, from the perspective of the user this is what the code looks like
+to retrieve a Delta Lake table as a dataset and use it in DuckDB: 
 
 .. code-block:: python
+    :emphasize-lines: 2,6
+
+    from deltalake import DeltaTable
+    table = DeltaTable("path/to/table")
+    dataset = table.to_pyarrow_dataset()
 
-    dataset = producer_library.get_dataset(...)
-    df = consumer_library.read_dataset(dataset)
-    df.filter("x > 0").select("y")
+    import duckdb
+    df = duckdb.arrow(dataset)
+    df.project("y")
 
-Here, the consumer would pass the filter ``x > 0`` and the projection of ``y`` down
-to the producer through the dataset protocol. Thus, the user gets to enjoy the
-performance benefits of pushing down filters and projections while being able
-to specify those in their preferred query engine.
+Here, the DuckDB would pass the the projection of ``y`` down to the producer
+through the dataset protocol. The deltalake scanner would then only read the
+column ``y``. Thus, the user gets to enjoy the performance benefits of pushing
+down projections while being able to specify those in their preferred query engine.
 
 
 Dataset Producers
@@ -60,24 +70,6 @@ produce a PyArrow-compatible dataset. Your dataset could be backed by the classe
 implemented in PyArrow or you could implement your own classes. Either way, you
 should implement the protocol below.
 
-When implementing the dataset, consider the following:
-
-* Filters passed down should be fully executed. While other systems have scanners
-  that are "best-effort", only executing the parts of the filter that it can, PyArrow
-  datasets should always remove all rows that don't match the filter. If the
-  implementation cannot execute the filter, it should raise an exception. A
-  limited set of expressions are allowed in these filters for the general
-  protocol. See the docstrings for ``Scannable`` below for details.
-* The API does not require that a dataset has metadata about all fragments
-  loaded into memory. Indeed, to scale to very large Datasets, don't eagerly
-  load all the fragment metadata into memory. Instead, load fragment metadata
-  once a filter is passed. This allows you to skip loading metadata about
-  fragments that aren't relevant to queries. For example, if you have a dataset
-  that uses Hive-style paritioning for a column ``date`` and the user passes a
-  filter for ``date=2023-01-01``, then you can skip listing directory for HIVE
-  partitions that don't match that date.
-
-
 Dataset Consumers
 -----------------
 
@@ -92,7 +84,7 @@ There are two general patterns for consuming PyArrow datasets: reading a single
 stream or creating a scan task per fragment.
 
 If you have a streaming execution model, you can receive a single stream
-of data by calling ``dataset.scanner(filter=..., columns=...).to_reader()``.
+of data by calling ``dataset.scanner(columns=...).to_reader()``.
 This will return a RecordBatchReader, which can be exported over the 
 :ref:`C Stream Interface <c-stream-interface>`. The record batches yield 
 from the stream can then be passed to worker threads for parallelism.
@@ -103,7 +95,7 @@ and readers. In this case, the code looks more like:
 
 .. code-block:: python
 
-    fragments = list(dataset.get_fragments(filter=..., columns=...))
+    fragments = list(dataset.get_fragments(columns=...))
 
     def scan_partition(i):
         fragment = fragments[i]
@@ -113,13 +105,11 @@ and readers. In this case, the code looks more like:
 Fragments are pickleable, so they can be passed to remote workers in a 
 distributed system.
 
-If your engine supports predicate (filter) and projection (column) pushdown,
+If your engine supports projection (column) pushdown,
 you can pass those down to the dataset by passing them to the ``scanner``.
 Column pushdown is limited to selecting a subset of columns from the schema.
 Some implementations, including PyArrow may also support projecting and
-renaming columns, but this is not part of the protocol. Predicate pushdown
-is limited to a subset of expressions. See the docstrings for ``Scannable``
-for the allowed expressions.
+renaming columns, but this is not part of the protocol.
 
 
 The protocol
diff --git a/python/pyarrow/dataset/protocol.py b/python/pyarrow/dataset/protocol.py
index 1f6657f916a..d8696507ba0 100644
--- a/python/pyarrow/dataset/protocol.py
+++ b/python/pyarrow/dataset/protocol.py
@@ -23,11 +23,13 @@
 that implement these protocols, rather than requiring the specific
 PyArrow classes.
 
+The pyarrow.dataset.Dataset class itself implements this protocol.
+
 See Extending PyArrow Datasets for more information:
 
 https://arrow.apache.org/docs/python/integration/dataset.html
 """
-from abc import abstractmethod, abstractproperty
+from abc import abstractmethod
 from typing import Iterator, List, Optional
 
 # TODO: remove once we drop support for Python 3.7
@@ -50,21 +52,21 @@ class Scanner(Protocol):
     @abstractmethod
     def count_rows(self) -> int:
         """
-        Count the number of rows in this dataset.
+        Count the number of rows in this dataset or fragment.
 
         Implementors may provide optimized code paths that compute this from metadata.
 
         Returns
         -------
         int
-            The number of rows in the dataset.
+            The number of rows in the dataset or fragment.
         """
         ...
 
     @abstractmethod
     def head(self, num_rows: int) -> Table:
         """
-        Get the first ``num_rows`` rows of the dataset.
+        Get the first ``num_rows`` rows of the dataset or fragment.
 
         Parameters
         ----------
@@ -74,7 +76,7 @@ def head(self, num_rows: int) -> Table:
         Returns
         -------
         Table
-            A table containing the first ``num_rows`` rows of the dataset.
+            A table containing the first ``num_rows`` rows of the dataset or fragment.
         """
         ...
 
@@ -96,7 +98,7 @@ def to_reader(self) -> RecordBatchReader:
 class Scannable(Protocol):
     @abstractmethod
     def scanner(self, columns: Optional[List[str]] = None,
-                filter: Optional[Expression] = None, batch_size: Optional[int] = None,
+                batch_size: Optional[int] = None,
                 use_threads: bool = True,
                 **kwargs) -> Scanner:
         """Create a scanner for this dataset.
@@ -106,33 +108,14 @@ def scanner(self, columns: Optional[List[str]] = None,
         columns : List[str], optional
             Names of columns to include in the scan. If None, all columns are
             included.
-        filter : Expression, optional
-            Filter expression to apply to the scan. If None, no filter is applied.
         batch_size : int, optional
             The number of rows to include in each batch. If None, the default
             value is used. The default value is implementation specific.
         use_threads : bool, default True
-            Whether to use multiple threads to read the rows. It is expected
-            that consumers reading a whole dataset in one scanner will keep this
+            Whether to use multiple threads to read the rows. Often consumers
+            reading a whole dataset in one scanner will keep this
             as True, while consumers reading a single fragment per worker will
-            typically set this to False.
-
-        Notes
-        -----
-        The filters must be fully satisfied. If the dataset cannot satisfy the
-        filter, it should raise an error.
-
-        Only the following expressions are allowed in the filter:
-        - Equality / inequalities (==, !=, <, >, <=, >=)
-        - Conjunctions (and, or)
-        - Field references (e.g. "a" or "a.b.c")
-        - Literals (e.g. 1, 1.0, "a", True)
-        - cast
-        - is_null / not_null
-        - isin
-        - between
-        - negation (not)
-
+            set this to False.
         """
         ...
 
@@ -151,24 +134,19 @@ class Fragment(Scannable, Protocol):
 class Dataset(Scannable, Protocol):
     @abstractmethod
     def get_fragments(
-        self,
-        filter: Optional[Expression] = None, **kwargs
+        self, **kwargs
     ) -> Iterator[Fragment]:
         """Get the fragments of this dataset.
 
         Parameters
         ----------
-        filter : Expression, optional
-            Filter expression to use to prune which fragments are selected.
-            See Scannable.scanner for details on allowed filters. The filter is
-            just used to prune which fragments are selected. It does not need to
-            save the filter to apply to the scan. That is handled by the scanner.
         **kwargs : dict
             Additional arguments to pass to underlying implementation.
         """
         ...
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def schema(self) -> Schema:
         """
         Get the schema of this dataset.