diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt
index 00eff7ae03b..82a91b01604 100644
--- a/cpp/examples/arrow/CMakeLists.txt
+++ b/cpp/examples/arrow/CMakeLists.txt
@@ -28,4 +28,9 @@ if (ARROW_PARQUET AND ARROW_DATASET)
     EXTRA_LINK_LIBS
     ${DATASET_EXAMPLES_LINK_LIBS})
   add_dependencies(dataset-parquet-scan-example parquet)
+
+endif()
+
+if (ARROW_CSV AND ARROW_S3)
+  ADD_ARROW_EXAMPLE(csv-reader-example)
 endif()
diff --git a/cpp/examples/arrow/csv-reader-example.cc b/cpp/examples/arrow/csv-reader-example.cc
new file mode 100644
index 00000000000..9922c140bcc
--- /dev/null
+++ b/cpp/examples/arrow/csv-reader-example.cc
@@ -0,0 +1,223 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/csv/api.h>
+#include <arrow/filesystem/api.h>
+
+#include "arrow/util/task_group.h"
+#include "arrow/util/thread_pool.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+// const int NUM_FILES = 20;
+const int NUM_FILES = 5;
+
+arrow::csv::ReadOptions CreateThreadedSyncReadOptions() {
+  auto result = arrow::csv::ReadOptions::Defaults();
+  result.use_threads = true;
+  return result;
+}
+
+arrow::csv::ReadOptions CreateSerialSyncReadOptions() {
+  auto result = arrow::csv::ReadOptions::Defaults();
+  result.use_threads = false;
+  return result;
+}
+
+arrow::csv::ReadOptions CreateThreadedAsyncReadOptions() {
+  auto result = arrow::csv::ReadOptions::Defaults();
+  result.use_threads = true;
+  result.read_async = true;
+  return result;
+}
+
+arrow::csv::ParseOptions CreateParseOptions() {
+  auto result = arrow::csv::ParseOptions::Defaults();
+  return result;
+}
+
+arrow::csv::ConvertOptions CreateConvertOptions() {
+  auto result = arrow::csv::ConvertOptions::Defaults();
+  return result;
+}
+
+arrow::Status DoReadFile(std::shared_ptr<arrow::csv::TableReader> table_reader,
+                         std::shared_ptr<arrow::io::InputStream> input_stream,
+                         int file_index) {
+  std::cout << "File At Index (" << file_index << ") START" << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+  auto table = *table_reader->Read();
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  std::cout << "File At Index (" << file_index << ") " << duration.count() << std::endl;
+  return arrow::Status::OK();
+}
+
+arrow::Future<arrow::Status> DoReadFileAsync(
+    std::shared_ptr<arrow::csv::TableReader> table_reader,
+    std::shared_ptr<arrow::io::InputStream> input_stream, int file_index) {
+  std::cout << "File At Index (" << file_index << ") START" << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+  auto table_future = table_reader->ReadAsync();
+  return table_future.Then(
+      [start, file_index](const arrow::Result<std::shared_ptr<arrow::Table>>& table) {
+        auto end = std::chrono::high_resolution_clock::now();
+        auto duration =
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+        std::cout << "File At Index (" << file_index << ") " << duration.count()
+                  << std::endl;
+        return arrow::Status::OK();
+      });
+}
+
+double DoReadFiles(arrow::MemoryPool* memory_pool,
+                   std::shared_ptr<arrow::fs::FileSystem> fs,
+                   std::shared_ptr<arrow::internal::TaskGroup> task_group,
+                   arrow::csv::ReadOptions read_options, std::string bucket_name) {
+  // TODO: Get rid of these "keepalives"
+  std::vector<std::shared_ptr<arrow::io::InputStream>> input_streams;
+  std::vector<std::shared_ptr<arrow::csv::TableReader>> table_readers;
+
+  auto total_start = std::chrono::high_resolution_clock::now();
+  auto total_end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start);
+
+  total_start = std::chrono::high_resolution_clock::now();
+  for (int file_index = 0; file_index < NUM_FILES; file_index++) {
+    std::shared_ptr<arrow::io::InputStream> input_stream;
+    if (fs->type_name() == "s3") {
+      input_stream =
+          *fs->OpenInputStream(bucket_name + "/" + std::to_string(file_index) + ".csv");
+    } else {
+      input_stream = *fs->OpenInputStream("/home/ubuntu/datasets/arrow/csv/" +
+                                          std::to_string(file_index) + ".csv");
+    }
+    input_streams.push_back(input_stream);
+    auto table_reader =
+        *arrow::csv::TableReader::Make(memory_pool, input_stream, read_options,
+                                       CreateParseOptions(), CreateConvertOptions());
+    table_readers.push_back(table_reader);
+    if (read_options.read_async) {
+      task_group->Append(DoReadFileAsync(table_reader, input_stream, file_index));
+    } else {
+      task_group->Append([table_reader, input_stream, file_index] {
+        return DoReadFile(table_reader, input_stream, file_index);
+      });
+    }
+  }
+  auto final_status = task_group->Finish();
+  if (!final_status.ok()) {
+    std::cout << "Method failed.  (err=" << final_status.message() << ")" << std::endl;
+  }
+  total_end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start);
+  return total_duration.count() / static_cast<double>(NUM_FILES);
+}
+
+int main(int argc, char** argv) {
+  auto* thread_pool = arrow::internal::GetCpuThreadPool();
+  std::cout << "Num threads: " << std::thread::hardware_concurrency() << std::endl;
+  auto memory_pool = arrow::default_memory_pool();
+  auto access_key = std::getenv("S3_ACCESS_KEY_ID");
+  auto access_secret = std::getenv("S3_ACCESS_KEY_SECRET");
+  auto aws_region = std::getenv("S3_REGION");
+  auto aws_bucket_name = std::getenv("S3_BUCKET_NAME");
+  arrow::fs::S3GlobalOptions options;
+  options.log_level = arrow::fs::S3LogLevel::Fatal;
+  if (!InitializeS3(options).ok()) {
+    std::cout << "Error initializing S3 subsystem" << std::endl;
+    return -1;
+  }
+  // auto fs =
+  // std::make_shared<arrow::fs::LocalFileSystem>(arrow::fs::LocalFileSystemOptions());
+  auto s3_options = arrow::fs::S3Options::FromAccessKey(access_key, access_secret);
+  s3_options.region = aws_region;
+  auto fs = *arrow::fs::S3FileSystem::Make(s3_options);
+  double avg_duration = 0;
+
+  // std::cout << "Serial outer loop threaded inner loop file I/O on thread pool" <<
+  // std::endl; avg_duration = DoReadFiles(memory_pool, fs,
+  // arrow::internal::TaskGroup::MakeSerial(), CreateThreadedSyncReadOptions(),
+  // aws_bucket_name); std::cout << "  Finished reading in all files (avg=" <<
+  // avg_duration << ")" << std::endl;
+
+  // std::cout << "Threaded outer loop threaded inner loop file I/O on thread pool
+  // (FAILS)" << std::endl; DoReadFiles(thread_pool, memory_pool, fs,
+  // arrow::internal::TaskGroup::MakeThreaded(), CreateThreadedSyncReadOptions(),
+  // aws_bucket_name); std::cout << "  Finished reading in all files (avg=" <<
+  // (total_duration.count() / static_cast<double>(NUM_FILES)) << ")" << std::endl;
+
+  std::cout << "Threaded outer loop serial inner loop file I/O on thread pool"
+            << std::endl;
+  avg_duration =
+      DoReadFiles(memory_pool, fs, arrow::internal::TaskGroup::MakeThreaded(thread_pool),
+                  CreateSerialSyncReadOptions(), aws_bucket_name);
+  std::cout << "  Finished reading in all files (avg=" << avg_duration << ")"
+            << std::endl;
+
+  std::cout << "Composable futures method (threaded outer, threaded inner)" << std::endl;
+  avg_duration =
+      DoReadFiles(memory_pool, fs, arrow::internal::TaskGroup::MakeThreaded(thread_pool),
+                  CreateThreadedAsyncReadOptions(), aws_bucket_name);
+  std::cout << "  Finished reading in all files (avg=" << avg_duration << ")"
+            << std::endl;
+
+  std::cout << "Composable futures method (serial outer, threaded inner)" << std::endl;
+  avg_duration = DoReadFiles(memory_pool, fs, arrow::internal::TaskGroup::MakeSerial(),
+                             CreateThreadedAsyncReadOptions(), aws_bucket_name);
+  std::cout << "  Finished reading in all files (avg=" << avg_duration << ")"
+            << std::endl;
+
+  // input_streams.clear();
+  // table_readers.clear();
+  // total_start = std::chrono::high_resolution_clock::now();
+  // auto futures_task_group = arrow::internal::TaskGroup::MakeThreaded(thread_pool);
+  // for (int file_index = 0; file_index < NUM_FILES; file_index++) {
+  //   auto input_stream = *fs->OpenInputStream("/home/ubuntu/datasets/arrow/csv/" +
+  //   std::to_string(file_index) + ".csv"); auto table_reader =
+  //   *arrow::csv::TableReader::Make(memory_pool, input_stream,
+  //   CreateAsyncIOReadOptions(), CreateParseOptions(), CreateConvertOptions()); auto
+  //   start = std::chrono::high_resolution_clock::now();
+  //   input_streams.push_back(input_stream);
+  //   table_readers.push_back(table_reader);
+  //   auto read_table_future = table_reader->ReadAsync();
+  //   read_table_future.Then([start, file_index] (const
+  //   arrow::Result<std::shared_ptr<arrow::Table>>& result) {
+  //     auto end = std::chrono::high_resolution_clock::now();
+  //     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end -
+  //     start); if (result.ok()) {
+  //       auto table = result.ValueUnsafe();
+  //       std::cout << "Finished reading file with " << table->num_rows() << " rows (" <<
+  //       file_index << ") " << duration.count() << std::endl;
+  //     }
+  //   });
+  //   futures_task_group->Append(read_table_future);
+  // }
+  // futures_task_group->Finish();
+
+  // total_end = std::chrono::high_resolution_clock::now();
+  // total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(total_end -
+  // total_start); std::cout << "  Finished reading in all files (avg=" <<
+  // (total_duration.count() / static_cast<double>(NUM_FILES)) << ")" << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index e94f5fc9653..7d7a170db73 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -118,6 +118,8 @@ struct ARROW_EXPORT ReadOptions {
 
   /// Whether to use the global CPU thread pool
   bool use_threads = true;
+  /// Whether to read in an async fashion
+  bool read_async = false;
   /// Block size we request from the IO layer; also determines the size of
   /// chunks when use_threads is true
   int32_t block_size = 1 << 20;  // 1 MB
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index cf5047aaf16..e3fda8191d5 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -256,6 +256,58 @@ class ThreadedBlockReader : public BlockReader {
   }
 };
 
+// An object that reads delimited CSV blocks for threaded use.
+// It's pretty much the same as the threaded block reader but operates
+// on a push instead of a pull basis
+class AsyncBlockReader {
+ public:
+  AsyncBlockReader(std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer)
+      : chunker_(std::move(chunker)),
+        partial_(std::make_shared<Buffer>("")),
+        buffer_(std::move(first_buffer)) {}
+
+  Result<arrow::util::optional<CSVBlock>> Next(std::shared_ptr<Buffer> next_buffer) {
+    if (buffer_ == nullptr) {
+      // EOF
+      return util::optional<CSVBlock>();
+    }
+
+    std::shared_ptr<Buffer> whole, completion, next_partial;
+    bool is_final = (next_buffer == nullptr);
+
+    auto current_partial = std::move(partial_);
+    auto current_buffer = std::move(buffer_);
+
+    if (is_final) {
+      // End of file reached => compute completion from penultimate block
+      RETURN_NOT_OK(
+          chunker_->ProcessFinal(current_partial, current_buffer, &completion, &whole));
+    } else {
+      // Get completion of partial from previous block.
+      std::shared_ptr<Buffer> starts_with_whole;
+      // Get completion of partial from previous block.
+      RETURN_NOT_OK(chunker_->ProcessWithPartial(current_partial, current_buffer,
+                                                 &completion, &starts_with_whole));
+
+      // Get a complete CSV block inside `partial + block`, and keep
+      // the rest for the next iteration.
+      RETURN_NOT_OK(chunker_->Process(starts_with_whole, &whole, &next_partial));
+    }
+
+    partial_ = std::move(next_partial);
+    buffer_ = std::move(next_buffer);
+
+    return CSVBlock{current_partial, completion, whole, block_index_++, is_final, {}};
+  }
+
+ protected:
+  std::unique_ptr<Chunker> chunker_;
+  std::shared_ptr<Buffer> partial_, buffer_;
+  int64_t block_index_ = 0;
+  // Whether there was a trailing CR at the end of last received buffer
+  bool trailing_cr_ = false;
+};
+
 /////////////////////////////////////////////////////////////////////////
 // Base class for common functionality
 
@@ -449,7 +501,6 @@ class ReaderMixin {
   ConversionSchema conversion_schema_;
 
   std::shared_ptr<io::InputStream> input_;
-  Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
   std::shared_ptr<internal::TaskGroup> task_group_;
 };
 
@@ -714,6 +765,7 @@ class SerialStreamingReader : public BaseStreamingReader {
   bool source_eof_ = false;
   int64_t last_block_index_ = 0;
   std::shared_ptr<SerialBlockReader> block_reader_;
+  Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -765,6 +817,13 @@ class SerialTableReader : public BaseTableReader {
     RETURN_NOT_OK(task_group_->Finish());
     return MakeTable();
   }
+
+  Future<std::shared_ptr<Table>> ReadAsync() override {
+    return Future<std::shared_ptr<Table>>::MakeFinished(Read());
+  }
+
+ protected:
+  Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -836,8 +895,110 @@ class ThreadedTableReader : public BaseTableReader {
     return MakeTable();
   }
 
+  Future<std::shared_ptr<Table>> ReadAsync() override {
+    return Future<std::shared_ptr<Table>>::MakeFinished(Read());
+  }
+
  protected:
   ThreadPool* thread_pool_;
+  Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
+};
+
+class AsyncTableReader : public BaseTableReader,
+                         public std::enable_shared_from_this<AsyncTableReader> {
+ public:
+  using BaseTableReader::BaseTableReader;
+
+  AsyncTableReader(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+                   const ReadOptions& read_options, const ParseOptions& parse_options,
+                   const ConvertOptions& convert_options, ThreadPool* thread_pool)
+      : BaseTableReader(pool, input, read_options, parse_options, convert_options),
+        thread_pool_(thread_pool) {}
+
+  ~AsyncTableReader() override {}
+
+  Status Init() override {
+    ARROW_ASSIGN_OR_RAISE(auto istream_it,
+                          io::MakeInputStreamIterator(input_, read_options_.block_size));
+    auto sync_buffer_iterator = CSVBufferIterator::Make(std::move(istream_it));
+
+    int32_t block_queue_size = thread_pool_->GetCapacity();
+    ARROW_ASSIGN_OR_RAISE(
+        buffer_iterator_,
+        MakeAsyncReadaheadIterator(std::move(sync_buffer_iterator), block_queue_size));
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<Table>> Read() override {
+    auto future = ReadAsync();
+    future.Wait();
+    return future.result();
+  }
+
+  Future<std::shared_ptr<Table>> ReadAsync() override {
+    task_group_ = internal::TaskGroup::MakeThreaded(thread_pool_);
+
+    // Read in the first block before multi-threading the rest
+    task_group_->Append(buffer_iterator_.NextFuture().Then(
+        [this](const Result<std::shared_ptr<Buffer>>& first_buffer_res) {
+          // TODO: Should not have to do this, Then should allow me to take in
+          // std::shared_ptr<Buffer>, why take in result if second parameter to then is
+          // false?
+          ARROW_ASSIGN_OR_RAISE(auto first_buffer, first_buffer_res);
+          if (first_buffer == IterationTraits<std::shared_ptr<Buffer>>::End()) {
+            return Status::Invalid("Empty CSV file");
+          }
+          RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer));
+          RETURN_NOT_OK(MakeColumnBuilders());
+          AsyncBlockReader* block_reader =
+              new AsyncBlockReader(MakeChunker(parse_options_), std::move(first_buffer));
+          std::function<Status(const std::shared_ptr<Buffer>& buffer)> callback =
+              [this, block_reader](const std::shared_ptr<Buffer>& buffer) {
+                ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_reader->Next(buffer));
+                if (!maybe_block.has_value()) {
+                  // EOF, nothing to do for this block
+                  return Status::OK();
+                }
+                // Launch parse task
+                task_group_->Append([this, maybe_block] {
+                  return ParseAndInsert(maybe_block->partial, maybe_block->completion,
+                                        maybe_block->buffer, maybe_block->block_index,
+                                        maybe_block->is_final)
+                      .status();
+                });
+                return Status::OK();
+              };
+          auto for_each_future =
+              async::AsyncForEach(std::move(buffer_iterator_), callback);
+          // The block reader needs to be pumped one last time.  The iterators pump this
+          // with the end signal but AsyncForEach does not pass that call on so we pump it
+          // here.  We can also delete the block reader at this point
+          task_group_->Append(for_each_future.Then(
+              [this, block_reader](const Status& status) {
+                // TODO Check status and delete block_reader no matter what
+                ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_reader->Next(nullptr));
+                if (maybe_block.has_value()) {
+                  task_group_->Append([this, maybe_block] {
+                    return ParseAndInsert(maybe_block->partial, maybe_block->completion,
+                                          maybe_block->buffer, maybe_block->block_index,
+                                          maybe_block->is_final)
+                        .status();
+                  });
+                }
+                delete block_reader;
+                return Status::OK();
+              },
+              true));
+          return Status::OK();
+        }));
+
+    return task_group_->FinishAsync().Then(
+        [this](const Status& status) { return MakeTable(); });
+  }
+
+ protected:
+  ThreadPool* thread_pool_;
+  AsyncReadaheadIterator<std::shared_ptr<Buffer>> buffer_iterator_;
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -849,9 +1010,17 @@ Result<std::shared_ptr<TableReader>> TableReader::Make(
     const ConvertOptions& convert_options) {
   std::shared_ptr<BaseTableReader> reader;
   if (read_options.use_threads) {
-    reader = std::make_shared<ThreadedTableReader>(
-        pool, input, read_options, parse_options, convert_options, GetCpuThreadPool());
+    if (read_options.read_async) {
+      reader = std::make_shared<AsyncTableReader>(
+          pool, input, read_options, parse_options, convert_options, GetCpuThreadPool());
+    } else {
+      reader = std::make_shared<ThreadedTableReader>(
+          pool, input, read_options, parse_options, convert_options, GetCpuThreadPool());
+    }
   } else {
+    if (read_options.read_async) {
+      return Status::Invalid("There is no support for a serial async reader right now");
+    }
     reader = std::make_shared<SerialTableReader>(pool, input, read_options, parse_options,
                                                  convert_options);
   }
diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h
index 652cedc8c74..1ab54c80336 100644
--- a/cpp/src/arrow/csv/reader.h
+++ b/cpp/src/arrow/csv/reader.h
@@ -24,6 +24,7 @@
 #include "arrow/result.h"
 #include "arrow/type.h"
 #include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -41,6 +42,9 @@ class ARROW_EXPORT TableReader {
   /// Read the entire CSV file and convert it to a Arrow Table
   virtual Result<std::shared_ptr<Table>> Read() = 0;
 
+  /// Same as Read() but return a future
+  virtual Future<std::shared_ptr<Table>> ReadAsync() = 0;
+
   /// Create a TableReader instance
   static Result<std::shared_ptr<TableReader>> Make(MemoryPool* pool,
                                                    std::shared_ptr<io::InputStream> input,
diff --git a/cpp/src/arrow/flight/flight_benchmark.cc b/cpp/src/arrow/flight/flight_benchmark.cc
index 6180e05cbfd..4792fd5be06 100644
--- a/cpp/src/arrow/flight/flight_benchmark.cc
+++ b/cpp/src/arrow/flight/flight_benchmark.cc
@@ -273,7 +273,7 @@ Status RunPerformanceTest(FlightClient* client, bool test_put) {
   // }
 
   ARROW_ASSIGN_OR_RAISE(auto pool, ThreadPool::Make(FLAGS_num_threads));
-  std::vector<Future<Status>> tasks;
+  std::vector<Future<>> tasks;
   for (const auto& endpoint : plan->endpoints()) {
     ARROW_ASSIGN_OR_RAISE(auto task, pool->Submit(ConsumeStream, endpoint));
     tasks.push_back(std::move(task));
diff --git a/cpp/src/arrow/io/caching.cc b/cpp/src/arrow/io/caching.cc
index b418d2cc079..a306ca7d286 100644
--- a/cpp/src/arrow/io/caching.cc
+++ b/cpp/src/arrow/io/caching.cc
@@ -158,7 +158,7 @@ ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, AsyncCont
   impl_->options = options;
 }
 
-ReadRangeCache::~ReadRangeCache() {}
+ReadRangeCache::~ReadRangeCache() = default;
 
 Status ReadRangeCache::Cache(std::vector<ReadRange> ranges) {
   ranges = internal::CoalesceReadRanges(std::move(ranges), impl_->options.hole_size_limit,
diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h
index 3664ac19d93..dca707c26f4 100644
--- a/cpp/src/arrow/io/hdfs.h
+++ b/cpp/src/arrow/io/hdfs.h
@@ -111,8 +111,8 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
   Status MakeDirectory(const std::string& path) override;
 
   // Delete file or directory
-  // @param path: absolute path to data
-  // @param recursive: if path is a directory, delete contents as well
+  // @param path absolute path to data
+  // @param recursive if path is a directory, delete contents as well
   // @returns error status on failure
   Status Delete(const std::string& path, bool recursive = false);
 
@@ -188,9 +188,9 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
 
   // FileMode::WRITE options
   // @param path complete file path
-  // @param buffer_size, 0 for default
-  // @param replication, 0 for default
-  // @param default_block_size, 0 for default
+  // @param buffer_size 0 by default
+  // @param replication 0 by default
+  // @param default_block_size 0 by default
   Status OpenWritable(const std::string& path, bool append, int32_t buffer_size,
                       int16_t replication, int64_t default_block_size,
                       std::shared_ptr<HdfsOutputStream>* file);
diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc
index 7692097b0d7..309d487c52c 100644
--- a/cpp/src/arrow/io/interfaces.cc
+++ b/cpp/src/arrow/io/interfaces.cc
@@ -128,13 +128,9 @@ Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const AsyncContext&
   TaskHints hints;
   hints.io_size = nbytes;
   hints.external_id = ctx.external_id;
-  auto maybe_fut = ctx.executor->Submit(std::move(hints), [self, position, nbytes] {
+  return DeferNotOk(ctx.executor->Submit(std::move(hints), [self, position, nbytes] {
     return self->ReadAt(position, nbytes);
-  });
-  if (!maybe_fut.ok()) {
-    return Future<std::shared_ptr<Buffer>>::MakeFinished(maybe_fut.status());
-  }
-  return *std::move(maybe_fut);
+  }));
 }
 
 // Default WillNeed() implementation: no-op
diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h
index 9c43b324f08..eac08919286 100644
--- a/cpp/src/arrow/result.h
+++ b/cpp/src/arrow/result.h
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <new>
 #include <string>
 #include <type_traits>
@@ -423,6 +424,8 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
 
   void Destroy() {
     if (ARROW_PREDICT_TRUE(status_.ok())) {
+      static_assert(offsetof(Result<T>, status_) == 0,
+                    "Status is guaranteed to be at the start of Result<>");
       internal::launder(reinterpret_cast<const T*>(&data_))->~T();
     }
   }
@@ -448,6 +451,9 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
 /// WARNING: ARROW_ASSIGN_OR_RAISE expands into multiple statements;
 /// it cannot be used in a single statement (e.g. as the body of an if
 /// statement without {})!
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
+/// an lvalue Result which you *don't* want to move out of cast appropriately.
 #define ARROW_ASSIGN_OR_RAISE(lhs, rexpr)                                              \
   ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
                              lhs, rexpr);
diff --git a/cpp/src/arrow/result_test.cc b/cpp/src/arrow/result_test.cc
index 8ecf5037f0b..b71af9d8531 100644
--- a/cpp/src/arrow/result_test.cc
+++ b/cpp/src/arrow/result_test.cc
@@ -34,10 +34,10 @@ namespace {
 using ::testing::Eq;
 
 StatusCode kErrorCode = StatusCode::Invalid;
-constexpr char kErrorMessage[] = "Invalid argument";
+constexpr const char* kErrorMessage = "Invalid argument";
 
 const int kIntElement = 42;
-constexpr char kStringElement[] =
+constexpr const char* kStringElement =
     "The Answer to the Ultimate Question of Life, the Universe, and Everything";
 
 // A data type without a default constructor.
@@ -46,6 +46,10 @@ struct Foo {
   std::string baz;
 
   explicit Foo(int value) : bar(value), baz(kStringElement) {}
+
+  bool operator==(const Foo& other) const {
+    return (bar == other.bar) && (baz == other.baz);
+  }
 };
 
 // A data type with only copy constructors.
@@ -59,7 +63,7 @@ struct CopyOnlyDataType {
 };
 
 struct ImplicitlyCopyConvertible {
-  ImplicitlyCopyConvertible(const CopyOnlyDataType& co)  // NOLINT(runtime/explicit)
+  ImplicitlyCopyConvertible(const CopyOnlyDataType& co)  // NOLINT runtime/explicit
       : copy_only(co) {}
 
   CopyOnlyDataType copy_only;
@@ -72,9 +76,9 @@ struct MoveOnlyDataType {
   MoveOnlyDataType(const MoveOnlyDataType& other) = delete;
   MoveOnlyDataType& operator=(const MoveOnlyDataType& other) = delete;
 
-  MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(other); }
+  MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(&other); }
   MoveOnlyDataType& operator=(MoveOnlyDataType&& other) {
-    MoveFrom(other);
+    MoveFrom(&other);
     return *this;
   }
 
@@ -87,17 +91,17 @@ struct MoveOnlyDataType {
     }
   }
 
-  void MoveFrom(MoveOnlyDataType& other) {
+  void MoveFrom(MoveOnlyDataType* other) {
     Destroy();
-    data = other.data;
-    other.data = nullptr;
+    data = other->data;
+    other->data = nullptr;
   }
 
   int* data = nullptr;
 };
 
 struct ImplicitlyMoveConvertible {
-  ImplicitlyMoveConvertible(MoveOnlyDataType&& mo)  // NOLINT(runtime/explicit)
+  ImplicitlyMoveConvertible(MoveOnlyDataType&& mo)  // NOLINT runtime/explicit
       : move_only(std::move(mo)) {}
 
   MoveOnlyDataType move_only;
@@ -128,6 +132,10 @@ struct HeapAllocatedObject {
   }
 
   ~HeapAllocatedObject() { delete value; }
+
+  bool operator==(const HeapAllocatedObject& other) const {
+    return *value == *other.value;
+  }
 };
 
 // Constructs a Foo.
@@ -165,14 +173,6 @@ struct StringVectorCtor {
   std::vector<std::string> operator()() { return {kStringElement, kErrorMessage}; }
 };
 
-bool operator==(const Foo& lhs, const Foo& rhs) {
-  return (lhs.bar == rhs.bar) && (lhs.baz == rhs.baz);
-}
-
-bool operator==(const HeapAllocatedObject& lhs, const HeapAllocatedObject& rhs) {
-  return *lhs.value == *rhs.value;
-}
-
 // Returns an rvalue reference to the Result<T> object pointed to by
 // |result|.
 template <class T>
@@ -184,9 +184,8 @@ Result<T>&& MoveResult(Result<T>* result) {
 template <typename T>
 class ResultTest : public ::testing::Test {};
 
-typedef ::testing::Types<IntCtor, FooCtor, StringCtor, StringVectorCtor,
-                         HeapAllocatedObjectCtor>
-    TestTypes;
+using TestTypes = ::testing::Types<IntCtor, FooCtor, StringCtor, StringVectorCtor,
+                                   HeapAllocatedObjectCtor>;
 
 TYPED_TEST_SUITE(ResultTest, TestTypes);
 
@@ -715,5 +714,15 @@ TEST(ResultTest, Equality) {
   }
 }
 
+TEST(ResultTest, ViewAsStatus) {
+  Result<int> ok(3);
+  Result<int> err(Status::Invalid("error"));
+
+  auto ViewAsStatus = [](const void* ptr) { return static_cast<const Status*>(ptr); };
+
+  EXPECT_EQ(ViewAsStatus(&ok), &ok.status());
+  EXPECT_EQ(ViewAsStatus(&err), &err.status());
+}
+
 }  // namespace
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/future.cc b/cpp/src/arrow/util/future.cc
index c56f6166e62..de8c99a3b47 100644
--- a/cpp/src/arrow/util/future.cc
+++ b/cpp/src/arrow/util/future.cc
@@ -73,7 +73,7 @@ class FutureWaiterImpl : public FutureWaiter {
     }
   }
 
-  ~FutureWaiterImpl() {
+  ~FutureWaiterImpl() override {
     for (auto future : futures_) {
       future->RemoveWaiter(this);
     }
@@ -174,9 +174,9 @@ FutureWaiterImpl* GetConcreteWaiter(FutureWaiter* waiter) {
 
 }  // namespace
 
-FutureWaiter::FutureWaiter() {}
+FutureWaiter::FutureWaiter() = default;
 
-FutureWaiter::~FutureWaiter() {}
+FutureWaiter::~FutureWaiter() = default;
 
 std::unique_ptr<FutureWaiter> FutureWaiter::Make(Kind kind,
                                                  std::vector<FutureImpl*> futures) {
@@ -225,9 +225,30 @@ class ConcreteFutureImpl : public FutureImpl {
     waiter_ = nullptr;
   }
 
-  void DoMarkFinished() { DoMarkFinishedOrFailed(FutureState::SUCCESS); }
+  void DoMarkFinished() {
+    DoMarkFinishedOrFailed(FutureState::SUCCESS);
+    RunCallbacks();
+  }
 
-  void DoMarkFailed() { DoMarkFinishedOrFailed(FutureState::FAILURE); }
+  void DoMarkFailed() {
+    DoMarkFinishedOrFailed(FutureState::FAILURE);
+    RunCallbacks();
+  }
+
+  void AddCallback(Callback callback) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (IsFutureFinished(state_)) {
+      callback();
+    } else {
+      callbacks_.push_back(std::move(callback));
+    }
+  }
+
+  void RunCallbacks() const {
+    for (const auto& callback : callbacks_) {
+      callback();
+    }
+  }
 
   void DoMarkFinishedOrFailed(FutureState state) {
     {
@@ -277,6 +298,12 @@ std::unique_ptr<FutureImpl> FutureImpl::Make() {
   return std::unique_ptr<FutureImpl>(new ConcreteFutureImpl());
 }
 
+std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
+  std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
+  ptr->state_ = state;
+  return std::move(ptr);
+}
+
 FutureImpl::FutureImpl() : state_(FutureState::PENDING) {}
 
 FutureState FutureImpl::SetWaiter(FutureWaiter* w, int future_num) {
@@ -295,4 +322,8 @@ void FutureImpl::MarkFinished() { GetConcreteFuture(this)->DoMarkFinished(); }
 
 void FutureImpl::MarkFailed() { GetConcreteFuture(this)->DoMarkFailed(); }
 
+void FutureImpl::AddCallback(Callback callback) {
+  GetConcreteFuture(this)->AddCallback(std::move(callback));
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/future.h b/cpp/src/arrow/util/future.h
index 575f5cb3c41..a3625810eec 100644
--- a/cpp/src/arrow/util/future.h
+++ b/cpp/src/arrow/util/future.h
@@ -19,6 +19,7 @@
 
 #include <atomic>
 #include <cmath>
+#include <functional>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -31,17 +32,80 @@
 
 namespace arrow {
 
+namespace detail {
+
+struct Empty {
+  static Result<Empty> ToResult(Status s) {
+    if (ARROW_PREDICT_TRUE(s.ok())) {
+      return Empty{};
+    }
+    return s;
+  }
+
+  template <typename T>
+  using EnableIfSame = typename std::enable_if<std::is_same<Empty, T>::value>::type;
+};
+
+}  // namespace detail
+class FutureWaiter;
+template <typename T = detail::Empty>
+class Future;
+template <typename T>
+class FutureStorage;
+
 /// A Future's execution or completion status
 enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
 
+using Callback = std::function<void()>;
+
 inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
 
-// ---------------------------------------------------------------------
-// Type-erased helpers
+template <typename>
+struct is_future : std::false_type {};
 
-class FutureWaiter;
 template <typename T>
-class Future;
+struct is_future<Future<T>> : std::true_type {};
+
+template <typename Signature>
+using result_of_t = typename std::result_of<Signature>::type;
+
+namespace detail {
+
+template <typename ContinuationReturn>
+struct ContinuedFutureImpl;
+
+template <>
+struct ContinuedFutureImpl<void> {
+  using type = Future<>;
+};
+
+template <>
+struct ContinuedFutureImpl<Status> {
+  using type = Future<>;
+};
+
+template <typename R>
+struct ContinuedFutureImpl {
+  using type = Future<R>;
+};
+
+template <typename T>
+struct ContinuedFutureImpl<Result<T>> {
+  using type = Future<T>;
+};
+
+template <typename T>
+struct ContinuedFutureImpl<Future<T>> {
+  using type = Future<T>;
+};
+
+template <typename Signature>
+using ContinuedFuture = typename ContinuedFutureImpl<result_of_t<Signature>>::type;
+
+}  // namespace detail
+
+// ---------------------------------------------------------------------
+// Type-erased helpers
 
 class ARROW_EXPORT FutureImpl {
  public:
@@ -52,27 +116,96 @@ class ARROW_EXPORT FutureImpl {
   FutureState state() { return state_.load(); }
 
   static std::unique_ptr<FutureImpl> Make();
+  static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
 
  protected:
   FutureImpl();
-  ARROW_DISALLOW_COPY_AND_ASSIGN(FutureImpl);
 
   // Future API
   void MarkFinished();
   void MarkFailed();
   void Wait();
   bool Wait(double seconds);
+  void AddCallback(Callback callback);
 
   // Waiter API
   inline FutureState SetWaiter(FutureWaiter* w, int future_num);
   inline void RemoveWaiter(FutureWaiter* w);
 
-  std::atomic<FutureState> state_;
+  std::atomic<FutureState> state_{FutureState::PENDING};
+
+  // Type erased storage for arbitrary results
+  // XXX small objects could be stored alongside state_ instead of boxed in a pointer
+  using Storage = std::unique_ptr<void, void (*)(void*)>;
+  Storage result_{NULLPTR, NULLPTR};
+
+  template <typename Finished, typename ContinuedFuture, typename Continuation,
+            typename Return = result_of_t<Continuation && (const Result<Finished>&)>>
+  typename std::enable_if<std::is_void<Return>::value>::type Continue(
+      ContinuedFuture next, Continuation&& continuation, bool run_on_failure) {
+    static_assert(std::is_same<ContinuedFuture, Future<>>::value, "");
+    const auto& this_result = *static_cast<Result<Finished>*>(result_.get());
+
+    if (this_result.ok() || run_on_failure) {
+      std::forward<Continuation>(continuation)(this_result);
+      next.MarkFinished(Status::OK());
+    } else {
+      next.MarkFinished(this_result.status());
+    }
+  }
+
+  template <typename Finished, typename ContinuedFuture, typename Continuation,
+            typename Return = result_of_t<Continuation && (const Result<Finished>&)>>
+  typename std::enable_if<std::is_same<Return, Status>::value>::type Continue(
+      ContinuedFuture next, Continuation&& continuation, bool run_on_failure) {
+    static_assert(std::is_same<ContinuedFuture, Future<>>::value, "");
+    const auto& this_result = *static_cast<Result<Finished>*>(result_.get());
+
+    if (this_result.ok() || run_on_failure) {
+      Status next_status = std::forward<Continuation>(continuation)(this_result);
+      next.MarkFinished(std::move(next_status));
+    } else {
+      next.MarkFinished(this_result.status());
+    }
+  }
+
+  template <typename Finished, typename ContinuedFuture, typename Continuation,
+            typename Return = result_of_t<Continuation && (const Result<Finished>&)>>
+  typename std::enable_if<!std::is_void<Return>::value &&
+                          !std::is_same<Return, Status>::value &&
+                          !is_future<Return>::value>::type
+  Continue(ContinuedFuture next, Continuation&& continuation, bool run_on_failure) {
+    static_assert(!std::is_same<ContinuedFuture, Future<>>::value, "");
+    const auto& this_result = *static_cast<Result<Finished>*>(result_.get());
+
+    if (this_result.ok() || run_on_failure) {
+      Result<typename ContinuedFuture::ValueType> next_result =
+          std::forward<Continuation>(continuation)(this_result);
+      next.MarkFinished(std::move(next_result));
+    } else {
+      next.MarkFinished(this_result.status());
+    }
+  }
+
+  template <typename Finished, typename ContinuedFuture, typename Continuation,
+            typename Return = result_of_t<Continuation && (const Result<Finished>&)>>
+  typename std::enable_if<is_future<Return>::value>::type Continue(
+      ContinuedFuture next, Continuation&& continuation, bool run_on_failure) {
+    const auto& this_result = *static_cast<Result<Finished>*>(result_.get());
+
+    if (this_result.ok() || run_on_failure) {
+      auto next_future = std::forward<Continuation>(continuation)(this_result);
+      next_future.AddCallback(
+          [next_future, next]() mutable { next.MarkFinished(next_future.result()); });
+    } else {
+      next.MarkFinished(this_result.status());
+    }
+  }
+
+  std::vector<Callback> callbacks_;
 
   template <typename T>
   friend class Future;
-  template <typename T>
-  friend class FutureStorage;
   friend class FutureWaiter;
   friend class FutureWaiterImpl;
 };
@@ -107,7 +240,7 @@ class ARROW_EXPORT FutureWaiter {
   static std::vector<FutureImpl*> ExtractFutures(const std::vector<FutureType>& futures) {
     std::vector<FutureImpl*> base_futures(futures.size());
     for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
-      base_futures[i] = futures[i].impl_;
+      base_futures[i] = futures[i].impl_.get();
     }
     return base_futures;
   }
@@ -118,7 +251,7 @@ class ARROW_EXPORT FutureWaiter {
       const std::vector<FutureType*>& futures) {
     std::vector<FutureImpl*> base_futures(futures.size());
     for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
-      base_futures[i] = futures[i]->impl_;
+      base_futures[i] = futures[i]->impl_.get();
     }
     return base_futures;
   }
@@ -132,97 +265,6 @@ class ARROW_EXPORT FutureWaiter {
   friend class ConcreteFutureImpl;
 };
 
-// ---------------------------------------------------------------------
-// An intermediate class for storing Future results
-
-class FutureStorageBase {
- public:
-  FutureStorageBase() : impl_(FutureImpl::Make()) {}
-
- protected:
-  ARROW_DISALLOW_COPY_AND_ASSIGN(FutureStorageBase);
-  std::unique_ptr<FutureImpl> impl_;
-
-  template <typename T>
-  friend class Future;
-};
-
-template <typename T>
-class FutureStorage : public FutureStorageBase {
- public:
-  static constexpr bool HasValue = true;
-
-  Status status() const { return result_.status(); }
-
-  void MarkFinished(Result<T> result) {
-    result_ = std::move(result);
-    if (ARROW_PREDICT_TRUE(result_.ok())) {
-      impl_->MarkFinished();
-    } else {
-      impl_->MarkFailed();
-    }
-  }
-
-  template <typename Func>
-  void ExecuteAndMarkFinished(Func&& func) {
-    MarkFinished(func());
-  }
-
- protected:
-  Result<T> result_;
-  friend class Future<T>;
-};
-
-// A Future<void> just stores a Status (always ok for now, but that could change
-// if we implement cancellation).
-template <>
-class FutureStorage<void> : public FutureStorageBase {
- public:
-  static constexpr bool HasValue = false;
-
-  Status status() const { return status_; }
-
-  void MarkFinished(Status st = Status::OK()) {
-    status_ = std::move(st);
-    impl_->MarkFinished();
-  }
-
-  template <typename Func>
-  void ExecuteAndMarkFinished(Func&& func) {
-    func();
-    MarkFinished();
-  }
-
- protected:
-  Status status_;
-};
-
-// A Future<Status> just stores a Status.
-template <>
-class FutureStorage<Status> : public FutureStorageBase {
- public:
-  static constexpr bool HasValue = false;
-
-  Status status() const { return status_; }
-
-  void MarkFinished(Status st) {
-    status_ = std::move(st);
-    if (ARROW_PREDICT_TRUE(status_.ok())) {
-      impl_->MarkFinished();
-    } else {
-      impl_->MarkFailed();
-    }
-  }
-
-  template <typename Func>
-  void ExecuteAndMarkFinished(Func&& func) {
-    MarkFinished(func());
-  }
-
- protected:
-  Status status_;
-};
-
 // ---------------------------------------------------------------------
 // Public API
 
@@ -239,11 +281,8 @@ class FutureStorage<Status> : public FutureStorageBase {
 /// WaitForAny or AsCompletedIterator).
 template <typename T>
 class Future {
-  static constexpr bool HasValue = FutureStorage<T>::HasValue;
-  template <typename U>
-  using EnableResult = typename std::enable_if<HasValue, Result<U>>::type;
-
  public:
+  using ValueType = T;
   static constexpr double kInfinity = FutureImpl::kInfinity;
 
   // The default constructor creates an invalid Future.  Use Future::Make()
@@ -253,7 +292,7 @@ class Future {
 
   // Consumer API
 
-  bool is_valid() const { return storage_ != NULLPTR; }
+  bool is_valid() const { return impl_ != NULLPTR; }
 
   /// \brief Return the Future's current state
   ///
@@ -265,28 +304,30 @@ class Future {
   }
 
   /// \brief Wait for the Future to complete and return its Result
-  ///
-  /// This function is not available on Future<void> and Future<Status>.
-  /// For these specializations, please call status() instead.
-  template <typename U = T>
-  const Result<T>& result(EnableResult<U>* = NULLPTR) const& {
-    CheckValid();
+  const Result<ValueType>& result() const& {
     Wait();
-    return storage_->result_;
+    return *GetResult();
   }
-
-  template <typename U = T>
-  Result<T> result(EnableResult<U>* = NULLPTR) && {
-    CheckValid();
+  Result<ValueType>&& result() && {
     Wait();
-    return std::move(storage_->result_);
+    return std::move(*GetResult());
+  }
+
+  /// \brief In general, Then should be preferred over AddCallback.
+  void AddCallback(Callback callback) const {
+    // TODO: Get rid of this method or at least make protected somehow?
+    impl_->AddCallback(callback);
   }
 
   /// \brief Wait for the Future to complete and return its Status
-  Status status() const {
-    CheckValid();
-    Wait();
-    return storage_->status();
+  const Status& status() const { return result().status(); }
+
+  /// \brief Future<T> is convertible to Future<>, which views only the
+  /// Status of the original. Marking this Future Finished is not supported.
+  explicit operator Future<>() const {
+    Future<> status_future;
+    status_future.impl_ = impl_;
+    return status_future;
   }
 
   /// \brief Wait for the Future to complete
@@ -310,39 +351,17 @@ class Future {
     return impl_->Wait(seconds);
   }
 
-  /// If a Result<Future> holds an error instead of a Future, construct a finished Future
-  /// holding that error.
-  static Future DeferNotOk(Result<Future> maybe_future) {
-    if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
-      return MakeFinished(std::move(maybe_future).status());
-    }
-    return std::move(maybe_future).MoveValueUnsafe();
-  }
-
   // Producer API
 
-  /// \brief Producer API: execute function and mark Future finished
-  ///
-  /// The function's return value is used to set the Future's result.
-  /// The function can have the following return types:
-  /// - `T`
-  /// - `Result<T>`, if T is neither `void` nor `Status`
-  template <typename Func>
-  void ExecuteAndMarkFinished(Func&& func) {
-    storage_->ExecuteAndMarkFinished(std::forward<Func>(func));
-  }
-
   /// \brief Producer API: mark Future finished
   ///
-  /// The arguments are used to set the Future's result.
-  /// This function accepts the following signatures:
-  /// - `(T val)`, if T is neither `void` nor `Status`
-  /// - `(Result<T> val)`, if T is neither `void` nor `Status`
-  /// - `(Status st)`, if T is `void` or `Status`
-  /// - `()`, if T is `void`
-  template <typename... Args>
-  void MarkFinished(Args&&... args) {
-    storage_->MarkFinished(std::forward<Args>(args)...);
+  /// The Future's result is set to `res`.
+  void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
+
+  /// \brief Mark a Future<> or Future<> completed with the provided Status.
+  template <typename E = ValueType>
+  detail::Empty::EnableIfSame<E> MarkFinished(Status s = Status::OK()) {
+    return DoMarkFinished(E::ToResult(std::move(s)));
   }
 
   /// \brief Producer API: instantiate a valid Future
@@ -350,23 +369,163 @@ class Future {
   /// The Future's state is initialized with PENDING.
   static Future Make() {
     Future fut;
-    fut.storage_ = std::make_shared<FutureStorage<T>>();
-    fut.impl_ = fut.storage_->impl_.get();
+    fut.impl_ = FutureImpl::Make();
     return fut;
   }
 
   /// \brief Producer API: instantiate a finished Future
-  ///
-  /// The given arguments are passed to MarkFinished().
-  template <typename... Args>
-  static Future MakeFinished(Args&&... args) {
-    // TODO we can optimize this by directly creating a finished FutureImpl
-    auto fut = Make();
-    fut.MarkFinished(std::forward<Args>(args)...);
+  static Future MakeFinished(Result<ValueType> res) {
+    Future fut;
+    if (ARROW_PREDICT_TRUE(res.ok())) {
+      fut.impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+    } else {
+      fut.impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+    }
+    fut.SetResult(std::move(res));
     return fut;
   }
 
+  /// \brief Make a finished Future<> or Future<> with the provided Status.
+  template <typename E = ValueType, typename = detail::Empty::EnableIfSame<E>>
+  static Future<> MakeFinished(Status s = Status::OK()) {
+    return MakeFinished(E::ToResult(std::move(s)));
+  }
+
+  /// \brief Consumer API: Register a continuation to run when this future completes
+  ///
+  /// The continuation will run in the same thread that called MarkFinished (whatever
+  /// callback is registered with this function will run before MarkFinished returns).
+  /// If your callback is lengthy then it is generally best to spawn a task so that the
+  /// callback can immediately return a future.
+  ///
+  /// The callback should receive (const Result<T> &).
+  ///
+  /// This method returns a future which will be completed after the callback (and
+  /// potentially the callback task) have finished.
+  ///
+  /// If the callback returns:
+  /// - void, a Future<> will be produced which will complete successully as soon
+  ///   as the callback runs.
+  /// - Status, a Future<> will be produced which will complete with the returned Status
+  ///   as soon as the callback runs.
+  /// - V or Result<V>, a Future<V> will be produced which will complete with the result
+  ///   of the callback as soon as the callback runs.
+  /// - Future<V>, a Future<V> will be produced which will be marked complete when the
+  ///   future returned by the callback completes (and will complete with the same
+  ///   result).
+  ///
+  /// If this future fails (results in a non-ok status) then by default the callback will
+  /// not run.  Instead the future that this method returns will be marked failed with the
+  /// same status (the failure will propagate).  This is analagous to throwing an
+  /// exception in sequential code, the failure propagates upwards, skipping the following
+  /// code.
+  ///
+  /// However, if run_on_failure is set to true then the callback will be run even if this
+  /// future fails.  The callback will receive the failed status (or failed result) and
+  /// can attempt to salvage or continue despite the failure.  As long as the callback
+  /// doesn't return a failed status (or failed future) then the final future (the one
+  /// returned by this method) will be marked with the successful result of the callback.
+  /// This is analagous to catching an exception in sequential code.
+  ///
+  /// If this future is already completed then the callback will be run immediately
+  /// (before this method returns) and the returned future may already be marked complete
+  /// (it will definitely be marked complete if the callback returns a non-future or a
+  /// completed future).
+  ///
+  /// Care should be taken when creating continuation callbacks.  The callback may not run
+  /// for some time.  If the callback is a lambda function then capture by reference is
+  /// generally not advised as references are probably not going to still be in scope when
+  /// the callback runs.  Capture by value or passing in a shared_ptr should generally be
+  /// safe.  Capturing this may or may not be safe, you will need to ensure that the this
+  /// object is going to still exist when the callback is run.  This can be tricky because
+  /// even though the "semantic instance" may still exist if the instance was moved then
+  /// it will no longer exist.
+  ///
+  /// Example:
+  ///
+  /// // This approach is NOT SAFE.  Even though the task objects are kept around until
+  /// after they are finished it is not the SAME task object.
+  /// // The task object is created, the callback lambda is created, a copy of this is
+  /// copied into the callback, and then the task object is
+  /// // immediately moved into the vector.  This move creates a new task object (with a
+  /// different this pointer).  The this pointer that exists
+  /// // when the callback actually runs will be pointing to an invalid MyTask object.
+  /// class MyTask {
+  ///   public:
+  ///     MyTask(std::shared_ptr<MyItem> item, Future<Block> block_to_process) :
+  ///     item_(item) {
+  ///       task_future_ = block_to_process.Then([this] {return Process();});
+  ///     }
+  ///     ARROW_DEFAULT_MOVE_AND_ASSIGN(MyTask);
+  ///   private:
+  ///     Status Process();
+  ///     std::shared_ptr<MyItem> item_;
+  ///     Future<> task_future_;
+  /// };
+  ///
+  /// std::vector<MyTask> tasks;
+  /// for (auto && block_future : block_futures) {
+  ///   for (auto && item : items) {
+  ///     tasks.push_back(MyTask(item, block_future));
+  ///   }
+  /// }
+  /// AwaitAllTasks(tasks);
+  struct LessPrefer {};
+  struct Prefer : LessPrefer {};
+
+  template <typename Continuation>
+  detail::ContinuedFuture<Continuation && (const Result<T>&)> ThenImpl(
+      Continuation&& continuation, bool run_on_failure, Prefer) const {
+    auto future = detail::ContinuedFuture < Continuation && (const Result<T>&) > ::Make();
+
+    // We know impl_ will be valid when invoking callbacks because at least one thread
+    // will be waiting for MarkFinished to return. Thus it's safe to keep a non-owning
+    // reference to impl_ here (but *not* to `this`!)
+    FutureImpl* impl = impl_.get();
+    impl_->AddCallback([impl, future, continuation, run_on_failure]() mutable {
+      impl->Continue<T>(std::move(future), std::move(continuation), run_on_failure);
+    });
+
+    return future;
+  }
+
+  template <typename Continuation>
+  detail::ContinuedFuture<Continuation && (const Status&)> ThenImpl(
+      Continuation&& continuation, bool run_on_failure, LessPrefer) const {
+    return ThenImpl(
+        [continuation](const Result<T>& result) mutable {
+          return std::move(continuation)(result.status());
+        },
+        run_on_failure, Prefer{});
+  }
+
+  template <typename Continuation>
+  auto Then(Continuation&& continuation, bool run_on_failure = false) const
+      -> decltype(ThenImpl(std::forward<Continuation>(continuation), run_on_failure,
+                           Prefer{})) {
+    return ThenImpl(std::forward<Continuation>(continuation), run_on_failure, Prefer{});
+  }
+
  protected:
+  Result<ValueType>* GetResult() const {
+    return static_cast<Result<ValueType>*>(impl_->result_.get());
+  }
+
+  void SetResult(Result<ValueType> res) {
+    impl_->result_ = {new Result<ValueType>(std::move(res)),
+                      [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
+  }
+
+  void DoMarkFinished(Result<ValueType> res) {
+    SetResult(std::move(res));
+
+    if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
+      impl_->MarkFinished();
+    } else {
+      impl_->MarkFailed();
+    }
+  }
+
   void CheckValid() const {
 #ifndef NDEBUG
     if (!is_valid()) {
@@ -375,12 +534,28 @@ class Future {
 #endif
   }
 
-  std::shared_ptr<FutureStorage<T>> storage_;
-  FutureImpl* impl_ = NULLPTR;
+  std::shared_ptr<FutureImpl> impl_;
 
   friend class FutureWaiter;
+
+  template <typename U>
+  friend class Future;
+
+  FRIEND_TEST(FutureRefTest, ChainRemoved);
+  FRIEND_TEST(FutureRefTest, TailRemoved);
+  FRIEND_TEST(FutureRefTest, HeadRemoved);
 };
 
+/// If a Result<Future> holds an error instead of a Future, construct a finished Future
+/// holding that error.
+template <typename T>
+static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
+  if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
+    return Future<T>::MakeFinished(std::move(maybe_future).status());
+  }
+  return std::move(maybe_future).MoveValueUnsafe();
+}
+
 /// \brief Wait for all the futures to end, or for the given timeout to expire.
 ///
 /// `true` is returned if all the futures completed before the timeout was reached,
diff --git a/cpp/src/arrow/util/future_test.cc b/cpp/src/arrow/util/future_test.cc
index e2fd4b91a14..f6e7b07acb6 100644
--- a/cpp/src/arrow/util/future_test.cc
+++ b/cpp/src/arrow/util/future_test.cc
@@ -70,9 +70,9 @@ struct MoveOnlyDataType {
   MoveOnlyDataType(const MoveOnlyDataType& other) = delete;
   MoveOnlyDataType& operator=(const MoveOnlyDataType& other) = delete;
 
-  MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(other); }
+  MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(&other); }
   MoveOnlyDataType& operator=(MoveOnlyDataType&& other) {
-    MoveFrom(other);
+    MoveFrom(&other);
     return *this;
   }
 
@@ -85,10 +85,10 @@ struct MoveOnlyDataType {
     }
   }
 
-  void MoveFrom(MoveOnlyDataType& other) {
+  void MoveFrom(MoveOnlyDataType* other) {
     Destroy();
-    data = other.data;
-    other.data = nullptr;
+    data = other->data;
+    other->data = nullptr;
   }
 
   int ToInt() const { return data == nullptr ? -42 : *data; }
@@ -122,15 +122,21 @@ void AssertFinished(const Future<T>& fut) {
 // Assert the future is successful *now*
 template <typename T>
 void AssertSuccessful(const Future<T>& fut) {
-  ASSERT_EQ(fut.state(), FutureState::SUCCESS);
-  ASSERT_OK(fut.status());
+  ASSERT_TRUE(fut.Wait(0.1));
+  if (IsFutureFinished(fut.state())) {
+    ASSERT_EQ(fut.state(), FutureState::SUCCESS);
+    ASSERT_OK(fut.status());
+  }
 }
 
 // Assert the future is failed *now*
 template <typename T>
 void AssertFailed(const Future<T>& fut) {
-  ASSERT_EQ(fut.state(), FutureState::FAILURE);
-  ASSERT_FALSE(fut.status().ok());
+  ASSERT_TRUE(fut.Wait(0.1));
+  if (IsFutureFinished(fut.state())) {
+    ASSERT_EQ(fut.state(), FutureState::FAILURE);
+    ASSERT_FALSE(fut.status().ok());
+  }
 }
 
 template <typename T>
@@ -158,9 +164,9 @@ IteratorResults<T> IteratorToResults(Iterator<T> iterator) {
 }
 
 // So that main thread may wait a bit for a future to be finished
-static const auto kYieldDuration = std::chrono::microseconds(50);
-static const double kTinyWait = 1e-5;  // seconds
-static const double kLargeWait = 5.0;  // seconds
+constexpr auto kYieldDuration = std::chrono::microseconds(50);
+constexpr double kTinyWait = 1e-5;  // seconds
+constexpr double kLargeWait = 5.0;  // seconds
 
 template <typename T>
 class SimpleExecutor {
@@ -232,6 +238,17 @@ TEST(FutureSyncTest, Int) {
     ASSERT_OK(res);
     ASSERT_EQ(*res, 42);
   }
+  {
+    // MakeFinished(int)
+    auto fut = Future<int>::MakeFinished(42);
+    AssertSuccessful(fut);
+    auto res = fut.result();
+    ASSERT_OK(res);
+    ASSERT_EQ(*res, 42);
+    res = std::move(fut.result());
+    ASSERT_OK(res);
+    ASSERT_EQ(*res, 42);
+  }
   {
     // MarkFinished(Result<int>)
     auto fut = Future<int>::Make();
@@ -249,6 +266,12 @@ TEST(FutureSyncTest, Int) {
     AssertFailed(fut);
     ASSERT_RAISES(IOError, fut.result());
   }
+  {
+    // MakeFinished(Status)
+    auto fut = Future<int>::MakeFinished(Status::IOError("xxx"));
+    AssertFailed(fut);
+    ASSERT_RAISES(IOError, fut.result());
+  }
   {
     // MarkFinished(Status)
     auto fut = Future<int>::Make();
@@ -259,6 +282,592 @@ TEST(FutureSyncTest, Int) {
   }
 }
 
+TEST(FutureRefTest, ChainRemoved) {
+  // Creating a future chain should not prevent the futures from being deleted if the
+  // entire chain is deleted
+  std::weak_ptr<FutureImpl> ref;
+  std::weak_ptr<FutureImpl> ref2;
+  {
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& status) { return Status::OK(); });
+    ref = fut.impl_;
+    ref2 = fut2.impl_;
+  }
+  ASSERT_TRUE(ref.expired());
+  ASSERT_TRUE(ref2.expired());
+}
+
+TEST(FutureRefTest, TailRemoved) {
+  // Keeping the head of the future chain should keep the entire chain alive
+  std::shared_ptr<Future<>> ref;
+  std::weak_ptr<FutureImpl> ref2;
+  bool side_effect_run = false;
+  {
+    ref = std::make_shared<Future<>>(Future<>::Make());
+    auto fut2 = ref->Then([&side_effect_run](const Status& status) {
+      side_effect_run = true;
+      return Status::OK();
+    });
+    ref2 = fut2.impl_;
+  }
+  ASSERT_FALSE(ref2.expired());
+
+  ref->MarkFinished();
+  ASSERT_TRUE(side_effect_run);
+}
+
+TEST(FutureRefTest, HeadRemoved) {
+  // Keeping the tail of the future chain should not keep the entire chain alive.  If no
+  // one has a reference to the head then there is no need to keep it, nothing will finish
+  // it.  In theory the intermediate futures could be finished by some external process
+  // but that would be highly unusual and bad practice so in reality this would just be a
+  // reference to a future that will never complete which is ok.
+  std::weak_ptr<FutureImpl> ref;
+  std::shared_ptr<Future<>> ref2;
+  {
+    auto fut = std::make_shared<Future<>>(Future<>::Make());
+    ref = fut->impl_;
+    ref2 = std::make_shared<Future<>>(
+        fut->Then([](const Status& status) { return Status::OK(); }));
+  }
+  ASSERT_TRUE(ref.expired());
+}
+
+TEST(FutureCompletionTest, Void) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    int passed_in_result = 0;
+    auto fut2 = fut.Then(
+        [&passed_in_result](const Result<int>& result) { passed_in_result = *result; });
+    fut.MarkFinished(42);
+    AssertSuccessful(fut2);
+    ASSERT_EQ(passed_in_result, 42);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto fut2 = fut.Then([](const Result<int>& result) {});
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    auto fut2 = fut.Then([](const Result<int>& result) {}, true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertSuccessful(fut2);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) {});
+    fut.MarkFinished();
+    AssertSuccessful(fut2);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) {});
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 =
+        fut.Then([&status_seen](const Status& result) { status_seen = result; }, true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertSuccessful(fut2);
+  }
+}
+
+TEST(FutureCompletionTest, NonVoid) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    auto fut2 = fut.Then([](const Result<int>& result) {
+      auto passed_in_result = *result;
+      return passed_in_result * passed_in_result;
+    });
+    fut.MarkFinished(42);
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42 * 42);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto fut2 = fut.Then([](const Result<int>& result) {
+      auto passed_in_result = *result;
+      return passed_in_result * passed_in_result;
+    });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    bool was_io_error = false;
+    auto fut2 = fut.Then(
+        [&was_io_error](const Result<int>& result) {
+          was_io_error = result.status().IsIOError();
+          return 100;
+        },
+        true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 100);
+    ASSERT_TRUE(was_io_error);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return 42; });
+    fut.MarkFinished();
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return 42; });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen](const Status& result) {
+          status_seen = result;
+          return 42;
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42);
+  }
+}
+
+TEST(FutureCompletionTest, FutureNonVoid) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<std::string>::Make();
+    int passed_in_result = 0;
+    auto fut2 = fut.Then([&passed_in_result, innerFut](const Result<int>& result) {
+      passed_in_result = *result;
+      return innerFut;
+    });
+    fut.MarkFinished(42);
+    ASSERT_EQ(passed_in_result, 42);
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished("hello");
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, "hello");
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<std::string>::Make();
+    auto fut2 = fut.Then([innerFut](const Result<int>& result) { return innerFut; });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<std::string>::Make();
+    bool was_io_error = false;
+    auto fut2 = fut.Then(
+        [&was_io_error, innerFut](const Result<int>& result) {
+          was_io_error = result.status().IsIOError();
+          return innerFut;
+        },
+        true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished("hello");
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, "hello");
+    ASSERT_TRUE(was_io_error);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto innerFut = Future<std::string>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished();
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished("hello");
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, "hello");
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<std::string>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<std::string>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen, &innerFut](const Status& result) {
+          status_seen = result;
+          return innerFut;
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished("hello");
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, "hello");
+  }
+}
+
+TEST(FutureCompletionTest, Status) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    int passed_in_result = 0;
+    Future<> fut2 = fut.Then([&passed_in_result](const Result<int>& result) {
+      passed_in_result = *result;
+      return Status::OK();
+    });
+    fut.MarkFinished(42);
+    ASSERT_EQ(passed_in_result, 42);
+    AssertSuccessful(fut2);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<std::string>::Make();
+    auto fut2 = fut.Then([innerFut](const Result<int>& result) { return innerFut; });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<std::string>::Make();
+    bool was_io_error = false;
+    auto fut2 = fut.Then(
+        [&was_io_error, innerFut](const Result<int>& result) {
+          was_io_error = result.status().IsIOError();
+          return innerFut;
+        },
+        true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished("hello");
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, "hello");
+    ASSERT_TRUE(was_io_error);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return Status::OK(); });
+    fut.MarkFinished();
+    AssertSuccessful(fut2);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return Status::OK(); });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen](const Status& result) {
+          status_seen = result;
+          return Status::OK();
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertSuccessful(fut2);
+  }
+}
+
+TEST(FutureCompletionTest, FutureStatus) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    int passed_in_result = 0;
+    Future<> fut2 = fut.Then([&passed_in_result, innerFut](const Result<int>& result) {
+      passed_in_result = *result;
+      return innerFut;
+    });
+    fut.MarkFinished(42);
+    ASSERT_EQ(passed_in_result, 42);
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished(Status::OK());
+    AssertSuccessful(fut2);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([innerFut](const Result<int>& result) { return innerFut; });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    bool was_io_error = false;
+    auto fut2 = fut.Then(
+        [&was_io_error, innerFut](const Result<int>& result) {
+          was_io_error = result.status().IsIOError();
+          return innerFut;
+        },
+        true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished(Status::OK());
+    AssertSuccessful(fut2);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished();
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished(Status::OK());
+    AssertSuccessful(fut2);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen, &innerFut](const Status& result) {
+          status_seen = result;
+          return innerFut;
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished(Status::OK());
+    AssertSuccessful(fut2);
+  }
+}
+
+TEST(FutureCompletionTest, Result) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    Future<int> fut2 = fut.Then([](const Result<int>& result) {
+      auto passed_in_result = *result;
+      return Result<int>(passed_in_result * passed_in_result);
+    });
+    fut.MarkFinished(42);
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42 * 42);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto fut2 = fut.Then([](const Result<int>& result) {
+      auto passed_in_result = *result;
+      return Result<int>(passed_in_result * passed_in_result);
+    });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    ASSERT_TRUE(fut2.status().IsIOError());
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    bool was_io_error = false;
+    auto fut2 = fut.Then(
+        [&was_io_error](const Result<int>& result) {
+          was_io_error = result.status().IsIOError();
+          return Result<int>(100);
+        },
+        true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 100);
+    ASSERT_TRUE(was_io_error);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return Result<int>(42); });
+    fut.MarkFinished();
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto fut2 = fut.Then([](const Status& result) { return Result<int>(42); });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen](const Status& result) {
+          status_seen = result;
+          return Result<int>(42);
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertSuccessful(fut2);
+    auto result = *fut2.result();
+    ASSERT_EQ(result, 42);
+  }
+}
+
+TEST(FutureCompletionTest, FutureVoid) {
+  {
+    // Simple callback
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    int passed_in_result = 0;
+    auto fut2 = fut.Then([&passed_in_result, innerFut](const Result<int>& result) {
+      passed_in_result = *result;
+      return innerFut;
+    });
+    fut.MarkFinished(42);
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished();
+    AssertSuccessful(fut2);
+    auto res = fut2.status();
+    ASSERT_OK(res);
+    ASSERT_EQ(passed_in_result, 42);
+  }
+  {
+    // Precompleted future
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    innerFut.MarkFinished();
+    int passed_in_result = 0;
+    auto fut2 = fut.Then([&passed_in_result, innerFut](const Result<int>& result) {
+      passed_in_result = *result;
+      return innerFut;
+    });
+    AssertNotFinished(fut2);
+    fut.MarkFinished(42);
+    AssertSuccessful(fut2);
+    ASSERT_EQ(passed_in_result, 42);
+  }
+  {
+    // Propagate failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([innerFut](const Result<int>& result) { return innerFut; });
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertFailed(fut2);
+    if (IsFutureFinished(fut2.state())) {
+      ASSERT_TRUE(fut2.status().IsIOError());
+    }
+  }
+  {
+    // Swallow failure
+    auto fut = Future<int>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 =
+        fut.Then([innerFut](const Result<int>& result) { return innerFut; }, true);
+    fut.MarkFinished(Result<int>(Status::IOError("xxx")));
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished();
+    AssertSuccessful(fut2);
+  }
+  {
+    // From void
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished();
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished();
+    AssertSuccessful(fut2);
+  }
+  {
+    // From failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    auto fut2 = fut.Then([&innerFut](const Status& result) { return innerFut; });
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut2);
+  }
+  {
+    // Recover a failed status
+    auto fut = Future<>::Make();
+    auto innerFut = Future<>::Make();
+    Status status_seen = Status::OK();
+    auto fut2 = fut.Then(
+        [&status_seen, &innerFut](const Status& result) {
+          status_seen = result;
+          return innerFut;
+        },
+        true);
+    ASSERT_TRUE(status_seen.ok());
+    fut.MarkFinished(Status::IOError("xxx"));
+    ASSERT_TRUE(status_seen.IsIOError());
+    AssertNotFinished(fut2);
+    innerFut.MarkFinished();
+    AssertSuccessful(fut2);
+  }
+}
+
 TEST(FutureSyncTest, Foo) {
   {
     // MarkFinished(Foo)
@@ -330,27 +939,24 @@ TEST(FutureSyncTest, MoveOnlyDataType) {
   }
 }
 
-TEST(FutureSyncTest, void) {
+TEST(FutureSyncTest, Empty) {
   {
     // MarkFinished()
-    auto fut = Future<void>::Make();
+    auto fut = Future<>::Make();
     AssertNotFinished(fut);
     fut.MarkFinished();
     AssertSuccessful(fut);
   }
-}
-
-TEST(FutureSyncTest, Status) {
   {
     // MarkFinished(Status)
-    auto fut = Future<Status>::Make();
+    auto fut = Future<>::Make();
     AssertNotFinished(fut);
     fut.MarkFinished(Status::OK());
     AssertSuccessful(fut);
   }
   {
     // MarkFinished(Status)
-    auto fut = Future<Status>::Make();
+    auto fut = Future<>::Make();
     AssertNotFinished(fut);
     fut.MarkFinished(Status::IOError("xxx"));
     AssertFailed(fut);
@@ -358,6 +964,33 @@ TEST(FutureSyncTest, Status) {
   }
 }
 
+TEST(FutureSyncTest, GetStatusFuture) {
+  {
+    auto fut = Future<MoveOnlyDataType>::Make();
+    Future<> status_future(fut);
+
+    AssertNotFinished(fut);
+    AssertNotFinished(status_future);
+
+    fut.MarkFinished(MoveOnlyDataType(42));
+    AssertSuccessful(fut);
+    AssertSuccessful(status_future);
+    ASSERT_EQ(&fut.status(), &status_future.status());
+  }
+  {
+    auto fut = Future<MoveOnlyDataType>::Make();
+    Future<> status_future(fut);
+
+    AssertNotFinished(fut);
+    AssertNotFinished(status_future);
+
+    fut.MarkFinished(Status::IOError("xxx"));
+    AssertFailed(fut);
+    AssertFailed(status_future);
+    ASSERT_EQ(&fut.status(), &status_future.status());
+  }
+}
+
 // --------------------------------------------------------------------
 // Tests with an executor
 
@@ -676,7 +1309,7 @@ class FutureTestBase : public ::testing::Test {
 template <typename T>
 class FutureTest : public FutureTestBase<T> {};
 
-typedef ::testing::Types<int, Foo, MoveOnlyDataType> FutureTestTypes;
+using FutureTestTypes = ::testing::Types<int, Foo, MoveOnlyDataType>;
 
 TYPED_TEST_SUITE(FutureTest, FutureTestTypes);
 
@@ -701,7 +1334,7 @@ TYPED_TEST(FutureTest, StressWaitForAll) { this->TestStressWaitForAll(); }
 template <typename T>
 class FutureIteratorTest : public FutureTestBase<T> {};
 
-typedef ::testing::Types<Foo, MoveOnlyDataType> FutureIteratorTestTypes;
+using FutureIteratorTestTypes = ::testing::Types<Foo, MoveOnlyDataType>;
 
 TYPED_TEST_SUITE(FutureIteratorTest, FutureIteratorTestTypes);
 
diff --git a/cpp/src/arrow/util/iterator.h b/cpp/src/arrow/util/iterator.h
index 58dda5df2a7..e58c2baa58f 100644
--- a/cpp/src/arrow/util/iterator.h
+++ b/cpp/src/arrow/util/iterator.h
@@ -18,8 +18,12 @@
 #pragma once
 
 #include <cassert>
+#include <condition_variable>
+#include <deque>
 #include <functional>
 #include <memory>
+#include <mutex>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -29,8 +33,13 @@
 #include "arrow/status.h"
 #include "arrow/util/compare.h"
 #include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/mutex.h"
 #include "arrow/util/optional.h"
+#include "arrow/util/task_group.h"
+#include "arrow/util/thread_pool.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -527,4 +536,288 @@ Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_si
   return ReadaheadIterator<T>::Make(std::move(it), readahead_queue_size);
 }
 
+namespace detail {
+
+/// This is the AsyncReadaheadIterator's equivalent of ReadaheadQueue.  I'm using the term
+/// worker here so it is a bit more explicit that there is an actual thread running and it
+/// is something that starts (Start) and stops (EnsureShutdownOrDie).
+///
+/// Unlike it's inspiration this class is not type-erased.  It probably could be but I'll
+/// leave that as an exercise for future development.
+template <typename T>
+class ARROW_EXPORT AsyncReadaheadWorker
+    : public std::enable_shared_from_this<AsyncReadaheadWorker<T>> {
+ public:
+  explicit AsyncReadaheadWorker(int readahead_queue_size)
+      : max_readahead_(readahead_queue_size) {}
+
+  ~AsyncReadaheadWorker() { EnsureShutdownOrDie(false); }
+
+  void Start(std::shared_ptr<Iterator<T>> iterator) {
+    DCHECK(!thread_.joinable());
+    auto self = this->shared_from_this();
+    thread_ = std::thread([self, iterator]() { self->DoWork(iterator); });
+    DCHECK(thread_.joinable());
+  }
+
+  Future<T> AddRequest() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    // If we have results in our readahead queue then we can immediately return a
+    // completed future
+    if (unconsumed_results_.size() > 0) {
+      auto wake_up_after = unconsumed_results_.size() == max_readahead_;
+      auto result = Future<T>::MakeFinished(unconsumed_results_.front());
+      unconsumed_results_.pop_front();
+      lock.unlock();
+      if (wake_up_after) {
+        worker_wakeup_.notify_one();
+      }
+      return result;
+    }
+    // Otherwise, we don't have a stored result, so we need to make a request for a new
+    // result. Unless we're finished (hit the end of the iterator) in which case we can
+    // just return End()
+    if (please_shutdown_) {
+      return Future<T>::MakeFinished(IterationTraits<T>::End());
+    }
+    auto result = Future<T>::Make();
+    waiting_futures_.push_back(result);
+    return result;
+  }
+
+  Status Shutdown(bool wait = true) {
+    return ShutdownUnlocked(std::unique_lock<std::mutex>(mutex_), wait);
+  }
+
+  void EnsureShutdownOrDie(bool wait = true) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (thread_.joinable()) {
+      ARROW_CHECK_OK(ShutdownUnlocked(std::move(lock), wait));
+      DCHECK(!thread_.joinable());
+    }
+  }
+
+  Status ShutdownUnlocked(std::unique_lock<std::mutex> lock, bool wait = true) {
+    if (!please_shutdown_) {
+      FinishUnlocked();
+    }
+    lock.unlock();
+    worker_wakeup_.notify_one();
+    if (wait) {
+      thread_.join();
+    } else {
+      thread_.detach();
+    }
+    return Status::OK();
+  }
+
+  void FinishUnlocked() {
+    please_shutdown_ = true;
+    for (auto&& future : waiting_futures_) {
+      future.MarkFinished(IterationTraits<T>::End());
+    }
+  }
+
+  void Finish() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    FinishUnlocked();
+  }
+
+  void DoWork(std::shared_ptr<Iterator<T>> iterator) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!please_shutdown_) {
+      while (unconsumed_results_.size() < max_readahead_) {
+        // Grab the next item, might be expensive (I/O) so we don't want to keep the lock
+        lock.unlock();
+        auto next_item = iterator->Next();
+        lock.lock();
+        // If we're done then we should deliver End() to all outstanding requests
+        if (next_item == IterationTraits<T>::End()) {
+          FinishUnlocked();
+        } else {
+          // If there are any outstanding requests then pop one off and deliver this item
+          if (waiting_futures_.size() > 0) {
+            auto next_future = waiting_futures_.front();
+            waiting_futures_.pop_front();
+            // Marking the future finished may trigger expensive callbacks so we don't
+            // want to hold the lock while we do that.
+            lock.unlock();
+            next_future.MarkFinished(std::move(next_item));
+            lock.lock();
+            // Otherwise, if there are no oustanding requests, add the item to our
+            // readahead queue
+          } else {
+            unconsumed_results_.push_back(std::move(next_item));
+          }
+        }
+        // Exit eagerly
+        if (please_shutdown_) {
+          return;
+        }
+      }
+      // Wait for more work to do
+      worker_wakeup_.wait(lock);
+    }
+  }
+
+ protected:
+  std::deque<Result<T>> unconsumed_results_;
+  std::deque<Future<T>> waiting_futures_;
+  std::size_t max_readahead_;
+  bool please_shutdown_ = false;
+
+  std::thread thread_;
+  std::mutex mutex_;
+  std::condition_variable worker_wakeup_;
+};
+
+}  // namespace detail
+
+/// \brief This iterates on the underlying iterator in a separate thread, getting up to
+/// N values in advance.  Unlike ReadaheadIterator this is not an iterator in the
+/// traditional sense.  Every call to NextFuture will generate a new future whether data
+/// is ready or not.
+///
+/// This means, without some form of back pressure, you are likely to create an endless
+/// supply of useless futures (which will eventually be fulfilled with
+/// IterationTraits<T>::End()) if the underlying iterator completes before you explode the
+/// heap/stack.
+///
+/// This iterator is instead meant to be passed to some kind of continuation aware
+/// iteration method such as async::ForEach
+template <typename T>
+class AsyncReadaheadIterator {
+ public:
+  // Public default constructor creates an empty iterator
+  AsyncReadaheadIterator() : done_(true) {}
+
+  ~AsyncReadaheadIterator() {
+    if (worker_ != NULLPTR) {
+      worker_->EnsureShutdownOrDie();
+    }
+  }
+
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(AsyncReadaheadIterator);
+  ARROW_DISALLOW_COPY_AND_ASSIGN(AsyncReadaheadIterator);
+
+  Future<T> NextFuture() {
+    if (done_) {
+      return Future<T>::MakeFinished(IterationTraits<T>::End());
+    }
+    return worker_->AddRequest();
+  }
+
+  static Result<AsyncReadaheadIterator<T>> Make(Iterator<T> it,
+                                                int readahead_queue_size) {
+    return AsyncReadaheadIterator<T>(std::move(it), readahead_queue_size);
+  }
+
+ private:
+  explicit AsyncReadaheadIterator(Iterator<T> it, int readahead_queue_size)
+      : worker_(std::make_shared<detail::AsyncReadaheadWorker<T>>(readahead_queue_size)) {
+    worker_->Start(std::make_shared<Iterator<T>>(std::move(it)));
+  }
+
+  std::shared_ptr<detail::AsyncReadaheadWorker<T>> worker_;
+  bool done_ = false;
+};
+
+template <typename T>
+Result<AsyncReadaheadIterator<T>> MakeAsyncReadaheadIterator(Iterator<T> it,
+                                                             int readahead_queue_size) {
+  return AsyncReadaheadIterator<T>::Make(std::move(it), readahead_queue_size);
+}
+
+namespace async {
+
+namespace internal {
+
+// This is a recursive function.  The base cases are when the iterator returns a failed
+// future (e.g. an I/O error) or when the underlying iterator is completed.
+//
+// Whenever recursion is performed on an iterator care should be taken to avoid a stack
+// overflow.  This picture is a bit more complicated however because `Then` won't
+// neccesarily execute immediately.
+template <typename T>
+Future<> AsyncForEachHelper(std::shared_ptr<AsyncReadaheadIterator<T>> it,
+                            std::function<Status(const T& t)> func,
+                            arrow::internal::Executor* executor = NULLPTR) {
+  auto future = it->NextFuture();
+  if (executor != NULLPTR) {
+    auto transferred_future = executor->Transfer(future);
+    if (!transferred_future.ok()) {
+      return Future<>::MakeFinished(transferred_future.status());
+    }
+    future = transferred_future.ValueUnsafe();
+  }
+  return future.Then([it, func](const Result<T>& t) {
+    if (!t.ok()) {
+      return Future<>::MakeFinished(t.status());
+    }
+    auto value = t.ValueUnsafe();
+    if (value == IterationTraits<T>::End()) {
+      return Future<>::MakeFinished();
+    }
+    auto cb_result = func(std::move(value));
+    if (!cb_result.ok()) {
+      return Future<>::MakeFinished(cb_result);
+    }
+    return AsyncForEachHelper(std::move(it), func);
+  });
+}
+
+}  // namespace internal
+
+/// \brief Provides a safe and convenient method to visit an async iterator
+///
+/// This will take ownership of `it`
+///
+/// The num_workers argument can be used to allow for multiple processing tasks to run in
+/// parallel.  This will only have an effect if executor is specified.
+///
+/// If `executor` is supplied then `func` will be submitted on the executor.  If it is not
+/// specified (The default) then `func` will be run on the same thread the async iterator
+/// uses to complete the future (for AsyncReadaheadIterator this will be the created
+/// readahead thread)
+///
+/// The caller is responsible for guaranteeing that the `executor` pointer is valid for
+/// the duration of this iteration.
+///
+/// If any errors are encountered by the iterator this iteration will stop immediately and
+/// the future returned by this method will be marked failed with the error.  Otherwise,
+/// the future returned by this method will be marked complete with an OK status when
+/// `func` has been applied to every item the underlying iterator produces.
+template <typename T>
+Future<> AsyncForEach(AsyncReadaheadIterator<T> it,
+                      std::function<Status(const T& t)> func, int num_workers = 1,
+                      arrow::internal::Executor* executor = NULLPTR) {
+  if (num_workers <= 0) {
+    return Future<>::MakeFinished(Status::Invalid(
+        "The num_workers argument to AsyncForEach should be a positive integer"));
+  }
+  auto task_group = arrow::internal::TaskGroup::MakeThreaded(NULLPTR);
+  std::shared_ptr<AsyncReadaheadIterator<T>> it_ptr(
+      new AsyncReadaheadIterator<T>(std::move(it)));
+
+  auto n_remaining = std::make_shared<std::atomic<int>>();
+  n_remaining->store(num_workers);
+
+  std::vector<Future<>> workers(num_workers);
+  for (int i = 0; i < num_workers; ++i) {
+    workers[i] = internal::AsyncForEachHelper(it_ptr, func, executor);
+
+    task_group->Append(workers[i]);
+
+    workers[i].Then([n_remaining, task_group](const Status&) {
+      if (n_remaining->fetch_sub(1) == 1) {
+        return task_group->Finish();
+      }
+      return Status::OK();
+    });
+  }
+
+  return task_group->FinishAsync();
+}
+
+}  // namespace async
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc
index 7295627b7c8..969be178840 100644
--- a/cpp/src/arrow/util/iterator_test.cc
+++ b/cpp/src/arrow/util/iterator_test.cc
@@ -49,6 +49,104 @@ struct IterationTraits<TestInt> {
   static TestInt End() { return TestInt(); }
 };
 
+template <typename T>
+class ManualIteratorBase {
+ public:
+  Result<T> Next(std::size_t index) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    last_asked_for_index_ = index;
+    waiting_count_++;
+    if (cv_.wait_for(lock, std::chrono::milliseconds(3000),
+                     [this, index] { return finished_ || results_.size() > index; })) {
+      waiting_count_--;
+      if (finished_ && index >= results_.size()) {
+        return Result<T>(IterationTraits<T>::End());
+      }
+      return Result<T>(results_[index]);
+    } else {
+      waiting_count_--;
+      return Result<T>(
+          Status::Invalid("Timed out waiting for someone to deliver a value"));
+    }
+  }
+
+  void Deliver(const T& value) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      results_.push_back(value);
+    }
+    cv_.notify_all();
+  }
+
+  void MarkFinished() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      finished_ = true;
+    }
+    cv_.notify_all();
+  }
+
+  std::size_t LastAskedForIndex() const { return last_asked_for_index_; }
+
+  Status WaitForWaiters(unsigned int num_waiters, unsigned int last_asked_for_index) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (cv_.wait_for(lock, std::chrono::milliseconds(100),
+                     [this, num_waiters, last_asked_for_index] {
+                       return waiting_count_ >= num_waiters &&
+                              last_asked_for_index == last_asked_for_index_;
+                     })) {
+      return Status::OK();
+    } else {
+      return Status::Invalid("Timed out waiting for waiters to show up at iterator");
+    }
+  }
+
+  Status WaitForGteWaiters(unsigned int num_waiters) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (cv_.wait_for(lock, std::chrono::milliseconds(100),
+                     [this, num_waiters] { return waiting_count_ >= num_waiters; })) {
+      return Status::OK();
+    } else {
+      return Status::Invalid("Timed out waiting for waiters to show up at iterator");
+    }
+  }
+
+  Status WaitForLteWaiters(unsigned int num_waiters) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (cv_.wait_for(lock, std::chrono::milliseconds(100),
+                     [this, num_waiters] { return waiting_count_ <= num_waiters; })) {
+      return Status::OK();
+    } else {
+      return Status::Invalid("Timed out waiting for waiters to show up at iterator");
+    }
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::vector<T> results_;
+  bool finished_;
+  unsigned int waiting_count_;
+  std::size_t last_asked_for_index_ = -1;
+};
+
+template <typename T>
+class ManualIterator {
+ public:
+  explicit ManualIterator(std::shared_ptr<ManualIteratorBase<T>> base)
+      : base_(base), i(0) {}
+  Result<T> Next() { return base_->Next(i++); }
+
+ private:
+  std::shared_ptr<ManualIteratorBase<T>> base_;
+  std::size_t i;
+};
+
+template <typename T>
+Iterator<T> MakeManual(std::shared_ptr<ManualIteratorBase<T>> base) {
+  return Iterator<T>(ManualIterator<T>(base));
+}
+
 template <typename T>
 class TracingIterator {
  public:
@@ -392,4 +490,91 @@ TEST(ReadaheadIterator, NextError) {
   AssertIteratorExhausted(it);
 }
 
+TEST(AsyncReadaheadIterator, BasicIteration) {
+  auto base_it = VectorIt({1, 2});
+  ASSERT_OK_AND_ASSIGN(auto readahead_it,
+                       MakeAsyncReadaheadIterator(std::move(base_it), 1));
+  auto future_one = readahead_it.NextFuture();
+  auto future_two = readahead_it.NextFuture();
+  auto future_three = readahead_it.NextFuture();
+  future_three.Wait();
+  ASSERT_EQ(TestInt(1), *future_one.result());
+  ASSERT_EQ(TestInt(2), *future_two.result());
+  ASSERT_EQ(TestInt(), *future_three.result());
+}
+
+TEST(AsyncReadaheadIterator, FastConsumer) {
+  auto base = std::make_shared<ManualIteratorBase<TestInt>>();
+  auto base_it = MakeManual<TestInt>(base);
+  ASSERT_OK_AND_ASSIGN(auto readahead_it,
+                       MakeAsyncReadaheadIterator(std::move(base_it), 2));
+  auto future_one = readahead_it.NextFuture();
+  auto future_two = readahead_it.NextFuture();
+  auto future_three = readahead_it.NextFuture();
+  // The readahead should ask for item 0 and wait
+  ASSERT_OK(base->WaitForWaiters(1, 0));
+  // Even though readahead is 2 there should only be one thread so one waiter.  TODO:
+  // Optimize 100ms wait
+  ASSERT_TRUE(base->WaitForGteWaiters(2).IsInvalid());
+  ASSERT_EQ(FutureState::PENDING, future_one.state());
+
+  base->Deliver(1);
+  ASSERT_OK(base->WaitForWaiters(1, 1));
+  ASSERT_TRUE(future_one.is_valid());
+  if (future_one.is_valid()) {
+    ASSERT_EQ(future_one.result(), 1);
+  }
+  ASSERT_EQ(FutureState::PENDING, future_two.state());
+
+  base->MarkFinished();
+  ASSERT_TRUE(future_two.Wait(0.1));
+  if (future_two.is_valid()) {
+    ASSERT_EQ(IterationTraits<TestInt>::End(), future_two.result().ValueOrDie());
+  }
+
+  ASSERT_TRUE(future_three.Wait(0.1));
+  ASSERT_EQ(IterationTraits<TestInt>::End(), future_three.result().ValueOrDie());
+}
+
+TEST(AsyncReadaheadIterator, FastProducer) {
+  auto base = std::make_shared<ManualIteratorBase<TestInt>>();
+  auto base_it = MakeManual<TestInt>(base);
+  ASSERT_OK_AND_ASSIGN(auto readahead_it,
+                       MakeAsyncReadaheadIterator(std::move(base_it), 2));
+  base->Deliver(1);
+  base->Deliver(2);
+  base->MarkFinished();
+  // There should be no one waiting for values.
+  ASSERT_OK(base->WaitForLteWaiters(
+      0));  // First the thread finishes reading and blocks on its own queue
+  ASSERT_TRUE(base->WaitForGteWaiters(1)
+                  .IsInvalid());  // Then the thread remains blocked and doesn't call Next
+
+  auto future_one = readahead_it.NextFuture();
+  // Future should be returned immediately completed
+  ASSERT_EQ(TestInt(1), future_one.result().ValueOrDie());
+
+  auto future_two = readahead_it.NextFuture();
+  ASSERT_EQ(TestInt(2), future_two.result().ValueOrDie());
+
+  auto future_three = readahead_it.NextFuture();
+  ASSERT_EQ(IterationTraits<TestInt>::End(), future_three.result().ValueOrDie());
+}
+
+TEST(AsyncReadaheadIterator, ForEach) {
+  auto it = VectorIt({1, 2, 3, 4});
+  ASSERT_OK_AND_ASSIGN(auto readahead_it, MakeAsyncReadaheadIterator(std::move(it), 1));
+  std::atomic<uint32_t> sum(0);
+  auto for_each_future = async::AsyncForEach<TestInt>(
+      std::move(readahead_it),
+      [&sum](const TestInt& r) {
+        sum.fetch_add(r.value);
+        return Status::OK();
+      },
+      1);
+  for_each_future.Wait();
+  ASSERT_OK(for_each_future.status());
+  ASSERT_EQ(10, sum.load());
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h
index 89a5cd091d5..e2c87a534a6 100644
--- a/cpp/src/arrow/util/parallel.h
+++ b/cpp/src/arrow/util/parallel.h
@@ -32,7 +32,7 @@ namespace internal {
 template <class FUNCTION>
 Status ParallelFor(int num_tasks, FUNCTION&& func) {
   auto pool = internal::GetCpuThreadPool();
-  std::vector<Future<Status>> futures(num_tasks);
+  std::vector<Future<>> futures(num_tasks);
 
   for (int i = 0; i < num_tasks; ++i) {
     ARROW_ASSIGN_OR_RAISE(futures[i], pool->Submit(func, i));
diff --git a/cpp/src/arrow/util/task_group.cc b/cpp/src/arrow/util/task_group.cc
index 79854d36168..5747fabd754 100644
--- a/cpp/src/arrow/util/task_group.cc
+++ b/cpp/src/arrow/util/task_group.cc
@@ -20,6 +20,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstdint>
+#include <functional>
 #include <mutex>
 #include <utility>
 
@@ -35,13 +36,6 @@ namespace internal {
 
 class SerialTaskGroup : public TaskGroup {
  public:
-  void AppendReal(std::function<Status()> task) override {
-    DCHECK(!finished_);
-    if (status_.ok()) {
-      status_ &= task();
-    }
-  }
-
   Status current_status() override { return status_; }
 
   bool ok() override { return status_.ok(); }
@@ -49,25 +43,36 @@ class SerialTaskGroup : public TaskGroup {
   Status Finish() override {
     if (!finished_) {
       finished_ = true;
-      if (parent_) {
-        parent_->status_ &= status_;
-      }
     }
     return status_;
   }
 
+  Future<> FinishAsync() override {
+    if (!finished_) {
+      finished_ = true;
+    }
+    return Future<>::MakeFinished(status_);
+  }
+
   int parallelism() override { return 1; }
 
-  std::shared_ptr<TaskGroup> MakeSubGroup() override {
-    auto child = new SerialTaskGroup();
-    child->parent_ = this;
-    return std::shared_ptr<TaskGroup>(child);
+ protected:
+  void AppendReal(std::function<Status()> task) override {
+    DCHECK(!finished_);
+    if (status_.ok()) {
+      status_ &= task();
+    }
+  }
+
+  void AppendReal(Future<> future) override {
+    DCHECK(!finished_);
+    if (status_.ok()) {
+      status_ &= future.status();
+    }
   }
 
- protected:
   Status status_;
   bool finished_ = false;
-  SerialTaskGroup* parent_ = nullptr;
 };
 
 ////////////////////////////////////////////////////////////////////////
@@ -91,18 +96,42 @@ class ThreadedTaskGroup : public TaskGroup {
       nremaining_.fetch_add(1, std::memory_order_acquire);
 
       auto self = checked_pointer_cast<ThreadedTaskGroup>(shared_from_this());
-      Status st = executor_->Spawn([self, task]() {
-        if (self->ok_.load(std::memory_order_acquire)) {
-          // XXX what about exceptions?
-          Status st = task();
-          self->UpdateStatus(std::move(st));
-        }
-        self->OneTaskDone();
-      });
+      Status st = executor_->Spawn(std::bind(
+          [](const std::shared_ptr<ThreadedTaskGroup>& self,
+             const std::function<Status()>& task) {
+            if (self->ok_.load(std::memory_order_acquire)) {
+              // XXX what about exceptions?
+              Status st = task();
+              self->UpdateStatus(std::move(st));
+            }
+            self->OneTaskDone();
+          },
+          std::move(self), std::move(task)));
       UpdateStatus(std::move(st));
     }
   }
 
+  void AppendReal(Future<> future) override {
+    DCHECK(!finished_);
+    if (ok_.load(std::memory_order_acquire)) {
+      nremaining_.fetch_add(1, std::memory_order_acquire);
+      AddFutureHelper(future);
+    }
+  }
+
+  void AddFutureHelper(Future<> task_future) {
+    auto self = checked_pointer_cast<ThreadedTaskGroup>(shared_from_this());
+    auto callback = std::bind(
+        [](const std::shared_ptr<ThreadedTaskGroup>& self, const Status& status) {
+          if (self->ok_.load(std::memory_order_acquire)) {
+            self->UpdateStatus(std::move(status));
+          }
+          self->OneTaskDone();
+        },
+        std::move(self), std::placeholders::_1);
+    ARROW_UNUSED(task_future.Then(std::move(callback), /*run_on_failure=*/true));
+  }
+
   Status current_status() override {
     std::lock_guard<std::mutex> lock(mutex_);
     return status_;
@@ -116,25 +145,17 @@ class ThreadedTaskGroup : public TaskGroup {
       cv_.wait(lock, [&]() { return nremaining_.load() == 0; });
       // Current tasks may start other tasks, so only set this when done
       finished_ = true;
-      if (parent_) {
-        parent_->OneTaskDone();
-      }
+      completion_future_.MarkFinished(status_);
     }
     return status_;
   }
 
-  int parallelism() override { return executor_->GetCapacity(); }
+  Future<> FinishAsync() override { return completion_future_; }
 
-  std::shared_ptr<TaskGroup> MakeSubGroup() override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto child = new ThreadedTaskGroup(executor_);
-    child->parent_ = this;
-    nremaining_.fetch_add(1, std::memory_order_acquire);
-    return std::shared_ptr<TaskGroup>(child);
-  }
+  int parallelism() override { return executor_->GetCapacity(); }
 
  protected:
-  void UpdateStatus(Status&& st) {
+  void UpdateStatus(const Status& st) {
     // Must be called unlocked, only locks on error
     if (ARROW_PREDICT_FALSE(!st.ok())) {
       std::lock_guard<std::mutex> lock(mutex_);
@@ -159,13 +180,13 @@ class ThreadedTaskGroup : public TaskGroup {
   Executor* executor_;
   std::atomic<int32_t> nremaining_;
   std::atomic<bool> ok_;
+  Future<> completion_future_ = Future<>::Make();
 
   // These members use locking
   std::mutex mutex_;
   std::condition_variable cv_;
   Status status_;
   bool finished_ = false;
-  ThreadedTaskGroup* parent_ = nullptr;
 };
 
 std::shared_ptr<TaskGroup> TaskGroup::MakeSerial() {
diff --git a/cpp/src/arrow/util/task_group.h b/cpp/src/arrow/util/task_group.h
index d10a7a4032c..624e123daba 100644
--- a/cpp/src/arrow/util/task_group.h
+++ b/cpp/src/arrow/util/task_group.h
@@ -22,6 +22,7 @@
 #include <utility>
 
 #include "arrow/status.h"
+#include "arrow/util/future.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/type_fwd.h"
 #include "arrow/util/visibility.h"
@@ -29,8 +30,6 @@
 namespace arrow {
 namespace internal {
 
-// TODO Simplify this.  Subgroups don't seem necessary.
-
 /// \brief A group of related tasks
 ///
 /// A TaskGroup executes tasks with the signature `Status()`.
@@ -38,6 +37,23 @@ namespace internal {
 /// implementation.  When Finish() returns, it is guaranteed that all
 /// tasks have finished, or at least one has errored.
 ///
+/// Once an error has occurred any tasks that are submitted to the task group
+/// will not run.  The call to Append will simply return without scheduling the
+/// task.
+///
+/// If the task group is parallel it is possible that multiple tasks could be
+/// running at the same time and one of those tasks fails.  This will put the
+/// task group in a failure state (so additional tasks cannot be run) however
+/// it will not interrupt running tasks.  Finish and FinishAsync will not complete
+/// until all running tasks have finished, even if one task fails.
+///
+/// If you wish to nest task groups then you can do so by obtaining a Future
+/// from your child task groups (using FinishAsync) and adding this future
+/// as a task to your parent task group.
+///
+/// Once a task group has finished new tasks should not be added to it.  This
+/// will lead to a runtime error.  If you need to start a new batch of work then
+/// you should create a new task group.  Keep in mind that you can nest task groups.
 class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
  public:
   /// Add a Status-returning function to execute.  Execution order is
@@ -47,12 +63,28 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
     return AppendReal(std::forward<Function>(func));
   }
 
+  /// Adds a future to the list of tasks to wait for.
+  template <typename T>
+  void Append(const Future<T>& future) {
+    return AppendReal(Future<>(future));
+  }
+
   /// Wait for execution of all tasks (and subgroups) to be finished,
   /// or for at least one task (or subgroup) to error out.
   /// The returned Status propagates the error status of the first failing
   /// task (or subgroup).
   virtual Status Finish() = 0;
 
+  /// Returns a future that will complete when all tasks in the task group
+  /// are finished.
+  ///
+  /// If any task fails then this future will be marked failed with the failing
+  /// status.  However, the future will not be marked complete until all other
+  /// running tasks have been completed.  Tasks that have not started running will
+  /// not start once the task group enters an error state so this possibility of
+  /// "other tasks" only exists with a threaded task group.
+  virtual Future<> FinishAsync() = 0;
+
   /// The current aggregate error Status.  Non-blocking, useful for stopping early.
   virtual Status current_status() = 0;
 
@@ -63,14 +95,6 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
   /// This is only a hint, useful for testing or debugging.
   virtual int parallelism() = 0;
 
-  /// Create a subgroup of this group.  This group can only finish
-  /// when all subgroups have finished (this means you must be
-  /// be careful to call Finish() on subgroups before calling it
-  /// on the main group).
-  // XXX if a subgroup errors out, should it propagate immediately to the parent
-  // and to children?
-  virtual std::shared_ptr<TaskGroup> MakeSubGroup() = 0;
-
   static std::shared_ptr<TaskGroup> MakeSerial();
   static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*);
 
@@ -81,6 +105,7 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
   ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
 
   virtual void AppendReal(std::function<Status()> task) = 0;
+  virtual void AppendReal(Future<> task) = 0;
 };
 
 }  // namespace internal
diff --git a/cpp/src/arrow/util/task_group_test.cc b/cpp/src/arrow/util/task_group_test.cc
index 58170df9fb4..6dbbc2ff248 100644
--- a/cpp/src/arrow/util/task_group_test.cc
+++ b/cpp/src/arrow/util/task_group_test.cc
@@ -22,6 +22,7 @@
 #include <memory>
 #include <random>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -76,20 +77,27 @@ void TestTaskGroupErrors(std::shared_ptr<TaskGroup> task_group) {
 
   std::atomic<int> count(0);
 
-  for (int i = 0; i < NSUCCESSES; ++i) {
-    task_group->Append([&]() {
-      count++;
-      return Status::OK();
-    });
-  }
-  ASSERT_TRUE(task_group->ok());
-  for (int i = 0; i < NERRORS; ++i) {
-    task_group->Append([&]() {
-      SleepFor(1e-2);
-      count++;
-      return Status::Invalid("some message");
-    });
-  }
+  auto task_group_was_ok = true;
+  task_group->Append([&]() -> Status {
+    for (int i = 0; i < NSUCCESSES; ++i) {
+      task_group->Append([&]() {
+        count++;
+        return Status::OK();
+      });
+    }
+    task_group_was_ok = task_group->ok();
+    for (int i = 0; i < NERRORS; ++i) {
+      task_group->Append([&]() {
+        SleepFor(1e-2);
+        count++;
+        return Status::Invalid("some message");
+      });
+    }
+
+    return Status::OK();
+  });
+
+  ASSERT_TRUE(task_group_was_ok);
 
   // Task error is propagated
   ASSERT_RAISES(Invalid, task_group->Finish());
@@ -108,78 +116,105 @@ void TestTaskGroupErrors(std::shared_ptr<TaskGroup> task_group) {
 
 // Check TaskGroup behaviour with a bunch of all-successful tasks and task groups
 void TestTaskSubGroupsSuccess(std::shared_ptr<TaskGroup> task_group) {
-  const int NTASKS = 50;
-  const int NGROUPS = 7;
+  // const int NTASKS = 50;
+  // const int NGROUPS = 7;
+
+  // auto sleeps = RandomSleepDurations(NTASKS, 1e-4, 1e-3);
+  // std::vector<std::shared_ptr<TaskGroup>> groups = {task_group};
+
+  // // Create some subgroups
+  // for (int i = 0; i < NGROUPS - 1; ++i) {
+  //   groups.push_back(task_group->MakeSubGroup());
+  // }
+
+  // // Add NTASKS sleeps amongst all groups
+  // std::atomic<int> count(0);
+  // for (int i = 0; i < NTASKS; ++i) {
+  //   groups[i % NGROUPS]->Append([&, i]() {
+  //     SleepFor(sleeps[i]);
+  //     count += i;
+  //     return Status::OK();
+  //   });
+  // }
+  // ASSERT_TRUE(task_group->ok());
+
+  // // Finish all subgroups first, then main group
+  // for (int i = NGROUPS - 1; i >= 0; --i) {
+  //   ASSERT_OK(groups[i]->Finish());
+  // }
+  // ASSERT_TRUE(task_group->ok());
+  // ASSERT_EQ(count.load(), NTASKS * (NTASKS - 1) / 2);
+  // // Finish() is idempotent
+  // ASSERT_OK(task_group->Finish());
+}
 
-  auto sleeps = RandomSleepDurations(NTASKS, 1e-4, 1e-3);
-  std::vector<std::shared_ptr<TaskGroup>> groups = {task_group};
+// Check TaskGroup behaviour with both successful and failing tasks and task groups
+void TestTaskSubGroupsErrors(std::shared_ptr<TaskGroup> task_group) {
+  // const int NTASKS = 50;
+  // const int NGROUPS = 7;
+  // const int FAIL_EVERY = 17;
+  // std::vector<std::shared_ptr<TaskGroup>> groups = {task_group};
+
+  // // Create some subgroups
+  // for (int i = 0; i < NGROUPS - 1; ++i) {
+  //   groups.push_back(task_group->MakeSubGroup());
+  // }
+
+  // // Add NTASKS sleeps amongst all groups
+  // for (int i = 0; i < NTASKS; ++i) {
+  //   groups[i % NGROUPS]->Append([&, i]() {
+  //     SleepFor(1e-3);
+  //     // As NGROUPS > NTASKS / FAIL_EVERY, some subgroups are successful
+  //     if (i % FAIL_EVERY == 0) {
+  //       return Status::Invalid("some message");
+  //     } else {
+  //       return Status::OK();
+  //     }
+  //   });
+  // }
+
+  // // Finish all subgroups first, then main group
+  // int nsuccessful = 0;
+  // for (int i = NGROUPS - 1; i > 0; --i) {
+  //   Status st = groups[i]->Finish();
+  //   if (st.ok()) {
+  //     ++nsuccessful;
+  //   } else {
+  //     ASSERT_RAISES(Invalid, st);
+  //   }
+  // }
+  // ASSERT_RAISES(Invalid, task_group->Finish());
+  // ASSERT_FALSE(task_group->ok());
+  // // Finish() is idempotent
+  // ASSERT_RAISES(Invalid, task_group->Finish());
+}
 
-  // Create some subgroups
-  for (int i = 0; i < NGROUPS - 1; ++i) {
-    groups.push_back(task_group->MakeSubGroup());
-  }
+class CopyCountingTask {
+ public:
+  explicit CopyCountingTask(std::shared_ptr<uint8_t> target)
+      : counter(0), target(std::move(target)) {}
 
-  // Add NTASKS sleeps amongst all groups
-  std::atomic<int> count(0);
-  for (int i = 0; i < NTASKS; ++i) {
-    groups[i % NGROUPS]->Append([&, i]() {
-      SleepFor(sleeps[i]);
-      count += i;
-      return Status::OK();
-    });
-  }
-  ASSERT_TRUE(task_group->ok());
+  CopyCountingTask(const CopyCountingTask& other)
+      : counter(other.counter + 1), target(other.target) {}
 
-  // Finish all subgroups first, then main group
-  for (int i = NGROUPS - 1; i >= 0; --i) {
-    ASSERT_OK(groups[i]->Finish());
+  CopyCountingTask& operator=(const CopyCountingTask& other) {
+    counter = other.counter + 1;
+    target = other.target;
+    return *this;
   }
-  ASSERT_TRUE(task_group->ok());
-  ASSERT_EQ(count.load(), NTASKS * (NTASKS - 1) / 2);
-  // Finish() is idempotent
-  ASSERT_OK(task_group->Finish());
-}
 
-// Check TaskGroup behaviour with both successful and failing tasks and task groups
-void TestTaskSubGroupsErrors(std::shared_ptr<TaskGroup> task_group) {
-  const int NTASKS = 50;
-  const int NGROUPS = 7;
-  const int FAIL_EVERY = 17;
-  std::vector<std::shared_ptr<TaskGroup>> groups = {task_group};
-
-  // Create some subgroups
-  for (int i = 0; i < NGROUPS - 1; ++i) {
-    groups.push_back(task_group->MakeSubGroup());
-  }
+  CopyCountingTask(CopyCountingTask&& other) = default;
+  CopyCountingTask& operator=(CopyCountingTask&& other) = default;
 
-  // Add NTASKS sleeps amongst all groups
-  for (int i = 0; i < NTASKS; ++i) {
-    groups[i % NGROUPS]->Append([&, i]() {
-      SleepFor(1e-3);
-      // As NGROUPS > NTASKS / FAIL_EVERY, some subgroups are successful
-      if (i % FAIL_EVERY == 0) {
-        return Status::Invalid("some message");
-      } else {
-        return Status::OK();
-      }
-    });
+  Status operator()() {
+    *target = counter;
+    return Status::OK();
   }
 
-  // Finish all subgroups first, then main group
-  int nsuccessful = 0;
-  for (int i = NGROUPS - 1; i > 0; --i) {
-    Status st = groups[i]->Finish();
-    if (st.ok()) {
-      ++nsuccessful;
-    } else {
-      ASSERT_RAISES(Invalid, st);
-    }
-  }
-  ASSERT_RAISES(Invalid, task_group->Finish());
-  ASSERT_FALSE(task_group->ok());
-  // Finish() is idempotent
-  ASSERT_RAISES(Invalid, task_group->Finish());
-}
+ private:
+  uint8_t counter;
+  std::shared_ptr<uint8_t> target;
+};
 
 // Check TaskGroup behaviour with tasks spawning other tasks
 void TestTasksSpawnTasks(std::shared_ptr<TaskGroup> task_group) {
@@ -276,6 +311,14 @@ void StressFailingTaskGroupLifetime(std::function<std::shared_ptr<TaskGroup>()>
   }
 }
 
+void TestNoCopyTask(std::shared_ptr<TaskGroup> task_group) {
+  auto counter = std::make_shared<uint8_t>(0);
+  CopyCountingTask task(counter);
+  task_group->Append(std::move(task));
+  ASSERT_OK(task_group->Finish());
+  ASSERT_EQ(0, *counter);
+}
+
 TEST(SerialTaskGroup, Success) { TestTaskGroupSuccess(TaskGroup::MakeSerial()); }
 
 TEST(SerialTaskGroup, Errors) { TestTaskGroupErrors(TaskGroup::MakeSerial()); }
@@ -290,6 +333,8 @@ TEST(SerialTaskGroup, SubGroupsErrors) {
   TestTaskSubGroupsErrors(TaskGroup::MakeSerial());
 }
 
+TEST(SerialTaskGroup, NoCopyTask) { TestNoCopyTask(TaskGroup::MakeSerial()); }
+
 TEST(ThreadedTaskGroup, Success) {
   auto task_group = TaskGroup::MakeThreaded(GetCpuThreadPool());
   TestTaskGroupSuccess(task_group);
@@ -316,6 +361,12 @@ TEST(ThreadedTaskGroup, SubGroupsSuccess) {
   TestTaskSubGroupsSuccess(TaskGroup::MakeThreaded(thread_pool.get()));
 }
 
+TEST(ThreadedTaskGroup, NoCopyTask) {
+  std::shared_ptr<ThreadPool> thread_pool;
+  ASSERT_OK_AND_ASSIGN(thread_pool, ThreadPool::Make(4));
+  TestNoCopyTask(TaskGroup::MakeThreaded(thread_pool.get()));
+}
+
 TEST(ThreadedTaskGroup, SubGroupsErrors) {
   std::shared_ptr<ThreadPool> thread_pool;
   ASSERT_OK_AND_ASSIGN(thread_pool, ThreadPool::Make(4));
diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h
index 45dc1ca7145..c6edcb1c692 100644
--- a/cpp/src/arrow/util/thread_pool.h
+++ b/cpp/src/arrow/util/thread_pool.h
@@ -64,15 +64,40 @@ namespace detail {
 // Make sure that both functions returning T and Result<T> can be called
 // through Executor::Submit(), and that a Future<T> is returned for both.
 template <typename T>
-struct ExecutorResultTraits {
-  using ValueType = T;
+struct ExecutorSubmitTraits {
+  using FutureType = Future<T>;
 };
-
 template <typename T>
-struct ExecutorResultTraits<Result<T>> {
-  using ValueType = T;
+struct ExecutorSubmitTraits<Result<T>> {
+  using FutureType = Future<T>;
+};
+
+// Make sure that both functions returning Status and void can be called
+// through Executor::Submit(), and that a Future<> is returned for both.
+template <>
+struct ExecutorSubmitTraits<Status> {
+  using FutureType = Future<>;
+};
+template <>
+struct ExecutorSubmitTraits<void> {
+  using FutureType = Future<>;
 };
 
+template <typename T, typename F>
+typename std::enable_if<
+    !std::is_same<typename std::result_of<F && ()>::type, void>::value>::type
+ExecuteAndMarkFinished(Future<T>* fut, F&& f) {
+  fut->MarkFinished(std::forward<F>(f)());
+}
+
+template <typename F>
+typename std::enable_if<
+    std::is_same<typename std::result_of<F && ()>::type, void>::value>::type
+ExecuteAndMarkFinished(Future<>* fut, F&& f) {
+  std::forward<F>(f)();
+  fut->MarkFinished();
+}
+
 }  // namespace detail
 
 // Hints about a task that may be used by an Executor.
@@ -100,7 +125,23 @@ class ARROW_EXPORT Executor {
 
   template <typename Function>
   Status Spawn(TaskHints hints, Function&& func) {
-    return SpawnReal(std::move(hints), std::forward<Function>(func));
+    return SpawnReal(hints, std::forward<Function>(func));
+  }
+
+  /// \brief Returns a future that will be completed when the base future completes but
+  /// continuations will run on this executor
+  template <typename T>
+  Result<Future<T>> Transfer(Future<T>& future) {
+    auto transferred_future = Future<T>::Make();
+    future.Then([this, transferred_future](const Result<T>& result) mutable {
+      auto submit_status = Submit([transferred_future, result]() mutable {
+        transferred_future.MarkFinished(result);
+      });
+      if (!submit_status.ok()) {
+        transferred_future.MarkFinished(submit_status.status());
+      }
+    });
+    return transferred_future;
   }
 
   // Submit a callable and arguments for execution.  Return a future that
@@ -108,43 +149,32 @@ class ARROW_EXPORT Executor {
   // The callable's arguments are copied before execution.
   template <
       typename Function, typename... Args,
-      typename FunctionRetType = typename std::result_of<Function && (Args && ...)>::type,
-      typename RT = typename detail::ExecutorResultTraits<FunctionRetType>,
-      typename ValueType = typename RT::ValueType>
-  Result<Future<ValueType>> Submit(Function&& func, Args&&... args) {
-    return Submit(TaskHints{}, std::forward<Function>(func), std::forward<Args>(args)...);
-  }
-
-  template <
-      typename Function, typename... Args,
-      typename FunctionRetType = typename std::result_of<Function && (Args && ...)>::type,
-      typename RT = typename detail::ExecutorResultTraits<FunctionRetType>,
-      typename ValueType = typename RT::ValueType>
-  Result<Future<ValueType>> Submit(TaskHints hints, Function&& func, Args&&... args) {
+      typename ReturnType = typename std::result_of<Function && (Args && ...)>::type,
+      typename FutureType = typename detail::ExecutorSubmitTraits<ReturnType>::FutureType>
+  Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
     auto bound_func =
         std::bind(std::forward<Function>(func), std::forward<Args>(args)...);
     using BoundFuncType = decltype(bound_func);
 
     struct Task {
       BoundFuncType bound_func;
-      Future<ValueType> future;
+      FutureType future;
 
-      void operator()() { future.ExecuteAndMarkFinished(std::move(bound_func)); }
+      void operator()() {
+        detail::ExecuteAndMarkFinished(&future, std::move(bound_func));
+      }
     };
-    auto future = Future<ValueType>::Make();
-    ARROW_RETURN_NOT_OK(SpawnReal(std::move(hints), Task{std::move(bound_func), future}));
+    auto future = FutureType::Make();
+    ARROW_RETURN_NOT_OK(SpawnReal(hints, Task{std::move(bound_func), future}));
     return future;
   }
 
-  // Like Submit(), but also returns a (failed) Future when submission fails
   template <
       typename Function, typename... Args,
-      typename FunctionRetType = typename std::result_of<Function && (Args && ...)>::type,
-      typename RT = typename detail::ExecutorResultTraits<FunctionRetType>,
-      typename ValueType = typename RT::ValueType>
-  Future<ValueType> SubmitAsFuture(Function&& func, Args&&... args) {
-    return Future<ValueType>::DeferNotOk(
-        Submit(std::forward<Function>(func), std::forward<Args>(args)...));
+      typename ReturnType = typename std::result_of<Function && (Args && ...)>::type,
+      typename FutureType = typename detail::ExecutorSubmitTraits<ReturnType>::FutureType>
+  Result<FutureType> Submit(Function&& func, Args&&... args) {
+    return Submit(TaskHints{}, std::forward<Function>(func), std::forward<Args>(args)...);
   }
 
   // Return the level of parallelism (the number of tasks that may be executed
@@ -172,7 +202,7 @@ class ARROW_EXPORT ThreadPool : public Executor {
   static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
 
   // Destroy thread pool; the pool will first be shut down
-  ~ThreadPool();
+  ~ThreadPool() override;
 
   // Return the desired number of worker threads.
   // The actual number of workers may lag a bit before being adjusted to
diff --git a/cpp/src/arrow/util/thread_pool_benchmark.cc b/cpp/src/arrow/util/thread_pool_benchmark.cc
index 15197235cf3..2b431e5b5c2 100644
--- a/cpp/src/arrow/util/thread_pool_benchmark.cc
+++ b/cpp/src/arrow/util/thread_pool_benchmark.cc
@@ -136,10 +136,13 @@ static void ThreadedTaskGroup(benchmark::State& state) {
 
   for (auto _ : state) {
     auto task_group = TaskGroup::MakeThreaded(pool.get());
-    for (int32_t i = 0; i < nspawns; ++i) {
-      // Pass the task by reference to avoid copying it around
-      task_group->Append(std::ref(task));
-    }
+    task_group->Append([&task, nspawns, task_group] {
+      for (int32_t i = 0; i < nspawns; ++i) {
+        // Pass the task by reference to avoid copying it around
+        task_group->Append(std::ref(task));
+      }
+      return Status::OK();
+    });
     ABORT_NOT_OK(task_group->Finish());
   }
   ABORT_NOT_OK(pool->Shutdown(true /* wait */));
@@ -147,6 +150,38 @@ static void ThreadedTaskGroup(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations() * nspawns);
 }
 
+// Benchmark threaded TaskGroup using futures
+static void FutureTaskGroup(benchmark::State& state) {
+  const auto nthreads = static_cast<int>(state.range(0));
+  const auto workload_size = static_cast<int32_t>(state.range(1));
+
+  std::shared_ptr<ThreadPool> pool;
+  pool = *ThreadPool::Make(nthreads);
+
+  Task task(workload_size);
+
+  const int32_t nspawns = 10000000 / workload_size + 1;
+
+  for (auto _ : state) {
+    auto task_group = TaskGroup::MakeThreaded(pool.get());
+    task_group->Append([&task, pool, nspawns, task_group] {
+      for (int32_t i = 0; i < nspawns; ++i) {
+        // Pass the task by reference to avoid copying it around
+        auto future_result = pool->Submit(std::ref(task));
+        ABORT_NOT_OK(future_result);
+        task_group->Append(*future_result);
+      }
+      return Status::OK();
+    });
+    auto final_future = task_group->FinishAsync();
+    final_future.Wait();
+    ABORT_NOT_OK(final_future.status());
+  }
+  ABORT_NOT_OK(pool->Shutdown(true /* wait */));
+
+  state.SetItemsProcessed(state.iterations() * nspawns);
+}
+
 static const int32_t kWorkloadSizes[] = {1000, 10000, 100000};
 
 static void WorkloadCost_Customize(benchmark::internal::Benchmark* b) {
@@ -190,6 +225,7 @@ BENCHMARK(ReferenceWorkloadCost)->Apply(WorkloadCost_Customize);
 BENCHMARK(SerialTaskGroup)->Apply(WorkloadCost_Customize);
 BENCHMARK(ThreadPoolSpawn)->Apply(ThreadPoolSpawn_Customize);
 BENCHMARK(ThreadedTaskGroup)->Apply(ThreadPoolSpawn_Customize);
+BENCHMARK(FutureTaskGroup)->Apply(ThreadPoolSpawn_Customize);
 
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc
index 5072d29334c..260999922f5 100644
--- a/cpp/src/plasma/client.cc
+++ b/cpp/src/plasma/client.cc
@@ -796,7 +796,7 @@ bool PlasmaClient::Impl::ComputeObjectHashParallel(XXH64_state_t* hash_state,
   // | num_threads * chunk_size | suffix |, where chunk_size = k * block_size.
   // Each thread gets a "chunk" of k blocks, except the suffix thread.
 
-  std::vector<arrow::Future<void>> futures;
+  std::vector<arrow::Future<>> futures;
   for (int i = 0; i < num_threads; i++) {
     futures.push_back(*pool->Submit(
         ComputeBlockHash, reinterpret_cast<uint8_t*>(data_address) + i * chunk_size,