diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0367e3fedc2..0e5365a508b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -267,7 +267,7 @@ if((ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) AND NOT ARROW_JSON)
   message(FATAL_ERROR "JSON parsing of arrays is required for Arrow tests")
 endif()
 
-if(ARROW_FLIGHT OR ARROW_BUILD_TESTS)
+if(ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS)
   set(ARROW_IPC ON)
 endif()
 
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 6c0e651efcb..d5f1857516c 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -202,6 +202,9 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {
 
     template <typename DType, typename ArrayType>
     enable_if_memoize<DType, Status> InsertValues(const DType&, const ArrayType& array) {
+      if (array.null_count() > 0) {
+        return Status::Invalid("Cannot insert dictionary values containing nulls");
+      }
       for (int64_t i = 0; i < array.length(); ++i) {
         ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i)));
       }
diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h
index e70827ed380..3b1fece7127 100644
--- a/cpp/src/arrow/ipc/writer.h
+++ b/cpp/src/arrow/ipc/writer.h
@@ -24,6 +24,7 @@
 #include <memory>
 #include <vector>
 
+#include "arrow/ipc/dictionary.h"  // IWYU pragma: export
 #include "arrow/ipc/message.h"
 #include "arrow/ipc/options.h"
 #include "arrow/result.h"
@@ -49,8 +50,6 @@ class OutputStream;
 
 namespace ipc {
 
-class DictionaryMemo;
-
 /// \class RecordBatchWriter
 /// \brief Abstract interface for writing a stream of record batches
 class ARROW_EXPORT RecordBatchWriter {
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 88bd5470a80..1daaa91de38 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -536,6 +536,21 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) {
   return Status::OK();
 }
 
+Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
+                   std::ostream* sink) {
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    const std::string& name = batch.column_name(i);
+    PrettyPrintOptions column_options = options;
+    column_options.indent += 2;
+
+    (*sink) << name << ": ";
+    RETURN_NOT_OK(PrettyPrint(*batch.column(i), column_options, sink));
+    (*sink) << "\n";
+  }
+  (*sink) << std::flush;
+  return Status::OK();
+}
+
 Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
                    std::ostream* sink) {
   RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink));
diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h
index 5740341a67d..70caa8cfa87 100644
--- a/cpp/src/arrow/pretty_print.h
+++ b/cpp/src/arrow/pretty_print.h
@@ -61,6 +61,10 @@ struct PrettyPrintOptions {
 ARROW_EXPORT
 Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink);
 
+ARROW_EXPORT
+Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
 /// \brief Print human-readable representation of Table
 ARROW_EXPORT
 Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 49447642bec..3928b07aa6f 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,7 +49,8 @@ static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) {
   for (int i = 0; i < carr.num_chunks(); ++i) {
     auto c1 = carr.chunk(i);
     *ss << "Chunk " << i << std::endl;
-    ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss));
+    ::arrow::PrettyPrintOptions options(/*indent=*/2);
+    ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, options, ss));
     *ss << std::endl;
   }
 }
@@ -59,15 +60,25 @@ void AssertTsEqual(const T& expected, const T& actual) {
   if (!expected.Equals(actual)) {
     std::stringstream pp_expected;
     std::stringstream pp_actual;
-    ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected));
-    ARROW_EXPECT_OK(PrettyPrint(actual, 0, &pp_actual));
+    ::arrow::PrettyPrintOptions options(/*indent=*/2);
+    options.window = 50;
+    ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected));
+    ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual));
     FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str();
   }
 }
 
-void AssertArraysEqual(const Array& expected, const Array& actual) {
+void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose) {
   std::stringstream diff;
   if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) {
+    if (verbose) {
+      ::arrow::PrettyPrintOptions options(/*indent=*/2);
+      options.window = 50;
+      diff << "Expected:\n";
+      ARROW_EXPECT_OK(PrettyPrint(expected, options, &diff));
+      diff << "\nActual:\n";
+      ARROW_EXPECT_OK(PrettyPrint(actual, options, &diff));
+    }
     FAIL() << diff.str();
   }
 }
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index f378b808c54..e070dc7d612 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -140,7 +140,9 @@ using ArrayVector = std::vector<std::shared_ptr<Array>>;
 #define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs))
 #define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs))
 
-ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual);
+// If verbose is true, then the arrays will be pretty printed
+ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual,
+                                    bool verbose = false);
 ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected,
                                      const RecordBatch& actual);
 ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected,
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 1d50bcb9220..b4b6d5ec6a9 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -343,9 +343,11 @@ void WriteTableToBuffer(const std::shared_ptr<Table>& table, int64_t row_group_s
                         const std::shared_ptr<ArrowWriterProperties>& arrow_properties,
                         std::shared_ptr<Buffer>* out) {
   auto sink = CreateOutputStream();
+
+  auto write_props = WriterProperties::Builder().write_batch_size(100)->build();
+
   ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink,
-                                row_group_size, default_writer_properties(),
-                                arrow_properties));
+                                row_group_size, write_props, arrow_properties));
   ASSERT_OK_NO_THROW(sink->Finish(out));
 }
 
@@ -368,37 +370,39 @@ void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual
   }
 }
 
-void DoConfiguredRoundtrip(
-    const std::shared_ptr<Table>& table, int64_t row_group_size,
-    std::shared_ptr<Table>* out,
-    const std::shared_ptr<::parquet::WriterProperties>& parquet_properties =
-        ::parquet::default_writer_properties(),
-    const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
-        default_arrow_writer_properties()) {
+void DoRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group_size,
+                 std::shared_ptr<Table>* out,
+                 const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
+                     ::parquet::default_writer_properties(),
+                 const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
+                     default_arrow_writer_properties(),
+                 const ArrowReaderProperties& arrow_reader_properties =
+                     default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
 
   auto sink = CreateOutputStream();
   ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink,
-                                row_group_size, parquet_properties, arrow_properties));
+                                row_group_size, writer_properties,
+                                arrow_writer_properties));
   ASSERT_OK_NO_THROW(sink->Finish(&buffer));
 
   std::unique_ptr<FileReader> reader;
-  ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
-                              ::arrow::default_memory_pool(), &reader));
+  FileReaderBuilder builder;
+  ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+  ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader));
   ASSERT_OK_NO_THROW(reader->ReadTable(out));
 }
 
 void CheckConfiguredRoundtrip(
     const std::shared_ptr<Table>& input_table,
     const std::shared_ptr<Table>& expected_table = nullptr,
-    const std::shared_ptr<::parquet::WriterProperties>& parquet_properties =
+    const std::shared_ptr<::parquet::WriterProperties>& writer_properties =
         ::parquet::default_writer_properties(),
-    const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
+    const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
         default_arrow_writer_properties()) {
   std::shared_ptr<Table> actual_table;
-  ASSERT_NO_FATAL_FAILURE(DoConfiguredRoundtrip(input_table, input_table->num_rows(),
-                                                &actual_table, parquet_properties,
-                                                arrow_properties));
+  ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(), &actual_table,
+                                      writer_properties, arrow_writer_properties));
   if (expected_table) {
     ASSERT_NO_FATAL_FAILURE(
         ::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema()));
@@ -439,9 +443,8 @@ void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group
   std::shared_ptr<Table> result;
   DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result,
                     arrow_properties);
-  ASSERT_NO_FATAL_FAILURE(
-      ::arrow::AssertSchemaEqual(*table->schema(), *result->schema()));
-  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false));
+  ::arrow::AssertSchemaEqual(*table->schema(), *result->schema());
+  ::arrow::AssertTablesEqual(*table, *result, false);
 }
 
 static std::shared_ptr<GroupNode> MakeSimpleSchema(const DataType& type,
@@ -751,8 +754,8 @@ TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {
 
 TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
   std::shared_ptr<Table> table;
-  ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table));
-  ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
+  this->PrepareListTable(SMALL_SIZE, true, true, 10, &table);
+  this->CheckRoundTrip(table);
 }
 
 TYPED_TEST(TestParquetIO, SingleRequiredListNullableColumnReadWrite) {
@@ -1169,8 +1172,12 @@ TEST_F(TestNullParquetIO, NullListColumn) {
 }
 
 TEST_F(TestNullParquetIO, NullDictionaryColumn) {
+  std::shared_ptr<Buffer> null_bitmap;
+  ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE,
+                                         &null_bitmap));
+
   std::shared_ptr<Array> indices =
-      std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE);
+      std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE);
   std::shared_ptr<::arrow::DictionaryType> dict_type =
       std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null());
 
@@ -2803,7 +2810,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
     ::arrow::AssertTablesEqual(expected, *actual, /*same_chunk_layout=*/false);
   }
 
-  static std::vector<double> null_probabilites() { return {0.0, 0.5, 1}; }
+  static std::vector<double> null_probabilities() { return {0.0, 0.5, 1}; }
 
  protected:
   std::shared_ptr<Array> dense_values_;
@@ -2813,7 +2820,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
   ArrowReaderProperties properties_;
 };
 
-void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
+void AsDictionary32Encoded(const Array& arr, std::shared_ptr<Array>* out) {
   ::arrow::StringDictionary32Builder builder(default_memory_pool());
   const auto& string_array = static_cast<const ::arrow::StringArray&>(arr);
   ASSERT_OK(builder.AppendArray(string_array));
@@ -2826,7 +2833,7 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
   std::vector<std::shared_ptr<Array>> chunks(kNumRowGroups);
   const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups;
   for (int i = 0; i < kNumRowGroups; ++i) {
-    AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
+    AsDictionary32Encoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
   }
   auto ex_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
                                   /*nullable=*/true);
@@ -2840,8 +2847,88 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDense) {
 
 INSTANTIATE_TEST_CASE_P(
     ReadDictionary, TestArrowReadDictionary,
-    ::testing::ValuesIn(TestArrowReadDictionary::null_probabilites()));
+    ::testing::ValuesIn(TestArrowReadDictionary::null_probabilities()));
+
+TEST(TestArrowWriteDictionaries, ChangingDictionaries) {
+  constexpr int num_unique = 50;
+  constexpr int repeat = 10000;
+  constexpr int64_t min_length = 2;
+  constexpr int64_t max_length = 20;
+  ::arrow::random::RandomArrayGenerator rag(0);
+  auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length,
+                                      max_length, /*null_probability=*/0.1);
+  auto expected = MakeSimpleTable(values, /*nullable=*/true);
+
+  const int num_chunks = 10;
+  std::vector<std::shared_ptr<Array>> chunks(num_chunks);
+  const int64_t chunk_size = values->length() / num_chunks;
+  for (int i = 0; i < num_chunks; ++i) {
+    AsDictionary32Encoded(*values->Slice(chunk_size * i, chunk_size), &chunks[i]);
+  }
+
+  auto dict_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
+                                    /*nullable=*/true);
+
+  std::shared_ptr<Table> actual;
+  DoRoundtrip(dict_table, /*row_group_size=*/values->length() / 2, &actual);
+  ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false);
+}
+
+TEST(TestArrowWriteDictionaries, AutoReadAsDictionary) {
+  constexpr int num_unique = 50;
+  constexpr int repeat = 100;
+  constexpr int64_t min_length = 2;
+  constexpr int64_t max_length = 20;
+  ::arrow::random::RandomArrayGenerator rag(0);
+  auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length,
+                                      max_length, /*null_probability=*/0.1);
+  std::shared_ptr<Array> dict_values;
+  AsDictionary32Encoded(*values, &dict_values);
 
-}  // namespace arrow
+  auto expected = MakeSimpleTable(dict_values, /*nullable=*/true);
+  auto expected_dense = MakeSimpleTable(values, /*nullable=*/true);
 
+  auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
+  std::shared_ptr<Table> actual, actual_dense;
+
+  DoRoundtrip(expected, values->length(), &actual, default_writer_properties(),
+              props_store_schema);
+  ::arrow::AssertTablesEqual(*expected, *actual);
+
+  auto props_no_store_schema = ArrowWriterProperties::Builder().build();
+  DoRoundtrip(expected, values->length(), &actual_dense, default_writer_properties(),
+              props_no_store_schema);
+  ::arrow::AssertTablesEqual(*expected_dense, *actual_dense);
+}
+
+TEST(TestArrowWriteDictionaries, NestedSubfield) {
+  // ARROW-3246: Automatic decoding of dictionary subfields left as followup
+  // work
+  auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 2, 3]");
+  auto indices = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 0]");
+  auto dict = ::arrow::ArrayFromJSON(::arrow::utf8(), "[\"foo\"]");
+
+  std::shared_ptr<Array> dict_values, values;
+  auto dict_ty = ::arrow::dictionary(::arrow::int32(), ::arrow::utf8());
+  ASSERT_OK(::arrow::DictionaryArray::FromArrays(dict_ty, indices, dict, &dict_values));
+  ASSERT_OK(::arrow::ListArray::FromArrays(*offsets, *dict_values,
+                                           ::arrow::default_memory_pool(), &values));
+
+  auto dense_ty = ::arrow::list(::arrow::utf8());
+  auto dense_values =
+      ::arrow::ArrayFromJSON(dense_ty, "[[], [\"foo\", \"foo\"], [\"foo\"]]");
+
+  auto table = MakeSimpleTable(values, /*nullable=*/true);
+  auto expected_table = MakeSimpleTable(dense_values, /*nullable=*/true);
+
+  auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
+  std::shared_ptr<Table> actual;
+  DoRoundtrip(table, values->length(), &actual, default_writer_properties(),
+              props_store_schema);
+
+  // The nested subfield is not automatically decoded to dictionary
+  ::arrow::AssertTablesEqual(*expected_table, *actual);
+}
+
+}  // namespace arrow
 }  // namespace parquet
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 3d1ad76c8f0..4451276d6b3 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -106,8 +106,9 @@ class FileReaderImpl : public FileReader {
       : pool_(pool), reader_(std::move(reader)), reader_properties_(properties) {}
 
   Status Init() {
-    return BuildSchemaManifest(reader_->metadata()->schema(), reader_properties_,
-                               &manifest_);
+    return BuildSchemaManifest(reader_->metadata()->schema(),
+                               reader_->metadata()->key_value_metadata(),
+                               reader_properties_, &manifest_);
   }
 
   std::vector<int> AllRowGroups() {
@@ -777,7 +778,7 @@ Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
     }
   }
 
-  auto result_schema = ::arrow::schema(fields, reader_->metadata()->key_value_metadata());
+  auto result_schema = ::arrow::schema(fields, manifest_.schema_metadata);
   *out = Table::Make(result_schema, columns);
   return (*out)->Validate();
   END_PARQUET_CATCH_EXCEPTIONS
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index bfc40940e3b..649f73f76be 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -24,6 +24,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #include <boost/algorithm/string/predicate.hpp>
@@ -31,6 +32,8 @@
 #include "arrow/array.h"
 #include "arrow/builder.h"
 #include "arrow/compute/kernel.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/type.h"
@@ -491,11 +494,13 @@ Status GroupToSchemaField(const GroupNode& node, int16_t max_def_level,
 Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_rep_level,
                          SchemaTreeContext* ctx, const SchemaField* parent,
                          SchemaField* out) {
+  /// Workhorse function for converting a Parquet schema node to an Arrow
+  /// type. Handles different conventions for nested data
   if (node.is_optional()) {
     ++max_def_level;
   } else if (node.is_repeated()) {
-    // Repeated fields add a definition level. This is used to distinguish
-    // between an empty list and a list with an item in it.
+    // Repeated fields add both a repetition and definition level. This is used
+    // to distinguish between an empty list and a list with an item in it.
     ++max_rep_level;
     ++max_def_level;
   }
@@ -504,9 +509,19 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re
 
   // Now, walk the schema and create a ColumnDescriptor for each leaf node
   if (node.is_group()) {
+    // A nested field, but we don't know what kind yet
     return GroupToSchemaField(static_cast<const GroupNode&>(node), max_def_level,
                               max_rep_level, ctx, parent, out);
   } else {
+    // Either a normal flat primitive type, or a list type encoded with 1-level
+    // list encoding. Note that the 3-level encoding is the form recommended by
+    // the parquet specification, but technically we can have either
+    //
+    // required/optional $TYPE $FIELD_NAME
+    //
+    // or
+    //
+    // repeated $TYPE $FIELD_NAME
     const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
     int column_index = ctx->schema->GetColumnIndex(primitive_node);
     std::shared_ptr<DataType> type;
@@ -526,6 +541,7 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re
       out->max_repetition_level = max_rep_level;
       return Status::OK();
     } else {
+      // A normal (required/optional) primitive node
       return PopulateLeaf(column_index,
                           ::arrow::field(node.name(), type, node.is_optional()),
                           max_def_level, max_rep_level, ctx, parent, out);
@@ -533,9 +549,56 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re
   }
 }
 
+Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
+                       std::shared_ptr<const KeyValueMetadata>* clean_metadata,
+                       std::shared_ptr<::arrow::Schema>* out) {
+  if (metadata == nullptr) {
+    *out = nullptr;
+    *clean_metadata = nullptr;
+    return Status::OK();
+  }
+
+  static const std::string kArrowSchemaKey = "ARROW:schema";
+  int schema_index = metadata->FindKey(kArrowSchemaKey);
+  if (schema_index == -1) {
+    *out = nullptr;
+    *clean_metadata = metadata;
+    return Status::OK();
+  }
+
+  // The original Arrow schema was serialized using the store_schema option. We
+  // deserialize it here and use it to inform read options such as
+  // dictionary-encoded fields
+  auto schema_buf = std::make_shared<Buffer>(metadata->value(schema_index));
+
+  ::arrow::ipc::DictionaryMemo dict_memo;
+  ::arrow::io::BufferReader input(schema_buf);
+  RETURN_NOT_OK(::arrow::ipc::ReadSchema(&input, &dict_memo, out));
+
+  if (metadata->size() > 1) {
+    // Copy the metadata without the schema key
+    auto new_metadata = ::arrow::key_value_metadata({}, {});
+    new_metadata->reserve(metadata->size() - 1);
+    for (int64_t i = 0; i < metadata->size(); ++i) {
+      if (i == schema_index) continue;
+      new_metadata->Append(metadata->key(i), metadata->value(i));
+    }
+    *clean_metadata = new_metadata;
+  } else {
+    // No other keys, let metadata be null
+    *clean_metadata = nullptr;
+  }
+  return Status::OK();
+}
+
 Status BuildSchemaManifest(const SchemaDescriptor* schema,
+                           const std::shared_ptr<const KeyValueMetadata>& metadata,
                            const ArrowReaderProperties& properties,
                            SchemaManifest* manifest) {
+  std::shared_ptr<::arrow::Schema> origin_schema;
+  RETURN_NOT_OK(
+      GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
+
   SchemaTreeContext ctx;
   ctx.manifest = manifest;
   ctx.properties = properties;
@@ -544,8 +607,26 @@ Status BuildSchemaManifest(const SchemaDescriptor* schema,
   manifest->descr = schema;
   manifest->schema_fields.resize(schema_node.field_count());
   for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
+    SchemaField* out_field = &manifest->schema_fields[i];
     RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), 0, 0, &ctx,
-                                    /*parent=*/nullptr, &manifest->schema_fields[i]));
+                                    /*parent=*/nullptr, out_field));
+
+    // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
+    // schema (if any) through all functions in the schema reconstruction, but
+    // I'm being lazy and just setting dictionary fields at the top level for
+    // now
+    if (manifest->origin_schema == nullptr) {
+      continue;
+    }
+    auto origin_field = manifest->origin_schema->field(i);
+    auto current_type = out_field->field->type();
+    if (origin_field->type()->id() != ::arrow::Type::DICTIONARY) {
+      continue;
+    }
+    if (current_type->id() != ::arrow::Type::DICTIONARY) {
+      out_field->field =
+          out_field->field->WithType(::arrow::dictionary(::arrow::int32(), current_type));
+    }
   }
   return Status::OK();
 }
@@ -555,7 +636,7 @@ Status FromParquetSchema(
     const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
     std::shared_ptr<::arrow::Schema>* out) {
   SchemaManifest manifest;
-  RETURN_NOT_OK(BuildSchemaManifest(schema, properties, &manifest));
+  RETURN_NOT_OK(BuildSchemaManifest(schema, key_value_metadata, properties, &manifest));
   std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
   for (int i = 0; i < static_cast<int>(fields.size()); i++) {
     fields[i] = manifest.schema_fields[i].field;
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index 4568e421474..d8f08524681 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -38,6 +38,8 @@ class Array;
 class ChunkedArray;
 class DataType;
 class Field;
+class KeyValueMetadata;
+class Schema;
 
 }  // namespace arrow
 
@@ -138,6 +140,8 @@ struct PARQUET_EXPORT SchemaField {
 
 struct SchemaManifest {
   const SchemaDescriptor* descr;
+  std::shared_ptr<::arrow::Schema> origin_schema;
+  std::shared_ptr<const KeyValueMetadata> schema_metadata;
   std::vector<SchemaField> schema_fields;
 
   std::unordered_map<int, const SchemaField*> column_index_to_field;
@@ -185,6 +189,7 @@ struct SchemaManifest {
 
 PARQUET_EXPORT
 Status BuildSchemaManifest(const SchemaDescriptor* schema,
+                           const std::shared_ptr<const KeyValueMetadata>& metadata,
                            const ArrowReaderProperties& properties,
                            SchemaManifest* manifest);
 
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index fb437f14320..0d13528d5f9 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -19,13 +19,14 @@
 
 #include <algorithm>
 #include <deque>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "arrow/array.h"
 #include "arrow/buffer-builder.h"
-#include "arrow/compute/api.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/visitor_inline.h"
@@ -43,6 +44,7 @@ using arrow::BinaryArray;
 using arrow::BooleanArray;
 using arrow::ChunkedArray;
 using arrow::Decimal128Array;
+using arrow::DictionaryArray;
 using arrow::Field;
 using arrow::FixedSizeBinaryArray;
 using Int16BufferBuilder = arrow::TypedBufferBuilder<int16_t>;
@@ -55,10 +57,6 @@ using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
 
-using arrow::compute::Cast;
-using arrow::compute::CastOptions;
-using arrow::compute::FunctionContext;
-
 using parquet::ParquetFileWriter;
 using parquet::ParquetVersion;
 using parquet::schema::GroupNode;
@@ -89,6 +87,21 @@ class LevelBuilder {
     return Status::OK();
   }
 
+  Status Visit(const DictionaryArray& array) {
+    // Only currently handle DictionaryArray where the dictionary is a
+    // primitive type
+    if (array.dict_type()->value_type()->num_children() > 0) {
+      return Status::NotImplemented(
+          "Writing DictionaryArray with nested dictionary "
+          "type not yet supported");
+    }
+    array_offsets_.push_back(static_cast<int32_t>(array.offset()));
+    valid_bitmaps_.push_back(array.null_bitmap_data());
+    null_counts_.push_back(array.null_count());
+    values_array_ = std::make_shared<DictionaryArray>(array.data());
+    return Status::OK();
+  }
+
   Status Visit(const ListArray& array) {
     array_offsets_.push_back(static_cast<int32_t>(array.offset()));
     valid_bitmaps_.push_back(array.null_bitmap_data());
@@ -113,7 +126,6 @@ class LevelBuilder {
   NOT_IMPLEMENTED_VISIT(FixedSizeList)
   NOT_IMPLEMENTED_VISIT(Struct)
   NOT_IMPLEMENTED_VISIT(Union)
-  NOT_IMPLEMENTED_VISIT(Dictionary)
   NOT_IMPLEMENTED_VISIT(Extension)
 
 #undef NOT_IMPLEMENTED_VISIT
@@ -411,8 +423,8 @@ class FileWriterImpl : public FileWriter {
         closed_(false) {}
 
   Status Init() {
-    return BuildSchemaManifest(writer_->schema(), default_arrow_reader_properties(),
-                               &schema_manifest_);
+    return BuildSchemaManifest(writer_->schema(), /*schema_metadata=*/nullptr,
+                               default_arrow_reader_properties(), &schema_manifest_);
   }
 
   Status NewRowGroup(int64_t chunk_size) override {
@@ -444,28 +456,6 @@ class FileWriterImpl : public FileWriter {
 
   Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
                           int64_t size) override {
-    // DictionaryArrays are not yet handled with a fast path. To still support
-    // writing them as a workaround, we convert them back to their non-dictionary
-    // representation.
-    if (data->type()->id() == ::arrow::Type::DICTIONARY) {
-      const ::arrow::DictionaryType& dict_type =
-          static_cast<const ::arrow::DictionaryType&>(*data->type());
-
-      // TODO(ARROW-1648): Remove this special handling once we require an Arrow
-      // version that has this fixed.
-      if (dict_type.value_type()->id() == ::arrow::Type::NA) {
-        auto null_array = std::make_shared<::arrow::NullArray>(data->length());
-        return WriteColumnChunk(*null_array);
-      }
-
-      FunctionContext ctx(this->memory_pool());
-      ::arrow::compute::Datum cast_input(data);
-      ::arrow::compute::Datum cast_output;
-      RETURN_NOT_OK(
-          Cast(&ctx, cast_input, dict_type.value_type(), CastOptions(), &cast_output));
-      return WriteColumnChunk(cast_output.chunked_array(), offset, size);
-    }
-
     ColumnWriter* column_writer;
     PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
 
@@ -563,6 +553,30 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool
   return Open(schema, pool, sink, properties, default_arrow_writer_properties(), writer);
 }
 
+Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+                         const ArrowWriterProperties& properties,
+                         std::shared_ptr<const KeyValueMetadata>* out) {
+  if (!properties.store_schema()) {
+    *out = nullptr;
+    return Status::OK();
+  }
+
+  static const std::string kArrowSchemaKey = "ARROW:schema";
+  std::shared_ptr<KeyValueMetadata> result;
+  if (schema.metadata()) {
+    result = schema.metadata()->Copy();
+  } else {
+    result = ::arrow::key_value_metadata({}, {});
+  }
+
+  ::arrow::ipc::DictionaryMemo dict_memo;
+  std::shared_ptr<Buffer> serialized;
+  RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized));
+  result->Append(kArrowSchemaKey, serialized->ToString());
+  *out = result;
+  return Status::OK();
+}
+
 Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
                         const std::shared_ptr<::arrow::io::OutputStream>& sink,
                         const std::shared_ptr<WriterProperties>& properties,
@@ -574,8 +588,11 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool
 
   auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
 
+  std::shared_ptr<const KeyValueMetadata> metadata;
+  RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
+
   std::unique_ptr<ParquetFileWriter> base_writer =
-      ParquetFileWriter::Open(sink, schema_node, properties, schema.metadata());
+      ParquetFileWriter::Open(sink, schema_node, properties, metadata);
 
   auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
   return Make(pool, std::move(base_writer), schema_ptr, arrow_properties, writer);
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 925a4ff80ab..0fd3a4c28dd 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1234,7 +1234,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
     int64_t num_decoded = this->current_decoder_->DecodeArrow(
         static_cast<int>(values_to_read), static_cast<int>(null_count),
         valid_bits_->mutable_data(), values_written_, builder_.get());
-    DCHECK_EQ(num_decoded, values_to_read);
+    DCHECK_EQ(num_decoded, values_to_read - null_count);
     ResetValues();
   }
 
@@ -1310,7 +1310,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
       /// Flush values since they have been copied into the builder
       ResetValues();
     }
-    DCHECK_EQ(num_decoded, values_to_read);
+    DCHECK_EQ(num_decoded, values_to_read - null_count);
   }
 
  private:
diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc
index dd0d65aa5cd..fcc8344ac06 100644
--- a/cpp/src/parquet/column_writer-test.cc
+++ b/cpp/src/parquet/column_writer-test.cc
@@ -218,7 +218,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
   void ReadAndCompare(Compression::type compression, int64_t num_rows) {
     this->SetupValuesOut(num_rows);
     this->ReadColumnFully(compression);
-    auto comparator = TypedComparator<TestType>::Make(this->descr_);
+    auto comparator = MakeComparator<TestType>(this->descr_);
     for (size_t i = 0; i < this->values_.size(); i++) {
       if (comparator->Compare(this->values_[i], this->values_out_[i]) ||
           comparator->Compare(this->values_out_[i], this->values_[i])) {
@@ -310,7 +310,7 @@ void TestPrimitiveWriter<Int96Type>::ReadAndCompare(Compression::type compressio
   this->SetupValuesOut(num_rows);
   this->ReadColumnFully(compression);
 
-  auto comparator = TypedComparator<Int96Type>::Make(Type::INT96, SortOrder::SIGNED);
+  auto comparator = MakeComparator<Int96Type>(Type::INT96, SortOrder::SIGNED);
   for (size_t i = 0; i < this->values_.size(); i++) {
     if (comparator->Compare(this->values_[i], this->values_out_[i]) ||
         comparator->Compare(this->values_out_[i], this->values_[i])) {
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index fa16234e6ec..052ca14967a 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -26,6 +26,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer-builder.h"
+#include "arrow/compute/api.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-stream-utils.h"
@@ -46,11 +47,12 @@
 
 namespace parquet {
 
-using ::arrow::Status;
-using ::arrow::internal::checked_cast;
+using arrow::Status;
+using arrow::compute::Datum;
+using arrow::internal::checked_cast;
 
-using BitWriter = ::arrow::BitUtil::BitWriter;
-using RleEncoder = ::arrow::util::RleEncoder;
+using BitWriter = arrow::BitUtil::BitWriter;
+using RleEncoder = arrow::util::RleEncoder;
 
 LevelEncoder::LevelEncoder() {}
 LevelEncoder::~LevelEncoder() {}
@@ -135,7 +137,7 @@ class SerializedPageWriter : public PageWriter {
  public:
   SerializedPageWriter(const std::shared_ptr<ArrowOutputStream>& sink,
                        Compression::type codec, ColumnChunkMetaDataBuilder* metadata,
-                       MemoryPool* pool = ::arrow::default_memory_pool())
+                       MemoryPool* pool = arrow::default_memory_pool())
       : sink_(sink),
         metadata_(metadata),
         pool_(pool),
@@ -282,7 +284,7 @@ class SerializedPageWriter : public PageWriter {
   std::unique_ptr<ThriftSerializer> thrift_serializer_;
 
   // Compression codec to use.
-  std::unique_ptr<::arrow::util::Codec> compressor_;
+  std::unique_ptr<arrow::util::Codec> compressor_;
 };
 
 // This implementation of the PageWriter writes to the final sink on Close .
@@ -290,7 +292,7 @@ class BufferedPageWriter : public PageWriter {
  public:
   BufferedPageWriter(const std::shared_ptr<ArrowOutputStream>& sink,
                      Compression::type codec, ColumnChunkMetaDataBuilder* metadata,
-                     MemoryPool* pool = ::arrow::default_memory_pool())
+                     MemoryPool* pool = arrow::default_memory_pool())
       : final_sink_(sink), metadata_(metadata) {
     in_memory_sink_ = CreateOutputStream(pool);
     pager_ = std::unique_ptr<SerializedPageWriter>(
@@ -332,7 +334,7 @@ class BufferedPageWriter : public PageWriter {
  private:
   std::shared_ptr<ArrowOutputStream> final_sink_;
   ColumnChunkMetaDataBuilder* metadata_;
-  std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
+  std::shared_ptr<arrow::io::BufferOutputStream> in_memory_sink_;
   std::unique_ptr<SerializedPageWriter> pager_;
 };
 
@@ -479,8 +481,8 @@ class ColumnWriterImpl {
   // Flag to infer if dictionary encoding has fallen back to PLAIN
   bool fallback_;
 
-  ::arrow::BufferBuilder definition_levels_sink_;
-  ::arrow::BufferBuilder repetition_levels_sink_;
+  arrow::BufferBuilder definition_levels_sink_;
+  arrow::BufferBuilder repetition_levels_sink_;
 
   std::shared_ptr<ResizableBuffer> definition_levels_rle_;
   std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
@@ -630,6 +632,50 @@ void ColumnWriterImpl::FlushBufferedDataPages() {
 // ----------------------------------------------------------------------
 // TypedColumnWriter
 
+template <typename Action>
+inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
+  int64_t num_batches = static_cast<int>(total / batch_size);
+  for (int round = 0; round < num_batches; round++) {
+    action(round * batch_size, batch_size);
+  }
+  // Write the remaining values
+  if (total % batch_size > 0) {
+    action(num_batches * batch_size, total % batch_size);
+  }
+}
+
+bool DictionaryDirectWriteSupported(const arrow::Array& array) {
+  DCHECK_EQ(array.type_id(), arrow::Type::DICTIONARY);
+  const arrow::DictionaryType& dict_type =
+      static_cast<const arrow::DictionaryType&>(*array.type());
+  auto id = dict_type.value_type()->id();
+  return id == arrow::Type::BINARY || id == arrow::Type::STRING;
+}
+
+Status ConvertDictionaryToDense(const arrow::Array& array, MemoryPool* pool,
+                                std::shared_ptr<arrow::Array>* out) {
+  const arrow::DictionaryType& dict_type =
+      static_cast<const arrow::DictionaryType&>(*array.type());
+
+  // TODO(ARROW-1648): Remove this special handling once we require an Arrow
+  // version that has this fixed.
+  if (dict_type.value_type()->id() == arrow::Type::NA) {
+    *out = std::make_shared<arrow::NullArray>(array.length());
+    return Status::OK();
+  }
+
+  arrow::compute::FunctionContext ctx(pool);
+  Datum cast_output;
+  RETURN_NOT_OK(arrow::compute::Cast(&ctx, Datum(array.data()), dict_type.value_type(),
+                                     arrow::compute::CastOptions(), &cast_output));
+  *out = cast_output.make_array();
+  return Status::OK();
+}
+
+static inline bool IsDictionaryEncoding(Encoding::type encoding) {
+  return encoding == Encoding::PLAIN_DICTIONARY;
+}
+
 template <typename DType>
 class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
  public:
@@ -645,23 +691,70 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
 
     if (properties->statistics_enabled(descr_->path()) &&
         (SortOrder::UNKNOWN != descr_->sort_order())) {
-      page_statistics_ = TypedStats::Make(descr_, allocator_);
-      chunk_statistics_ = TypedStats::Make(descr_, allocator_);
+      page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+      chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
     }
   }
 
   int64_t Close() override { return ColumnWriterImpl::Close(); }
 
   void WriteBatch(int64_t num_values, const int16_t* def_levels,
-                  const int16_t* rep_levels, const T* values) override;
+                  const int16_t* rep_levels, const T* values) override {
+    // We check for DataPage limits only after we have inserted the values. If a user
+    // writes a large number of values, the DataPage size can be much above the limit.
+    // The purpose of this chunking is to bound this. Even if a user writes large number
+    // of values, the chunking will ensure the AddDataPage() is called at a reasonable
+    // pagesize limit
+    int64_t value_offset = 0;
+    auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+      int64_t values_to_write =
+          WriteLevels(batch_size, def_levels + offset, rep_levels + offset);
+      // PARQUET-780
+      if (values_to_write > 0) {
+        DCHECK_NE(nullptr, values);
+      }
+      WriteValues(values + value_offset, values_to_write, batch_size - values_to_write);
+      CommitWriteAndCheckPageLimit(batch_size, values_to_write);
+      value_offset += values_to_write;
+
+      // Dictionary size checked separately from data page size since we
+      // circumvent this check when writing arrow::DictionaryArray directly
+      CheckDictionarySizeLimit();
+    };
+    DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+  }
 
   void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
                         const int16_t* rep_levels, const uint8_t* valid_bits,
-                        int64_t valid_bits_offset, const T* values) override;
+                        int64_t valid_bits_offset, const T* values) override {
+    // Like WriteBatch, but for spaced values
+    int64_t value_offset = 0;
+    auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+      int64_t batch_num_values = 0;
+      int64_t batch_num_spaced_values = 0;
+      WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset,
+                        &batch_num_values, &batch_num_spaced_values);
+      WriteValuesSpaced(values + value_offset, batch_num_values, batch_num_spaced_values,
+                        valid_bits, valid_bits_offset + value_offset);
+      CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
+      value_offset += batch_num_spaced_values;
+
+      // Dictionary size checked separately from data page size since we
+      // circumvent this check when writing arrow::DictionaryArray directly
+      CheckDictionarySizeLimit();
+    };
+    DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+  }
 
   Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
-                    int64_t num_levels, const ::arrow::Array& array,
-                    ArrowWriteContext* context) override;
+                    int64_t num_levels, const arrow::Array& array,
+                    ArrowWriteContext* ctx) override {
+    if (array.type()->id() == arrow::Type::DICTIONARY) {
+      return WriteArrowDictionary(def_levels, rep_levels, num_levels, array, ctx);
+    } else {
+      return WriteArrowDense(def_levels, rep_levels, num_levels, array, ctx);
+    }
+  }
 
   int64_t EstimatedBufferedValueBytes() const override {
     return current_encoder_->EstimatedDataEncodedSize();
@@ -672,6 +765,17 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
     return current_encoder_->FlushValues();
   }
 
+  // Internal function to handle direct writing of arrow::DictionaryArray,
+  // since the standard logic concerning dictionary size limits and fallback to
+  // plain encoding is circumvented
+  Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
+                              int64_t num_levels, const arrow::Array& array,
+                              ArrowWriteContext* context);
+
+  Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
+                         int64_t num_levels, const arrow::Array& array,
+                         ArrowWriteContext* context);
+
   void WriteDictionaryPage() override {
     // We have to dynamic cast here because of TypedEncoder<Type> as
     // some compilers don't want to cast through virtual inheritance
@@ -686,11 +790,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
     total_bytes_written_ += pager_->WriteDictionaryPage(page);
   }
 
-  // Checks if the Dictionary Page size limit is reached
-  // If the limit is reached, the Dictionary and Data Pages are serialized
-  // The encoding is switched to PLAIN
-  void CheckDictionarySizeLimit();
-
   EncodedStatistics GetPageStatistics() override {
     EncodedStatistics result;
     if (page_statistics_) result = page_statistics_->Encode();
@@ -729,233 +828,239 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
   std::shared_ptr<TypedStats> page_statistics_;
   std::shared_ptr<TypedStats> chunk_statistics_;
 
-  inline int64_t WriteMiniBatch(int64_t num_values, const int16_t* def_levels,
-                                const int16_t* rep_levels, const T* values);
-
-  inline int64_t WriteMiniBatchSpaced(int64_t num_values, const int16_t* def_levels,
-                                      const int16_t* rep_levels,
-                                      const uint8_t* valid_bits,
-                                      int64_t valid_bits_offset, const T* values,
-                                      int64_t* num_spaced_written);
+  // If writing a sequence of arrow::DictionaryArray to the writer, we keep the
+  // dictionary passed to DictEncoder<T>::PutDictionary so we can check
+  // subsequent array chunks to see either if materialization is required (in
+  // which case we call back to the dense write path)
+  std::shared_ptr<arrow::Array> preserved_dictionary_;
+
+  int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
+                      const int16_t* rep_levels) {
+    int64_t values_to_write = 0;
+    // If the field is required and non-repeated, there are no definition levels
+    if (descr_->max_definition_level() > 0) {
+      for (int64_t i = 0; i < num_values; ++i) {
+        if (def_levels[i] == descr_->max_definition_level()) {
+          ++values_to_write;
+        }
+      }
 
-  // Write values to a temporary buffer before they are encoded into pages
-  void WriteValues(int64_t num_values, const T* values) {
-    dynamic_cast<ValueEncoderType*>(current_encoder_.get())
-        ->Put(values, static_cast<int>(num_values));
-  }
+      WriteDefinitionLevels(num_values, def_levels);
+    } else {
+      // Required field, write all values
+      values_to_write = num_values;
+    }
 
-  void WriteValuesSpaced(int64_t num_values, const uint8_t* valid_bits,
-                         int64_t valid_bits_offset, const T* values) {
-    dynamic_cast<ValueEncoderType*>(current_encoder_.get())
-        ->PutSpaced(values, static_cast<int>(num_values), valid_bits, valid_bits_offset);
-  }
-};
+    // Not present for non-repeated fields
+    if (descr_->max_repetition_level() > 0) {
+      // A row could include more than one value
+      // Count the occasions where we start a new row
+      for (int64_t i = 0; i < num_values; ++i) {
+        if (rep_levels[i] == 0) {
+          rows_written_++;
+        }
+      }
 
-// Only one Dictionary Page is written.
-// Fallback to PLAIN if dictionary page limit is reached.
-template <typename DType>
-void TypedColumnWriterImpl<DType>::CheckDictionarySizeLimit() {
-  // We have to dynamic cast here because TypedEncoder<Type> as some compilers
-  // don't want to cast through virtual inheritance
-  auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
-  if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
-    WriteDictionaryPage();
-    // Serialize the buffered Dictionary Indicies
-    FlushBufferedDataPages();
-    fallback_ = true;
-    // Only PLAIN encoding is supported for fallback in V1
-    current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
-                                   properties_->memory_pool());
-    encoding_ = Encoding::PLAIN;
-  }
-}
+      WriteRepetitionLevels(num_values, rep_levels);
+    } else {
+      // Each value is exactly one row
+      rows_written_ += static_cast<int>(num_values);
+    }
+    return values_to_write;
+  }
+
+  void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
+                         const int16_t* rep_levels, int64_t* out_values_to_write,
+                         int64_t* out_spaced_values_to_write) {
+    int64_t values_to_write = 0;
+    int64_t spaced_values_to_write = 0;
+    // If the field is required and non-repeated, there are no definition levels
+    if (descr_->max_definition_level() > 0) {
+      // Minimal definition level for which spaced values are written
+      int16_t min_spaced_def_level = descr_->max_definition_level();
+      if (descr_->schema_node()->is_optional()) {
+        min_spaced_def_level--;
+      }
+      for (int64_t i = 0; i < num_levels; ++i) {
+        if (def_levels[i] == descr_->max_definition_level()) {
+          ++values_to_write;
+        }
+        if (def_levels[i] >= min_spaced_def_level) {
+          ++spaced_values_to_write;
+        }
+      }
 
-// ----------------------------------------------------------------------
-// Instantiate templated classes
+      WriteDefinitionLevels(num_levels, def_levels);
+    } else {
+      // Required field, write all values
+      values_to_write = num_levels;
+      spaced_values_to_write = num_levels;
+    }
 
-template <typename DType>
-int64_t TypedColumnWriterImpl<DType>::WriteMiniBatch(int64_t num_values,
-                                                     const int16_t* def_levels,
-                                                     const int16_t* rep_levels,
-                                                     const T* values) {
-  int64_t values_to_write = 0;
-  // If the field is required and non-repeated, there are no definition levels
-  if (descr_->max_definition_level() > 0) {
-    for (int64_t i = 0; i < num_values; ++i) {
-      if (def_levels[i] == descr_->max_definition_level()) {
-        ++values_to_write;
+    // Not present for non-repeated fields
+    if (descr_->max_repetition_level() > 0) {
+      // A row could include more than one value
+      // Count the occasions where we start a new row
+      for (int64_t i = 0; i < num_levels; ++i) {
+        if (rep_levels[i] == 0) {
+          rows_written_++;
+        }
       }
+
+      WriteRepetitionLevels(num_levels, rep_levels);
+    } else {
+      // Each value is exactly one row
+      rows_written_ += static_cast<int>(num_levels);
     }
 
-    WriteDefinitionLevels(num_values, def_levels);
-  } else {
-    // Required field, write all values
-    values_to_write = num_values;
+    *out_values_to_write = values_to_write;
+    *out_spaced_values_to_write = spaced_values_to_write;
   }
 
-  // Not present for non-repeated fields
-  if (descr_->max_repetition_level() > 0) {
-    // A row could include more than one value
-    // Count the occasions where we start a new row
-    for (int64_t i = 0; i < num_values; ++i) {
-      if (rep_levels[i] == 0) {
-        rows_written_++;
-      }
-    }
+  void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
+    num_buffered_values_ += num_levels;
+    num_buffered_encoded_values_ += num_values;
 
-    WriteRepetitionLevels(num_values, rep_levels);
-  } else {
-    // Each value is exactly one row
-    rows_written_ += static_cast<int>(num_values);
+    if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
+      AddDataPage();
+    }
   }
 
-  // PARQUET-780
-  if (values_to_write > 0) {
-    DCHECK(nullptr != values) << "Values ptr cannot be NULL";
+  void FallbackToPlainEncoding() {
+    if (IsDictionaryEncoding(current_encoder_->encoding())) {
+      WriteDictionaryPage();
+      // Serialize the buffered Dictionary Indicies
+      FlushBufferedDataPages();
+      fallback_ = true;
+      // Only PLAIN encoding is supported for fallback in V1
+      current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
+                                     properties_->memory_pool());
+      encoding_ = Encoding::PLAIN;
+    }
   }
 
-  WriteValues(values_to_write, values);
+  // Checks if the Dictionary Page size limit is reached
+  // If the limit is reached, the Dictionary and Data Pages are serialized
+  // The encoding is switched to PLAIN
+  //
+  // Only one Dictionary Page is written.
+  // Fallback to PLAIN if dictionary page limit is reached.
+  void CheckDictionarySizeLimit() {
+    if (!has_dictionary_ || fallback_) {
+      // Either not using dictionary encoding, or we have already fallen back
+      // to PLAIN encoding because the size threshold was reached
+      return;
+    }
 
-  if (page_statistics_ != nullptr) {
-    page_statistics_->Update(values, values_to_write, num_values - values_to_write);
+    // We have to dynamic cast here because TypedEncoder<Type> as some compilers
+    // don't want to cast through virtual inheritance
+    auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+    if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
+      FallbackToPlainEncoding();
+    }
   }
 
-  num_buffered_values_ += num_values;
-  num_buffered_encoded_values_ += values_to_write;
-
-  if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
-    AddDataPage();
-  }
-  if (has_dictionary_ && !fallback_) {
-    CheckDictionarySizeLimit();
+  void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
+    dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+        ->Put(values, static_cast<int>(num_values));
+    if (page_statistics_ != nullptr) {
+      page_statistics_->Update(values, num_values, num_nulls);
+    }
   }
 
-  return values_to_write;
-}
-
-template <typename DType>
-int64_t TypedColumnWriterImpl<DType>::WriteMiniBatchSpaced(
-    int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels,
-    const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values,
-    int64_t* num_spaced_written) {
-  int64_t values_to_write = 0;
-  int64_t spaced_values_to_write = 0;
-  // If the field is required and non-repeated, there are no definition levels
-  if (descr_->max_definition_level() > 0) {
-    // Minimal definition level for which spaced values are written
-    int16_t min_spaced_def_level = descr_->max_definition_level();
+  void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
+                         const uint8_t* valid_bits, int64_t valid_bits_offset) {
     if (descr_->schema_node()->is_optional()) {
-      min_spaced_def_level--;
-    }
-    for (int64_t i = 0; i < num_levels; ++i) {
-      if (def_levels[i] == descr_->max_definition_level()) {
-        ++values_to_write;
-      }
-      if (def_levels[i] >= min_spaced_def_level) {
-        ++spaced_values_to_write;
-      }
+      dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+          ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
+                      valid_bits_offset);
+    } else {
+      dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+          ->Put(values, static_cast<int>(num_values));
     }
-
-    WriteDefinitionLevels(num_levels, def_levels);
-  } else {
-    // Required field, write all values
-    values_to_write = num_levels;
-    spaced_values_to_write = num_levels;
-  }
-
-  // Not present for non-repeated fields
-  if (descr_->max_repetition_level() > 0) {
-    // A row could include more than one value
-    // Count the occasions where we start a new row
-    for (int64_t i = 0; i < num_levels; ++i) {
-      if (rep_levels[i] == 0) {
-        rows_written_++;
-      }
+    if (page_statistics_ != nullptr) {
+      const int64_t num_nulls = num_spaced_values - num_values;
+      page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
+                                     num_nulls);
     }
-
-    WriteRepetitionLevels(num_levels, rep_levels);
-  } else {
-    // Each value is exactly one row
-    rows_written_ += static_cast<int>(num_levels);
   }
+};
 
-  if (descr_->schema_node()->is_optional()) {
-    WriteValuesSpaced(spaced_values_to_write, valid_bits, valid_bits_offset, values);
-  } else {
-    WriteValues(values_to_write, values);
-  }
-  *num_spaced_written = spaced_values_to_write;
+template <typename DType>
+Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(const int16_t* def_levels,
+                                                          const int16_t* rep_levels,
+                                                          int64_t num_levels,
+                                                          const arrow::Array& array,
+                                                          ArrowWriteContext* ctx) {
+  // If this is the first time writing a DictionaryArray, then there's
+  // a few possible paths to take:
+  //
+  // - If dictionary encoding is not enabled, convert to densely
+  //   encoded and call WriteArrow
+  // - Dictionary encoding enabled
+  //   - If this is the first time this is called, then we call
+  //     PutDictionary into the encoder and then PutIndices on each
+  //     chunk. We store the dictionary that was written in
+  //     preserved_dictionary_ so that subsequent calls to this method
+  //     can make sure the dictionary has not changed
+  //   - On subsequent calls, we have to check whether the dictionary
+  //     has changed. If it has, then we trigger the varying
+  //     dictionary path and materialize each chunk and then call
+  //     WriteArrow with that
+  auto WriteDense = [&] {
+    std::shared_ptr<arrow::Array> dense_array;
+    RETURN_NOT_OK(
+        ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
+    return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx);
+  };
 
-  if (page_statistics_ != nullptr) {
-    page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, values_to_write,
-                                   spaced_values_to_write - values_to_write);
+  if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
+      !DictionaryDirectWriteSupported(array)) {
+    // No longer dictionary-encoding for whatever reason, maybe we never were
+    // or we decided to stop. Note that WriteArrow can be invoked multiple
+    // times with both dense and dictionary-encoded versions of the same data
+    // without a problem. Any dense data will be hashed to indices until the
+    // dictionary page limit is reached, at which everything (dictionary and
+    // dense) will fall back to plain encoding
+    return WriteDense();
   }
 
-  num_buffered_values_ += num_levels;
-  num_buffered_encoded_values_ += values_to_write;
+  auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+  const auto& data = checked_cast<const arrow::DictionaryArray&>(array);
+  std::shared_ptr<arrow::Array> dictionary = data.dictionary();
+  std::shared_ptr<arrow::Array> indices = data.indices();
 
-  if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
-    AddDataPage();
-  }
-  if (has_dictionary_ && !fallback_) {
-    CheckDictionarySizeLimit();
-  }
+  int64_t value_offset = 0;
+  auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
+    int64_t batch_num_values = 0;
+    int64_t batch_num_spaced_values = 0;
+    WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset,
+                      &batch_num_values, &batch_num_spaced_values);
+    dict_encoder->PutIndices(*indices->Slice(value_offset, batch_num_spaced_values));
+    CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+    value_offset += batch_num_spaced_values;
+  };
 
-  return values_to_write;
-}
+  // Handle seeing dictionary for the first time
+  if (!preserved_dictionary_) {
+    // It's a new dictionary. Call PutDictionary and keep track of it
+    PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
 
-template <typename DType>
-void TypedColumnWriterImpl<DType>::WriteBatch(int64_t num_values,
-                                              const int16_t* def_levels,
-                                              const int16_t* rep_levels,
-                                              const T* values) {
-  // We check for DataPage limits only after we have inserted the values. If a user
-  // writes a large number of values, the DataPage size can be much above the limit.
-  // The purpose of this chunking is to bound this. Even if a user writes large number
-  // of values, the chunking will ensure the AddDataPage() is called at a reasonable
-  // pagesize limit
-  int64_t write_batch_size = properties_->write_batch_size();
-  int num_batches = static_cast<int>(num_values / write_batch_size);
-  int64_t num_remaining = num_values % write_batch_size;
-  int64_t value_offset = 0;
-  for (int round = 0; round < num_batches; round++) {
-    int64_t offset = round * write_batch_size;
-    int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset],
-                                        &rep_levels[offset], &values[value_offset]);
-    value_offset += num_values;
+    // TODO(wesm): If some dictionary values are unobserved, then the
+    // statistics will be inaccurate. Do we care enough to fix it?
+    if (page_statistics_ != nullptr) {
+      PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
+    }
+    preserved_dictionary_ = dictionary;
+  } else if (!dictionary->Equals(*preserved_dictionary_)) {
+    // Dictionary has changed
+    PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+    return WriteDense();
   }
-  // Write the remaining values
-  int64_t offset = num_batches * write_batch_size;
-  WriteMiniBatch(num_remaining, &def_levels[offset], &rep_levels[offset],
-                 &values[value_offset]);
-}
 
-template <typename DType>
-void TypedColumnWriterImpl<DType>::WriteBatchSpaced(
-    int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels,
-    const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) {
-  // We check for DataPage limits only after we have inserted the values. If a user
-  // writes a large number of values, the DataPage size can be much above the limit.
-  // The purpose of this chunking is to bound this. Even if a user writes large number
-  // of values, the chunking will ensure the AddDataPage() is called at a reasonable
-  // pagesize limit
-  int64_t write_batch_size = properties_->write_batch_size();
-  int num_batches = static_cast<int>(num_values / write_batch_size);
-  int64_t num_remaining = num_values % write_batch_size;
-  int64_t num_spaced_written = 0;
-  int64_t values_offset = 0;
-  for (int round = 0; round < num_batches; round++) {
-    int64_t offset = round * write_batch_size;
-    WriteMiniBatchSpaced(write_batch_size, &def_levels[offset], &rep_levels[offset],
-                         valid_bits, valid_bits_offset + values_offset,
-                         values + values_offset, &num_spaced_written);
-    values_offset += num_spaced_written;
-  }
-  // Write the remaining values
-  int64_t offset = num_batches * write_batch_size;
-  WriteMiniBatchSpaced(num_remaining, &def_levels[offset], &rep_levels[offset],
-                       valid_bits, valid_bits_offset + values_offset,
-                       values + values_offset, &num_spaced_written);
+  PARQUET_CATCH_NOT_OK(
+      DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
+  return Status::OK();
 }
 
 // ----------------------------------------------------------------------
@@ -964,7 +1069,7 @@ void TypedColumnWriterImpl<DType>::WriteBatchSpaced(
 template <typename ParquetType, typename ArrowType, typename Enable = void>
 struct SerializeFunctor {
   using ArrowCType = typename ArrowType::c_type;
-  using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+  using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
   using ParquetCType = typename ParquetType::c_type;
   Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
     const ArrowCType* input = array.raw_values();
@@ -980,15 +1085,15 @@ struct SerializeFunctor {
 };
 
 template <typename ParquetType, typename ArrowType>
-inline Status SerializeData(const ::arrow::Array& array, ArrowWriteContext* ctx,
+inline Status SerializeData(const arrow::Array& array, ArrowWriteContext* ctx,
                             typename ParquetType::c_type* out) {
-  using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+  using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
   SerializeFunctor<ParquetType, ArrowType> functor;
   return functor.Serialize(checked_cast<const ArrayType&>(array), ctx, out);
 }
 
 template <typename ParquetType, typename ArrowType>
-Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
+Status WriteArrowSerialize(const arrow::Array& array, int64_t num_levels,
                            const int16_t* def_levels, const int16_t* rep_levels,
                            ArrowWriteContext* ctx,
                            TypedColumnWriter<ParquetType>* writer) {
@@ -1013,12 +1118,12 @@ Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
 }
 
 template <typename ParquetType>
-Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
+Status WriteArrowZeroCopy(const arrow::Array& array, int64_t num_levels,
                           const int16_t* def_levels, const int16_t* rep_levels,
                           ArrowWriteContext* ctx,
                           TypedColumnWriter<ParquetType>* writer) {
   using T = typename ParquetType::c_type;
-  const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
+  const auto& data = static_cast<const arrow::PrimitiveArray&>(array);
   const T* values = nullptr;
   // The values buffer may be null if the array is empty (ARROW-2744)
   if (data.values() != nullptr) {
@@ -1036,13 +1141,13 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
   return Status::OK();
 }
 
-#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType)  \
-  case ::arrow::Type::ArrowEnum:                                 \
-    return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
+#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
+  case arrow::Type::ArrowEnum:                                  \
+    return WriteArrowSerialize<ParquetType, arrow::ArrowType>(  \
         array, num_levels, def_levels, rep_levels, ctx, this);
 
 #define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType)                       \
-  case ::arrow::Type::ArrowEnum:                                                      \
+  case arrow::Type::ArrowEnum:                                                        \
     return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
                                            ctx, this);
 
@@ -1056,43 +1161,34 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
 // Write Arrow to BooleanType
 
 template <>
-Status TypedColumnWriterImpl<BooleanType>::WriteArrow(const int16_t* def_levels,
-                                                      const int16_t* rep_levels,
-                                                      int64_t num_levels,
-                                                      const ::arrow::Array& array,
-                                                      ArrowWriteContext* ctx) {
-  if (array.type_id() != ::arrow::Type::BOOL) {
-    ARROW_UNSUPPORTED();
-  }
-  bool* buffer = nullptr;
-  RETURN_NOT_OK(ctx->GetScratchData<bool>(array.length(), &buffer));
-
-  const auto& data = static_cast<const ::arrow::BooleanArray&>(array);
-  const uint8_t* values = nullptr;
-  // The values buffer may be null if the array is empty (ARROW-2744)
-  if (data.values() != nullptr) {
-    values = reinterpret_cast<const uint8_t*>(data.values()->data());
-  } else {
-    DCHECK_EQ(data.length(), 0);
+struct SerializeFunctor<BooleanType, arrow::BooleanType> {
+  Status Serialize(const arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
+    for (int i = 0; i < data.length(); i++) {
+      *out++ = data.Value(i);
+    }
+    return Status::OK();
   }
+};
 
-  int buffer_idx = 0;
-  int64_t offset = array.offset();
-  for (int i = 0; i < data.length(); i++) {
-    if (data.IsValid(i)) {
-      buffer[buffer_idx++] = BitUtil::GetBit(values, offset + i);
-    }
+template <>
+Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(const int16_t* def_levels,
+                                                           const int16_t* rep_levels,
+                                                           int64_t num_levels,
+                                                           const arrow::Array& array,
+                                                           ArrowWriteContext* ctx) {
+  if (array.type_id() != arrow::Type::BOOL) {
+    ARROW_UNSUPPORTED();
   }
-  PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, buffer));
-  return Status::OK();
+  return WriteArrowSerialize<BooleanType, arrow::BooleanType>(
+      array, num_levels, def_levels, rep_levels, ctx, this);
 }
 
 // ----------------------------------------------------------------------
 // Write Arrow types to INT32
 
 template <>
-struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
-  Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
+struct SerializeFunctor<Int32Type, arrow::Date64Type> {
+  Status Serialize(const arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
     const int64_t* input = array.raw_values();
     for (int i = 0; i < array.length(); i++) {
       *out++ = static_cast<int32_t>(*input++ / 86400000);
@@ -1102,11 +1198,11 @@ struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
 };
 
 template <>
-struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
-  Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
+struct SerializeFunctor<Int32Type, arrow::Time32Type> {
+  Status Serialize(const arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
     const int32_t* input = array.raw_values();
-    const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
-    if (type.unit() == ::arrow::TimeUnit::SECOND) {
+    const auto& type = static_cast<const arrow::Time32Type&>(*array.type());
+    if (type.unit() == arrow::TimeUnit::SECOND) {
       for (int i = 0; i < array.length(); i++) {
         out[i] = input[i] * 1000;
       }
@@ -1118,13 +1214,13 @@ struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
 };
 
 template <>
-Status TypedColumnWriterImpl<Int32Type>::WriteArrow(const int16_t* def_levels,
-                                                    const int16_t* rep_levels,
-                                                    int64_t num_levels,
-                                                    const ::arrow::Array& array,
-                                                    ArrowWriteContext* ctx) {
+Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(const int16_t* def_levels,
+                                                         const int16_t* rep_levels,
+                                                         int64_t num_levels,
+                                                         const arrow::Array& array,
+                                                         ArrowWriteContext* ctx) {
   switch (array.type()->id()) {
-    case ::arrow::Type::NA: {
+    case arrow::Type::NA: {
       PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
     } break;
       WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
@@ -1149,21 +1245,21 @@ Status TypedColumnWriterImpl<Int32Type>::WriteArrow(const int16_t* def_levels,
   for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
 
 template <>
-struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
-  Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
+struct SerializeFunctor<Int96Type, arrow::TimestampType> {
+  Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
     const int64_t* input = array.raw_values();
-    const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
+    const auto& type = static_cast<const arrow::TimestampType&>(*array.type());
     switch (type.unit()) {
-      case ::arrow::TimeUnit::NANO:
+      case arrow::TimeUnit::NANO:
         INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
         break;
-      case ::arrow::TimeUnit::MICRO:
+      case arrow::TimeUnit::MICRO:
         INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
         break;
-      case ::arrow::TimeUnit::MILLI:
+      case arrow::TimeUnit::MILLI:
         INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
         break;
-      case ::arrow::TimeUnit::SECOND:
+      case arrow::TimeUnit::SECOND:
         INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
         break;
     }
@@ -1198,15 +1294,15 @@ static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
      {COERCE_MULTIPLY, 1}}};
 
 template <>
-struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
-  Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
+struct SerializeFunctor<Int64Type, arrow::TimestampType> {
+  Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext* ctx,
                    int64_t* out) {
-    const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
+    const auto& source_type = static_cast<const arrow::TimestampType&>(*array.type());
     auto source_unit = source_type.unit();
     const int64_t* values = array.raw_values();
 
-    ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
-    auto target_type = ::arrow::timestamp(target_unit);
+    arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
+    auto target_type = arrow::timestamp(target_unit);
     bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
 
     auto DivideBy = [&](const int64_t factor) {
@@ -1242,15 +1338,15 @@ struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
 #undef COERCE_INVALID
 #undef COERCE_MULTIPLY
 
-Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
+Status WriteTimestamps(const arrow::Array& values, int64_t num_levels,
                        const int16_t* def_levels, const int16_t* rep_levels,
                        ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer) {
-  const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
+  const auto& source_type = static_cast<const arrow::TimestampType&>(*values.type());
 
   auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
     ArrowWriteContext temp_ctx = *ctx;
     temp_ctx.properties = properties;
-    return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
+    return WriteArrowSerialize<Int64Type, arrow::TimestampType>(
         values, num_levels, def_levels, rep_levels, &temp_ctx, writer);
   };
 
@@ -1264,21 +1360,21 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
       return WriteCoerce(ctx->properties);
     }
   } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
-             source_type.unit() == ::arrow::TimeUnit::NANO) {
+             source_type.unit() == arrow::TimeUnit::NANO) {
     // Absent superseding user instructions, when writing Parquet version 1.0 files,
     // timestamps in nanoseconds are coerced to microseconds
     std::shared_ptr<ArrowWriterProperties> properties =
         (ArrowWriterProperties::Builder())
-            .coerce_timestamps(::arrow::TimeUnit::MICRO)
+            .coerce_timestamps(arrow::TimeUnit::MICRO)
             ->disallow_truncated_timestamps()
             ->build();
     return WriteCoerce(properties.get());
-  } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
+  } else if (source_type.unit() == arrow::TimeUnit::SECOND) {
     // Absent superseding user instructions, timestamps in seconds are coerced to
     // milliseconds
     std::shared_ptr<ArrowWriterProperties> properties =
         (ArrowWriterProperties::Builder())
-            .coerce_timestamps(::arrow::TimeUnit::MILLI)
+            .coerce_timestamps(arrow::TimeUnit::MILLI)
             ->build();
     return WriteCoerce(properties.get());
   } else {
@@ -1289,13 +1385,13 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
 }
 
 template <>
-Status TypedColumnWriterImpl<Int64Type>::WriteArrow(const int16_t* def_levels,
-                                                    const int16_t* rep_levels,
-                                                    int64_t num_levels,
-                                                    const ::arrow::Array& array,
-                                                    ArrowWriteContext* ctx) {
+Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(const int16_t* def_levels,
+                                                         const int16_t* rep_levels,
+                                                         int64_t num_levels,
+                                                         const arrow::Array& array,
+                                                         ArrowWriteContext* ctx) {
   switch (array.type()->id()) {
-    case ::arrow::Type::TIMESTAMP:
+    case arrow::Type::TIMESTAMP:
       return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this);
       WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
       WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
@@ -1307,15 +1403,15 @@ Status TypedColumnWriterImpl<Int64Type>::WriteArrow(const int16_t* def_levels,
 }
 
 template <>
-Status TypedColumnWriterImpl<Int96Type>::WriteArrow(const int16_t* def_levels,
-                                                    const int16_t* rep_levels,
-                                                    int64_t num_levels,
-                                                    const ::arrow::Array& array,
-                                                    ArrowWriteContext* ctx) {
-  if (array.type_id() != ::arrow::Type::TIMESTAMP) {
+Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(const int16_t* def_levels,
+                                                         const int16_t* rep_levels,
+                                                         int64_t num_levels,
+                                                         const arrow::Array& array,
+                                                         ArrowWriteContext* ctx) {
+  if (array.type_id() != arrow::Type::TIMESTAMP) {
     ARROW_UNSUPPORTED();
   }
-  return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
+  return WriteArrowSerialize<Int96Type, arrow::TimestampType>(
       array, num_levels, def_levels, rep_levels, ctx, this);
 }
 
@@ -1323,12 +1419,12 @@ Status TypedColumnWriterImpl<Int96Type>::WriteArrow(const int16_t* def_levels,
 // Floating point types
 
 template <>
-Status TypedColumnWriterImpl<FloatType>::WriteArrow(const int16_t* def_levels,
-                                                    const int16_t* rep_levels,
-                                                    int64_t num_levels,
-                                                    const ::arrow::Array& array,
-                                                    ArrowWriteContext* ctx) {
-  if (array.type_id() != ::arrow::Type::FLOAT) {
+Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(const int16_t* def_levels,
+                                                         const int16_t* rep_levels,
+                                                         int64_t num_levels,
+                                                         const arrow::Array& array,
+                                                         ArrowWriteContext* ctx) {
+  if (array.type_id() != arrow::Type::FLOAT) {
     ARROW_UNSUPPORTED();
   }
   return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
@@ -1336,12 +1432,12 @@ Status TypedColumnWriterImpl<FloatType>::WriteArrow(const int16_t* def_levels,
 }
 
 template <>
-Status TypedColumnWriterImpl<DoubleType>::WriteArrow(const int16_t* def_levels,
-                                                     const int16_t* rep_levels,
-                                                     int64_t num_levels,
-                                                     const ::arrow::Array& array,
-                                                     ArrowWriteContext* ctx) {
-  if (array.type_id() != ::arrow::Type::DOUBLE) {
+Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(const int16_t* def_levels,
+                                                          const int16_t* rep_levels,
+                                                          int64_t num_levels,
+                                                          const arrow::Array& array,
+                                                          ArrowWriteContext* ctx) {
+  if (array.type_id() != arrow::Type::DOUBLE) {
     ARROW_UNSUPPORTED();
   }
   return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
@@ -1351,51 +1447,37 @@ Status TypedColumnWriterImpl<DoubleType>::WriteArrow(const int16_t* def_levels,
 // ----------------------------------------------------------------------
 // Write Arrow to BYTE_ARRAY
 
-template <typename ParquetType, typename ArrowType>
-struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_binary<ArrowType>> {
-  Status Serialize(const ::arrow::BinaryArray& array, ArrowWriteContext*,
-                   ByteArray* out) {
-    // In the case of an array consisting of only empty strings or all null,
-    // array.data() points already to a nullptr, thus array.data()->data() will
-    // segfault.
-    const uint8_t* values = nullptr;
-    if (array.value_data()) {
-      values = reinterpret_cast<const uint8_t*>(array.value_data()->data());
-      DCHECK(values != nullptr);
-    }
+template <>
+Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(const int16_t* def_levels,
+                                                             const int16_t* rep_levels,
+                                                             int64_t num_levels,
+                                                             const arrow::Array& array,
+                                                             ArrowWriteContext* ctx) {
+  if (array.type()->id() != arrow::Type::BINARY &&
+      array.type()->id() != arrow::Type::STRING) {
+    ARROW_UNSUPPORTED();
+  }
 
-    // Slice offset is accounted for in raw_value_offsets
-    const int32_t* value_offset = array.raw_value_offsets();
-    if (array.null_count() == 0) {
-      // no nulls, just dump the data
-      for (int64_t i = 0; i < array.length(); i++) {
-        out[i] =
-            ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]);
-      }
-    } else {
-      for (int64_t i = 0; i < array.length(); i++) {
-        if (array.IsValid(i)) {
-          out[i] =
-              ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]);
-        }
-      }
+  int64_t value_offset = 0;
+  auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+    int64_t batch_num_values = 0;
+    int64_t batch_num_spaced_values = 0;
+    WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset,
+                      &batch_num_values, &batch_num_spaced_values);
+    std::shared_ptr<arrow::Array> data_slice =
+        array.Slice(value_offset, batch_num_spaced_values);
+    current_encoder_->Put(*data_slice);
+    if (page_statistics_ != nullptr) {
+      page_statistics_->Update(*data_slice);
     }
-    return Status::OK();
-  }
-};
+    CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+    CheckDictionarySizeLimit();
+    value_offset += batch_num_spaced_values;
+  };
 
-template <>
-Status TypedColumnWriterImpl<ByteArrayType>::WriteArrow(const int16_t* def_levels,
-                                                        const int16_t* rep_levels,
-                                                        int64_t num_levels,
-                                                        const ::arrow::Array& array,
-                                                        ArrowWriteContext* ctx) {
-  switch (array.type()->id()) {
-    WRITE_SERIALIZE_CASE(BINARY, BinaryType, ByteArrayType)
-    WRITE_SERIALIZE_CASE(STRING, BinaryType, ByteArrayType)
-    default:
-      ARROW_UNSUPPORTED();
-  }
+  PARQUET_CATCH_NOT_OK(
+      DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
+  return Status::OK();
 }
 
 // ----------------------------------------------------------------------
@@ -1403,8 +1485,8 @@ Status TypedColumnWriterImpl<ByteArrayType>::WriteArrow(const int16_t* def_level
 
 template <typename ParquetType, typename ArrowType>
 struct SerializeFunctor<ParquetType, ArrowType,
-                        ::arrow::enable_if_fixed_size_binary<ArrowType>> {
-  Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
+                        arrow::enable_if_fixed_size_binary<ArrowType>> {
+  Status Serialize(const arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
                    FLBA* out) {
     if (array.null_count() == 0) {
       // no nulls, just dump the data
@@ -1424,17 +1506,17 @@ struct SerializeFunctor<ParquetType, ArrowType,
 };
 
 template <>
-Status WriteArrowSerialize<FLBAType, ::arrow::Decimal128Type>(
-    const ::arrow::Array& array, int64_t num_levels, const int16_t* def_levels,
+Status WriteArrowSerialize<FLBAType, arrow::Decimal128Type>(
+    const arrow::Array& array, int64_t num_levels, const int16_t* def_levels,
     const int16_t* rep_levels, ArrowWriteContext* ctx,
     TypedColumnWriter<FLBAType>* writer) {
-  const auto& data = static_cast<const ::arrow::Decimal128Array&>(array);
+  const auto& data = static_cast<const arrow::Decimal128Array&>(array);
   const int64_t length = data.length();
 
   FLBA* buffer;
   RETURN_NOT_OK(ctx->GetScratchData<FLBA>(num_levels, &buffer));
 
-  const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*data.type());
+  const auto& decimal_type = static_cast<const arrow::Decimal128Type&>(*data.type());
   const int32_t offset =
       decimal_type.byte_width() - internal::DecimalSize(decimal_type.precision());
 
@@ -1452,8 +1534,8 @@ Status WriteArrowSerialize<FLBAType, ::arrow::Decimal128Type>(
     // todo(advancedxy): use a writeBatch to avoid this step
     for (int64_t i = 0, j = 0; i < length; ++i, j += 2) {
       auto unsigned_64_bit = reinterpret_cast<const uint64_t*>(data.GetValue(i));
-      big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]);
-      big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]);
+      big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]);
+      big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]);
       buffer[i] = FixedLenByteArray(
           reinterpret_cast<const uint8_t*>(&big_endian_values[j]) + offset);
     }
@@ -1461,8 +1543,8 @@ Status WriteArrowSerialize<FLBAType, ::arrow::Decimal128Type>(
     for (int64_t i = 0, buffer_idx = 0, j = 0; i < length; ++i) {
       if (data.IsValid(i)) {
         auto unsigned_64_bit = reinterpret_cast<const uint64_t*>(data.GetValue(i));
-        big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]);
-        big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]);
+        big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]);
+        big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]);
         buffer[buffer_idx++] = FixedLenByteArray(
             reinterpret_cast<const uint8_t*>(&big_endian_values[j]) + offset);
         j += 2;
@@ -1474,11 +1556,11 @@ Status WriteArrowSerialize<FLBAType, ::arrow::Decimal128Type>(
 }
 
 template <>
-Status TypedColumnWriterImpl<FLBAType>::WriteArrow(const int16_t* def_levels,
-                                                   const int16_t* rep_levels,
-                                                   int64_t num_levels,
-                                                   const ::arrow::Array& array,
-                                                   ArrowWriteContext* ctx) {
+Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(const int16_t* def_levels,
+                                                        const int16_t* rep_levels,
+                                                        int64_t num_levels,
+                                                        const arrow::Array& array,
+                                                        ArrowWriteContext* ctx) {
   switch (array.type()->id()) {
     WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
     WRITE_SERIALIZE_CASE(DECIMAL, Decimal128Type, FLBAType)
diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc
index 9497534c823..ccd456afce2 100644
--- a/cpp/src/parquet/encoding-test.cc
+++ b/cpp/src/parquet/encoding-test.cc
@@ -54,7 +54,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) {
   int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues));
 
   std::vector<bool> draws;
-  ::arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */);
+  arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */);
 
   std::unique_ptr<BooleanEncoder> encoder =
       MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
@@ -75,7 +75,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) {
   ASSERT_EQ(nvalues, values_decoded);
 
   for (int i = 0; i < nvalues; ++i) {
-    ASSERT_EQ(draws[i], ::arrow::BitUtil::GetBit(decode_data, i)) << i;
+    ASSERT_EQ(draws[i], arrow::BitUtil::GetBit(decode_data, i)) << i;
   }
 }
 
@@ -260,7 +260,7 @@ class TestDictionaryEncoding : public TestEncodingBase<Type> {
   static constexpr int TYPE = Type::type_num;
 
   void CheckRoundtrip() {
-    std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255);
+    std::vector<uint8_t> valid_bits(arrow::BitUtil::BytesForBits(num_values_) + 1, 255);
 
     auto base_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get());
     auto encoder =
@@ -327,8 +327,8 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
 
 class TestArrowBuilderDecoding : public ::testing::Test {
  public:
-  using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder;
-  using DictBuilder = ::arrow::BinaryDictionary32Builder;
+  using DenseBuilder = arrow::internal::ChunkedBinaryBuilder;
+  using DictBuilder = arrow::BinaryDictionary32Builder;
 
   void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; }
   void TearDown() override {}
@@ -343,7 +343,7 @@ class TestArrowBuilderDecoding : public ::testing::Test {
     constexpr int repeat = 100;
     constexpr int64_t min_length = 2;
     constexpr int64_t max_length = 10;
-    ::arrow::random::RandomArrayGenerator rag(0);
+    arrow::random::RandomArrayGenerator rag(0);
     expected_dense_ = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length,
                                             max_length, null_probability);
 
@@ -356,7 +356,7 @@ class TestArrowBuilderDecoding : public ::testing::Test {
     ASSERT_OK(builder->Finish(&expected_dict_));
 
     // Initialize input_data_ for the encoder from the expected_array_ values
-    const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*expected_dense_);
+    const auto& binary_array = static_cast<const arrow::BinaryArray&>(*expected_dense_);
     input_data_.resize(binary_array.length());
 
     for (int64_t i = 0; i < binary_array.length(); ++i) {
@@ -382,8 +382,8 @@ class TestArrowBuilderDecoding : public ::testing::Test {
 
   template <typename Builder>
   void CheckDense(int actual_num_values, Builder& builder) {
-    ASSERT_EQ(actual_num_values, num_values_);
-    ::arrow::ArrayVector actual_vec;
+    ASSERT_EQ(actual_num_values, num_values_ - null_count_);
+    arrow::ArrayVector actual_vec;
     ASSERT_OK(builder.Finish(&actual_vec));
     ASSERT_EQ(actual_vec.size(), 1);
     ASSERT_ARRAYS_EQUAL(*actual_vec[0], *expected_dense_);
@@ -391,8 +391,8 @@ class TestArrowBuilderDecoding : public ::testing::Test {
 
   template <typename Builder>
   void CheckDict(int actual_num_values, Builder& builder) {
-    ASSERT_EQ(actual_num_values, num_values_);
-    std::shared_ptr<::arrow::Array> actual;
+    ASSERT_EQ(actual_num_values, num_values_ - null_count_);
+    std::shared_ptr<arrow::Array> actual;
     ASSERT_OK(builder.Finish(&actual));
     ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_);
   }
@@ -439,8 +439,8 @@ class TestArrowBuilderDecoding : public ::testing::Test {
 
  protected:
   std::vector<double> null_probabilities_;
-  std::shared_ptr<::arrow::Array> expected_dict_;
-  std::shared_ptr<::arrow::Array> expected_dense_;
+  std::shared_ptr<arrow::Array> expected_dict_;
+  std::shared_ptr<arrow::Array> expected_dense_;
   int num_values_;
   int null_count_;
   std::vector<ByteArray> input_data_;
@@ -480,6 +480,143 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) {
   this->CheckDecodeArrowNonNullUsingDictBuilder();
 }
 
+TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) {
+  // Implemented as part of ARROW-3246
+
+  const int64_t size = 50;
+  const int32_t min_length = 0;
+  const int32_t max_length = 10;
+  const double null_probability = 0.25;
+
+  auto CheckSeed = [&](int seed) {
+    arrow::random::RandomArrayGenerator rag(seed);
+    auto values = rag.String(size, min_length, max_length, null_probability);
+
+    auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN);
+    auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN);
+
+    ASSERT_NO_THROW(encoder->Put(*values));
+    auto buf = encoder->FlushValues();
+
+    int num_values = static_cast<int>(values->length() - values->null_count());
+    decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));
+
+    arrow::StringBuilder builder;
+    ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast<int>(values->length()),
+                                               static_cast<int>(values->null_count()),
+                                               values->null_bitmap_data(),
+                                               values->offset(), &builder));
+
+    std::shared_ptr<arrow::Array> result;
+    ASSERT_OK(builder.Finish(&result));
+    ASSERT_EQ(50, result->length());
+    arrow::AssertArraysEqual(*values, *result);
+
+    // Type checked
+    auto i32_values = rag.Int32(size, 0, 10, null_probability);
+    ASSERT_THROW(encoder->Put(*i32_values), ParquetException);
+  };
+
+  for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
+    CheckSeed(seed);
+  }
+}
+
+void GetBinaryDictDecoder(DictEncoder<ByteArrayType>* encoder, int64_t num_values,
+                          std::shared_ptr<Buffer>* out_values,
+                          std::shared_ptr<Buffer>* out_dict,
+                          std::unique_ptr<ByteArrayDecoder>* out_decoder) {
+  auto decoder = MakeDictDecoder<ByteArrayType>();
+  auto buf = encoder->FlushValues();
+  auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size());
+  encoder->WriteDict(dict_buf->mutable_data());
+
+  auto dict_decoder = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN);
+  dict_decoder->SetData(encoder->num_entries(), dict_buf->data(),
+                        static_cast<int>(dict_buf->size()));
+
+  decoder->SetData(static_cast<int>(num_values), buf->data(),
+                   static_cast<int>(buf->size()));
+  decoder->SetDict(dict_decoder.get());
+
+  *out_values = buf;
+  *out_dict = dict_buf;
+  *out_decoder = std::unique_ptr<ByteArrayDecoder>(
+      dynamic_cast<ByteArrayDecoder*>(decoder.release()));
+}
+
+TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) {
+  // Implemented as part of ARROW-3246
+  const int64_t size = 50;
+  const int64_t min_length = 0;
+  const int64_t max_length = 10;
+  const double null_probability = 0.1;
+  arrow::random::RandomArrayGenerator rag(0);
+  auto values = rag.String(size, min_length, max_length, null_probability);
+
+  auto owned_encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN,
+                                                       /*use_dictionary=*/true);
+
+  auto encoder = dynamic_cast<DictEncoder<ByteArrayType>*>(owned_encoder.get());
+
+  ASSERT_NO_THROW(encoder->Put(*values));
+
+  std::unique_ptr<ByteArrayDecoder> decoder;
+  std::shared_ptr<Buffer> buf, dict_buf;
+  int num_values = static_cast<int>(values->length() - values->null_count());
+  GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder);
+
+  arrow::StringBuilder builder;
+  ASSERT_EQ(num_values,
+            decoder->DecodeArrow(static_cast<int>(values->length()),
+                                 static_cast<int>(values->null_count()),
+                                 values->null_bitmap_data(), values->offset(), &builder));
+
+  std::shared_ptr<arrow::Array> result;
+  ASSERT_OK(builder.Finish(&result));
+  arrow::AssertArraysEqual(*values, *result);
+}
+
+TEST(DictEncodingAdHoc, PutDictionaryPutIndices) {
+  // Part of ARROW-3246
+  auto dict_values = arrow::ArrayFromJSON(arrow::binary(), "[\"foo\", \"bar\", \"baz\"]");
+  auto indices = arrow::ArrayFromJSON(arrow::int32(), "[0, 1, 2]");
+  auto indices_nulls = arrow::ArrayFromJSON(arrow::int32(), "[null, 0, 1, null, 2]");
+
+  auto expected = arrow::ArrayFromJSON(arrow::binary(),
+                                       "[\"foo\", \"bar\", \"baz\", null, "
+                                       "\"foo\", \"bar\", null, \"baz\"]");
+
+  auto owned_encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN,
+                                                       /*use_dictionary=*/true);
+  auto owned_decoder = MakeDictDecoder<ByteArrayType>();
+
+  auto encoder = dynamic_cast<DictEncoder<ByteArrayType>*>(owned_encoder.get());
+
+  ASSERT_NO_THROW(encoder->PutDictionary(*dict_values));
+
+  // Trying to call PutDictionary again throws
+  ASSERT_THROW(encoder->PutDictionary(*dict_values), ParquetException);
+
+  ASSERT_NO_THROW(encoder->PutIndices(*indices));
+  ASSERT_NO_THROW(encoder->PutIndices(*indices_nulls));
+
+  std::unique_ptr<ByteArrayDecoder> decoder;
+  std::shared_ptr<Buffer> buf, dict_buf;
+  int num_values = static_cast<int>(expected->length() - expected->null_count());
+  GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder);
+
+  arrow::BinaryBuilder builder;
+  ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast<int>(expected->length()),
+                                             static_cast<int>(expected->null_count()),
+                                             expected->null_bitmap_data(),
+                                             expected->offset(), &builder));
+
+  std::shared_ptr<arrow::Array> result;
+  ASSERT_OK(builder.Finish(&result));
+  arrow::AssertArraysEqual(*expected, *result);
+}
+
 class DictEncoding : public TestArrowBuilderDecoding {
  public:
   void SetupEncoderDecoder() override {
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 57d3dd73869..cd4518ebf3f 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -46,8 +46,7 @@ constexpr int64_t kInMemoryDefaultCapacity = 1024;
 
 class EncoderImpl : virtual public Encoder {
  public:
-  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding,
-              ::arrow::MemoryPool* pool)
+  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
       : descr_(descr),
         encoding_(encoding),
         pool_(pool),
@@ -55,13 +54,13 @@ class EncoderImpl : virtual public Encoder {
 
   Encoding::type encoding() const override { return encoding_; }
 
-  ::arrow::MemoryPool* memory_pool() const override { return pool_; }
+  MemoryPool* memory_pool() const override { return pool_; }
 
  protected:
   // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
   const ColumnDescriptor* descr_;
   const Encoding::type encoding_;
-  ::arrow::MemoryPool* pool_;
+  MemoryPool* pool_;
 
   /// Type length from descr
   int type_length_;
@@ -75,38 +74,60 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
  public:
   using T = typename DType::c_type;
 
-  explicit PlainEncoder(const ColumnDescriptor* descr,
-                        ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::PLAIN, pool) {
+    values_sink_ = CreateOutputStream(pool);
+  }
 
-  int64_t EstimatedDataEncodedSize() override;
-  std::shared_ptr<Buffer> FlushValues() override;
+  int64_t EstimatedDataEncodedSize() override {
+    int64_t position = -1;
+    PARQUET_THROW_NOT_OK(values_sink_->Tell(&position));
+    return position;
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<Buffer> buffer;
+    PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer));
+    values_sink_ = CreateOutputStream(this->pool_);
+    return buffer;
+  }
 
   void Put(const T* buffer, int num_values) override;
 
- protected:
-  std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_;
-};
+  void Put(const arrow::Array& values) override;
 
-template <typename DType>
-PlainEncoder<DType>::PlainEncoder(const ColumnDescriptor* descr,
-                                  ::arrow::MemoryPool* pool)
-    : EncoderImpl(descr, Encoding::PLAIN, pool) {
-  values_sink_ = CreateOutputStream(pool);
-}
-template <typename DType>
-int64_t PlainEncoder<DType>::EstimatedDataEncodedSize() {
-  int64_t position = -1;
-  PARQUET_THROW_NOT_OK(values_sink_->Tell(&position));
-  return position;
-}
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    std::shared_ptr<ResizableBuffer> buffer;
+    PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(),
+                                                        num_values * sizeof(T), &buffer));
+    int32_t num_valid_values = 0;
+    arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
+    T* data = reinterpret_cast<T*>(buffer->mutable_data());
+    for (int32_t i = 0; i < num_values; i++) {
+      if (valid_bits_reader.IsSet()) {
+        data[num_valid_values++] = src[i];
+      }
+      valid_bits_reader.Next();
+    }
+    Put(data, num_valid_values);
+  }
 
-template <typename DType>
-std::shared_ptr<Buffer> PlainEncoder<DType>::FlushValues() {
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer));
-  values_sink_ = CreateOutputStream(this->pool_);
-  return buffer;
-}
+  void Put(const ByteArray& val) {
+    // Write the result to the output stream
+    PARQUET_THROW_NOT_OK(values_sink_->Write(reinterpret_cast<const uint8_t*>(&val.len),
+                                             sizeof(uint32_t)));
+    if (val.len > 0) {
+      DCHECK(nullptr != val.ptr) << "Value ptr cannot be NULL";
+    }
+    PARQUET_THROW_NOT_OK(
+        values_sink_->Write(reinterpret_cast<const uint8_t*>(val.ptr), val.len));
+  }
+
+ protected:
+  std::shared_ptr<arrow::io::BufferOutputStream> values_sink_;
+};
 
 template <typename DType>
 void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
@@ -117,17 +138,45 @@ void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
 template <>
 inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
   for (int i = 0; i < num_values; ++i) {
-    // Write the result to the output stream
-    PARQUET_THROW_NOT_OK(values_sink_->Write(
-        reinterpret_cast<const uint8_t*>(&src[i].len), sizeof(uint32_t)));
-    if (src[i].len > 0) {
-      DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL";
+    Put(src[i]);
+  }
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const arrow::Array& values) {
+  ParquetException::NYI(values.type()->ToString());
+}
+
+void AssertBinary(const arrow::Array& values) {
+  if (values.type_id() != arrow::Type::BINARY &&
+      values.type_id() != arrow::Type::STRING) {
+    throw ParquetException("Only BinaryArray and subclasses supported");
+  }
+}
+
+template <typename EncoderType>
+void PutBinaryArray(const arrow::Array& values, EncoderType* encoder) {
+  AssertBinary(values);
+  const auto& data = checked_cast<const arrow::BinaryArray&>(values);
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    for (int64_t i = 0; i < data.length(); i++) {
+      encoder->Put(ByteArray(data.GetView(i)));
+    }
+  } else {
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        encoder->Put(ByteArray(data.GetView(i)));
+      }
     }
-    PARQUET_THROW_NOT_OK(
-        values_sink_->Write(reinterpret_cast<const uint8_t*>(src[i].ptr), src[i].len));
   }
 }
 
+template <>
+void PlainEncoder<ByteArrayType>::Put(const arrow::Array& values) {
+  PutBinaryArray(values, this);
+}
+
 template <>
 inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
   for (int i = 0; i < num_values; ++i) {
@@ -140,13 +189,6 @@ inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_va
   }
 }
 
-class PlainByteArrayEncoder : public PlainEncoder<ByteArrayType>,
-                              virtual public ByteArrayEncoder {
- public:
-  using BASE = PlainEncoder<ByteArrayType>;
-  using BASE::PlainEncoder;
-};
-
 class PlainFLBAEncoder : public PlainEncoder<FLBAType>, virtual public FLBAEncoder {
  public:
   using BASE = PlainEncoder<FLBAType>;
@@ -157,9 +199,8 @@ class PlainBooleanEncoder : public EncoderImpl,
                             virtual public TypedEncoder<BooleanType>,
                             virtual public BooleanEncoder {
  public:
-  explicit PlainBooleanEncoder(
-      const ColumnDescriptor* descr,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+  explicit PlainBooleanEncoder(const ColumnDescriptor* descr,
+                               MemoryPool* pool = arrow::default_memory_pool());
 
   int64_t EstimatedDataEncodedSize() override;
   std::shared_ptr<Buffer> FlushValues() override;
@@ -167,11 +208,33 @@ class PlainBooleanEncoder : public EncoderImpl,
   void Put(const bool* src, int num_values) override;
   void Put(const std::vector<bool>& src, int num_values) override;
 
+  void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    std::shared_ptr<ResizableBuffer> buffer;
+    PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(),
+                                                        num_values * sizeof(T), &buffer));
+    int32_t num_valid_values = 0;
+    arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
+    T* data = reinterpret_cast<T*>(buffer->mutable_data());
+    for (int32_t i = 0; i < num_values; i++) {
+      if (valid_bits_reader.IsSet()) {
+        data[num_valid_values++] = src[i];
+      }
+      valid_bits_reader.Next();
+    }
+    Put(data, num_valid_values);
+  }
+
+  void Put(const arrow::Array& values) override {
+    ParquetException::NYI("Direct Arrow to Boolean writes not implemented");
+  }
+
  private:
   int bits_available_;
-  std::unique_ptr<::arrow::BitUtil::BitWriter> bit_writer_;
+  std::unique_ptr<arrow::BitUtil::BitWriter> bit_writer_;
   std::shared_ptr<ResizableBuffer> bits_buffer_;
-  std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_;
+  std::shared_ptr<arrow::io::BufferOutputStream> values_sink_;
 
   template <typename SequenceType>
   void PutImpl(const SequenceType& src, int num_values);
@@ -217,8 +280,7 @@ void PlainBooleanEncoder::PutImpl(const SequenceType& src, int num_values) {
   }
 }
 
-PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr,
-                                         ::arrow::MemoryPool* pool)
+PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
     : EncoderImpl(descr, Encoding::PLAIN, pool),
       bits_available_(kInMemoryDefaultCapacity * 8),
       bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)) {
@@ -262,24 +324,29 @@ void PlainBooleanEncoder::Put(const std::vector<bool>& src, int num_values) {
 template <typename DType>
 struct DictEncoderTraits {
   using c_type = typename DType::c_type;
-  using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+  using MemoTableType = arrow::internal::ScalarMemoTable<c_type>;
 };
 
 template <>
 struct DictEncoderTraits<ByteArrayType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable;
+  using MemoTableType = arrow::internal::BinaryMemoTable;
 };
 
 template <>
 struct DictEncoderTraits<FLBAType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable;
+  using MemoTableType = arrow::internal::BinaryMemoTable;
 };
 
-/// See the dictionary encoding section of https://github.com/Parquet/parquet-format.
-/// The encoding supports streaming encoding. Values are encoded as they are added while
-/// the dictionary is being constructed. At any time, the buffered values can be
-/// written out with the current dictionary size. More values can then be added to
-/// the encoder, including new dictionary entries.
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format.  The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
 template <typename DType>
 class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
   using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
@@ -287,9 +354,10 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
  public:
   typedef typename DType::c_type T;
 
-  explicit DictEncoderImpl(
-      const ColumnDescriptor* desc,
-      ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool());
+  explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+      : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
+        dict_encoded_size_(0),
+        memo_table_(pool, kInitialHashTableSize) {}
 
   ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
 
@@ -301,7 +369,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
     ++buffer;
     --buffer_len;
 
-    ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+    arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
     for (int index : buffered_indices_) {
       if (!encoder.Put(index)) return -1;
     }
@@ -315,20 +383,96 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
 
   /// Returns a conservative estimate of the number of bytes needed to encode the buffered
   /// indices. Used to size the buffer passed to WriteIndices().
-  int64_t EstimatedDataEncodedSize() override;
+  int64_t EstimatedDataEncodedSize() override {
+    // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
+    // reserve
+    // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
+    // but not reserving them would cause the encoder to fail.
+    return 1 +
+           arrow::util::RleEncoder::MaxBufferSize(
+               bit_width(), static_cast<int>(buffered_indices_.size())) +
+           arrow::util::RleEncoder::MinBufferSize(bit_width());
+  }
 
   /// The minimum bit width required to encode the currently buffered indices.
-  int bit_width() const override;
+  int bit_width() const override {
+    if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+    if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+    return BitUtil::Log2(num_entries());
+  }
 
   /// Encode value. Note that this does not actually write any data, just
   /// buffers the value's index to be written later.
   inline void Put(const T& value);
-  void Put(const T* values, int num_values) override;
 
-  std::shared_ptr<Buffer> FlushValues() override;
+  void Put(const T* src, int num_values) override {
+    for (int32_t i = 0; i < num_values; i++) {
+      Put(src[i]);
+    }
+  }
 
   void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override;
+                 int64_t valid_bits_offset) override {
+    arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                    num_values);
+    for (int32_t i = 0; i < num_values; i++) {
+      if (valid_bits_reader.IsSet()) {
+        Put(src[i]);
+      }
+      valid_bits_reader.Next();
+    }
+  }
+
+  void Put(const arrow::Array& values) override;
+  void PutDictionary(const arrow::Array& values) override;
+
+  template <typename ArrowType>
+  void PutIndicesTyped(const arrow::Array& data) {
+    using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
+    const auto& indices = checked_cast<const ArrayType&>(data);
+    auto values = indices.raw_values();
+    buffered_indices_.reserve(
+        buffered_indices_.size() +
+        static_cast<size_t>(indices.length() - indices.null_count()));
+    if (indices.null_count() > 0) {
+      arrow::internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(),
+                                                      indices.offset(), indices.length());
+      for (int64_t i = 0; i < indices.length(); ++i) {
+        if (valid_bits_reader.IsSet()) {
+          buffered_indices_.push_back(static_cast<int32_t>(values[i]));
+        }
+        valid_bits_reader.Next();
+      }
+    } else {
+      for (int64_t i = 0; i < indices.length(); ++i) {
+        buffered_indices_.push_back(static_cast<int32_t>(values[i]));
+      }
+    }
+  }
+
+  void PutIndices(const arrow::Array& data) override {
+    switch (data.type()->id()) {
+      case arrow::Type::INT8:
+        return PutIndicesTyped<arrow::Int8Type>(data);
+      case arrow::Type::INT16:
+        return PutIndicesTyped<arrow::Int16Type>(data);
+      case arrow::Type::INT32:
+        return PutIndicesTyped<arrow::Int32Type>(data);
+      case arrow::Type::INT64:
+        return PutIndicesTyped<arrow::Int64Type>(data);
+      default:
+        throw ParquetException("Dictionary indices were not signed integer");
+    }
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<ResizableBuffer> buffer =
+        AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+    int result_size = WriteIndices(buffer->mutable_data(),
+                                   static_cast<int>(EstimatedDataEncodedSize()));
+    PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+    return std::move(buffer);
+  }
 
   /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
   /// dict_encoded_size() bytes.
@@ -350,66 +494,6 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
   MemoTableType memo_table_;
 };
 
-// Initially 1024 elements
-static constexpr int32_t INITIAL_HASH_TABLE_SIZE = 1 << 10;
-
-template <typename DType>
-DictEncoderImpl<DType>::DictEncoderImpl(const ColumnDescriptor* desc,
-                                        ::arrow::MemoryPool* pool)
-    : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
-      dict_encoded_size_(0),
-      memo_table_(pool, INITIAL_HASH_TABLE_SIZE) {}
-
-template <typename DType>
-int64_t DictEncoderImpl<DType>::EstimatedDataEncodedSize() {
-  // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
-  // reserve
-  // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
-  // but not reserving them would cause the encoder to fail.
-  return 1 +
-         ::arrow::util::RleEncoder::MaxBufferSize(
-             bit_width(), static_cast<int>(buffered_indices_.size())) +
-         ::arrow::util::RleEncoder::MinBufferSize(bit_width());
-}
-
-template <typename DType>
-int DictEncoderImpl<DType>::bit_width() const {
-  if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
-  if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
-  return BitUtil::Log2(num_entries());
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> DictEncoderImpl<DType>::FlushValues() {
-  std::shared_ptr<ResizableBuffer> buffer =
-      AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
-  int result_size =
-      WriteIndices(buffer->mutable_data(), static_cast<int>(EstimatedDataEncodedSize()));
-  PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
-  return std::move(buffer);
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::Put(const T* src, int num_values) {
-  for (int32_t i = 0; i < num_values; i++) {
-    Put(src[i]);
-  }
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::PutSpaced(const T* src, int num_values,
-                                       const uint8_t* valid_bits,
-                                       int64_t valid_bits_offset) {
-  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
-                                                    num_values);
-  for (int32_t i = 0; i < num_values; i++) {
-    if (valid_bits_reader.IsSet()) {
-      Put(src[i]);
-    }
-    valid_bits_reader.Next();
-  }
-}
-
 template <typename DType>
 void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
   // For primitive types, only a memcpy
@@ -420,7 +504,7 @@ void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
 // ByteArray and FLBA already have the dictionary encoded in their data heaps
 template <>
 void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
-  memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
+  memo_table_.VisitValues(0, [&buffer](const arrow::util::string_view& v) {
     uint32_t len = static_cast<uint32_t>(v.length());
     memcpy(buffer, &len, sizeof(len));
     buffer += sizeof(len);
@@ -431,7 +515,7 @@ void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
 
 template <>
 void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
-  memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
+  memo_table_.VisitValues(0, [&](const arrow::util::string_view& v) {
     DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
     memcpy(buffer, v.data(), type_length_);
     buffer += type_length_;
@@ -479,25 +563,48 @@ inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
   buffered_indices_.push_back(memo_index);
 }
 
-class DictByteArrayEncoder : public DictEncoderImpl<ByteArrayType>,
-                             virtual public ByteArrayEncoder {
- public:
-  using BASE = DictEncoderImpl<ByteArrayType>;
-  using BASE::DictEncoderImpl;
-};
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const arrow::Array& values) {
+  ParquetException::NYI(values.type()->ToString());
+}
 
-class DictFLBAEncoder : public DictEncoderImpl<FLBAType>, virtual public FLBAEncoder {
- public:
-  using BASE = DictEncoderImpl<FLBAType>;
-  using BASE::DictEncoderImpl;
-};
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const arrow::Array& values) {
+  PutBinaryArray(values, this);
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const arrow::Array& values) {
+  ParquetException::NYI(values.type()->ToString());
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const arrow::Array& values) {
+  AssertBinary(values);
+  if (this->num_entries() > 0) {
+    throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+  }
+
+  const auto& data = checked_cast<const arrow::BinaryArray&>(values);
+  if (data.null_count() > 0) {
+    throw ParquetException("Inserted binary dictionary cannot cannot contain nulls");
+  }
+  for (int64_t i = 0; i < data.length(); i++) {
+    auto v = data.GetView(i);
+    dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+    ARROW_IGNORE_EXPR(
+        memo_table_.GetOrInsert(v.data(), static_cast<int32_t>(v.size()),
+                                /*on_found=*/[](int32_t memo_index) {},
+                                /*on_not_found=*/[](int32_t memo_index) {}));
+  }
+}
 
 // ----------------------------------------------------------------------
 // Encoder and decoder factory functions
 
 std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
                                      bool use_dictionary, const ColumnDescriptor* descr,
-                                     ::arrow::MemoryPool* pool) {
+                                     MemoryPool* pool) {
   if (use_dictionary) {
     switch (type_num) {
       case Type::INT32:
@@ -511,9 +618,9 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
       case Type::DOUBLE:
         return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
       case Type::BYTE_ARRAY:
-        return std::unique_ptr<Encoder>(new DictByteArrayEncoder(descr, pool));
+        return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
       case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::unique_ptr<Encoder>(new DictFLBAEncoder(descr, pool));
+        return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
       default:
         DCHECK(false) << "Encoder not implemented";
         break;
@@ -533,9 +640,9 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
       case Type::DOUBLE:
         return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
       case Type::BYTE_ARRAY:
-        return std::unique_ptr<Encoder>(new PlainByteArrayEncoder(descr, pool));
+        return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
       case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::unique_ptr<Encoder>(new PlainFLBAEncoder(descr, pool));
+        return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
       default:
         DCHECK(false) << "Encoder not implemented";
         break;
@@ -665,7 +772,7 @@ class PlainBooleanDecoder : public DecoderImpl,
   int Decode(bool* buffer, int max_values) override;
 
  private:
-  std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
+  std::unique_ptr<arrow::BitUtil::BitReader> bit_reader_;
 };
 
 PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
@@ -679,7 +786,7 @@ void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len)
 int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
   max_values = std::min(max_values, num_values_);
   bool val;
-  ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
+  arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
   for (int i = 0; i < max_values; ++i) {
     if (!bit_reader_->GetValue(1, &val)) {
       ParquetException::EofException();
@@ -712,7 +819,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
+                  arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -721,7 +828,15 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::internal::ChunkedBinaryBuilder* builder) override {
+                  arrow::internal::ChunkedBinaryBuilder* builder) override {
+    int result = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                     valid_bits_offset, builder, &result));
+    return result;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -729,14 +844,14 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::BinaryDictionary32Builder* builder) override {
+                         arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::internal::ChunkedBinaryBuilder* builder) override {
+                         arrow::internal::ChunkedBinaryBuilder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
@@ -744,17 +859,17 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
  private:
   template <typename BuilderType>
-  ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                              int64_t valid_bits_offset, BuilderType* builder,
-                              int* values_decoded) {
-    num_values = std::min(num_values, num_values_);
+  arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                            int64_t valid_bits_offset, BuilderType* builder,
+                            int* out_values_decoded) {
     RETURN_NOT_OK(builder->Reserve(num_values));
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+    arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
     int increment;
     int i = 0;
     const uint8_t* data = data_;
     int64_t data_size = len_;
     int bytes_decoded = 0;
+    int values_decoded = 0;
     while (i < num_values) {
       if (bit_reader.IsSet()) {
         uint32_t len = arrow::util::SafeLoadAs<uint32_t>(data);
@@ -766,6 +881,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
         data += increment;
         data_size -= increment;
         bytes_decoded += increment;
+        ++values_decoded;
       } else {
         RETURN_NOT_OK(builder->AppendNull());
       }
@@ -775,14 +891,14 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
     data_ += bytes_decoded;
     len_ -= bytes_decoded;
-    num_values_ -= num_values;
-    *values_decoded = num_values;
-    return ::arrow::Status::OK();
+    num_values_ -= values_decoded;
+    *out_values_decoded = values_decoded;
+    return arrow::Status::OK();
   }
 
   template <typename BuilderType>
-  ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder,
-                                     int* values_decoded) {
+  arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder,
+                                   int* values_decoded) {
     num_values = std::min(num_values, num_values_);
     RETURN_NOT_OK(builder->Reserve(num_values));
     int i = 0;
@@ -805,7 +921,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
     len_ -= bytes_decoded;
     num_values_ -= num_values;
     *values_decoded = num_values;
-    return ::arrow::Status::OK();
+    return arrow::Status::OK();
   }
 };
 
@@ -827,7 +943,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
   // dictionary is not guaranteed to persist in memory after this call so the
   // dictionary decoder needs to copy the data out if necessary.
   explicit DictDecoderImpl(const ColumnDescriptor* descr,
-                           ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+                           MemoryPool* pool = arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
         dictionary_(AllocateBuffer(pool, 0)),
         dictionary_length_(0),
@@ -844,7 +960,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     uint8_t bit_width = *data;
     ++data;
     --len;
-    idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width);
+    idx_decoder_ = arrow::util::RleDecoder(data, len, bit_width);
   }
 
   int Decode(T* buffer, int num_values) override {
@@ -870,12 +986,11 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     return num_values;
   }
 
-  void InsertDictionary(::arrow::ArrayBuilder* builder) override;
+  void InsertDictionary(arrow::ArrayBuilder* builder) override;
 
   int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
-                          ::arrow::ArrayBuilder* builder) override {
-    num_values = std::min(num_values, num_values_);
+                          arrow::ArrayBuilder* builder) override {
     if (num_values > 0) {
       // TODO(wesm): Refactor to batch reads for improved memory use. It is not
       // trivial because the null_count is relative to the entire bitmap
@@ -893,20 +1008,20 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     /// XXX(wesm): Cannot append "valid bits" directly to the builder
     std::vector<uint8_t> valid_bytes(num_values);
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+    arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
     for (int64_t i = 0; i < num_values; ++i) {
       valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
       bit_reader.Next();
     }
 
-    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+    auto binary_builder = checked_cast<arrow::BinaryDictionary32Builder*>(builder);
     PARQUET_THROW_NOT_OK(
         binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
-    num_values_ -= num_values;
-    return num_values;
+    num_values_ -= num_values - null_count;
+    return num_values - null_count;
   }
 
-  int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
+  int DecodeIndices(int num_values, arrow::ArrayBuilder* builder) override {
     num_values = std::min(num_values, num_values_);
     num_values = std::min(num_values, num_values_);
     if (num_values > 0) {
@@ -921,7 +1036,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
       ParquetException::EofException();
     }
-    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+    auto binary_builder = checked_cast<arrow::BinaryDictionary32Builder*>(builder);
     PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
     num_values_ -= num_values;
     return num_values;
@@ -956,7 +1071,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
   // BinaryDictionary32Builder
   std::shared_ptr<ResizableBuffer> indices_scratch_space_;
 
-  ::arrow::util::RleDecoder idx_decoder_;
+  arrow::util::RleDecoder idx_decoder_;
 };
 
 template <typename Type>
@@ -1019,17 +1134,17 @@ inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionar
 }
 
 template <typename Type>
-void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+void DictDecoderImpl<Type>::InsertDictionary(arrow::ArrayBuilder* builder) {
   ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
 }
 
 template <>
-void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
-  auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+void DictDecoderImpl<ByteArrayType>::InsertDictionary(arrow::ArrayBuilder* builder) {
+  auto binary_builder = checked_cast<arrow::BinaryDictionary32Builder*>(builder);
 
   // Make an BinaryArray referencing the internal dictionary data
-  auto arr = std::make_shared<::arrow::BinaryArray>(
-      dictionary_length_, byte_array_offsets_, byte_array_data_);
+  auto arr = std::make_shared<arrow::BinaryArray>(dictionary_length_, byte_array_offsets_,
+                                                  byte_array_data_);
   PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
 }
 
@@ -1041,7 +1156,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
+                  arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1050,7 +1165,15 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::internal::ChunkedBinaryBuilder* builder) override {
+                  arrow::internal::ChunkedBinaryBuilder* builder) override {
+    int result = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                     valid_bits_offset, builder, &result));
+    return result;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1058,14 +1181,14 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::BinaryDictionary32Builder* builder) override {
+                         arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::internal::ChunkedBinaryBuilder* builder) override {
+                         arrow::internal::ChunkedBinaryBuilder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
@@ -1073,24 +1196,26 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
  private:
   template <typename BuilderType>
-  ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                              int64_t valid_bits_offset, BuilderType* builder,
-                              int* out_num_values) {
+  arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                            int64_t valid_bits_offset, BuilderType* builder,
+                            int* out_num_values) {
     constexpr int32_t buffer_size = 1024;
     int32_t indices_buffer[buffer_size];
+
     RETURN_NOT_OK(builder->Reserve(num_values));
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+    arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
 
     auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
 
     int values_decoded = 0;
-    while (values_decoded < num_values) {
+    int num_appended = 0;
+    while (num_appended < num_values) {
       bool is_valid = bit_reader.IsSet();
       bit_reader.Next();
 
       if (is_valid) {
         int32_t batch_size =
-            std::min<int32_t>(buffer_size, num_values - values_decoded - null_count);
+            std::min<int32_t>(buffer_size, num_values - num_appended - null_count);
         int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size);
 
         int i = 0;
@@ -1100,11 +1225,12 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
             const auto& val = dict_values[indices_buffer[i]];
             RETURN_NOT_OK(builder->Append(val.ptr, val.len));
             ++i;
+            ++values_decoded;
           } else {
             RETURN_NOT_OK(builder->AppendNull());
             --null_count;
           }
-          ++values_decoded;
+          ++num_appended;
           if (i == num_indices) {
             // Do not advance the bit_reader if we have fulfilled the decode
             // request
@@ -1116,20 +1242,20 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
       } else {
         RETURN_NOT_OK(builder->AppendNull());
         --null_count;
-        ++values_decoded;
+        ++num_appended;
       }
     }
-    if (values_decoded != num_values) {
-      return ::arrow::Status::IOError("Expected to dictionary-decode ", num_values,
-                                      " but only able to decode ", values_decoded);
+    if (num_values != num_appended) {
+      return arrow::Status::IOError("Expected to dictionary-decode ", num_values,
+                                    " but only able to decode ", num_appended);
     }
     *out_num_values = values_decoded;
-    return ::arrow::Status::OK();
+    return arrow::Status::OK();
   }
 
   template <typename BuilderType>
-  ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder,
-                                     int* out_num_values) {
+  arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder,
+                                   int* out_num_values) {
     constexpr int32_t buffer_size = 2048;
     int32_t indices_buffer[buffer_size];
     int values_decoded = 0;
@@ -1151,7 +1277,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
       ParquetException::EofException();
     }
     *out_num_values = values_decoded;
-    return ::arrow::Status::OK();
+    return arrow::Status::OK();
   }
 };
 
@@ -1170,7 +1296,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
   typedef typename DType::c_type T;
 
   explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
-                               ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+                               MemoryPool* pool = arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
     if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
       throw ParquetException("Delta bit pack encoding should only be for integer data.");
@@ -1179,7 +1305,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 
   virtual void SetData(int num_values, const uint8_t* data, int len) {
     this->num_values_ = num_values;
-    decoder_ = ::arrow::BitUtil::BitReader(data, len);
+    decoder_ = arrow::BitUtil::BitReader(data, len);
     values_current_block_ = 0;
     values_current_mini_block_ = 0;
   }
@@ -1242,8 +1368,8 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
     return max_values;
   }
 
-  ::arrow::MemoryPool* pool_;
-  ::arrow::BitUtil::BitReader decoder_;
+  MemoryPool* pool_;
+  arrow::BitUtil::BitReader decoder_;
   int32_t values_current_block_;
   int32_t num_mini_blocks_;
   uint64_t values_per_mini_block_;
@@ -1263,9 +1389,8 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 class DeltaLengthByteArrayDecoder : public DecoderImpl,
                                     virtual public TypedDecoder<ByteArrayType> {
  public:
-  explicit DeltaLengthByteArrayDecoder(
-      const ColumnDescriptor* descr,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+  explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
+                                       MemoryPool* pool = arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
         len_decoder_(nullptr, pool) {}
 
@@ -1303,9 +1428,8 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 class DeltaByteArrayDecoder : public DecoderImpl,
                               virtual public TypedDecoder<ByteArrayType> {
  public:
-  explicit DeltaByteArrayDecoder(
-      const ColumnDescriptor* descr,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+  explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+                                 MemoryPool* pool = arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
         prefix_len_decoder_(nullptr, pool),
         suffix_decoder_(nullptr, pool),
@@ -1387,7 +1511,7 @@ namespace detail {
 
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
-                                         ::arrow::MemoryPool* pool) {
+                                         MemoryPool* pool) {
   switch (type_num) {
     case Type::BOOLEAN:
       ParquetException::NYI("Dictionary encoding not implemented for boolean type");
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 5aa1fed74b6..618fd1a4c0c 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -28,7 +28,10 @@
 
 namespace arrow {
 
+class Array;
 class ArrayBuilder;
+class BinaryArray;
+class BinaryBuilder;
 class BinaryDictionary32Builder;
 
 namespace internal {
@@ -51,7 +54,9 @@ class Encoder {
   virtual std::shared_ptr<Buffer> FlushValues() = 0;
   virtual Encoding::type encoding() const = 0;
 
-  virtual ::arrow::MemoryPool* memory_pool() const = 0;
+  virtual void Put(const ::arrow::Array& values) = 0;
+
+  virtual MemoryPool* memory_pool() const = 0;
 };
 
 // Base class for value encoders. Since encoders may or not have state (e.g.,
@@ -63,25 +68,12 @@ class TypedEncoder : virtual public Encoder {
  public:
   typedef typename DType::c_type T;
 
+  using Encoder::Put;
+
   virtual void Put(const T* src, int num_values) = 0;
 
   virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                         int64_t valid_bits_offset) {
-    std::shared_ptr<ResizableBuffer> buffer;
-    PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer(
-        this->memory_pool(), num_values * sizeof(T), &buffer));
-    int32_t num_valid_values = 0;
-    ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
-                                                      num_values);
-    T* data = reinterpret_cast<T*>(buffer->mutable_data());
-    for (int32_t i = 0; i < num_values; i++) {
-      if (valid_bits_reader.IsSet()) {
-        data[num_valid_values++] = src[i];
-      }
-      valid_bits_reader.Next();
-    }
-    Put(data, num_valid_values);
-  }
+                         int64_t valid_bits_offset) = 0;
 };
 
 // Base class for dictionary encoders
@@ -105,6 +97,20 @@ class DictEncoder : virtual public TypedEncoder<DType> {
   virtual void WriteDict(uint8_t* buffer) = 0;
 
   virtual int num_entries() const = 0;
+
+  /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
+  /// assumed (without any boundschecking) that the indices reference
+  /// pre-existing dictionary values
+  /// \param[in] indices the dictionary index values. Only Int32Array currently
+  /// supported
+  virtual void PutIndices(const ::arrow::Array& indices) = 0;
+
+  /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
+  /// separately. Currently throws exception if the current dictionary memo is
+  /// non-empty
+  /// \param[in] values the dictionary values. Only valid for certain
+  /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
+  virtual void PutDictionary(const ::arrow::Array& values) = 0;
 };
 
 // ----------------------------------------------------------------------
@@ -204,8 +210,8 @@ using Int64Encoder = TypedEncoder<Int64Type>;
 using Int96Encoder = TypedEncoder<Int96Type>;
 using FloatEncoder = TypedEncoder<FloatType>;
 using DoubleEncoder = TypedEncoder<DoubleType>;
-class ByteArrayEncoder : virtual public TypedEncoder<ByteArrayType> {};
-class FLBAEncoder : virtual public TypedEncoder<FLBAType> {};
+using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using FLBAEncoder = TypedEncoder<FLBAType>;
 
 class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
  public:
@@ -223,6 +229,7 @@ class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
  public:
   using TypedDecoder<ByteArrayType>::DecodeSpaced;
 
+  /// \brief Returns number of encoded values decoded
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
                           ::arrow::BinaryDictionary32Builder* builder) = 0;
@@ -230,6 +237,11 @@ class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
   virtual int DecodeArrowNonNull(int num_values,
                                  ::arrow::BinaryDictionary32Builder* builder) = 0;
 
+  /// \brief Returns number of encoded values decoded
+  virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset, ::arrow::BinaryBuilder* builder) = 0;
+
+  /// \brief Returns number of encoded values decoded
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
                           ::arrow::internal::ChunkedBinaryBuilder* builder) = 0;
@@ -331,7 +343,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
 
 template <typename DType>
 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
-    const ColumnDescriptor* descr,
+    const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = DictDecoder<DType>;
   auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 5410dc8367c..c8718f07d62 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -81,14 +81,14 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
   // If ColumnOrder is defined, return max_value and min_value
   if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
-    return TypedStatistics<DType>::Make(
+    return MakeStatistics<DType>(
         descr, metadata.statistics.min_value, metadata.statistics.max_value,
         metadata.num_values - metadata.statistics.null_count,
         metadata.statistics.null_count, metadata.statistics.distinct_count,
         metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value);
   }
   // Default behavior
-  return TypedStatistics<DType>::Make(
+  return MakeStatistics<DType>(
       descr, metadata.statistics.min, metadata.statistics.max,
       metadata.num_values - metadata.statistics.null_count,
       metadata.statistics.null_count, metadata.statistics.distinct_count,
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index b7e55f0cc96..209969a0054 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -482,7 +482,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
         : write_timestamps_as_int96_(false),
           coerce_timestamps_enabled_(false),
           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
-          truncated_timestamps_allowed_(false) {}
+          truncated_timestamps_allowed_(false),
+          store_schema_(false) {}
     virtual ~Builder() {}
 
     Builder* disable_deprecated_int96_timestamps() {
@@ -511,10 +512,18 @@ class PARQUET_EXPORT ArrowWriterProperties {
       return this;
     }
 
+    /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
+    /// to enable certain read options (like "read_dictionary") to be set
+    /// automatically
+    Builder* store_schema() {
+      store_schema_ = true;
+      return this;
+    }
+
     std::shared_ptr<ArrowWriterProperties> build() {
       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
           write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
-          truncated_timestamps_allowed_));
+          truncated_timestamps_allowed_, store_schema_));
     }
 
    private:
@@ -523,6 +532,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
     bool coerce_timestamps_enabled_;
     ::arrow::TimeUnit::type coerce_timestamps_unit_;
     bool truncated_timestamps_allowed_;
+
+    bool store_schema_;
   };
 
   bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
@@ -534,20 +545,24 @@ class PARQUET_EXPORT ArrowWriterProperties {
 
   bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
 
+  bool store_schema() const { return store_schema_; }
+
  private:
   explicit ArrowWriterProperties(bool write_nanos_as_int96,
                                  bool coerce_timestamps_enabled,
                                  ::arrow::TimeUnit::type coerce_timestamps_unit,
-                                 bool truncated_timestamps_allowed)
+                                 bool truncated_timestamps_allowed, bool store_schema)
       : write_timestamps_as_int96_(write_nanos_as_int96),
         coerce_timestamps_enabled_(coerce_timestamps_enabled),
         coerce_timestamps_unit_(coerce_timestamps_unit),
-        truncated_timestamps_allowed_(truncated_timestamps_allowed) {}
+        truncated_timestamps_allowed_(truncated_timestamps_allowed),
+        store_schema_(store_schema) {}
 
   const bool write_timestamps_as_int96_;
   const bool coerce_timestamps_enabled_;
   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
   const bool truncated_timestamps_allowed_;
+  const bool store_schema_;
 };
 
 /// \brief State object used for writing Arrow data directly to a Parquet
diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc
index fa1caa96d31..84150d1a8bf 100644
--- a/cpp/src/parquet/statistics-test.cc
+++ b/cpp/src/parquet/statistics-test.cc
@@ -62,8 +62,7 @@ static FLBA FLBAFromString(const std::string& s) {
 }
 
 TEST(Comparison, SignedByteArray) {
-  auto comparator =
-      TypedComparator<ByteArrayType>::Make(Type::BYTE_ARRAY, SortOrder::SIGNED);
+  auto comparator = MakeComparator<ByteArrayType>(Type::BYTE_ARRAY, SortOrder::SIGNED);
 
   std::string s1 = "12345";
   std::string s2 = "12345678";
@@ -82,8 +81,7 @@ TEST(Comparison, SignedByteArray) {
 
 TEST(Comparison, UnsignedByteArray) {
   // Check if UTF-8 is compared using unsigned correctly
-  auto comparator =
-      TypedComparator<ByteArrayType>::Make(Type::BYTE_ARRAY, SortOrder::UNSIGNED);
+  auto comparator = MakeComparator<ByteArrayType>(Type::BYTE_ARRAY, SortOrder::UNSIGNED);
 
   std::string s1 = "arrange";
   std::string s2 = "arrangement";
@@ -107,8 +105,8 @@ TEST(Comparison, UnsignedByteArray) {
 
 TEST(Comparison, SignedFLBA) {
   int size = 10;
-  auto comparator = TypedComparator<FLBAType>::Make(Type::FIXED_LEN_BYTE_ARRAY,
-                                                    SortOrder::SIGNED, size);
+  auto comparator =
+      MakeComparator<FLBAType>(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED, size);
 
   std::string s1 = "Anti123456";
   std::string s2 = "Bunkd123456";
@@ -125,8 +123,8 @@ TEST(Comparison, SignedFLBA) {
 
 TEST(Comparison, UnsignedFLBA) {
   int size = 10;
-  auto comparator = TypedComparator<FLBAType>::Make(Type::FIXED_LEN_BYTE_ARRAY,
-                                                    SortOrder::UNSIGNED, size);
+  auto comparator =
+      MakeComparator<FLBAType>(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::UNSIGNED, size);
 
   std::string s1 = "Anti123456";
   std::string s2 = "Bunkd123456";
@@ -146,7 +144,7 @@ TEST(Comparison, SignedInt96) {
   parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}};
   parquet::Int96 aaa{{1, 41, static_cast<uint32_t>(-14)}}, bbb{{1, 41, 42}};
 
-  auto comparator = TypedComparator<Int96Type>::Make(Type::INT96, SortOrder::SIGNED);
+  auto comparator = MakeComparator<Int96Type>(Type::INT96, SortOrder::SIGNED);
 
   ASSERT_TRUE(comparator->Compare(a, b));
   ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
@@ -158,7 +156,7 @@ TEST(Comparison, UnsignedInt96) {
   parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast<uint32_t>(-14)}};
   parquet::Int96 aaa, bbb;
 
-  auto comparator = TypedComparator<Int96Type>::Make(Type::INT96, SortOrder::UNSIGNED);
+  auto comparator = MakeComparator<Int96Type>(Type::INT96, SortOrder::UNSIGNED);
 
   ASSERT_TRUE(comparator->Compare(a, b));
   ASSERT_TRUE(comparator->Compare(aa, bb));
@@ -197,7 +195,7 @@ TEST(Comparison, SignedInt64) {
   NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64);
   ColumnDescriptor descr(node, 0, 0);
 
-  auto comparator = TypedComparator<Int64Type>::Make(&descr);
+  auto comparator = MakeComparator<Int64Type>(&descr);
 
   ASSERT_TRUE(comparator->Compare(a, b));
   ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
@@ -214,7 +212,7 @@ TEST(Comparison, UnsignedInt64) {
   ColumnDescriptor descr(node, 0, 0);
 
   ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order());
-  auto comparator = TypedComparator<Int64Type>::Make(&descr);
+  auto comparator = MakeComparator<Int64Type>(&descr);
 
   ASSERT_TRUE(comparator->Compare(a, b));
   ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
@@ -231,7 +229,7 @@ TEST(Comparison, UnsignedInt32) {
   ColumnDescriptor descr(node, 0, 0);
 
   ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order());
-  auto comparator = TypedComparator<Int32Type>::Make(&descr);
+  auto comparator = MakeComparator<Int32Type>(&descr);
 
   ASSERT_TRUE(comparator->Compare(a, b));
   ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
@@ -253,7 +251,6 @@ template <typename TestType>
 class TestStatistics : public PrimitiveTypedTest<TestType> {
  public:
   using T = typename TestType::c_type;
-  using TypedStats = TypedStatistics<TestType>;
 
   std::vector<T> GetDeepCopy(
       const std::vector<T>&);  // allocates new memory for FLBA/ByteArray
@@ -264,15 +261,16 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
   void TestMinMaxEncode() {
     this->GenerateData(1000);
 
-    auto statistics1 = TypedStats::Make(this->schema_.Column(0));
+    auto statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
     statistics1->Update(this->values_ptr_, this->values_.size(), 0);
     std::string encoded_min = statistics1->EncodeMin();
     std::string encoded_max = statistics1->EncodeMax();
 
-    auto statistics2 = TypedStats::Make(this->schema_.Column(0), encoded_min, encoded_max,
-                                        this->values_.size(), 0, 0, true);
+    auto statistics2 =
+        MakeStatistics<TestType>(this->schema_.Column(0), encoded_min, encoded_max,
+                                 this->values_.size(), 0, 0, true);
 
-    auto statistics3 = TypedStats::Make(this->schema_.Column(0));
+    auto statistics3 = MakeStatistics<TestType>(this->schema_.Column(0));
     std::vector<uint8_t> valid_bits(
         BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
     statistics3->UpdateSpaced(this->values_ptr_, valid_bits.data(), 0,
@@ -293,7 +291,7 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
   void TestReset() {
     this->GenerateData(1000);
 
-    auto statistics = TypedStats::Make(this->schema_.Column(0));
+    auto statistics = MakeStatistics<TestType>(this->schema_.Column(0));
     statistics->Update(this->values_ptr_, this->values_.size(), 0);
     ASSERT_EQ(this->values_.size(), statistics->num_values());
 
@@ -308,17 +306,17 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     int num_null[2];
     random_numbers(2, 42, 0, 100, num_null);
 
-    auto statistics1 = TypedStats::Make(this->schema_.Column(0));
+    auto statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
     this->GenerateData(1000);
     statistics1->Update(this->values_ptr_, this->values_.size() - num_null[0],
                         num_null[0]);
 
-    auto statistics2 = TypedStats::Make(this->schema_.Column(0));
+    auto statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
     this->GenerateData(1000);
     statistics2->Update(this->values_ptr_, this->values_.size() - num_null[1],
                         num_null[1]);
 
-    auto total = TypedStats::Make(this->schema_.Column(0));
+    auto total = MakeStatistics<TestType>(this->schema_.Column(0));
     total->Merge(*statistics1);
     total->Merge(*statistics2);
 
@@ -332,7 +330,7 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     this->GenerateData(num_values);
 
     // compute statistics for the whole batch
-    auto expected_stats = TypedStats::Make(this->schema_.Column(0));
+    auto expected_stats = MakeStatistics<TestType>(this->schema_.Column(0));
     expected_stats->Update(this->values_ptr_, num_values - null_count, null_count);
 
     auto sink = CreateOutputStream();
@@ -456,7 +454,7 @@ template <>
 void TestStatistics<ByteArrayType>::TestMinMaxEncode() {
   this->GenerateData(1000);
   // Test that we encode min max strings correctly
-  auto statistics1 = TypedStatistics<ByteArrayType>::Make(this->schema_.Column(0));
+  auto statistics1 = MakeStatistics<ByteArrayType>(this->schema_.Column(0));
   statistics1->Update(this->values_ptr_, this->values_.size(), 0);
   std::string encoded_min = statistics1->EncodeMin();
   std::string encoded_max = statistics1->EncodeMax();
@@ -470,8 +468,8 @@ void TestStatistics<ByteArrayType>::TestMinMaxEncode() {
                         statistics1->max().len));
 
   auto statistics2 =
-      TypedStatistics<ByteArrayType>::Make(this->schema_.Column(0), encoded_min,
-                                           encoded_max, this->values_.size(), 0, 0, true);
+      MakeStatistics<ByteArrayType>(this->schema_.Column(0), encoded_min, encoded_max,
+                                    this->values_.size(), 0, 0, true);
 
   ASSERT_EQ(encoded_min, statistics2->EncodeMin());
   ASSERT_EQ(encoded_max, statistics2->EncodeMax());
@@ -833,6 +831,25 @@ TYPED_TEST(TestStatisticsSortOrder, MinMax) {
   ASSERT_NO_FATAL_FAILURE(this->VerifyParquetStats());
 }
 
+TEST(TestByteArrayStatisticsFromArrow, Basics) {
+  // Part of ARROW-3246. Replicating TestStatisticsSortOrder test but via Arrow
+
+  auto values = ArrayFromJSON(::arrow::utf8(),
+                              u8"[\"c123\", \"b123\", \"a123\", null, "
+                              "null, \"f123\", \"g123\", \"h123\", \"i123\", \"ü123\"]");
+
+  const auto& typed_values = static_cast<const ::arrow::BinaryArray&>(*values);
+
+  NodePtr node = PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY,
+                                     ConvertedType::UTF8);
+  ColumnDescriptor descr(node, 0, 0);
+  auto stats = MakeStatistics<ByteArrayType>(&descr);
+  ASSERT_NO_FATAL_FAILURE(stats->Update(*values));
+
+  ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min());
+  ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max());
+}
+
 // Ensure UNKNOWN sort order is handled properly
 using TestStatisticsSortOrderFLBA = TestStatisticsSortOrder<FLBAType>;
 
@@ -873,7 +890,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) {
   }
 
   // Test values
-  auto nan_stats = TypedStatistics<FloatType>::Make(&descr);
+  auto nan_stats = MakeStatistics<FloatType>(&descr);
   nan_stats->Update(&values[0], NUM_VALUES, 0);
   float min = nan_stats->min();
   float max = nan_stats->max();
@@ -881,7 +898,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) {
   ASSERT_EQ(max, 3.0f);
 
   // Test all NaNs
-  auto all_nan_stats = TypedStatistics<FloatType>::Make(&descr);
+  auto all_nan_stats = MakeStatistics<FloatType>(&descr);
   all_nan_stats->Update(&nan_values[0], NUM_VALUES, 0);
   min = all_nan_stats->min();
   max = all_nan_stats->max();
@@ -925,7 +942,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) {
   std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255);
 
   // Test values
-  auto nan_stats = TypedStatistics<FloatType>::Make(&descr);
+  auto nan_stats = MakeStatistics<FloatType>(&descr);
   nan_stats->UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0);
   float min = nan_stats->min();
   float max = nan_stats->max();
@@ -933,7 +950,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) {
   ASSERT_EQ(max, 3.0f);
 
   // Test all NaNs
-  auto all_nan_stats = TypedStatistics<FloatType>::Make(&descr);
+  auto all_nan_stats = MakeStatistics<FloatType>(&descr);
   all_nan_stats->UpdateSpaced(&nan_values[0], valid_bits.data(), 0, NUM_VALUES, 0);
   min = all_nan_stats->min();
   max = all_nan_stats->max();
@@ -968,7 +985,7 @@ TEST(TestStatisticsSortOrderDoubleNaN, NaNValues) {
   NodePtr node = PrimitiveNode::Make("nan_double", Repetition::OPTIONAL, Type::DOUBLE);
   ColumnDescriptor descr(node, 1, 1);
 
-  auto nan_stats = TypedStatistics<DoubleType>::Make(&descr);
+  auto nan_stats = MakeStatistics<DoubleType>(&descr);
   double values[NUM_VALUES] = {std::nan(""), std::nan(""), -3.0, -2.0, -1.0,
                                0.0,          1.0,          2.0,  3.0,  4.0};
   double* values_ptr = &values[0];
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 16abc152cf4..4bada835741 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -20,6 +20,9 @@
 #include <cstring>
 #include <type_traits>
 
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 
 #include "parquet/encoding.h"
@@ -30,6 +33,7 @@
 
 using arrow::default_memory_pool;
 using arrow::MemoryPool;
+using arrow::internal::checked_cast;
 
 namespace parquet {
 
@@ -126,14 +130,14 @@ struct CompareHelper<FLBAType, false> {
   }
 };
 
-template <typename DType, bool is_signed = true>
-class TypedComparatorImpl : public TypedComparator<DType> {
+template <bool is_signed, typename DType>
+class TypedComparatorImpl : virtual public TypedComparator<DType> {
  public:
   typedef typename DType::c_type T;
 
   explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
 
-  bool CompareInline(const T& a, const T& b) {
+  bool CompareInline(const T& a, const T& b) const {
     return CompareHelper<DType, is_signed>::Compare(type_length_, a, b);
   }
 
@@ -157,9 +161,18 @@ class TypedComparatorImpl : public TypedComparator<DType> {
                        int64_t valid_bits_offset, T* out_min, T* out_max) override {
     ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
                                                       length);
-    T min = values[0];
-    T max = values[0];
-    for (int64_t i = 0; i < length; i++) {
+
+    // Find the first non-null value
+    int64_t first_non_null = 0;
+    while (!valid_bits_reader.IsSet()) {
+      ++first_non_null;
+      valid_bits_reader.Next();
+    }
+
+    T min = values[first_non_null];
+    T max = values[first_non_null];
+    valid_bits_reader.Next();
+    for (int64_t i = first_non_null + 1; i < length; i++) {
       if (valid_bits_reader.IsSet()) {
         if (CompareInline(values[i], min)) {
           min = values[i];
@@ -173,46 +186,114 @@ class TypedComparatorImpl : public TypedComparator<DType> {
     *out_max = max;
   }
 
+  void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) override;
+
  private:
   int type_length_;
 };
 
+template <bool is_signed, typename DType>
+void TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values,
+                                                      typename DType::c_type* out_min,
+                                                      typename DType::c_type* out_max) {
+  ParquetException::NYI(values.type()->ToString());
+}
+
+template <bool is_signed>
+void GetMinMaxBinaryHelper(
+    const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
+    const ::arrow::Array& values, ByteArray* out_min, ByteArray* out_max) {
+  const auto& data = checked_cast<const ::arrow::BinaryArray&>(values);
+
+  ByteArray min, max;
+  if (data.null_count() > 0) {
+    ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(),
+                                                      data.offset(), data.length());
+
+    int64_t first_non_null = 0;
+    while (!valid_bits_reader.IsSet()) {
+      ++first_non_null;
+      valid_bits_reader.Next();
+    }
+    min = data.GetView(first_non_null);
+    max = data.GetView(first_non_null);
+    for (int64_t i = first_non_null; i < data.length(); i++) {
+      ByteArray val = data.GetView(i);
+      if (valid_bits_reader.IsSet()) {
+        if (comparator.CompareInline(val, min)) {
+          min = val;
+        } else if (comparator.CompareInline(max, val)) {
+          max = val;
+        }
+      }
+      valid_bits_reader.Next();
+    }
+  } else {
+    min = data.GetView(0);
+    max = data.GetView(0);
+    for (int64_t i = 0; i < data.length(); i++) {
+      ByteArray val = data.GetView(i);
+      if (comparator.CompareInline(val, min)) {
+        min = val;
+      } else if (comparator.CompareInline(max, val)) {
+        max = val;
+      }
+    }
+  }
+  *out_min = min;
+  *out_max = max;
+}
+
+template <>
+void TypedComparatorImpl<true, ByteArrayType>::GetMinMax(const ::arrow::Array& values,
+                                                         ByteArray* out_min,
+                                                         ByteArray* out_max) {
+  GetMinMaxBinaryHelper<true>(*this, values, out_min, out_max);
+}
+
+template <>
+void TypedComparatorImpl<false, ByteArrayType>::GetMinMax(const ::arrow::Array& values,
+                                                          ByteArray* out_min,
+                                                          ByteArray* out_max) {
+  GetMinMaxBinaryHelper<false>(*this, values, out_min, out_max);
+}
+
 std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
                                              SortOrder::type sort_order,
                                              int type_length) {
   if (SortOrder::SIGNED == sort_order) {
     switch (physical_type) {
       case Type::BOOLEAN:
-        return std::make_shared<TypedComparatorImpl<BooleanType>>();
+        return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
       case Type::INT32:
-        return std::make_shared<TypedComparatorImpl<Int32Type>>();
+        return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
       case Type::INT64:
-        return std::make_shared<TypedComparatorImpl<Int64Type>>();
+        return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
       case Type::INT96:
-        return std::make_shared<TypedComparatorImpl<Int96Type>>();
+        return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
       case Type::FLOAT:
-        return std::make_shared<TypedComparatorImpl<FloatType>>();
+        return std::make_shared<TypedComparatorImpl<true, FloatType>>();
       case Type::DOUBLE:
-        return std::make_shared<TypedComparatorImpl<DoubleType>>();
+        return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
       case Type::BYTE_ARRAY:
-        return std::make_shared<TypedComparatorImpl<ByteArrayType>>();
+        return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_shared<TypedComparatorImpl<FLBAType>>(type_length);
+        return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Signed Compare not implemented");
     }
   } else if (SortOrder::UNSIGNED == sort_order) {
     switch (physical_type) {
       case Type::INT32:
-        return std::make_shared<TypedComparatorImpl<Int32Type, false>>();
+        return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
       case Type::INT64:
-        return std::make_shared<TypedComparatorImpl<Int64Type, false>>();
+        return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
       case Type::INT96:
-        return std::make_shared<TypedComparatorImpl<Int96Type, false>>();
+        return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
       case Type::BYTE_ARRAY:
-        return std::make_shared<TypedComparatorImpl<ByteArrayType, false>>();
+        return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_shared<TypedComparatorImpl<FLBAType, false>>(type_length);
+        return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Unsigned Compare not implemented");
     }
@@ -228,6 +309,59 @@ std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
 
 // ----------------------------------------------------------------------
 
+template <typename T, typename Enable = void>
+struct StatsHelper {
+  bool CanHaveNaN() { return false; }
+
+  inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; }
+
+  inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; }
+
+  inline bool IsNaN(const T value) { return false; }
+};
+
+template <typename T>
+struct StatsHelper<T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  bool CanHaveNaN() { return true; }
+
+  inline int64_t GetValueBeginOffset(const T* values, int64_t count) {
+    // Skip NaNs
+    for (int64_t i = 0; i < count; i++) {
+      if (!std::isnan(values[i])) {
+        return i;
+      }
+    }
+    return count;
+  }
+
+  inline int64_t GetValueEndOffset(const T* values, int64_t count) {
+    // Skip NaNs
+    for (int64_t i = (count - 1); i >= 0; i--) {
+      if (!std::isnan(values[i])) {
+        return (i + 1);
+      }
+    }
+    return 0;
+  }
+
+  inline bool IsNaN(const T value) { return std::isnan(value); }
+};
+
+template <typename T>
+void SetNaN(T* value) {
+  // no-op
+}
+
+template <>
+void SetNaN<float>(float* value) {
+  *value = std::nanf("");
+}
+
+template <>
+void SetNaN<double>(double* value) {
+  *value = std::nan("");
+}
+
 template <typename DType>
 class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
@@ -305,6 +439,25 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
                     int64_t num_not_null, int64_t num_null) override;
 
+  void Update(const ::arrow::Array& values) override {
+    IncrementNullCount(values.null_count());
+    IncrementNumValues(values.length() - values.null_count());
+
+    // TODO: support distinct count?
+    if (values.null_count() == values.length()) {
+      return;
+    }
+
+    StatsHelper<T> helper;
+    if (helper.CanHaveNaN()) {
+      ParquetException::NYI("No NaN handling for Arrow arrays yet");
+    }
+
+    T batch_min, batch_max;
+    comparator_->GetMinMax(values, &batch_min, &batch_max);
+    SetMinMax(batch_min, batch_max);
+  }
+
   const T& min() const override { return min_; }
 
   const T& max() const override { return max_; }
@@ -393,55 +546,6 @@ inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteA
   *dst = ByteArray(src.len, buffer->data());
 }
 
-template <typename T, typename Enable = void>
-struct StatsHelper {
-  inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; }
-
-  inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; }
-
-  inline bool IsNaN(const T value) { return false; }
-};
-
-template <typename T>
-struct StatsHelper<T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  inline int64_t GetValueBeginOffset(const T* values, int64_t count) {
-    // Skip NaNs
-    for (int64_t i = 0; i < count; i++) {
-      if (!std::isnan(values[i])) {
-        return i;
-      }
-    }
-    return count;
-  }
-
-  inline int64_t GetValueEndOffset(const T* values, int64_t count) {
-    // Skip NaNs
-    for (int64_t i = (count - 1); i >= 0; i--) {
-      if (!std::isnan(values[i])) {
-        return (i + 1);
-      }
-    }
-    return 0;
-  }
-
-  inline bool IsNaN(const T value) { return std::isnan(value); }
-};
-
-template <typename T>
-void SetNaN(T* value) {
-  // no-op
-}
-
-template <>
-void SetNaN<float>(float* value) {
-  *value = std::nanf("");
-}
-
-template <>
-void SetNaN<double>(double* value) {
-  *value = std::nan("");
-}
-
 template <typename DType>
 void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
                                         int64_t num_null) {
@@ -461,7 +565,7 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
   int64_t end_offset = helper.GetValueEndOffset(values, num_not_null);
 
   // All values are NaN
-  if (end_offset < begin_offset) {
+  if (helper.CanHaveNaN() && end_offset < begin_offset) {
     // Set min/max to NaNs in this case.
     // Don't set has_min_max flag since
     // these values must be over-written by valid stats later
@@ -494,26 +598,28 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* va
   // As (num_not_null != 0) there must be one
   int64_t length = num_null + num_not_null;
   int64_t i = 0;
-  ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
-                                                    length);
   StatsHelper<T> helper;
-  for (; i < length; i++) {
-    // PARQUET-1225: Handle NaNs
-    if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) {
-      break;
+  if (helper.CanHaveNaN()) {
+    ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
+                                                      length);
+    for (; i < length; i++) {
+      // PARQUET-1225: Handle NaNs
+      if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) {
+        break;
+      }
+      valid_bits_reader.Next();
     }
-    valid_bits_reader.Next();
-  }
 
-  // All are NaNs and stats are not set yet
-  if ((i == length) && helper.IsNaN(values[i - 1])) {
-    // Don't set has_min_max flag since
-    // these values must be over-written by valid stats later
-    if (!has_min_max_) {
-      SetNaN(&min_);
-      SetNaN(&max_);
+    // All are NaNs and stats are not set yet
+    if ((i == length) && helper.IsNaN(values[i - 1])) {
+      // Don't set has_min_max flag since
+      // these values must be over-written by valid stats later
+      if (!has_min_max_) {
+        SetNaN(&min_);
+        SetNaN(&max_);
+      }
+      return;
     }
-    return;
   }
 
   // Find min and max values from remaining non-NaN values
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 402b3c38923..30d58aafd8d 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -26,6 +26,13 @@
 #include "parquet/platform.h"
 #include "parquet/types.h"
 
+namespace arrow {
+
+class Array;
+class BinaryArray;
+
+}  // namespace arrow
+
 namespace parquet {
 
 class ColumnDescriptor;
@@ -63,19 +70,6 @@ class TypedComparator : public Comparator {
  public:
   using T = typename DType::c_type;
 
-  /// \brief Typed version of Comparator::Make
-  static std::shared_ptr<TypedComparator<DType>> Make(Type::type physical_type,
-                                                      SortOrder::type sort_order,
-                                                      int type_length = -1) {
-    return std::static_pointer_cast<TypedComparator<DType>>(
-        Comparator::Make(physical_type, sort_order, type_length));
-  }
-
-  /// \brief Typed version of Comparator::Make
-  static std::shared_ptr<TypedComparator<DType>> Make(const ColumnDescriptor* descr) {
-    return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
-  }
-
   /// \brief Scalar comparison of two elements, return true if first
   /// is strictly less than the second
   virtual bool Compare(const T& a, const T& b) = 0;
@@ -84,6 +78,11 @@ class TypedComparator : public Comparator {
   /// elements without any nulls
   virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0;
 
+  /// \brief Compute minimum and maximum elements from an Arrow array. Only
+  /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
+  /// / arrow::BinaryArray
+  virtual void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) = 0;
+
   /// \brief Compute maximum and minimum elements in a batch of
   /// elements with accompanying bitmap indicating which elements are
   /// included (bit set) and excluded (bit not set)
@@ -100,6 +99,21 @@ class TypedComparator : public Comparator {
                                int64_t valid_bits_offset, T* out_min, T* out_max) = 0;
 };
 
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
+                                                       SortOrder::type sort_order,
+                                                       int type_length = -1) {
+  return std::static_pointer_cast<TypedComparator<DType>>(
+      Comparator::Make(physical_type, sort_order, type_length));
+}
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
+  return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
+}
+
 // ----------------------------------------------------------------------
 
 /// \brief Structure represented encoded statistics to be written to
@@ -137,33 +151,33 @@ class PARQUET_EXPORT EncodedStatistics {
     }
   }
 
-  inline bool is_set() const {
+  bool is_set() const {
     return has_min || has_max || has_null_count || has_distinct_count;
   }
 
-  inline bool is_signed() const { return is_signed_; }
+  bool is_signed() const { return is_signed_; }
 
-  inline void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
+  void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
 
-  inline EncodedStatistics& set_max(const std::string& value) {
+  EncodedStatistics& set_max(const std::string& value) {
     *max_ = value;
     has_max = true;
     return *this;
   }
 
-  inline EncodedStatistics& set_min(const std::string& value) {
+  EncodedStatistics& set_min(const std::string& value) {
     *min_ = value;
     has_min = true;
     return *this;
   }
 
-  inline EncodedStatistics& set_null_count(int64_t value) {
+  EncodedStatistics& set_null_count(int64_t value) {
     null_count = value;
     has_null_count = true;
     return *this;
   }
 
-  inline EncodedStatistics& set_distinct_count(int64_t value) {
+  EncodedStatistics& set_distinct_count(int64_t value) {
     distinct_count = value;
     has_distinct_count = true;
     return *this;
@@ -242,39 +256,6 @@ class TypedStatistics : public Statistics {
  public:
   using T = typename DType::c_type;
 
-  /// \brief Typed version of Statistics::Make
-  static std::shared_ptr<TypedStatistics<DType>> Make(
-      const ColumnDescriptor* descr,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
-    return std::static_pointer_cast<TypedStatistics<DType>>(
-        Statistics::Make(descr, pool));
-  }
-
-  /// \brief Create Statistics initialized to a particular state
-  /// \param[in] min the minimum value
-  /// \param[in] max the minimum value
-  /// \param[in] num_values number of values
-  /// \param[in] null_count number of null values
-  /// \param[in] distinct_count number of distinct values
-  static std::shared_ptr<TypedStatistics<DType>> Make(const T& min, const T& max,
-                                                      int64_t num_values,
-                                                      int64_t null_count,
-                                                      int64_t distinct_count) {
-    return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
-        DType::type_num, &min, &max, num_values, null_count, distinct_count));
-  }
-
-  /// \brief Typed version of Statistics::Make
-  static std::shared_ptr<TypedStatistics<DType>> Make(
-      const ColumnDescriptor* descr, const std::string& encoded_min,
-      const std::string& encoded_max, int64_t num_values, int64_t null_count,
-      int64_t distinct_count, bool has_min_max,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
-    return std::static_pointer_cast<TypedStatistics<DType>>(
-        Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
-                         distinct_count, has_min_max, pool));
-  }
-
   /// \brief The current minimum value
   virtual const T& min() const = 0;
 
@@ -289,18 +270,19 @@ class TypedStatistics : public Statistics {
 
   /// \brief Batch statistics update with supplied validity bitmap
   virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
-                            int64_t valid_bits_spaced, int64_t num_not_null,
+                            int64_t valid_bits_offset, int64_t num_not_null,
                             int64_t num_null) = 0;
 
+  /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
+  /// conversion to a primitive Parquet C type. Only implemented for certain
+  /// Parquet type / Arrow type combinations like BYTE_ARRAY /
+  /// arrow::BinaryArray
+  virtual void Update(const ::arrow::Array& values) = 0;
+
   /// \brief Set min and max values to particular values
   virtual void SetMinMax(const T& min, const T& max) = 0;
 };
 
-#ifndef ARROW_NO_DEPRECATED_API
-// TODO(wesm): Remove after Arrow 0.14.0
-using RowGroupStatistics = Statistics;
-#endif
-
 using BoolStatistics = TypedStatistics<BooleanType>;
 using Int32Statistics = TypedStatistics<Int32Type>;
 using Int64Statistics = TypedStatistics<Int64Type>;
@@ -309,4 +291,40 @@ using DoubleStatistics = TypedStatistics<DoubleType>;
 using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
 using FLBAStatistics = TypedStatistics<FLBAType>;
 
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
+}
+
+/// \brief Create Statistics initialized to a particular state
+/// \param[in] min the minimum value
+/// \param[in] max the minimum value
+/// \param[in] num_values number of values
+/// \param[in] null_count number of null values
+/// \param[in] distinct_count number of distinct values
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
+                                                       const typename DType::c_type& max,
+                                                       int64_t num_values,
+                                                       int64_t null_count,
+                                                       int64_t distinct_count) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+      DType::type_num, &min, &max, num_values, null_count, distinct_count));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(
+      Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
+                       distinct_count, has_min_max, pool));
+}
+
 }  // namespace parquet
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index bc456ea24a8..30395f37ec4 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -493,6 +493,10 @@ class ColumnOrder {
 struct ByteArray {
   ByteArray() : len(0), ptr(NULLPTR) {}
   ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+  ByteArray(::arrow::util::string_view view)  // NOLINT implicit conversion
+      : ByteArray(static_cast<uint32_t>(view.size()),
+                  reinterpret_cast<const uint8_t*>(view.data())) {}
   uint32_t len;
   const uint8_t* ptr;
 };
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 82ca9fbb33e..97e73cb6468 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -365,6 +365,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
             Builder* coerce_timestamps(TimeUnit unit)
             Builder* allow_truncated_timestamps()
             Builder* disallow_truncated_timestamps()
+            Builder* store_schema()
             shared_ptr[ArrowWriterProperties] build()
         c_bool support_deprecated_int96_timestamps()
 
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index d51e7fb3c6c..82da6b572d8 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1226,6 +1226,11 @@ cdef class ParquetWriter:
         properties = properties_builder.build()
 
         cdef ArrowWriterProperties.Builder arrow_properties_builder
+
+        # Store the original Arrow schema so things like dictionary types can
+        # be automatically reconstructed
+        arrow_properties_builder.store_schema()
+
         self._set_int96_support(&arrow_properties_builder)
         self._set_coerce_timestamps(&arrow_properties_builder)
         self._set_allow_truncated_timestamps(&arrow_properties_builder)
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index b143dd4bada..fc620c1eea7 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -159,10 +159,10 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
 
 @pytest.mark.pandas
 @pytest.mark.parametrize('chunk_size', [None, 1000])
-def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size):
+def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size):
     df = alltypes_sample(size=10000, categorical=True)
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
     assert arrow_table.schema.pandas_metadata is not None
 
@@ -173,7 +173,7 @@ def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size):
 
     assert arrow_table.schema.metadata == table_read.schema.metadata
 
-    df_read = table_read.to_pandas(categories=['str_category'])
+    df_read = table_read.to_pandas()
     tm.assert_frame_equal(df, df_read, check_categorical=False)
 
 
@@ -297,7 +297,7 @@ def test_datetime_timezone_tzinfo():
 def test_pandas_parquet_custom_metadata(tempdir):
     df = alltypes_sample(size=10000)
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
     assert b'pandas' in arrow_table.schema.metadata
 
@@ -321,7 +321,7 @@ def test_pandas_parquet_column_multiindex(tempdir):
         names=['level_1', 'level_2']
     )
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
     assert arrow_table.schema.pandas_metadata is not None
 
@@ -333,10 +333,10 @@ def test_pandas_parquet_column_multiindex(tempdir):
 
 
 @pytest.mark.pandas
-def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
+def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir):
     df = alltypes_sample(size=10000)
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df, preserve_index=False)
     js = arrow_table.schema.pandas_metadata
     assert not js['index_columns']
@@ -357,7 +357,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
 
 
 @pytest.mark.pandas
-def test_pandas_parquet_1_0_rountrip(tempdir):
+def test_pandas_parquet_1_0_roundtrip(tempdir):
     size = 10000
     np.random.seed(0)
     df = pd.DataFrame({
@@ -376,7 +376,7 @@ def test_pandas_parquet_1_0_rountrip(tempdir):
         'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
         'empty_str': [''] * size
     })
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
     _write_table(arrow_table, filename, version='1.0')
     table_read = _read_table(filename)
@@ -415,7 +415,7 @@ def test_pandas_column_selection(tempdir):
         'uint8': np.arange(size, dtype=np.uint8),
         'uint16': np.arange(size, dtype=np.uint16)
     })
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
     _write_table(arrow_table, filename)
     table_read = _read_table(filename, columns=['uint8'])
@@ -567,7 +567,7 @@ def test_pandas_parquet_configuration_options(tempdir):
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0
     })
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
 
     for use_dictionary in [True, False]:
@@ -883,7 +883,7 @@ def test_validate_schema_write_table(tempdir):
 def test_column_of_arrays(tempdir):
     df, schema = dataframe_with_arrays()
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df, schema=schema)
     _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms')
     table_read = _read_table(filename)
@@ -914,7 +914,7 @@ def test_coerce_timestamps(tempdir):
     df = pd.DataFrame(arrays)
     schema = pa.schema(fields)
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df, schema=schema)
 
     _write_table(arrow_table, filename, version="2.0", coerce_timestamps='us')
@@ -967,7 +967,7 @@ def test_coerce_timestamps_truncated(tempdir):
 def test_column_of_lists(tempdir):
     df, schema = dataframe_with_lists(parquet_compatible=True)
 
-    filename = tempdir / 'pandas_rountrip.parquet'
+    filename = tempdir / 'pandas_roundtrip.parquet'
     arrow_table = pa.Table.from_pandas(df, schema=schema)
     _write_table(arrow_table, filename, version='2.0')
     table_read = _read_table(filename)
@@ -1888,8 +1888,12 @@ def test_read_schema(tempdir):
     table = pa.Table.from_pandas(df)
     _write_table(table, data_path)
 
-    assert table.schema.equals(pq.read_schema(data_path))
-    assert table.schema.equals(pq.read_schema(data_path, memory_map=True))
+    read1 = pq.read_schema(data_path)
+    read2 = pq.read_schema(data_path, memory_map=True)
+    assert table.schema.equals(read1, check_metadata=False)
+    assert table.schema.equals(read2, check_metadata=False)
+
+    assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
 
 
 def _filter_partition(df, part_keys):
@@ -2981,6 +2985,36 @@ def test_parquet_file_too_small(tempdir):
         pq.read_table(path)
 
 
+def test_dictionary_array_automatically_read():
+    # ARROW-3246
+
+    # Make a large dictionary, a little over 4MB of data
+    dict_length = 4000
+    dict_values = pa.array([('x' * 1000 + '_{}'.format(i))
+                            for i in range(dict_length)])
+
+    num_chunks = 10
+    chunk_size = 100
+    chunks = []
+    for i in range(num_chunks):
+        indices = np.random.randint(0, dict_length,
+                                    size=chunk_size).astype(np.int32)
+        chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
+                                                     dict_values))
+
+    table = pa.table([pa.chunked_array(chunks)], names=['f0'])
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+    result = pq.read_table(pa.BufferReader(contents))
+
+    assert result.equals(table)
+
+    # The only key in the metadata was the Arrow schema key
+    assert result.schema.metadata is None
+
+
 @pytest.mark.pandas
 def test_multi_dataset_metadata(tempdir):
     filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]