From fefe7944bfce2cdc59e49551b62fb77b323747cc Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Tue, 4 Jun 2019 17:48:20 -0700
Subject: [PATCH 01/22] First cut impl of reading Parquet FileIterator into
 ArrowRecordBatch based reader

---
 .../iceberg/arrow/reader/ArrowReader.java     | 199 ++++++++++++++++++
 build.gradle                                  |  19 ++
 .../iceberg/data/TableScanIterable.java       |   3 +-
 .../org/apache/iceberg/parquet/Parquet.java   |  10 +-
 .../apache/iceberg/parquet/ParquetAvro.java   |   6 +-
 .../iceberg/parquet/ParquetFilters.java       |   4 +-
 .../org/apache/iceberg/parquet/ParquetIO.java |   6 +-
 .../iceberg/parquet/ParquetIterable.java      |   2 +-
 .../iceberg/parquet/ParquetReadSupport.java   |   4 +-
 .../apache/iceberg/parquet/ParquetReader.java |  32 ++-
 .../iceberg/parquet/ParquetWriteSupport.java  |   4 +-
 .../apache/iceberg/parquet/ParquetWriter.java |   4 +-
 settings.gradle                               |   2 +
 .../iceberg/spark/source/IcebergSource.java   |   8 +-
 .../apache/iceberg/spark/source/Reader.java   |   2 +-
 15 files changed, 281 insertions(+), 24 deletions(-)
 create mode 100644 arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
new file mode 100644
index 000000000000..0bfe927a2200
--- /dev/null
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
@@ -0,0 +1,199 @@
+package org.apache.iceberg.arrow.reader;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.VectorLoader;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.VectorUnloader;
+import org.apache.arrow.vector.ipc.message.ArrowRecordBatch;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.spark.TaskContext;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.arrow.ArrowUtils;
+import org.apache.spark.sql.execution.arrow.ArrowWriter;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.util.TaskCompletionListener;
+
+public class ArrowReader {
+
+  public static InternalRowOverArrowBatchIterator fromBatchIterator(
+      Iterator<ArrowRecordBatch> arrowBatchIter,
+      StructType sparkSchema,
+      String timeZoneId) {
+
+    // StructType sparkSchema = SparkSchemaUtil.convert(icebergSchema);
+
+    Schema arrowSchema = ArrowUtils.toArrowSchema(sparkSchema, timeZoneId);
+    BufferAllocator allocator =
+        ArrowUtils.rootAllocator().newChildAllocator("fromBatchIterator", 0, Long.MAX_VALUE);
+
+    VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator);
+
+    return new InternalRowOverArrowBatchIterator(arrowBatchIter, allocator, root);
+  }
+
+  public static class InternalRowOverArrowBatchIterator implements Iterator<InternalRow>, Closeable {
+
+    private Iterator<ArrowRecordBatch> arrowBatchIter;
+    private Iterator<InternalRow> rowIter;
+    private BufferAllocator allocator;
+    private VectorSchemaRoot root;
+
+    InternalRowOverArrowBatchIterator(Iterator<ArrowRecordBatch> arrowBatchIter,
+        BufferAllocator allocator,
+        VectorSchemaRoot root) {
+
+      this.arrowBatchIter = arrowBatchIter;
+      this.allocator = allocator;
+      this.root = root;
+
+      // if (arrowBatchIter.hasNext()) {
+      //   rowIter = nextBatch();
+      // } else {
+      //   rowIter = Collections.emptyIterator();
+      // }
+    }
+
+
+
+    @Override
+    public boolean hasNext() {
+      if (rowIter != null && rowIter.hasNext()) {
+        return true;
+      }
+      if (arrowBatchIter.hasNext()) {
+        rowIter = nextBatch();
+        return true;
+      } else {
+        root.close();
+        allocator.close();
+        return false;
+      }
+    }
+
+    @Override
+    public InternalRow next() {
+      return rowIter.next();
+    }
+
+    private Iterator<InternalRow> nextBatch() {
+      ArrowRecordBatch arrowRecordBatch = arrowBatchIter.next();
+      VectorLoader vectorLoader = new VectorLoader(root);
+      vectorLoader.load(arrowRecordBatch);
+      arrowRecordBatch.close();
+
+      List<FieldVector> fieldVectors = root.getFieldVectors();
+      ColumnVector[] columns = new ColumnVector[fieldVectors.size()];
+      for(int i=0; i<fieldVectors.size(); i++) {
+        columns[i] = new ArrowColumnVector(fieldVectors.get(i));
+      }
+
+      ColumnarBatch batch = new ColumnarBatch(columns);
+      batch.setNumRows(root.getRowCount());
+
+      return batch.rowIterator();
+    }
+
+
+    @Override
+    public void close() throws IOException {
+      // arrowWriter.finish();
+      root.close();
+      allocator.close();
+    }
+
+  }
+
+  public static ArrowRecordBatchIterator toBatchIterator(
+      Iterator<InternalRow> rowIter,
+      StructType sparkSchema, int maxRecordsPerBatch,
+      String timezonId) {
+
+    // StructType sparkSchema = SparkSchemaUtil.convert(icebergSchema);
+    TaskContext context = TaskContext.get();
+
+    Schema arrowSchema = ArrowUtils.toArrowSchema(sparkSchema, timezonId);
+    BufferAllocator allocator = ArrowUtils.rootAllocator().newChildAllocator(
+        "toBatchIterator",
+        0,
+        Long.MAX_VALUE);
+    VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator);
+
+    context.addTaskCompletionListener(new TaskCompletionListener() {
+      @Override
+      public void onTaskCompletion(TaskContext context) {
+        root.close();
+        allocator.close();
+      }
+    });
+
+    return new ArrowRecordBatchIterator(rowIter, root, allocator, maxRecordsPerBatch);
+  }
+
+
+  public static class ArrowRecordBatchIterator implements Iterator<ArrowRecordBatch>, Closeable {
+
+    Iterator<InternalRow> rowIterator;
+    VectorSchemaRoot root;
+    BufferAllocator allocator;
+    int maxRecordsPerBatch;
+    ArrowWriter arrowWriter;
+    VectorUnloader unloader;
+
+    ArrowRecordBatchIterator(Iterator<InternalRow> rowIterator,
+        VectorSchemaRoot root,
+        BufferAllocator allocator,
+        int maxRecordsPerBatch) {
+
+      this.unloader = new VectorUnloader(root);
+      this.arrowWriter = ArrowWriter.create(root);
+      this.rowIterator = rowIterator;
+      this.root = root;
+      this.allocator = allocator;
+      this.maxRecordsPerBatch = maxRecordsPerBatch;
+    }
+
+    @Override
+    public boolean hasNext() {
+
+      if (!rowIterator.hasNext()) {
+
+        root.close();
+        allocator.close();
+        return false;
+      }
+
+      return true;
+    }
+
+    @Override
+    public ArrowRecordBatch next() {
+
+      int rowCount = 0;
+
+      while (rowIterator.hasNext() && (maxRecordsPerBatch <= 0 || rowCount < maxRecordsPerBatch)) {
+        InternalRow row = rowIterator.next();
+        arrowWriter.write(row);
+        rowCount += 1;
+      }
+      arrowWriter.finish();
+      ArrowRecordBatch batch = unloader.getRecordBatch();
+      return batch;
+    }
+
+    @Override
+    public void close() throws IOException {
+      // arrowWriter.finish();
+      root.close();
+      allocator.close();
+    }
+  }
+}
diff --git a/build.gradle b/build.gradle
index 51d438234d69..042a6a5e2b46 100644
--- a/build.gradle
+++ b/build.gradle
@@ -190,7 +190,11 @@ project(':iceberg-data') {
   dependencies {
     compile project(':iceberg-api')
     compile project(':iceberg-core')
+    compileOnly project(':iceberg-spark')
     compileOnly project(':iceberg-parquet')
+    compileOnly("org.apache.spark:spark-hive_$scalaVersion:$sparkVersion") {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
 
     testCompile("org.apache.hadoop:hadoop-client:$hadoopVersion") {
       exclude group: 'org.apache.avro', module: 'avro'
@@ -249,7 +253,11 @@ project(':iceberg-parquet') {
   dependencies {
     compile project(':iceberg-api')
     compile project(':iceberg-core')
+    compile project(':iceberg-arrow')
 
+    compileOnly("org.apache.spark:spark-hive_$scalaVersion:$sparkVersion") {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
     compile "org.apache.parquet:parquet-avro:$parquetVersion"
 
     compileOnly "org.apache.avro:avro:$avroVersion"
@@ -282,6 +290,17 @@ project(':iceberg-spark') {
   }
 }
 
+project(':iceberg-arrow') {
+  dependencies {
+//    compile project(':iceberg-spark')
+    compile project(':iceberg-api')
+
+    compileOnly("org.apache.spark:spark-hive_$scalaVersion:$sparkVersion") {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
+  }
+}
+
 project(':iceberg-pig') {
   dependencies {
     compile project(':iceberg-api')
diff --git a/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java b/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java
index 4f749357b7d9..343c6be742b8 100644
--- a/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java
+++ b/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java
@@ -41,6 +41,7 @@
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.InputFile;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.SparkSchemaUtil;
 
 import static com.google.common.collect.Iterables.filter;
 import static java.util.Collections.emptyIterator;
@@ -91,7 +92,7 @@ private CloseableIterable<Record> open(FileScanTask task) {
 
       case PARQUET:
         Parquet.ReadBuilder parquet = Parquet.read(input)
-            .project(projection)
+            .project(projection, SparkSchemaUtil.convert(projection))
             .createReaderFunc(fileSchema -> buildReader(projection, fileSchema))
             .split(task.start(), task.length());
 
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
index 7ffc818e1de7..839093056b52 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
@@ -52,6 +52,7 @@
 import org.apache.parquet.hadoop.api.WriteSupport;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.schema.MessageType;
+import org.apache.spark.sql.types.StructType;
 
 import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION;
 import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT;
@@ -258,6 +259,7 @@ public static class ReadBuilder {
     private Long start = null;
     private Long length = null;
     private Schema schema = null;
+    private StructType sparkSchema = null;
     private Expression filter = null;
     private ReadSupport<?> readSupport = null;
     private Function<MessageType, ParquetValueReader<?>> readerFunc = null;
@@ -284,6 +286,12 @@ public ReadBuilder split(long start, long length) {
       return this;
     }
 
+    public ReadBuilder project(Schema schema, StructType sparkSchema) {
+      this.schema = schema;
+      this.sparkSchema = sparkSchema;
+      return this;
+    }
+
     public ReadBuilder project(Schema schema) {
       this.schema = schema;
       return this;
@@ -354,7 +362,7 @@ public <D> CloseableIterable<D> build() {
         ParquetReadOptions options = optionsBuilder.build();
 
         return new org.apache.iceberg.parquet.ParquetReader<>(
-            file, schema, options, readerFunc, filter, reuseContainers, caseSensitive);
+            file, schema, options, readerFunc, filter, reuseContainers, caseSensitive, sparkSchema);
       }
 
       ParquetReadBuilder<D> builder = new ParquetReadBuilder<>(ParquetIO.file(file));
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java
index 4c315c323400..afcfb7828f45 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java
@@ -36,8 +36,8 @@
 import org.apache.iceberg.avro.UUIDConversion;
 import org.apache.iceberg.types.TypeUtil;
 
-class ParquetAvro {
-  static Schema parquetAvroSchema(Schema avroSchema) {
+public class ParquetAvro {
+  public static Schema parquetAvroSchema(Schema avroSchema) {
     return AvroSchemaVisitor.visit(avroSchema, new ParquetDecimalSchemaConverter());
   }
 
@@ -173,7 +173,7 @@ public GenericFixed toFixed(BigDecimal value, Schema schema, LogicalType type) {
     }
   }
 
-  static GenericData DEFAULT_MODEL = new SpecificData() {
+  public static GenericData DEFAULT_MODEL = new SpecificData() {
     private final Conversion<?> fixedDecimalConversion = new FixedDecimalConversion();
     private final Conversion<?> intDecimalConversion = new IntDecimalConversion();
     private final Conversion<?> longDecimalConversion = new LongDecimalConversion();
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java
index b4a675e04a3d..cbf2d1c8ae31 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java
@@ -37,9 +37,9 @@
 
 import static org.apache.iceberg.expressions.ExpressionVisitors.visit;
 
-class ParquetFilters {
+public class ParquetFilters {
 
-  static FilterCompat.Filter convert(Schema schema, Expression expr, boolean caseSensitive) {
+  public static FilterCompat.Filter convert(Schema schema, Expression expr, boolean caseSensitive) {
     FilterPredicate pred = visit(expr, new ConvertFilterToParquet(schema, caseSensitive));
     // TODO: handle AlwaysFalse.INSTANCE
     if (pred != null && pred != AlwaysTrue.INSTANCE) {
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java
index 360a05503ce6..6432483fab31 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java
@@ -44,11 +44,11 @@
 /**
  * Methods in this class translate from the IO API to Parquet's IO API.
  */
-class ParquetIO {
+public class ParquetIO {
   private ParquetIO() {
   }
 
-  static InputFile file(org.apache.iceberg.io.InputFile file) {
+  public static InputFile file(org.apache.iceberg.io.InputFile file) {
     // TODO: use reflection to avoid depending on classes from iceberg-hadoop
     // TODO: use reflection to avoid depending on classes from hadoop
     if (file instanceof HadoopInputFile) {
@@ -62,7 +62,7 @@ static InputFile file(org.apache.iceberg.io.InputFile file) {
     return new ParquetInputFile(file);
   }
 
-  static OutputFile file(org.apache.iceberg.io.OutputFile file) {
+  public static OutputFile file(org.apache.iceberg.io.OutputFile file) {
     if (file instanceof HadoopOutputFile) {
       HadoopOutputFile hfile = (HadoopOutputFile) file;
       try {
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java
index bc4344872430..7d6d72c4f11b 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java
@@ -31,7 +31,7 @@
 public class ParquetIterable<T> extends CloseableGroup implements CloseableIterable<T> {
   private final ParquetReader.Builder<T> builder;
 
-  ParquetIterable(ParquetReader.Builder<T> builder) {
+  public ParquetIterable(ParquetReader.Builder<T> builder) {
     this.builder = builder;
   }
 
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java
index 8a0b44c720b4..b0c8a31be283 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java
@@ -41,12 +41,12 @@
  *
  * @param <T> Java type produced by this read support instance
  */
-class ParquetReadSupport<T> extends ReadSupport<T> {
+public class ParquetReadSupport<T> extends ReadSupport<T> {
   private final Schema expectedSchema;
   private final ReadSupport<T> wrapped;
   private final boolean callInit;
 
-  ParquetReadSupport(Schema expectedSchema, ReadSupport<T> readSupport, boolean callInit) {
+  public ParquetReadSupport(Schema expectedSchema, ReadSupport<T> readSupport, boolean callInit) {
     this.expectedSchema = expectedSchema;
     this.wrapped = readSupport;
     this.callInit = callInit;
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
index 653bb490dd61..e00f5d2e6054 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
@@ -23,8 +23,10 @@
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
+import java.util.TimeZone;
 import java.util.function.Function;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.arrow.reader.ArrowReader;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.expressions.Expression;
 import org.apache.iceberg.expressions.Expressions;
@@ -36,6 +38,8 @@
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.schema.MessageType;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.StructType;
 
 import static org.apache.iceberg.parquet.ParquetSchemaUtil.addFallbackIds;
 import static org.apache.iceberg.parquet.ParquetSchemaUtil.hasIds;
@@ -50,12 +54,15 @@ public class ParquetReader<T> extends CloseableGroup implements CloseableIterabl
   private final Expression filter;
   private final boolean reuseContainers;
   private final boolean caseSensitive;
+  private final StructType sparkSchema;
 
   public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                        Function<MessageType, ParquetValueReader<?>> readerFunc,
-                       Expression filter, boolean reuseContainers, boolean caseSensitive) {
+                       Expression filter, boolean reuseContainers, boolean caseSensitive,
+                       StructType sparkSchema) {
     this.input = input;
     this.expectedSchema = expectedSchema;
+    this.sparkSchema = sparkSchema;
     this.options = options;
     this.readerFunc = readerFunc;
     // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
@@ -185,9 +192,30 @@ private ReadConf<T> init() {
 
   @Override
   public Iterator<T> iterator() {
+    // create iterator over file
     FileIterator<T> iter = new FileIterator<>(init());
     addCloseable(iter);
-    return iter;
+
+    // return iter;
+    return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
+  }
+
+  private Iterator<T> arrowBatchAsInternalRow(Iterator<InternalRow> iter) {
+    // Convert InterRow iterator to ArrowRecordBatch Iterator
+    Iterator<InternalRow> rowIterator = iter;
+    ArrowReader.ArrowRecordBatchIterator arrowBatchIter = ArrowReader.toBatchIterator(rowIterator,
+        sparkSchema, 1000,
+        TimeZone.getDefault().getID());
+    addCloseable(arrowBatchIter);
+
+    // Overlay InternalRow iterator over ArrowRecordbatches
+    ArrowReader.InternalRowOverArrowBatchIterator
+        rowOverbatchIter = ArrowReader.fromBatchIterator(arrowBatchIter,
+        sparkSchema, TimeZone.getDefault().getID());
+
+    addCloseable(rowOverbatchIter);
+
+    return (Iterator)rowOverbatchIter;
   }
 
   private static class FileIterator<T> implements Iterator<T>, Closeable {
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java
index 633f9f80cecc..d097c67d9ab6 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java
@@ -26,12 +26,12 @@
 import org.apache.parquet.io.api.RecordConsumer;
 import org.apache.parquet.schema.MessageType;
 
-class ParquetWriteSupport<T> extends WriteSupport<T> {
+public class ParquetWriteSupport<T> extends WriteSupport<T> {
   private final MessageType type;
   private final Map<String, String> keyValueMetadata;
   private final WriteSupport<T> wrapped;
 
-  ParquetWriteSupport(MessageType type, Map<String, String> keyValueMetadata, WriteSupport<T> writeSupport) {
+  public ParquetWriteSupport(MessageType type, Map<String, String> keyValueMetadata, WriteSupport<T> writeSupport) {
     this.type = type;
     this.keyValueMetadata = keyValueMetadata;
     this.wrapped = writeSupport;
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java
index c7bd6e216f7b..f2c7a3ac86b2 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java
@@ -46,7 +46,7 @@
 import static java.lang.Math.min;
 import static org.apache.iceberg.parquet.ParquetSchemaUtil.convert;
 
-class ParquetWriter<T> implements FileAppender<T>, Closeable {
+public class ParquetWriter<T> implements FileAppender<T>, Closeable {
   private static final DynConstructors.Ctor<PageWriteStore> pageStoreCtor = DynConstructors
       .builder(PageWriteStore.class)
       .hiddenImpl("org.apache.parquet.hadoop.ColumnChunkPageWriteStore",
@@ -76,7 +76,7 @@ class ParquetWriter<T> implements FileAppender<T>, Closeable {
   private long nextCheckRecordCount = 10;
 
   @SuppressWarnings("unchecked")
-  ParquetWriter(Configuration conf, OutputFile output, Schema schema, long rowGroupSize,
+  public ParquetWriter(Configuration conf, OutputFile output, Schema schema, long rowGroupSize,
                 Map<String, String> metadata,
                 Function<MessageType, ParquetValueWriter<?>> createWriterFunc,
                 CompressionCodecName codec,
diff --git a/settings.gradle b/settings.gradle
index e4a2fe953efd..988c1579a079 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -25,6 +25,7 @@ include 'data'
 include 'orc'
 include 'parquet'
 include 'spark'
+include 'arrow'
 include 'pig'
 include 'runtime'
 include 'hive'
@@ -37,6 +38,7 @@ project(':data').name = 'iceberg-data'
 project(':orc').name = 'iceberg-orc'
 project(':parquet').name = 'iceberg-parquet'
 project(':spark').name = 'iceberg-spark'
+project(':arrow').name = 'iceberg-arrow'
 project(':pig').name = 'iceberg-pig'
 project(':runtime').name = 'iceberg-runtime'
 project(':hive').name = 'iceberg-hive'
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
index 097184abd678..995e9265841f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -106,21 +106,21 @@ protected Table findTable(DataSourceOptions options, Configuration conf) {
     return tables.load(location.get());
   }
 
-  private SparkSession lazySparkSession() {
+  protected SparkSession lazySparkSession() {
     if (lazySpark == null) {
       this.lazySpark = SparkSession.builder().getOrCreate();
     }
     return lazySpark;
   }
 
-  private Configuration lazyBaseConf() {
+  protected Configuration lazyBaseConf() {
     if (lazyConf == null) {
       this.lazyConf = lazySparkSession().sparkContext().hadoopConfiguration();
     }
     return lazyConf;
   }
 
-  private Table getTableAndResolveHadoopConfiguration(
+  protected Table getTableAndResolveHadoopConfiguration(
       DataSourceOptions options, Configuration conf) {
     // Overwrite configurations from the Spark Context with configurations from the options.
     mergeIcebergHadoopConfs(conf, options.asMap());
@@ -132,7 +132,7 @@ private Table getTableAndResolveHadoopConfiguration(
     return table;
   }
 
-  private static void mergeIcebergHadoopConfs(
+  protected static void mergeIcebergHadoopConfs(
       Configuration baseConf, Map<String, String> options) {
     options.keySet().stream()
         .filter(key -> key.startsWith("iceberg.hadoop"))
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index 63a33f958000..ddc43e251d48 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -464,7 +464,7 @@ private CloseableIterable<InternalRow> newParquetIterable(InputFile location,
                                                             FileScanTask task,
                                                             Schema readSchema) {
       return Parquet.read(location)
-          .project(readSchema)
+          .project(readSchema, SparkSchemaUtil.convert(readSchema))
           .split(task.start(), task.length())
           .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema))
           .filter(task.residual())

From 5b907807e8847d5e3532f35baa4e1fc639c4c05a Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Wed, 5 Jun 2019 15:38:52 -0700
Subject: [PATCH 02/22] made num records per arrow batch configurable

---
 .../org/apache/iceberg/parquet/Parquet.java    |  9 ++++++++-
 .../apache/iceberg/parquet/ParquetReader.java  |  6 ++++--
 .../iceberg/spark/source/IcebergSource.java    |  9 ++++++++-
 .../apache/iceberg/spark/source/Reader.java    | 18 +++++++++++++-----
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
index 839093056b52..d1c89755d5ed 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
@@ -268,6 +268,7 @@ public static class ReadBuilder {
     private Map<String, String> properties = Maps.newHashMap();
     private boolean callInit = false;
     private boolean reuseContainers = false;
+    private int maxRecordsPerBatch = 1000;
 
     private ReadBuilder(InputFile file) {
       this.file = file;
@@ -341,6 +342,12 @@ public ReadBuilder reuseContainers() {
       return this;
     }
 
+    public ReadBuilder recordsPerBatch(int numRowsPerBatch) {
+
+      this.maxRecordsPerBatch = numRowsPerBatch;
+      return this;
+    }
+
     @SuppressWarnings("unchecked")
     public <D> CloseableIterable<D> build() {
       if (readerFunc != null) {
@@ -362,7 +369,7 @@ public <D> CloseableIterable<D> build() {
         ParquetReadOptions options = optionsBuilder.build();
 
         return new org.apache.iceberg.parquet.ParquetReader<>(
-            file, schema, options, readerFunc, filter, reuseContainers, caseSensitive, sparkSchema);
+            file, schema, options, readerFunc, filter, reuseContainers, caseSensitive, sparkSchema, maxRecordsPerBatch);
       }
 
       ParquetReadBuilder<D> builder = new ParquetReadBuilder<>(ParquetIO.file(file));
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
index e00f5d2e6054..02302208e92c 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
@@ -55,11 +55,12 @@ public class ParquetReader<T> extends CloseableGroup implements CloseableIterabl
   private final boolean reuseContainers;
   private final boolean caseSensitive;
   private final StructType sparkSchema;
+  private final int maxRecordsPerBatch;
 
   public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                        Function<MessageType, ParquetValueReader<?>> readerFunc,
                        Expression filter, boolean reuseContainers, boolean caseSensitive,
-                       StructType sparkSchema) {
+                       StructType sparkSchema, int maxRecordsPerBatch) {
     this.input = input;
     this.expectedSchema = expectedSchema;
     this.sparkSchema = sparkSchema;
@@ -69,6 +70,7 @@ public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions
     this.filter = filter == Expressions.alwaysTrue() ? null : filter;
     this.reuseContainers = reuseContainers;
     this.caseSensitive = caseSensitive;
+    this.maxRecordsPerBatch = maxRecordsPerBatch;
   }
 
   private static class ReadConf<T> {
@@ -204,7 +206,7 @@ private Iterator<T> arrowBatchAsInternalRow(Iterator<InternalRow> iter) {
     // Convert InterRow iterator to ArrowRecordBatch Iterator
     Iterator<InternalRow> rowIterator = iter;
     ArrowReader.ArrowRecordBatchIterator arrowBatchIter = ArrowReader.toBatchIterator(rowIterator,
-        sparkSchema, 1000,
+        sparkSchema, maxRecordsPerBatch,
         TimeZone.getDefault().getID());
     addCloseable(arrowBatchIter);
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
index 995e9265841f..7bb2e6a03b8f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -49,6 +49,7 @@ public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport, D
 
   private SparkSession lazySpark = null;
   private Configuration lazyConf = null;
+  private static final int DEFAULT_NUM_RECORDS_PER_BATCH = 1000;
 
   @Override
   public String shortName() {
@@ -61,7 +62,13 @@ public DataSourceReader createReader(DataSourceOptions options) {
     Table table = getTableAndResolveHadoopConfiguration(options, conf);
     String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive", "true");
 
-    return new Reader(table, Boolean.valueOf(caseSensitive));
+    Optional<String> numRecordsPerBatchOpt = options.get("iceberg.read.numrecordsperbatch");
+    int numRecordsPerBatch = DEFAULT_NUM_RECORDS_PER_BATCH;
+    if(numRecordsPerBatchOpt.isPresent()) {
+      numRecordsPerBatch = Integer.parseInt(numRecordsPerBatchOpt.get());
+    }
+
+    return new Reader(table, Boolean.valueOf(caseSensitive), numRecordsPerBatch);
   }
 
   @Override
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index ddc43e251d48..acd60c7ce681 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -102,6 +102,7 @@ class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushD
   private final FileIO fileIo;
   private final EncryptionManager encryptionManager;
   private final boolean caseSensitive;
+  private final int numRecordsPerBatch;
   private StructType requestedSchema = null;
   private List<Expression> filterExpressions = null;
   private Filter[] pushedFilters = NO_FILTERS;
@@ -111,12 +112,13 @@ class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushD
   private StructType type = null; // cached because Spark accesses it multiple times
   private List<CombinedScanTask> tasks = null; // lazy cache of tasks
 
-  Reader(Table table, boolean caseSensitive) {
+  Reader(Table table, boolean caseSensitive, int numRecordsPerBatch) {
     this.table = table;
     this.schema = table.schema();
     this.fileIo = table.io();
     this.encryptionManager = table.encryption();
     this.caseSensitive = caseSensitive;
+    this.numRecordsPerBatch = numRecordsPerBatch;
   }
 
   private Schema lazySchema() {
@@ -150,7 +152,8 @@ public List<InputPartition<InternalRow>> planInputPartitions() {
     List<InputPartition<InternalRow>> readTasks = Lists.newArrayList();
     for (CombinedScanTask task : tasks()) {
       readTasks.add(
-        new ReadTask(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager, caseSensitive));
+        new ReadTask(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager,
+            caseSensitive, numRecordsPerBatch));
     }
 
     return readTasks;
@@ -249,25 +252,27 @@ private static class ReadTask implements InputPartition<InternalRow>, Serializab
     private final FileIO fileIo;
     private final EncryptionManager encryptionManager;
     private final boolean caseSensitive;
+    private final int numRecordsPerBatch;
 
     private transient Schema tableSchema = null;
     private transient Schema expectedSchema = null;
 
     private ReadTask(
         CombinedScanTask task, String tableSchemaString, String expectedSchemaString, FileIO fileIo,
-        EncryptionManager encryptionManager, boolean caseSensitive) {
+        EncryptionManager encryptionManager, boolean caseSensitive, int numRecordsPerBatch) {
       this.task = task;
       this.tableSchemaString = tableSchemaString;
       this.expectedSchemaString = expectedSchemaString;
       this.fileIo = fileIo;
       this.encryptionManager = encryptionManager;
       this.caseSensitive = caseSensitive;
+      this.numRecordsPerBatch = numRecordsPerBatch;
     }
 
     @Override
     public InputPartitionReader<InternalRow> createPartitionReader() {
       return new TaskDataReader(task, lazyTableSchema(), lazyExpectedSchema(), fileIo,
-        encryptionManager, caseSensitive);
+        encryptionManager, caseSensitive, numRecordsPerBatch);
     }
 
     private Schema lazyTableSchema() {
@@ -297,13 +302,14 @@ private static class TaskDataReader implements InputPartitionReader<InternalRow>
     private final FileIO fileIo;
     private final Map<String, InputFile> inputFiles;
     private final boolean caseSensitive;
+    private final int numRecordsPerBatch;
 
     private Iterator<InternalRow> currentIterator = null;
     private Closeable currentCloseable = null;
     private InternalRow current = null;
 
     public TaskDataReader(CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
-                          EncryptionManager encryptionManager, boolean caseSensitive) {
+                          EncryptionManager encryptionManager, boolean caseSensitive, int numRecordsPerBatch) {
       this.fileIo = fileIo;
       this.tasks = task.files().iterator();
       this.tableSchema = tableSchema;
@@ -319,6 +325,7 @@ public TaskDataReader(CombinedScanTask task, Schema tableSchema, Schema expected
       // open last because the schemas and fileIo must be set
       this.currentIterator = open(tasks.next());
       this.caseSensitive = caseSensitive;
+      this.numRecordsPerBatch = numRecordsPerBatch;
     }
 
     @Override
@@ -469,6 +476,7 @@ private CloseableIterable<InternalRow> newParquetIterable(InputFile location,
           .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema))
           .filter(task.residual())
           .caseSensitive(caseSensitive)
+          .recordsPerBatch(numRecordsPerBatch)
           .build();
     }
 

From f58e545a6da746c448aa3b066e57b96c6c7c8f19 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Wed, 5 Jun 2019 16:04:48 -0700
Subject: [PATCH 03/22] addressed comments

---
 .../iceberg/arrow/reader/ArrowReader.java     | 43 ++++++++++---------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
index 0bfe927a2200..40b0ec829977 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
@@ -5,6 +5,7 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
+import javax.annotation.concurrent.NotThreadSafe;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.VectorLoader;
@@ -29,8 +30,7 @@ public static InternalRowOverArrowBatchIterator fromBatchIterator(
       StructType sparkSchema,
       String timeZoneId) {
 
-    // StructType sparkSchema = SparkSchemaUtil.convert(icebergSchema);
-
+    // timeZoneId required for TimestampType in StructType
     Schema arrowSchema = ArrowUtils.toArrowSchema(sparkSchema, timeZoneId);
     BufferAllocator allocator =
         ArrowUtils.rootAllocator().newChildAllocator("fromBatchIterator", 0, Long.MAX_VALUE);
@@ -40,12 +40,14 @@ public static InternalRowOverArrowBatchIterator fromBatchIterator(
     return new InternalRowOverArrowBatchIterator(arrowBatchIter, allocator, root);
   }
 
+  @NotThreadSafe
   public static class InternalRowOverArrowBatchIterator implements Iterator<InternalRow>, Closeable {
 
-    private Iterator<ArrowRecordBatch> arrowBatchIter;
+    private final Iterator<ArrowRecordBatch> arrowBatchIter;
+    private final BufferAllocator allocator;
+    private final VectorSchemaRoot root;
+
     private Iterator<InternalRow> rowIter;
-    private BufferAllocator allocator;
-    private VectorSchemaRoot root;
 
     InternalRowOverArrowBatchIterator(Iterator<ArrowRecordBatch> arrowBatchIter,
         BufferAllocator allocator,
@@ -55,11 +57,6 @@ public static class InternalRowOverArrowBatchIterator implements Iterator<Intern
       this.allocator = allocator;
       this.root = root;
 
-      // if (arrowBatchIter.hasNext()) {
-      //   rowIter = nextBatch();
-      // } else {
-      //   rowIter = Collections.emptyIterator();
-      // }
     }
 
 
@@ -73,8 +70,11 @@ public boolean hasNext() {
         rowIter = nextBatch();
         return true;
       } else {
-        root.close();
-        allocator.close();
+        try {
+          close();
+        } catch (IOException ioe) {
+          throw new RuntimeException("Encountered an error while closing iterator. "+ioe.getMessage(), ioe);
+        }
         return false;
       }
     }
@@ -141,12 +141,12 @@ public void onTaskCompletion(TaskContext context) {
 
   public static class ArrowRecordBatchIterator implements Iterator<ArrowRecordBatch>, Closeable {
 
-    Iterator<InternalRow> rowIterator;
-    VectorSchemaRoot root;
-    BufferAllocator allocator;
-    int maxRecordsPerBatch;
-    ArrowWriter arrowWriter;
-    VectorUnloader unloader;
+    final Iterator<InternalRow> rowIterator;
+    final VectorSchemaRoot root;
+    final BufferAllocator allocator;
+    final int maxRecordsPerBatch;
+    final ArrowWriter arrowWriter;
+    final VectorUnloader unloader;
 
     ArrowRecordBatchIterator(Iterator<InternalRow> rowIterator,
         VectorSchemaRoot root,
@@ -166,8 +166,11 @@ public boolean hasNext() {
 
       if (!rowIterator.hasNext()) {
 
-        root.close();
-        allocator.close();
+        try {
+          close();
+        } catch (IOException ioe) {
+          throw new RuntimeException("Encountered an error while closing iterator. "+ioe.getMessage(), ioe);
+        }
         return false;
       }
 

From 47e436a65870c30b66296291eaa40081878e7431 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Thu, 6 Jun 2019 10:20:04 -0700
Subject: [PATCH 04/22] Added docs for public methods and ArrowReader class

---
 .../iceberg/arrow/reader/ArrowReader.java     | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
index 40b0ec829977..1896f27f6298 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
@@ -23,8 +23,24 @@
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.util.TaskCompletionListener;
 
+/***
+ * This is a helper class for Arrow reading. It provides two main converter methods.
+ * These converter methods are currently used to first convert a Parquet FileIterator
+ * into Iterator<ArrowRecordBatches>. Second, the ArrowRecordBatch is made
+ * into Columnar Batch and exposed as an Iterator<InternalRow>. The second step is to
+ * done to conform to Spark's current interface. When Spark adds Arrow support we will
+ * take the second iterator out and just return the first one.
+ */
 public class ArrowReader {
 
+  /***
+   * Accepts an iterator over ArrowRecordBatches and copies into ColumnarBatches.
+   * Since Spark uses Iterator over InternalRow we return this over ColumarBatch.
+   * @param arrowBatchIter
+   * @param sparkSchema
+   * @param timeZoneId
+   * @return
+   */
   public static InternalRowOverArrowBatchIterator fromBatchIterator(
       Iterator<ArrowRecordBatch> arrowBatchIter,
       StructType sparkSchema,
@@ -112,6 +128,17 @@ public void close() throws IOException {
 
   }
 
+  /**
+   * Acceepts Iterator over InternalRow coming in from ParqeutReader's FileIterator
+   * and creates ArrowRecordBatches over that by collecting rows from the input iter.
+   * Each next() call over this iterator will collect up to maxRecordsPerBatch rows
+   * at a time and create an Arrow batch with it and returns an iterator over that.
+   * @param rowIter
+   * @param sparkSchema
+   * @param maxRecordsPerBatch
+   * @param timezonId
+   * @return
+   */
   public static ArrowRecordBatchIterator toBatchIterator(
       Iterator<InternalRow> rowIter,
       StructType sparkSchema, int maxRecordsPerBatch,

From 19c7cb90f9997abfa78e058884796bc240d98afb Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Thu, 6 Jun 2019 14:44:49 -0700
Subject: [PATCH 05/22] Fixed javadoc

---
 .../java/org/apache/iceberg/arrow/reader/ArrowReader.java     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
index 1896f27f6298..d4a1ea6f7d50 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
@@ -26,8 +26,8 @@
 /***
  * This is a helper class for Arrow reading. It provides two main converter methods.
  * These converter methods are currently used to first convert a Parquet FileIterator
- * into Iterator<ArrowRecordBatches>. Second, the ArrowRecordBatch is made
- * into Columnar Batch and exposed as an Iterator<InternalRow>. The second step is to
+ * into Iterator over ArrowRecordBatches. Second, the ArrowRecordBatch is made
+ * into Columnar Batch and exposed as an Iterator over InternalRow. The second step is to
  * done to conform to Spark's current interface. When Spark adds Arrow support we will
  * take the second iterator out and just return the first one.
  */

From 15c7cd800c936e7450798d8d6c2b0a42a2fea8a3 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Fri, 14 Jun 2019 00:03:07 -0700
Subject: [PATCH 06/22] WIP first stab at reading into Arrow and returning as
 InternalRow iterator

---
 .../iceberg/arrow/reader/ArrowReader.java     | 27 ++++++++++++++-----
 .../org/apache/iceberg/BaseTableScan.java     |  2 +-
 .../apache/iceberg/parquet/ParquetReader.java | 14 ++++++++--
 .../iceberg/spark/source/IcebergSource.java   |  8 +++++-
 .../apache/iceberg/spark/source/Reader.java   | 14 +++++++++-
 5 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
index d4a1ea6f7d50..f7e1f6d79e8c 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/reader/ArrowReader.java
@@ -22,6 +22,8 @@
 import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.util.TaskCompletionListener;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /***
  * This is a helper class for Arrow reading. It provides two main converter methods.
@@ -33,6 +35,8 @@
  */
 public class ArrowReader {
 
+  private static final Logger LOG = LoggerFactory.getLogger(ArrowReader.class);
+
   /***
    * Accepts an iterator over ArrowRecordBatches and copies into ColumnarBatches.
    * Since Spark uses Iterator over InternalRow we return this over ColumarBatch.
@@ -102,6 +106,8 @@ public InternalRow next() {
 
     private Iterator<InternalRow> nextBatch() {
       ArrowRecordBatch arrowRecordBatch = arrowBatchIter.next();
+      long start = System.currentTimeMillis();
+      root.setRowCount(0);
       VectorLoader vectorLoader = new VectorLoader(root);
       vectorLoader.load(arrowRecordBatch);
       arrowRecordBatch.close();
@@ -115,6 +121,8 @@ private Iterator<InternalRow> nextBatch() {
       ColumnarBatch batch = new ColumnarBatch(columns);
       batch.setNumRows(root.getRowCount());
 
+      LOG.info("[InternalRowOverArrowIterator] => Created Columnar Batch with "+root.getRowCount()+ " rows" +
+          ". Took " + (System.currentTimeMillis() - start) + " milliseconds.");
       return batch.rowIterator();
     }
 
@@ -154,13 +162,15 @@ public static ArrowRecordBatchIterator toBatchIterator(
         Long.MAX_VALUE);
     VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator);
 
-    context.addTaskCompletionListener(new TaskCompletionListener() {
-      @Override
-      public void onTaskCompletion(TaskContext context) {
-        root.close();
-        allocator.close();
-      }
-    });
+    if (context!=null) {
+      context.addTaskCompletionListener(new TaskCompletionListener() {
+        @Override
+        public void onTaskCompletion(TaskContext context) {
+          root.close();
+          allocator.close();
+        }
+      });
+    }
 
     return new ArrowRecordBatchIterator(rowIter, root, allocator, maxRecordsPerBatch);
   }
@@ -209,12 +219,15 @@ public ArrowRecordBatch next() {
 
       int rowCount = 0;
 
+      long start = System.currentTimeMillis();
       while (rowIterator.hasNext() && (maxRecordsPerBatch <= 0 || rowCount < maxRecordsPerBatch)) {
         InternalRow row = rowIterator.next();
         arrowWriter.write(row);
         rowCount += 1;
       }
       arrowWriter.finish();
+      LOG.info("[ArrowRecordBatchIterator] => Created batch with "+rowCount+ " rows. " +
+          "Took "+(System.currentTimeMillis() - start) + " milliseconds.");
       ArrowRecordBatch batch = unloader.getRecordBatch();
       return batch;
     }
diff --git a/core/src/main/java/org/apache/iceberg/BaseTableScan.java b/core/src/main/java/org/apache/iceberg/BaseTableScan.java
index fadde6330dae..30e403fe8176 100644
--- a/core/src/main/java/org/apache/iceberg/BaseTableScan.java
+++ b/core/src/main/java/org/apache/iceberg/BaseTableScan.java
@@ -186,7 +186,7 @@ public CloseableIterable<FileScanTask> planFiles() {
             );
           });
 
-      if (PLAN_SCANS_WITH_WORKER_POOL && snapshot.manifests().size() > 1) {
+      if (PLAN_SCANS_WITH_WORKER_POOL && snapshot.manifests().size() > 30) {
         return new ParallelIterable<>(readers, ThreadPools.getWorkerPool());
       } else {
         return CloseableIterable.concat(readers);
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
index 02302208e92c..4817896d7f72 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
@@ -40,6 +40,8 @@
 import org.apache.parquet.schema.MessageType;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.StructType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import static org.apache.iceberg.parquet.ParquetSchemaUtil.addFallbackIds;
 import static org.apache.iceberg.parquet.ParquetSchemaUtil.hasIds;
@@ -56,6 +58,7 @@ public class ParquetReader<T> extends CloseableGroup implements CloseableIterabl
   private final boolean caseSensitive;
   private final StructType sparkSchema;
   private final int maxRecordsPerBatch;
+  private static final Logger LOG = LoggerFactory.getLogger(ParquetReader.class);
 
   public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                        Function<MessageType, ParquetValueReader<?>> readerFunc,
@@ -198,8 +201,15 @@ public Iterator<T> iterator() {
     FileIterator<T> iter = new FileIterator<>(init());
     addCloseable(iter);
 
-    // return iter;
-    return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
+    if(maxRecordsPerBatch == 0) {
+      LOG.info("[ParquetReader] => Return regular iterator. No batching.");
+      System.out.println("[ParquetReader] => Return regular iterator. No batching.");
+      return iter;
+    } else {
+      LOG.info("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
+      // System.out.println("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
+      return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
+    }
   }
 
   private Iterator<T> arrowBatchAsInternalRow(Iterator<InternalRow> iter) {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
index 7bb2e6a03b8f..ed1372ce7f53 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -39,17 +39,22 @@
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.WriteSupport;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
+import org.apache.spark.sql.sources.v2.reader.SupportsScanColumnarBatch;
 import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.types.StructType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
 import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
 
-public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister {
+public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport,
+    DataSourceRegister {
 
   private SparkSession lazySpark = null;
   private Configuration lazyConf = null;
   private static final int DEFAULT_NUM_RECORDS_PER_BATCH = 1000;
+  private static final Logger LOG = LoggerFactory.getLogger(IcebergSource.class);
 
   @Override
   public String shortName() {
@@ -67,6 +72,7 @@ public DataSourceReader createReader(DataSourceOptions options) {
     if(numRecordsPerBatchOpt.isPresent()) {
       numRecordsPerBatch = Integer.parseInt(numRecordsPerBatchOpt.get());
     }
+    LOG.info("[IcebergSource] => Reading numRecordsPerBatch = "+numRecordsPerBatch);
 
     return new Reader(table, Boolean.valueOf(caseSensitive), numRecordsPerBatch);
   }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index acd60c7ce681..ab8ad0aafe77 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -75,6 +75,7 @@
 import org.apache.spark.sql.sources.v2.reader.SupportsPushDownFilters;
 import org.apache.spark.sql.sources.v2.reader.SupportsPushDownRequiredColumns;
 import org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics;
+import org.apache.spark.sql.sources.v2.reader.SupportsScanColumnarBatch;
 import org.apache.spark.sql.types.BinaryType;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
@@ -82,6 +83,7 @@
 import org.apache.spark.sql.types.StringType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -92,7 +94,10 @@
 import static scala.collection.JavaConverters.asScalaBufferConverter;
 import static scala.collection.JavaConverters.seqAsJavaListConverter;
 
-class Reader implements DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns,
+class Reader implements DataSourceReader,
+    SupportsScanColumnarBatch,
+    SupportsPushDownFilters,
+    SupportsPushDownRequiredColumns,
     SupportsReportStatistics {
   private static final Logger LOG = LoggerFactory.getLogger(Reader.class);
 
@@ -144,6 +149,11 @@ public StructType readSchema() {
     return lazyType();
   }
 
+  @Override
+  public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
+
+  }
+
   @Override
   public List<InputPartition<InternalRow>> planInputPartitions() {
     String tableSchemaString = SchemaParser.toJson(table.schema());
@@ -433,8 +443,10 @@ private Iterator<InternalRow> open(FileScanTask task, Schema readSchema) {
       InputFile location = inputFiles.get(task.file().path().toString());
       Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
       CloseableIterable<InternalRow> iter;
+      LOG.info("[Reader] File format "+task.file().format());
       switch (task.file().format()) {
         case PARQUET:
+          LOG.info("[Reader] Returning Parquet Iterable ..");
           iter = newParquetIterable(location, task, readSchema);
           break;
 

From 4a9efd61efe3e2921559a55ad86ade700394a681 Mon Sep 17 00:00:00 2001
From: fbocse <fbocse@adobe.com>
Date: Mon, 4 Mar 2019 22:57:15 +0200
Subject: [PATCH 07/22] Add publish to snapshot repository by replacing version
 to `1.0-adobe-2.0-SNAPSHOT` (snapshot prefix is required by snapshot repo)

---
 build.gradle      | 13 ++++++++++++-
 gradle.properties |  4 ++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index fd0e1e374048..667081947242 100644
--- a/build.gradle
+++ b/build.gradle
@@ -47,7 +47,8 @@ if (JavaVersion.current() != JavaVersion.VERSION_1_8) {
 allprojects {
   group = "org.apache.iceberg"
   apply plugin: 'com.palantir.baseline-idea'
-  version = gitVersion()
+  /* TODO - this assumes that the upstream apache version is 1.0 so we need to be consistent w/ upstream changes */
+  version = "1.0-adobe-2.0-SNAPSHOT"
 }
 
 apply plugin: 'nebula-aggregate-javadocs'
@@ -108,6 +109,16 @@ subprojects {
         from components.java
       }
     }
+    repositories {
+        maven {
+            name 'Experience platform snapshot artifactory'
+            url 'https://artifactory.corp.adobe.com/artifactory/maven-experienceplatform-snapshot/'
+            credentials {
+                username = "${artifactory_user_p}" == "" ? System.getenv("ARTIFACTORY_USER") : "${artifactory_user_p}"
+                password = "${artifactory_key_p}" == "" ? System.getenv("ARTIFACTORY_API_TOKEN") : "${artifactory_key_p}"
+            }
+        }
+    }
   }
 }
 
diff --git a/gradle.properties b/gradle.properties
index f2ff982d9caf..90a909a081f7 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,2 +1,6 @@
 jmhOutputPath=build/reports/jmh/human-readable-output.txt
 jmhIncludeRegex=.*
+
+artifactory_contextUrl=https://artifactory.corp.adobe.com/artifactory
+artifactory_user_p=
+artifactory_key_p=

From 5a453204242d5480f60204c455bf20dfd7a73ac3 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Wed, 17 Jul 2019 09:46:41 -0700
Subject: [PATCH 08/22] Adding arrow schema conversion utility

---
 .../apache/iceberg/arrow/ArrowSchemaUtil.java | 148 ++++++++++++++++++
 .../iceberg/arrow/ArrowSchemaUtilTest.java    | 141 +++++++++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
 create mode 100644 core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java

diff --git a/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java b/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
new file mode 100644
index 000000000000..be312f6f06da
--- /dev/null
+++ b/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.arrow;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import java.util.List;
+import java.util.Map;
+import org.apache.arrow.vector.types.DateUnit;
+import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.types.Types.ListType;
+import org.apache.iceberg.types.Types.MapType;
+import org.apache.iceberg.types.Types.NestedField;
+import org.apache.iceberg.types.Types.StructType;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.iceberg.types.Types.NestedField.required;
+
+
+public class ArrowSchemaUtil {
+  static final String ORIGINAL_TYPE = "originalType";
+  static final String MAP_TYPE = "mapType";
+  static final String MAP_KEY = "key";
+  static final String MAP_VALUE = "value";
+
+  private ArrowSchemaUtil() { }
+
+  /**
+   * Convert Iceberg schema to Arrow Schema.
+   *
+   * @param schema iceberg schema
+   * @return arrow schema
+   */
+  public static Schema convert(final org.apache.iceberg.Schema schema) {
+    final ImmutableList.Builder<Field> fields = ImmutableList.builder();
+
+    for (NestedField f : schema.columns()) {
+      fields.add(convert(f));
+    }
+
+    return new Schema(fields.build());
+  }
+
+  private static Field convert(final NestedField field) {
+    final ArrowType arrowType;
+
+    final List<Field> children = Lists.newArrayList();
+    Map<String, String> metadata = null;
+
+    switch (field.type().typeId()) {
+      case BINARY:
+        arrowType = ArrowType.Binary.INSTANCE;
+        break;
+      case FIXED:
+        arrowType = new ArrowType.FixedSizeBinary(((Types.FixedType) field.type()).length());
+        break;
+      case BOOLEAN:
+        arrowType = ArrowType.Bool.INSTANCE;
+        break;
+      case INTEGER:
+        arrowType = new ArrowType.Int(Integer.SIZE, true);
+        break;
+      case LONG:
+        arrowType = new ArrowType.Int(Long.SIZE, true);
+        break;
+      case FLOAT:
+        arrowType = new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE);
+        break;
+      case DOUBLE:
+        arrowType = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE);
+        break;
+      case DECIMAL:
+        final Types.DecimalType decimalType = (Types.DecimalType) field.type();
+        arrowType = new ArrowType.Decimal(decimalType.precision(), decimalType.scale());
+        break;
+      case STRING:
+        arrowType = ArrowType.Utf8.INSTANCE;
+        break;
+      case TIME:
+        arrowType = new ArrowType.Time(TimeUnit.MICROSECOND, Long.SIZE);
+        break;
+      case TIMESTAMP:
+        arrowType = new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC");
+        break;
+      case DATE:
+        arrowType = new ArrowType.Date(DateUnit.DAY);
+        break;
+      case STRUCT:
+        final StructType struct = field.type().asStructType();
+        arrowType = ArrowType.Struct.INSTANCE;
+
+        for (NestedField nested : struct.fields()) {
+          children.add(convert(nested));
+        }
+        break;
+      case LIST:
+        final ListType listType = field.type().asListType();
+        arrowType = ArrowType.List.INSTANCE;
+
+        for (NestedField nested : listType.fields()) {
+          children.add(convert(nested));
+        }
+        break;
+      case MAP:
+        //Maps are represented as List<Struct<key, value>>
+        metadata = ImmutableMap.of(ORIGINAL_TYPE, MAP_TYPE);
+        final MapType mapType = field.type().asMapType();
+        arrowType = ArrowType.List.INSTANCE;
+
+        final List<Field> entryFields = Lists.newArrayList(
+            convert(required(0, MAP_KEY, mapType.keyType())),
+            convert(optional(0, MAP_VALUE, mapType.valueType()))
+        );
+
+        final Field entry = new Field("",
+            new FieldType(true, new ArrowType.Struct(), null), entryFields);
+        children.add(entry);
+        break;
+      default: throw new UnsupportedOperationException("Unsupported field type: " + field);
+    }
+
+    return new Field(field.name(), new FieldType(field.isOptional(), arrowType, null, metadata), children);
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
new file mode 100644
index 000000000000..0ac5cc8e7323
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.arrow;
+
+
+import java.util.List;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.VectorLoader;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.types.Types.BooleanType;
+import org.apache.iceberg.types.Types.DateType;
+import org.apache.iceberg.types.Types.DoubleType;
+import org.apache.iceberg.types.Types.ListType;
+import org.apache.iceberg.types.Types.LongType;
+import org.apache.iceberg.types.Types.MapType;
+import org.apache.iceberg.types.Types.StringType;
+import org.apache.iceberg.types.Types.TimestampType;
+import org.apache.spark.sql.execution.arrow.ArrowUtils;
+import org.apache.spark.sql.execution.arrow.ArrowWriter;
+import org.apache.spark.sql.execution.arrow.BooleanWriter;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.junit.Test;
+
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.Bool;
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.Date;
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.FloatingPoint;
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.Int;
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.List;
+import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.Timestamp;
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.iceberg.types.Types.NestedField.required;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+
+public class ArrowSchemaUtilTest {
+
+  @Test
+  public void convertPrimitive() {
+    Schema iceberg = new Schema(
+        optional(0, "i", Types.IntegerType.get()),
+        optional(1, "b", BooleanType.get()),
+        required(2, "d", DoubleType.get()),
+        required(3, "s", StringType.get()),
+        optional(4, "d2", DateType.get()),
+        optional(5, "ts", TimestampType.withoutZone())
+    );
+
+    org.apache.arrow.vector.types.pojo.Schema arrow = ArrowSchemaUtil.convert(iceberg);
+
+    System.out.println(iceberg);
+    System.out.println(arrow);
+
+    validate(iceberg, arrow);
+  }
+
+  @Test
+  public void convertComplex() {
+    Schema iceberg = new Schema(
+        optional(0, "m", MapType.ofOptional(
+            1, 2, StringType.get(),
+            LongType.get())
+        ),
+        required(3, "m2", MapType.ofOptional(
+            4, 5, StringType.get(),
+            ListType.ofOptional(6, TimestampType.withoutZone()))
+        )
+    );
+
+    org.apache.arrow.vector.types.pojo.Schema arrow = ArrowSchemaUtil.convert(iceberg);
+
+    System.out.println(iceberg);
+    System.out.println(arrow);
+
+    assertEquals(iceberg.columns().size(), arrow.getFields().size());
+  }
+
+  private void validate(Schema iceberg, org.apache.arrow.vector.types.pojo.Schema arrow) {
+    assertEquals(iceberg.columns().size(), arrow.getFields().size());
+
+    for (Types.NestedField nf : iceberg.columns()) {
+      Field field = arrow.findField(nf.name());
+      assertNotNull("Missing filed: " + nf, field);
+
+      validate(nf.type(), field.getType());
+    }
+  }
+
+  private void validate(Type iceberg, ArrowType arrow) {
+    switch (iceberg.typeId()) {
+      case BOOLEAN: assertEquals(Bool, arrow.getTypeID());
+        break;
+      case INTEGER: assertEquals(Int, arrow.getTypeID());
+        break;
+      case LONG: assertEquals(Int, arrow.getTypeID());
+        break;
+      case DOUBLE: assertEquals(FloatingPoint, arrow.getTypeID());
+        break;
+      case STRING: assertEquals(ArrowType.Utf8.INSTANCE.getTypeID(), arrow.getTypeID());
+        break;
+      case DATE: assertEquals(Date, arrow.getTypeID());
+        break;
+      case TIMESTAMP: assertEquals(Timestamp, arrow.getTypeID());
+        break;
+      case MAP: assertEquals(List, arrow.getTypeID());
+        break;
+      default: throw new UnsupportedOperationException("Check not implemented for type: " + iceberg);
+    }
+  }
+}

From bc19e0b6feafa7f7e799cca1a14a828458ea80eb Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Wed, 17 Jul 2019 09:56:50 -0700
Subject: [PATCH 09/22] adding arrow-vector dep to tests

---
 build.gradle | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/build.gradle b/build.gradle
index 4be4c3e3e755..0304dc787e96 100644
--- a/build.gradle
+++ b/build.gradle
@@ -222,6 +222,14 @@ project(':iceberg-core') {
     compile("org.apache.avro:avro") {
       exclude group: 'org.tukaani' // xz compression is not supported
     }
+    compile("org.apache.arrow:arrow-vector") {
+      exclude group: 'io.netty', module: 'netty-buffer'
+      exclude group: 'io.netty', module: 'netty-common'
+    }
+    compileOnly("org.apache.spark:spark-hive_2.11") {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
+
 
     compile "com.fasterxml.jackson.core:jackson-databind"
     compile "com.fasterxml.jackson.core:jackson-core"

From af5aa5ebd3e1c096a4a6c7ef6c7e83eaefcdf210 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Mon, 22 Jul 2019 16:23:07 -0700
Subject: [PATCH 10/22] [WIP] Working vectorization for primitive types. Added
 test for VectorizedSparkParquetReaders.

---
 .../apache/iceberg/arrow/ArrowSchemaUtil.java |   2 +-
 .../iceberg/arrow/ArrowSchemaUtilTest.java    | 142 +++++++++
 .../apache/iceberg/parquet/ParquetReader.java |  23 +-
 .../iceberg/parquet/ParquetValueReaders.java  |  84 +++++
 .../spark/data/SparkParquetReaders.java       |   5 +-
 .../vector/VectorizedParquetValueReaders.java | 297 ++++++++++++++++++
 .../vector/VectorizedSparkParquetReaders.java | 219 +++++++++++++
 .../iceberg/spark/source/IcebergSource.java   |  16 +-
 .../apache/iceberg/spark/source/Reader.java   | 139 ++++----
 .../iceberg/spark/data/AvroDataTest.java      |   8 +-
 .../TestSparkParquetVectorizedReader.java     | 101 ++++++
 versions.props                                |   1 +
 12 files changed, 949 insertions(+), 88 deletions(-)
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
 create mode 100644 spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
 create mode 100644 spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java

diff --git a/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java b/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
index be312f6f06da..492af6180912 100644
--- a/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
+++ b/core/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java
@@ -65,7 +65,7 @@ public static Schema convert(final org.apache.iceberg.Schema schema) {
     return new Schema(fields.build());
   }
 
-  private static Field convert(final NestedField field) {
+  public static Field convert(final NestedField field) {
     final ArrowType arrowType;
 
     final List<Field> children = Lists.newArrayList();
diff --git a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
index 0ac5cc8e7323..8d205c4d4b5c 100644
--- a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
+++ b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
@@ -66,6 +66,148 @@
 
 public class ArrowSchemaUtilTest {
 
+  @Test
+  public void testArrowWriting() {
+
+    Schema iceberg = new Schema(
+        optional(0, "i", Types.IntegerType.get()),
+        optional(1, "b", BooleanType.get()),
+        required(2, "d", DoubleType.get()),
+        required(3, "s", StringType.get()),
+        optional(4, "d2", DateType.get()),
+        optional(5, "ts", TimestampType.withoutZone())
+    );
+
+    org.apache.arrow.vector.types.pojo.Schema arrowSchema = ArrowSchemaUtil.convert(iceberg);
+    RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
+
+    VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, rootAllocator);
+
+    // java.util.List<FieldVector> fieldVectors = schemaRoot.getFieldVectors();
+
+    ColumnVector[] columns = new ColumnVector[arrowSchema.getFields().size()];
+    int i=0;
+    for(Field field: arrowSchema.getFields()) {
+
+      // create
+      FieldVector vec = field.createVector(rootAllocator);
+      ArrowFieldWriter fieldWriter = selectFieldWriter(vec);
+
+      // int batchSize = 100;
+      // for (int i=0; i<batchSize; i++) {
+      //   fieldWriter.write(i, );
+      // }
+
+      ArrowColumnVector columnVector = new ArrowColumnVector(vec);
+      columns[i] = columnVector;
+      i++;
+    }
+
+    ColumnarBatch batch = new ColumnarBatch(columns);
+
+  }
+
+  ArrowFieldWriter<?> selectFieldWriter(ValueVector vec) {
+
+    Field field = vec.getField();
+    DataType dt = ArrowUtils.fromArrowField(field);
+
+    if(dt instanceof IntegerType) {
+
+      return new IntegerWriter(vec);
+
+    } else if (dt instanceof org.apache.spark.sql.types.BooleanType) {
+
+      return new BooleanWriter(vec);
+    } else {
+
+      throw new UnsupportedOperationException("Unsupported data type: "+dt.catalogString());
+    }
+
+  }
+
+
+  private static abstract class ArrowFieldWriter<T> {
+
+    ValueVector vec;
+    String fieldName;
+    DataType dataType;
+    boolean isNullable;
+    int count = 0;
+
+    ArrowFieldWriter(ValueVector vec) {
+
+      this.vec = vec;
+      this.vec.allocateNew();
+    }
+
+    public void write(int ordinal, T data) {
+
+      if(data == null) {
+
+        setNull(ordinal);
+      } else {
+
+        setValue(ordinal, data);
+      }
+      count++;
+    }
+
+    public void finish() {
+
+      vec.setValueCount(count);
+    }
+
+    public void reset() {
+
+      vec.setValueCount(0);
+      count = 0;
+    }
+
+    abstract public void setNull(int ordinal);
+
+    abstract public void setValue(int ordinal, T data);
+  }
+
+  private static final class IntegerWriter extends ArrowFieldWriter<Integer>{
+
+
+    IntegerWriter(ValueVector vec) {
+      super(vec);
+    }
+
+    @Override
+    public void setNull(int ordinal) {
+      ((IntVector)vec).setNull(ordinal);
+    }
+
+    @Override
+    public void setValue(int ordinal, Integer data) {
+      ((IntVector)vec).setSafe(ordinal, data);
+    }
+  }
+
+
+  private static final class BooleanWriter extends ArrowFieldWriter<Boolean>{
+
+
+    BooleanWriter(ValueVector vec) {
+      super(vec);
+    }
+
+    @Override
+    public void setNull(int ordinal) {
+      ((BitVector)vec).setNull(ordinal);
+    }
+
+    @Override
+    public void setValue(int ordinal, Boolean data) {
+      ((BitVector)vec).set(ordinal, data ? 1 : 0);
+    }
+  }
+
+
+
   @Test
   public void convertPrimitive() {
     Schema iceberg = new Schema(
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
index 0603bd30cddd..9928a4da0beb 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
@@ -40,6 +40,7 @@
 import org.apache.parquet.schema.MessageType;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -201,15 +202,15 @@ public Iterator<T> iterator() {
     FileIterator<T> iter = new FileIterator<>(init());
     addCloseable(iter);
 
-    if(maxRecordsPerBatch == 0) {
-      LOG.info("[ParquetReader] => Return regular iterator. No batching.");
-      System.out.println("[ParquetReader] => Return regular iterator. No batching.");
-      return iter;
-    } else {
-      LOG.info("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
+    // if(maxRecordsPerBatch == 0) {
+    LOG.info("[ParquetReader] => Return regular iterator. No batching.");
+    System.out.println("[ParquetReader] => Return regular iterator. No batching.");
+    return iter;
+    // } else {
+    //   LOG.info("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
       // System.out.println("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
-      return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
-    }
+      // return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
+    // }
   }
 
   private Iterator<T> arrowBatchAsInternalRow(Iterator<InternalRow> iter) {
@@ -266,7 +267,11 @@ public T next() {
       } else {
         this.last = model.read(null);
       }
-      valuesRead += 1;
+      if (last instanceof ColumnarBatch) {
+        valuesRead += ((ColumnarBatch)last).numRows();
+      } else {
+        valuesRead += 1;
+      }
 
       return last;
     }
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java
index ac61983b2c29..45293674f20b 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java
@@ -26,13 +26,18 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.page.PageReadStore;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.Type;
+import org.apache.spark.sql.vectorized.ArrowColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 
 import static java.util.Collections.emptyIterator;
 
@@ -576,6 +581,85 @@ public V setValue(V value) {
     }
   }
 
+
+  public static class ColumnarBatchReader implements ParquetValueReader<ColumnarBatch>  {
+
+    private final int numFields;
+    private final Types.StructType iceExpectedFields;
+    private final ParquetValueReader<FieldVector>[] readers;
+    private final TripleIterator<?> column;
+    private final TripleIterator<?>[] columns;
+    private final List<TripleIterator<?>> children;
+
+    @SuppressWarnings("unchecked")
+    public ColumnarBatchReader(List<Type> types,
+        Types.StructType icebergExpectedFields,
+        List<ParquetValueReader<FieldVector>> readers) {
+
+      this.numFields = readers.size();
+      this.iceExpectedFields = icebergExpectedFields;
+      this.readers = (ParquetValueReader<FieldVector>[]) Array.newInstance(
+          ParquetValueReader.class, readers.size());
+      this.columns = (TripleIterator<?>[]) Array.newInstance(TripleIterator.class, readers.size());
+
+
+      ImmutableList.Builder<TripleIterator<?>> columnsBuilder = ImmutableList.builder();
+      for (int i = 0; i < readers.size(); i += 1) {
+        ParquetValueReader<FieldVector> reader = readers.get(i);
+        this.readers[i] = readers.get(i);
+        this.columns[i] = reader.column();
+        columnsBuilder.addAll(reader.columns());
+      }
+
+      this.children = columnsBuilder.build();
+      if (children.size() > 0) {
+        this.column = children.get(0);
+      } else {
+        this.column = NullReader.NULL_COLUMN;
+      }
+
+    }
+
+    @Override
+    public final void setPageSource(PageReadStore pageStore) {
+      for (int i = 0; i < readers.length; i += 1) {
+        readers[i].setPageSource(pageStore);
+      }
+    }
+
+    @Override
+    public final TripleIterator<?> column() {
+      return column;
+    }
+
+    @Override
+    public List<TripleIterator<?>> columns() {
+      return children;
+    }
+
+
+    @Override
+    public final ColumnarBatch read(ColumnarBatch ignore) {
+
+      ArrowColumnVector[] arrowVectorArr = (ArrowColumnVector[])Array.newInstance(ArrowColumnVector.class,
+          readers.length);
+
+      int numRows=0;
+      for (int i = 0; i < readers.length; i += 1) {
+
+        FieldVector vec = readers[i].read(null);
+        arrowVectorArr[i] = new ArrowColumnVector(vec);
+        numRows = vec.getValueCount();
+      }
+
+      ColumnarBatch batch = new ColumnarBatch(arrowVectorArr);
+      batch.setNumRows(numRows);
+
+      return  batch;
+    }
+
+  }
+
   public abstract static class StructReader<T, I> implements ParquetValueReader<T> {
     private interface Setter<R> {
       void set(R record, int pos, Object reuse);
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
index 9a36266ffdf2..bb9330577935 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
@@ -59,6 +59,7 @@
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -112,7 +113,7 @@ public ParquetValueReader<?> struct(Types.StructType ignored, GroupType struct,
   }
 
   private static class ReadBuilder extends TypeWithSchemaVisitor<ParquetValueReader<?>> {
-    private final MessageType type;
+    protected final MessageType type;
 
     ReadBuilder(MessageType type) {
       this.type = type;
@@ -360,7 +361,7 @@ public long readLong() {
     }
   }
 
-  private static class StringReader extends PrimitiveReader<UTF8String> {
+  protected static class StringReader extends PrimitiveReader<UTF8String> {
     StringReader(ColumnDescriptor desc) {
       super(desc);
     }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
new file mode 100644
index 000000000000..39b072a3411a
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -0,0 +1,297 @@
+package org.apache.iceberg.spark.data.vector;
+
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.DateDayVector;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.TimeStampMicroTZVector;
+import org.apache.arrow.vector.VarBinaryVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.apache.iceberg.arrow.ArrowSchemaUtil;
+import org.apache.iceberg.parquet.ParquetValueReaders;
+import org.apache.iceberg.types.Types;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.io.api.Binary;
+
+/***
+ * Parquet Value Reader implementations for Vectorization.
+ * Contains type-wise readers to read parquet data as vectors.
+ * - Returns Arrow's Field Vector for each type
+ * - Null values are explicitly handled.
+ * - Type serialization is done based on types in Arrow
+ * - Mapping of Iceberg type to Arrow type is done in ArrowSchemaUtil.convert()
+ *
+ * 	 icebergType : LONG   		-> 		Field Vector Type : org.apache.arrow.vector.BigIntVector
+ * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
+ * 	 icebergType : BOOLEAN 		-> 		Field Vector Type : org.apache.arrow.vector.BitVector
+ * 	 icebergType : INTEGER 		-> 		Field Vector Type : org.apache.arrow.vector.IntVector
+ * 	 icebergType : FLOAT   		-> 		Field Vector Type : org.apache.arrow.vector.Float4Vector
+ * 	 icebergType : DOUBLE  		-> 		Field Vector Type : org.apache.arrow.vector.Float8Vector
+ * 	 icebergType : DATE    		-> 		Field Vector Type : org.apache.arrow.vector.DateDayVector
+ * 	 icebergType : TIMESTAMP  -> 		Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
+ * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
+ * 	 icebergType : BINARY  		-> 		Field Vector Type : org.apache.arrow.vector.VarBinaryVector
+ */
+public class VectorizedParquetValueReaders {
+
+  public abstract static class VectorReader extends ParquetValueReaders.PrimitiveReader<FieldVector> {
+
+    protected FieldVector vec;
+
+    VectorReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+
+      super(desc);
+
+      this.vec = ArrowSchemaUtil.convert(icebergField).createVector(rootAlloc);
+      System.out.println("=> icebergField : "+icebergField.type().typeId().name()+" ,  Field Vector Type : "+vec.getClass().getName());
+    }
+
+    @Override
+    public FieldVector read(FieldVector ignore) {
+
+      vec.reset();
+      int i=0;
+
+      while(column.hasNext()) {
+        // Todo: this check works for flat schemas only
+        // need to get max definition level to do proper check
+        if(column.currentDefinitionLevel() == 0) {
+          // handle null
+          column.nextNull();
+          nextNullAt(i);
+        } else {
+          nextValueAt(i);
+        }
+        i++;
+      }
+      vec.setValueCount(i);
+      return vec;
+    }
+
+
+    public int getRowCount() {
+      return vec.getValueCount();
+    }
+
+    protected abstract void nextNullAt(int i);
+
+    protected abstract void nextValueAt(int i);
+  }
+
+  protected static class StringReader extends VectorReader {
+
+    StringReader(ColumnDescriptor desc, Types.NestedField icebergField, RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    @Override
+    protected void nextNullAt(int i) {
+      ((VarCharVector) vec).setNull(i);
+    }
+
+    @Override
+    protected void nextValueAt(int i) {
+
+      Binary binary = column.nextBinary();
+      if (binary == null) {
+
+        ((VarCharVector) vec).setNull(i);
+
+      } else {
+        String utf8Str = binary.toStringUsingUTF8();
+        ((VarCharVector) vec).setSafe(i, utf8Str.getBytes());
+      }
+    }
+
+  }
+
+  protected static class IntegerReader extends VectorReader {
+
+    IntegerReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+
+      super(desc, icebergField, rootAlloc);
+    }
+
+    @Override
+    protected void nextNullAt(int i) {
+      ((IntVector) vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      int intValue = column.nextInteger();
+      ((IntVector)vec).setSafe(i, intValue);
+
+    }
+  }
+
+  protected static class LongReader extends VectorReader {
+
+    LongReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((BigIntVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      long longValue = column.nextLong();
+      ((BigIntVector)vec).setSafe(i, longValue);
+
+    }
+  }
+
+  protected static class TimestampMillisReader extends LongReader {
+
+    TimestampMillisReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextValueAt(int i) {
+
+      long longValue = column.nextLong();
+      ((BigIntVector)vec).setSafe(i, 1000 * longValue);
+
+    }
+  }
+
+  protected static class TimestampMicroReader extends VectorReader {
+
+    TimestampMicroReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((TimeStampMicroTZVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      long longValue = column.nextLong();
+      ((TimeStampMicroTZVector)vec).setSafe(i, longValue);
+
+    }
+  }
+
+  protected static class BooleanReader extends VectorReader {
+
+    BooleanReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((BitVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      boolean bool = column.nextBoolean();
+      ((BitVector)vec).setSafe(i, bool ? 1 : 0);
+
+    }
+  }
+
+
+
+  protected static class FloatReader extends VectorReader {
+
+    FloatReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((Float4Vector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      float floatValue = column.nextFloat();
+      ((Float4Vector)vec).setSafe(i, floatValue);
+
+    }
+  }
+
+  protected static class DoubleReader extends VectorReader {
+
+    DoubleReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((Float8Vector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      double doubleValue = column.nextDouble();
+      ((Float8Vector)vec).setSafe(i, doubleValue);
+
+    }
+  }
+
+
+  protected static class BinaryReader extends VectorReader {
+
+    BinaryReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((VarBinaryVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      Binary binaryValue = column.nextBinary();
+      ((VarBinaryVector)vec).setSafe(i, binaryValue.getBytes());
+
+    }
+  }
+
+
+
+  protected static class DateReader extends VectorReader {
+
+    DateReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc) {
+      super(desc, icebergField, rootAlloc);
+    }
+
+    protected void nextNullAt(int i) {
+      ((DateDayVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      int dateValue = column.nextInteger();
+      ((DateDayVector)vec).setSafe(i, dateValue);
+
+    }
+  }
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
new file mode 100644
index 000000000000..22b5a09c036e
--- /dev/null
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
@@ -0,0 +1,219 @@
+package org.apache.iceberg.spark.data.vector;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.arrow.ArrowSchemaUtil;
+import org.apache.iceberg.parquet.ParquetValueReader;
+import org.apache.iceberg.parquet.ParquetValueReaders;
+import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
+import org.apache.iceberg.types.Types;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+public class VectorizedSparkParquetReaders {
+
+  @SuppressWarnings("unchecked")
+  public static ParquetValueReader<ColumnarBatch> buildReader(
+      Schema tableSchema,
+      Schema expectedSchema,
+      MessageType fileSchema) {
+
+    return (ParquetValueReader<ColumnarBatch>)
+        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
+            new ReadBuilder(tableSchema, expectedSchema, fileSchema));
+  }
+
+  private static class ReadBuilder extends TypeWithSchemaVisitor<ParquetValueReader<?>> {
+    protected final MessageType parquetSchema;
+    protected final Schema projectedIcebergSchema;
+    protected final Schema tableIcebergSchema;
+    protected final org.apache.arrow.vector.types.pojo.Schema arrowSchema;
+    protected final RootAllocator rootAllocator;
+
+    ReadBuilder(Schema tableSchema, Schema projectedIcebergSchema, MessageType parquetSchema) {
+      this.parquetSchema = parquetSchema;
+      this.tableIcebergSchema = tableSchema;
+      this.projectedIcebergSchema = projectedIcebergSchema;
+      this.arrowSchema = ArrowSchemaUtil.convert(projectedIcebergSchema);
+      this.rootAllocator = new RootAllocator(Long.MAX_VALUE);
+    }
+
+    @Override
+    public ParquetValueReader<?> message(Types.StructType expected, MessageType message,
+        List<ParquetValueReader<?>> fieldReaders) {
+      return struct(expected, message.asGroupType(), fieldReaders);
+    }
+
+    @Override
+    public ParquetValueReader<?> struct(Types.StructType expected, GroupType struct,
+        List<ParquetValueReader<?>> fieldReaders) {
+
+      // this works on struct fields and the root iceberg schema which itself is a struct.
+
+      // match the expected struct's order
+      Map<Integer, ParquetValueReader<FieldVector>> readersById = Maps.newHashMap();
+      Map<Integer, Type> typesById = Maps.newHashMap();
+      List<Type> fields = struct.getFields();
+
+      for (int i = 0; i < fields.size(); i += 1) {
+        Type fieldType = fields.get(i);
+        int fieldD = parquetSchema.getMaxDefinitionLevel(path(fieldType.getName())) - 1;
+        int id = fieldType.getId().intValue();
+        // Todo: figure out optional vield reading for vectorized reading
+        // readersById.put(id, (ParquetValueReader<FieldVector>)ParquetValueReaders.
+        //     option(fieldType, fieldD, fieldReaders.get(i)));
+
+        readersById.put(id, (ParquetValueReader<FieldVector>)fieldReaders.get(i));
+        typesById.put(id, fieldType);
+      }
+
+      List<Types.NestedField> icebergFields = expected != null ?
+          expected.fields() : ImmutableList.of();
+
+      List<ParquetValueReader<FieldVector>> reorderedFields = Lists.newArrayListWithExpectedSize(
+          icebergFields.size());
+
+      List<Type> types = Lists.newArrayListWithExpectedSize(icebergFields.size());
+
+      for (Types.NestedField field : icebergFields) {
+        int id = field.fieldId();
+        ParquetValueReader<FieldVector> reader = readersById.get(id);
+        if (reader != null) {
+          reorderedFields.add(reader);
+          types.add(typesById.get(id));
+        } else {
+          reorderedFields.add(ParquetValueReaders.nulls());
+          types.add(null);
+        }
+      }
+
+      return new ParquetValueReaders.ColumnarBatchReader(types, expected, reorderedFields);
+    }
+
+
+    @Override
+    public ParquetValueReader<?> primitive(org.apache.iceberg.types.Type.PrimitiveType expected,
+        PrimitiveType primitive) {
+
+      // Create arrow vector for this field
+      int parquetFieldId = primitive.getId().intValue();
+      ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath());
+      Types.NestedField icebergField = tableIcebergSchema.findField(parquetFieldId);
+      // Field field = ArrowSchemaUtil.convert(projectedIcebergSchema.findField(parquetFieldId));
+      // FieldVector vec = field.createVector(rootAllocator);
+
+      if (primitive.getOriginalType() != null) {
+        switch (primitive.getOriginalType()) {
+          case ENUM:
+          case JSON:
+          case UTF8:
+            return new VectorizedParquetValueReaders.StringReader(desc, icebergField, rootAllocator);
+          case INT_8:
+          case INT_16:
+          case INT_32:
+            return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator);
+            // if (expected != null && expected.typeId() == Types.LongType.get().typeId()) {
+            //   return new ParquetValueReaders.IntAsLongReader(desc);
+            // } else {
+            //   return new ParquetValueReaders.UnboxedReader(desc);
+            // }
+          case DATE:
+            return new VectorizedParquetValueReaders.DateReader(desc, icebergField, rootAllocator);
+          case INT_64:
+            return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator);
+          case TIMESTAMP_MICROS:
+            return new VectorizedParquetValueReaders.TimestampMicroReader(desc, icebergField, rootAllocator);
+          case TIMESTAMP_MILLIS:
+            return new VectorizedParquetValueReaders.TimestampMillisReader(desc, icebergField, rootAllocator);
+          // case DECIMAL:
+          //   DecimalMetadata decimal = primitive.getDecimalMetadata();
+          //   switch (primitive.getPrimitiveTypeName()) {
+          //     case BINARY:
+          //     case FIXED_LEN_BYTE_ARRAY:
+          //       return new SparkParquetReaders.BinaryDecimalReader(desc, decimal.getScale());
+          //     case INT64:
+          //       return new SparkParquetReaders.LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale());
+          //     case INT32:
+          //       return new SparkParquetReaders.IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale());
+          //     default:
+          //       throw new UnsupportedOperationException(
+          //           "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
+          //   }
+          // case BSON:
+          //   return new SparkParquetReaders.BytesReader(desc);
+          default:
+            throw new UnsupportedOperationException(
+                "Unsupported logical type: " + primitive.getOriginalType());
+        }
+      }
+
+      switch (primitive.getPrimitiveTypeName()) {
+        case FIXED_LEN_BYTE_ARRAY:
+        case BINARY:
+          return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator);
+        case INT32:
+          return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator);
+        case FLOAT:
+          return new VectorizedParquetValueReaders.FloatReader(desc, icebergField, rootAllocator);
+          // if (expected != null && expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) {
+          //   return new ParquetValueReaders.FloatAsDoubleReader(desc);
+          // } else {
+          //   return new ParquetValueReaders.UnboxedReader<>(desc);
+          // }
+        case BOOLEAN:
+          return new VectorizedParquetValueReaders.BooleanReader(desc, icebergField, rootAllocator);
+        case INT64:
+          return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator);
+        case DOUBLE:
+          return new VectorizedParquetValueReaders.DoubleReader(desc, icebergField, rootAllocator);
+        default:
+          throw new UnsupportedOperationException("Unsupported type: " + primitive);
+      }
+    }
+
+    private String[] currentPath() {
+      String[] path = new String[fieldNames.size()];
+      if (!fieldNames.isEmpty()) {
+        Iterator<String> iter = fieldNames.descendingIterator();
+        for (int i = 0; iter.hasNext(); i += 1) {
+          path[i] = iter.next();
+        }
+      }
+
+      return path;
+    }
+
+    protected MessageType type() {
+      return parquetSchema;
+    }
+
+    protected String[] path(String name) {
+      String[] path = new String[fieldNames.size() + 1];
+      path[fieldNames.size()] = name;
+
+      if (!fieldNames.isEmpty()) {
+        Iterator<String> iter = fieldNames.descendingIterator();
+        for (int i = 0; iter.hasNext(); i += 1) {
+          path[i] = iter.next();
+        }
+      }
+
+      return path;
+    }
+  }
+
+
+
+
+}
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
index 45df910b905c..b3edfc2827eb 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -101,14 +101,14 @@ protected Table findTable(DataSourceOptions options, Configuration conf) {
     Optional<String> path = options.get("path");
     Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set");
 
-    if (path.get().contains("/")) {
-      HadoopTables tables = new HadoopTables(conf);
-      return tables.load(path.get());
-    } else {
-      HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
-      TableIdentifier tableIdentifier = TableIdentifier.parse(path.get());
-      return hiveCatalog.loadTable(tableIdentifier);
-    }
+    // if (path.get().contains("/")) {
+    HadoopTables tables = new HadoopTables(conf);
+    return tables.load(path.get());
+    // } else {
+    //   HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
+    //   TableIdentifier tableIdentifier = TableIdentifier.parse(path.get());
+    //   return hiveCatalog.loadTable(tableIdentifier);
+    // }
   }
 
   protected SparkSession lazySparkSession() {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
index d4bd88120dcf..75d72f1c465e 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -22,7 +22,6 @@
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
 import java.io.Closeable;
 import java.io.IOException;
@@ -60,15 +59,13 @@
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.data.SparkAvroReader;
 import org.apache.iceberg.spark.data.SparkOrcReader;
-import org.apache.iceberg.spark.data.SparkParquetReaders;
-import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.spark.data.vector.VectorizedSparkParquetReaders;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.ByteBuffers;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.Attribute;
 import org.apache.spark.sql.catalyst.expressions.AttributeReference;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-import org.apache.spark.sql.catalyst.expressions.JoinedRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.sources.Filter;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
@@ -163,15 +160,10 @@ public StructType readSchema() {
 
   @Override
   public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
-    return null;
-  }
-
-  @Override
-  public List<InputPartition<InternalRow>> planInputPartitions() {
     String tableSchemaString = SchemaParser.toJson(table.schema());
     String expectedSchemaString = SchemaParser.toJson(lazySchema());
 
-    List<InputPartition<InternalRow>> readTasks = Lists.newArrayList();
+    List<InputPartition<ColumnarBatch>> readTasks = Lists.newArrayList();
     for (CombinedScanTask task : tasks()) {
       readTasks.add(
           new ReadTask(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager,
@@ -181,6 +173,22 @@ public List<InputPartition<InternalRow>> planInputPartitions() {
     return readTasks;
   }
 
+  @Override
+  public List<InputPartition<InternalRow>> planInputPartitions() {
+    return null;
+    // String tableSchemaString = SchemaParser.toJson(table.schema());
+    // String expectedSchemaString = SchemaParser.toJson(lazySchema());
+    //
+    // List<InputPartition<InternalRow>> readTasks = Lists.newArrayList();
+    // for (CombinedScanTask task : tasks()) {
+    //   readTasks.add(
+    //       new ReadTask(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager,
+    //           caseSensitive, numRecordsPerBatch));
+    // }
+    //
+    // return readTasks;
+  }
+
   @Override
   public Filter[] pushFilters(Filter[] filters) {
     this.tasks = null; // invalidate cached tasks, if present
@@ -275,7 +283,7 @@ public String toString() {
         table, lazySchema().asStruct(), filterExpressions, caseSensitive);
   }
 
-  private static class ReadTask implements InputPartition<InternalRow>, Serializable {
+  private static class ReadTask implements InputPartition<ColumnarBatch>, Serializable {
     private final CombinedScanTask task;
     private final String tableSchemaString;
     private final String expectedSchemaString;
@@ -300,7 +308,7 @@ private ReadTask(
     }
 
     @Override
-    public InputPartitionReader<InternalRow> createPartitionReader() {
+    public InputPartitionReader<ColumnarBatch> createPartitionReader() {
       return new TaskDataReader(task, lazyTableSchema(), lazyExpectedSchema(), fileIo,
         encryptionManager, caseSensitive, numRecordsPerBatch);
     }
@@ -320,7 +328,7 @@ private Schema lazyExpectedSchema() {
     }
   }
 
-  private static class TaskDataReader implements InputPartitionReader<InternalRow> {
+  private static class TaskDataReader implements InputPartitionReader<ColumnarBatch> {
     // for some reason, the apply method can't be called from Java without reflection
     private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply")
         .impl(UnsafeProjection.class, InternalRow.class)
@@ -334,9 +342,9 @@ private static class TaskDataReader implements InputPartitionReader<InternalRow>
     private final boolean caseSensitive;
     private final int numRecordsPerBatch;
 
-    private Iterator<InternalRow> currentIterator;
+    private Iterator<ColumnarBatch> currentIterator;
     private Closeable currentCloseable = null;
-    private InternalRow current = null;
+    private ColumnarBatch current = null;
 
     TaskDataReader(CombinedScanTask task, Schema tableSchema, Schema expectedSchema, FileIO fileIo,
                           EncryptionManager encryptionManager, boolean caseSensitive, int numRecordsPerBatch) {
@@ -376,7 +384,7 @@ public boolean next() throws IOException {
     }
 
     @Override
-    public InternalRow get() {
+    public ColumnarBatch get() {
       return current;
     }
 
@@ -391,7 +399,7 @@ public void close() throws IOException {
       }
     }
 
-    private Iterator<InternalRow> open(FileScanTask task) {
+    private Iterator<ColumnarBatch> open(FileScanTask task) {
       DataFile file = task.file();
 
       // schema or rows returned by readers
@@ -406,23 +414,24 @@ private Iterator<InternalRow> open(FileScanTask task) {
       boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
 
       Schema iterSchema;
-      Iterator<InternalRow> iter;
-
-      if (hasJoinedPartitionColumns) {
-        // schema used to read data files
-        Schema readSchema = TypeUtil.selectNot(requiredSchema, idColumns);
-        Schema partitionSchema = TypeUtil.select(requiredSchema, idColumns);
-        PartitionRowConverter convertToRow = new PartitionRowConverter(partitionSchema, spec);
-        JoinedRow joined = new JoinedRow();
-
-        InternalRow partition = convertToRow.apply(file.partition());
-        joined.withRight(partition);
-
-        // create joined rows and project from the joined schema to the final schema
-        iterSchema = TypeUtil.join(readSchema, partitionSchema);
-        iter = Iterators.transform(open(task, readSchema), joined::withLeft);
-
-      } else if (hasExtraFilterColumns) {
+      Iterator<ColumnarBatch> iter;
+
+      // if (hasJoinedPartitionColumns) {
+        // // schema used to read data files
+        // Schema readSchema = TypeUtil.selectNot(requiredSchema, idColumns);
+        // Schema partitionSchema = TypeUtil.select(requiredSchema, idColumns);
+        // PartitionRowConverter convertToRow = new PartitionRowConverter(partitionSchema, spec);
+        // JoinedRow joined = new JoinedRow();
+        //
+        // InternalRow partition = convertToRow.apply(file.partition());
+        // joined.withRight(partition);
+        //
+        // // create joined rows and project from the joined schema to the final schema
+        // iterSchema = TypeUtil.join(readSchema, partitionSchema);
+        // iter = Iterators.transform(open(task, readSchema), joined::withLeft);
+      //
+      // } else if (hasExtraFilterColumns) {
+      if (hasExtraFilterColumns) {
         // add projection to the final schema
         iterSchema = requiredSchema;
         iter = open(task, requiredSchema);
@@ -434,36 +443,37 @@ private Iterator<InternalRow> open(FileScanTask task) {
       }
 
       // TODO: remove the projection by reporting the iterator's schema back to Spark
-      return Iterators.transform(iter,
-          APPLY_PROJECTION.bind(projection(finalSchema, iterSchema))::invoke);
+      // return Iterators.transform(iter,
+      //     APPLY_PROJECTION.bind(projection(finalSchema, iterSchema))::invoke);
+      return iter;
     }
 
-    private Iterator<InternalRow> open(FileScanTask task, Schema readSchema) {
-      CloseableIterable<InternalRow> iter;
-      if (task.isDataTask()) {
-        iter = newDataIterable(task.asDataTask(), readSchema);
-
-      } else {
-        InputFile location = inputFiles.get(task.file().path().toString());
-        Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
-
-        switch (task.file().format()) {
-          case PARQUET:
-            iter = newParquetIterable(location, task, readSchema);
-            break;
-
-          case AVRO:
-            iter = newAvroIterable(location, task, readSchema);
-            break;
-
-          case ORC:
-            iter = newOrcIterable(location, task, readSchema);
-            break;
-
-          default:
-            throw new UnsupportedOperationException(
-                "Cannot read unknown format: " + task.file().format());
-        }
+    private Iterator<ColumnarBatch> open(FileScanTask task, Schema readSchema) {
+      CloseableIterable<ColumnarBatch> iter;
+      // if (task.isDataTask()) {
+      //   iter = newDataIterable(task.asDataTask(), readSchema);
+      //
+      // } else {
+      InputFile location = inputFiles.get(task.file().path().toString());
+      Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
+
+      switch (task.file().format()) {
+        case PARQUET:
+          iter = newParquetIterable(location, task, readSchema);
+          break;
+        //
+        // case AVRO:
+        //   iter = newAvroIterable(location, task, readSchema);
+        //   break;
+        //
+        // case ORC:
+        //   iter = newOrcIterable(location, task, readSchema);
+        //   break;
+        //
+        default:
+          throw new UnsupportedOperationException(
+              "Cannot read unknown format: " + task.file().format());
+        // }
       }
 
       this.currentCloseable = iter;
@@ -504,13 +514,14 @@ private CloseableIterable<InternalRow> newAvroIterable(InputFile location,
           .build();
     }
 
-    private CloseableIterable<InternalRow> newParquetIterable(InputFile location,
+    private CloseableIterable<ColumnarBatch> newParquetIterable(InputFile location,
                                                             FileScanTask task,
                                                             Schema readSchema) {
       return Parquet.read(location)
           .project(readSchema, SparkSchemaUtil.convert(readSchema))
           .split(task.start(), task.length())
-          .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema))
+          .createReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(tableSchema, readSchema,
+              fileSchema))
           .filter(task.residual())
           .caseSensitive(caseSensitive)
           .recordsPerBatch(numRecordsPerBatch)
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
index 94ad20bc33f7..ef4b0f72c47b 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
@@ -50,10 +50,10 @@ public abstract class AvroDataTest {
       required(110, "s", Types.StringType.get()),
       //required(111, "uuid", Types.UUIDType.get()),
       //required(112, "fixed", Types.FixedType.ofLength(7)),
-      optional(113, "bytes", Types.BinaryType.get()),
-      required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
-      required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
-      required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
+      optional(113, "bytes", Types.BinaryType.get())
+      // required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
+      // required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
+      // required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
   );
 
   @Rule
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
new file mode 100644
index 000000000000..b56faa756c71
--- /dev/null
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -0,0 +1,101 @@
+package org.apache.iceberg.spark.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.spark.data.vector.VectorizedSparkParquetReaders;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Test;
+
+
+public class TestSparkParquetVectorizedReader extends AvroDataTest {
+
+  @Override
+  protected void writeAndValidate(Schema schema) throws IOException {
+
+    // Write test data
+    Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema,
+        type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));
+
+    List<GenericData.Record> expected = RandomData.generateList(schema, 100, 0L);
+
+    // write a test parquet file using iceberg writer
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<GenericData.Record> writer = Parquet.write(Files.localOutput(testFile))
+        .schema(schema)
+        .named("test")
+        .build()) {
+      writer.addAll(expected);
+    }
+
+
+    try(CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
+    .project(schema)
+    .createReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type))
+    .build()) {
+
+      Iterator<ColumnarBatch> batches = batchReader.iterator();
+      int numRowsRead = 0;
+      while(batches.hasNext()) {
+
+        ColumnarBatch batch = batches.next();
+        numRowsRead += batch.numRows();
+        System.out.println("Batch read with "+batch.numRows()+" rows. Read "+numRowsRead+" till now.");
+      }
+
+      Assert.assertEquals(expected.size(), numRowsRead);
+
+    }
+  }
+
+
+
+
+  @Test
+  public void testArray() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testArrayOfStructs() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMap() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testNumericMapKey() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testComplexMapKey() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMapOfStructs() throws IOException {
+    System.out.println("Not Supported");
+  }
+
+  @Test
+  public void testMixedTypes() throws IOException {
+    System.out.println("Not Supported");
+  }
+}
diff --git a/versions.props b/versions.props
index 7f718424c56a..80c334e695f8 100644
--- a/versions.props
+++ b/versions.props
@@ -1,6 +1,7 @@
 org.slf4j:slf4j-api = 1.7.5
 com.google.guava:guava = 28.0-jre
 org.apache.avro:avro = 1.8.2
+org.apache.arrow:arrow-vector = 0.12.0
 org.apache.hadoop:* = 2.7.3
 org.apache.hive:hive-standalone-metastore = 1.2.1
 org.apache.orc:orc-core = 1.5.5

From 18084e2bb75f4127da039eefcbfcbbee43470cef Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Mon, 22 Jul 2019 17:05:40 -0700
Subject: [PATCH 11/22] [WIP] Added Decimal types to vectorization

---
 .../vector/VectorizedParquetValueReaders.java | 99 ++++++++++++++++++-
 .../vector/VectorizedSparkParquetReaders.java | 39 +++++---
 .../iceberg/spark/data/AvroDataTest.java      |  8 +-
 3 files changed, 123 insertions(+), 23 deletions(-)

diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
index 39b072a3411a..c1d4a75fdc60 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -1,9 +1,12 @@
 package org.apache.iceberg.spark.data.vector;
 
+import java.math.BigDecimal;
+import java.math.BigInteger;
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVector;
 import org.apache.arrow.vector.DateDayVector;
+import org.apache.arrow.vector.DecimalVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -16,15 +19,17 @@
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.io.api.Binary;
+import org.apache.spark.sql.types.Decimal;
 
 /***
  * Parquet Value Reader implementations for Vectorization.
  * Contains type-wise readers to read parquet data as vectors.
- * - Returns Arrow's Field Vector for each type
+ * - Returns Arrow's Field Vector for each type.
  * - Null values are explicitly handled.
- * - Type serialization is done based on types in Arrow
+ * - Type serialization is done based on types in Arrow.
+ * - Creates One Vector per RowGroup. So a Batch would have as many rows as there are in the underlying RowGroup.
  * - Mapping of Iceberg type to Arrow type is done in ArrowSchemaUtil.convert()
- *
+ * - Iceberg to Arrow Type mapping :
  * 	 icebergType : LONG   		-> 		Field Vector Type : org.apache.arrow.vector.BigIntVector
  * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
  * 	 icebergType : BOOLEAN 		-> 		Field Vector Type : org.apache.arrow.vector.BitVector
@@ -35,6 +40,7 @@
  * 	 icebergType : TIMESTAMP  -> 		Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
  * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
  * 	 icebergType : BINARY  		-> 		Field Vector Type : org.apache.arrow.vector.VarBinaryVector
+ * 	 icebergField : DECIMAL 	->  	Field Vector Type : org.apache.arrow.vector.DecimalVector
  */
 public class VectorizedParquetValueReaders {
 
@@ -294,4 +300,91 @@ protected void nextValueAt(int i) {
 
     }
   }
+
+
+  protected static class IntegerDecimalReader extends VectorReader {
+    private final int precision;
+    private final int scale;
+
+    IntegerDecimalReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc,
+        int precision, int scale) {
+
+      super(desc, icebergField, rootAlloc);
+      this.precision = precision;
+      this.scale = scale;
+    }
+
+    protected void nextNullAt(int i) {
+      ((DecimalVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      int decimalIntValue = column.nextInteger();
+      Decimal decimalValue = Decimal.apply(decimalIntValue, precision, scale);
+
+      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+
+    }
+  }
+
+  protected static class LongDecimalReader extends VectorReader {
+    private final int precision;
+    private final int scale;
+
+    LongDecimalReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc,
+        int precision, int scale) {
+
+      super(desc, icebergField, rootAlloc);
+      this.precision = precision;
+      this.scale = scale;
+    }
+
+    protected void nextNullAt(int i) {
+      ((DecimalVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      long decimalLongValue = column.nextLong();
+      Decimal decimalValue = Decimal.apply(decimalLongValue, precision, scale);
+
+      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+
+    }
+  }
+
+
+
+  protected static class BinaryDecimalReader extends VectorReader {
+    private final int precision;
+    private final int scale;
+
+    BinaryDecimalReader(ColumnDescriptor desc,
+        Types.NestedField icebergField,
+        RootAllocator rootAlloc,
+        int precision, int scale) {
+
+      super(desc, icebergField, rootAlloc);
+      this.precision = precision;
+      this.scale = scale;
+    }
+
+    protected void nextNullAt(int i) {
+      ((DecimalVector)vec).setNull(i);
+    }
+
+    protected void nextValueAt(int i) {
+
+      Binary binaryValue = column.nextBinary();
+      Decimal decimalValue = Decimal.fromDecimal(new BigDecimal(new BigInteger(binaryValue.getBytes()), scale));
+
+      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+
+    }
+  }
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
index 22b5a09c036e..c1fce1a3fe3f 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
@@ -15,6 +15,7 @@
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.schema.DecimalMetadata;
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.PrimitiveType;
@@ -136,22 +137,28 @@ public ParquetValueReader<?> primitive(org.apache.iceberg.types.Type.PrimitiveTy
             return new VectorizedParquetValueReaders.TimestampMicroReader(desc, icebergField, rootAllocator);
           case TIMESTAMP_MILLIS:
             return new VectorizedParquetValueReaders.TimestampMillisReader(desc, icebergField, rootAllocator);
-          // case DECIMAL:
-          //   DecimalMetadata decimal = primitive.getDecimalMetadata();
-          //   switch (primitive.getPrimitiveTypeName()) {
-          //     case BINARY:
-          //     case FIXED_LEN_BYTE_ARRAY:
-          //       return new SparkParquetReaders.BinaryDecimalReader(desc, decimal.getScale());
-          //     case INT64:
-          //       return new SparkParquetReaders.LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale());
-          //     case INT32:
-          //       return new SparkParquetReaders.IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale());
-          //     default:
-          //       throw new UnsupportedOperationException(
-          //           "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
-          //   }
-          // case BSON:
-          //   return new SparkParquetReaders.BytesReader(desc);
+          case DECIMAL:
+            DecimalMetadata decimal = primitive.getDecimalMetadata();
+            switch (primitive.getPrimitiveTypeName()) {
+              case BINARY:
+              case FIXED_LEN_BYTE_ARRAY:
+                return new VectorizedParquetValueReaders.BinaryDecimalReader(desc, icebergField, rootAllocator,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              case INT64:
+                return new VectorizedParquetValueReaders.LongDecimalReader(desc, icebergField, rootAllocator,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              case INT32:
+                return new VectorizedParquetValueReaders.IntegerDecimalReader(desc, icebergField, rootAllocator,
+                    decimal.getPrecision(),
+                    decimal.getScale());
+              default:
+                throw new UnsupportedOperationException(
+                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
+            }
+          case BSON:
+            return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator);
           default:
             throw new UnsupportedOperationException(
                 "Unsupported logical type: " + primitive.getOriginalType());
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
index ef4b0f72c47b..94ad20bc33f7 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java
@@ -50,10 +50,10 @@ public abstract class AvroDataTest {
       required(110, "s", Types.StringType.get()),
       //required(111, "uuid", Types.UUIDType.get()),
       //required(112, "fixed", Types.FixedType.ofLength(7)),
-      optional(113, "bytes", Types.BinaryType.get())
-      // required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
-      // required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
-      // required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
+      optional(113, "bytes", Types.BinaryType.get()),
+      required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
+      required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
+      required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
   );
 
   @Rule

From 36da977b4a5152914595003ae5233b594f2d33d6 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Tue, 23 Jul 2019 14:48:36 -0700
Subject: [PATCH 12/22] [WIP] added remaining primitive type vectorization and
 tests

---
 .../iceberg/arrow/ArrowSchemaUtilTest.java    | 142 ------------------
 .../apache/iceberg/parquet/ParquetReader.java |   8 -
 .../vector/VectorizedParquetValueReaders.java |   8 +-
 .../vector/VectorizedSparkParquetReaders.java |   1 +
 .../apache/iceberg/spark/data/RandomData.java |   4 +-
 .../iceberg/spark/data/TestHelpers.java       |  28 ++++
 .../TestSparkParquetVectorizedReader.java     |  17 ++-
 7 files changed, 53 insertions(+), 155 deletions(-)

diff --git a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
index 8d205c4d4b5c..0ac5cc8e7323 100644
--- a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
+++ b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
@@ -66,148 +66,6 @@
 
 public class ArrowSchemaUtilTest {
 
-  @Test
-  public void testArrowWriting() {
-
-    Schema iceberg = new Schema(
-        optional(0, "i", Types.IntegerType.get()),
-        optional(1, "b", BooleanType.get()),
-        required(2, "d", DoubleType.get()),
-        required(3, "s", StringType.get()),
-        optional(4, "d2", DateType.get()),
-        optional(5, "ts", TimestampType.withoutZone())
-    );
-
-    org.apache.arrow.vector.types.pojo.Schema arrowSchema = ArrowSchemaUtil.convert(iceberg);
-    RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
-
-    VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, rootAllocator);
-
-    // java.util.List<FieldVector> fieldVectors = schemaRoot.getFieldVectors();
-
-    ColumnVector[] columns = new ColumnVector[arrowSchema.getFields().size()];
-    int i=0;
-    for(Field field: arrowSchema.getFields()) {
-
-      // create
-      FieldVector vec = field.createVector(rootAllocator);
-      ArrowFieldWriter fieldWriter = selectFieldWriter(vec);
-
-      // int batchSize = 100;
-      // for (int i=0; i<batchSize; i++) {
-      //   fieldWriter.write(i, );
-      // }
-
-      ArrowColumnVector columnVector = new ArrowColumnVector(vec);
-      columns[i] = columnVector;
-      i++;
-    }
-
-    ColumnarBatch batch = new ColumnarBatch(columns);
-
-  }
-
-  ArrowFieldWriter<?> selectFieldWriter(ValueVector vec) {
-
-    Field field = vec.getField();
-    DataType dt = ArrowUtils.fromArrowField(field);
-
-    if(dt instanceof IntegerType) {
-
-      return new IntegerWriter(vec);
-
-    } else if (dt instanceof org.apache.spark.sql.types.BooleanType) {
-
-      return new BooleanWriter(vec);
-    } else {
-
-      throw new UnsupportedOperationException("Unsupported data type: "+dt.catalogString());
-    }
-
-  }
-
-
-  private static abstract class ArrowFieldWriter<T> {
-
-    ValueVector vec;
-    String fieldName;
-    DataType dataType;
-    boolean isNullable;
-    int count = 0;
-
-    ArrowFieldWriter(ValueVector vec) {
-
-      this.vec = vec;
-      this.vec.allocateNew();
-    }
-
-    public void write(int ordinal, T data) {
-
-      if(data == null) {
-
-        setNull(ordinal);
-      } else {
-
-        setValue(ordinal, data);
-      }
-      count++;
-    }
-
-    public void finish() {
-
-      vec.setValueCount(count);
-    }
-
-    public void reset() {
-
-      vec.setValueCount(0);
-      count = 0;
-    }
-
-    abstract public void setNull(int ordinal);
-
-    abstract public void setValue(int ordinal, T data);
-  }
-
-  private static final class IntegerWriter extends ArrowFieldWriter<Integer>{
-
-
-    IntegerWriter(ValueVector vec) {
-      super(vec);
-    }
-
-    @Override
-    public void setNull(int ordinal) {
-      ((IntVector)vec).setNull(ordinal);
-    }
-
-    @Override
-    public void setValue(int ordinal, Integer data) {
-      ((IntVector)vec).setSafe(ordinal, data);
-    }
-  }
-
-
-  private static final class BooleanWriter extends ArrowFieldWriter<Boolean>{
-
-
-    BooleanWriter(ValueVector vec) {
-      super(vec);
-    }
-
-    @Override
-    public void setNull(int ordinal) {
-      ((BitVector)vec).setNull(ordinal);
-    }
-
-    @Override
-    public void setValue(int ordinal, Boolean data) {
-      ((BitVector)vec).set(ordinal, data ? 1 : 0);
-    }
-  }
-
-
-
   @Test
   public void convertPrimitive() {
     Schema iceberg = new Schema(
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
index 9928a4da0beb..9f1e5fbdc648 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
@@ -202,15 +202,7 @@ public Iterator<T> iterator() {
     FileIterator<T> iter = new FileIterator<>(init());
     addCloseable(iter);
 
-    // if(maxRecordsPerBatch == 0) {
-    LOG.info("[ParquetReader] => Return regular iterator. No batching.");
-    System.out.println("[ParquetReader] => Return regular iterator. No batching.");
     return iter;
-    // } else {
-    //   LOG.info("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
-      // System.out.println("[ParquetReader] => Read into Arrow batches of " + maxRecordsPerBatch + " rows.");
-      // return arrowBatchAsInternalRow((Iterator<InternalRow>) iter);
-    // }
   }
 
   private Iterator<T> arrowBatchAsInternalRow(Iterator<InternalRow> iter) {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
index c1d4a75fdc60..93ebc4508fd6 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -19,6 +19,7 @@
 import org.apache.iceberg.types.Types;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.Type;
 import org.apache.spark.sql.types.Decimal;
 
 /***
@@ -47,15 +48,16 @@ public class VectorizedParquetValueReaders {
   public abstract static class VectorReader extends ParquetValueReaders.PrimitiveReader<FieldVector> {
 
     protected FieldVector vec;
+    protected boolean isOptional;
 
     VectorReader(ColumnDescriptor desc,
         Types.NestedField icebergField,
         RootAllocator rootAlloc) {
 
       super(desc);
-
       this.vec = ArrowSchemaUtil.convert(icebergField).createVector(rootAlloc);
-      System.out.println("=> icebergField : "+icebergField.type().typeId().name()+" ,  Field Vector Type : "+vec.getClass().getName());
+      this.isOptional = desc.getPrimitiveType().isRepetition(Type.Repetition.OPTIONAL);
+      // System.out.println("=> icebergField : "+icebergField.type().typeId().name()+" ,  Field Vector Type : "+vec.getClass().getName());
     }
 
     @Override
@@ -67,7 +69,7 @@ public FieldVector read(FieldVector ignore) {
       while(column.hasNext()) {
         // Todo: this check works for flat schemas only
         // need to get max definition level to do proper check
-        if(column.currentDefinitionLevel() == 0) {
+        if(isOptional && column.currentDefinitionLevel() == 0) {
           // handle null
           column.nextNull();
           nextNullAt(i);
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
index c1fce1a3fe3f..e1a5a1f97e4e 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
@@ -111,6 +111,7 @@ public ParquetValueReader<?> primitive(org.apache.iceberg.types.Type.PrimitiveTy
       int parquetFieldId = primitive.getId().intValue();
       ColumnDescriptor desc = parquetSchema.getColumnDescription(currentPath());
       Types.NestedField icebergField = tableIcebergSchema.findField(parquetFieldId);
+      // int fieldD = parquetSchema.getMaxDefinitionLevel(path(primitive.getName())) - 1;
       // Field field = ArrowSchemaUtil.convert(projectedIcebergSchema.findField(parquetFieldId));
       // FieldVector vec = field.createVector(rootAllocator);
 
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
index 4b8952698bf8..080c0d1066d9 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -55,7 +55,9 @@ public static List<Record> generateList(Schema schema, int numRecords, long seed
     RandomDataGenerator generator = new RandomDataGenerator(schema, seed);
     List<Record> records = Lists.newArrayListWithExpectedSize(numRecords);
     for (int i = 0; i < numRecords; i += 1) {
-      records.add((Record) TypeUtil.visit(schema, generator));
+      Record rec = (Record) TypeUtil.visit(schema, generator);
+      // System.out.println("Add record "+rec);
+      records.add(rec);
     }
 
     return records;
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index c20bc67a69e1..335f8a840790 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -53,6 +53,7 @@
 import org.apache.spark.sql.types.MapType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.junit.Assert;
 import scala.collection.Seq;
@@ -205,6 +206,33 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter
     }
   }
 
+  public static void assertEqualsUnsafe(Types.StructType struct, List<Record> expected, ColumnarBatch batch) {
+    List<Types.NestedField> fields = struct.fields();
+    for (int r=0; r<batch.numRows(); r++) {
+
+      Record expRec = expected.get(r);
+      InternalRow actualRow = batch.getRow(r);
+
+      for (int i = 0; i < fields.size(); i += 1) {
+
+        Type fieldType = fields.get(i).type();
+        Object expectedValue = expRec.get(i);
+        // System.out.println("   -> Checking Row "+r+", field #"+i
+        //     + " , Field:"+ fields.get(i).name()
+        //     + " , optional:"+fields.get(i).isOptional()
+        //     + " , type:"+fieldType.typeId()
+        //     + " , expected:"+expectedValue);
+        if (actualRow.isNullAt(i)) {
+
+          Assert.assertTrue("Expect null", (expectedValue == null));
+        } else {
+          Object actualValue = actualRow.get(i, convert(fieldType));
+          assertEqualsUnsafe(fieldType, expectedValue, actualValue);
+        }
+      }
+    }
+  }
+
   private static void assertEqualsUnsafe(Types.ListType list, Collection<?> expected, ArrayData actual) {
     Type elementType = list.elementType();
     List<?> expectedElements = Lists.newArrayList(expected);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
index b56faa756c71..3f5536b882e8 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -2,6 +2,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.avro.generic.GenericData;
@@ -18,6 +19,7 @@
 import org.junit.Assume;
 import org.junit.Test;
 
+import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe;
 
 public class TestSparkParquetVectorizedReader extends AvroDataTest {
 
@@ -49,11 +51,24 @@ protected void writeAndValidate(Schema schema) throws IOException {
 
       Iterator<ColumnarBatch> batches = batchReader.iterator();
       int numRowsRead = 0;
+      int numExpectedRead = 0;
       while(batches.hasNext()) {
 
         ColumnarBatch batch = batches.next();
         numRowsRead += batch.numRows();
-        System.out.println("Batch read with "+batch.numRows()+" rows. Read "+numRowsRead+" till now.");
+
+        List<GenericData.Record> expectedBatch = new ArrayList<>(batch.numRows());
+        for(int i = numExpectedRead; i < numExpectedRead+batch.numRows(); i++) {
+          expectedBatch.add(expected.get(i));
+        }
+
+        System.out.println("-> Check "+numExpectedRead+" - "+ (numExpectedRead+batch.numRows()));
+        assertEqualsUnsafe(schema.asStruct(), expectedBatch, batch);
+
+        System.out.println("Batch read with "+batch.numRows()+" rows. Read "+numRowsRead+" till now. " +
+            "Expected batch "+expectedBatch.size());
+
+        numExpectedRead += batch.numRows();
       }
 
       Assert.assertEquals(expected.size(), numRowsRead);

From 7121ecdb83589c45f07d1bbe6512e897fd709139 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Tue, 23 Jul 2019 14:58:11 -0700
Subject: [PATCH 13/22] [WIP] unused imports fixed

---
 .../iceberg/arrow/ArrowSchemaUtilTest.java    | 18 ------------------
 .../vector/VectorizedParquetValueReaders.java | 19 +++++++++++++++++++
 .../vector/VectorizedSparkParquetReaders.java | 19 +++++++++++++++++++
 .../TestSparkParquetVectorizedReader.java     |  2 +-
 4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
index 0ac5cc8e7323..f42ebde64dfa 100644
--- a/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
+++ b/core/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java
@@ -19,16 +19,6 @@
 
 package org.apache.iceberg.arrow;
 
-
-import java.util.List;
-import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.BitVector;
-import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.Float8Vector;
-import org.apache.arrow.vector.IntVector;
-import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.VectorLoader;
-import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.iceberg.Schema;
@@ -42,14 +32,6 @@
 import org.apache.iceberg.types.Types.MapType;
 import org.apache.iceberg.types.Types.StringType;
 import org.apache.iceberg.types.Types.TimestampType;
-import org.apache.spark.sql.execution.arrow.ArrowUtils;
-import org.apache.spark.sql.execution.arrow.ArrowWriter;
-import org.apache.spark.sql.execution.arrow.BooleanWriter;
-import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.types.*;
-import org.apache.spark.sql.vectorized.ArrowColumnVector;
-import org.apache.spark.sql.vectorized.ColumnVector;
-import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.junit.Test;
 
 import static org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID.Bool;
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
index 93ebc4508fd6..ad08b98cffe7 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.iceberg.spark.data.vector;
 
 import java.math.BigDecimal;
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
index e1a5a1f97e4e..91f69116baa7 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.iceberg.spark.data.vector;
 
 import com.google.common.collect.ImmutableList;
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
index 3f5536b882e8..fed41b499f59 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -62,7 +62,7 @@ protected void writeAndValidate(Schema schema) throws IOException {
           expectedBatch.add(expected.get(i));
         }
 
-        System.out.println("-> Check "+numExpectedRead+" - "+ (numExpectedRead+batch.numRows()));
+        // System.out.println("-> Check "+numExpectedRead+" - "+ (numExpectedRead+batch.numRows()));
         assertEqualsUnsafe(schema.asStruct(), expectedBatch, batch);
 
         System.out.println("Batch read with "+batch.numRows()+" rows. Read "+numRowsRead+" till now. " +

From e2bdf33c947a5982a99dad4736105a4298345544 Mon Sep 17 00:00:00 2001
From: "Chen, Junjie" <jimmyjchen@tencent.com>
Date: Fri, 26 Jul 2019 05:25:52 +0800
Subject: [PATCH 14/22] Add argument validation to HadoopTables#create (#298)

---
 .../java/org/apache/iceberg/hadoop/HadoopTables.java  | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java
index 553faef93e94..e0de97dca262 100644
--- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java
+++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java
@@ -19,6 +19,8 @@
 
 package org.apache.iceberg.hadoop;
 
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableMap;
 import java.util.Map;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
@@ -69,19 +71,24 @@ public Table load(String location) {
    * location.
    *
    * @param schema iceberg schema used to create the table
-   * @param spec partition specification
+   * @param spec partitioning spec, if null the table will be unpartitioned
+   * @param properties a string map of table properties, initialized to empty if null
    * @param location a path URI (e.g. hdfs:///warehouse/my_table)
    * @return newly created table implementation
    */
   @Override
   public Table create(Schema schema, PartitionSpec spec, Map<String, String> properties,
                       String location) {
+    Preconditions.checkNotNull(schema, "A table schema is required");
+
     TableOperations ops = newTableOps(location);
     if (ops.current() != null) {
       throw new AlreadyExistsException("Table already exists at location: " + location);
     }
 
-    TableMetadata metadata = TableMetadata.newTableMetadata(ops, schema, spec, location, properties);
+    Map<String, String> tableProps = properties == null ? ImmutableMap.of() : properties;
+    PartitionSpec partitionSpec = spec == null ? PartitionSpec.unpartitioned() : spec;
+    TableMetadata metadata = TableMetadata.newTableMetadata(ops, schema, partitionSpec, location, tableProps);
     ops.commit(null, metadata);
 
     return new BaseTable(ops, location);

From 1596d61d5aee12cb089affa7b9113ece74025e8d Mon Sep 17 00:00:00 2001
From: David Phillips <david@acz.org>
Date: Thu, 25 Jul 2019 14:30:47 -0700
Subject: [PATCH 15/22] Install source JAR when running install target (#310)

---
 build.gradle | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build.gradle b/build.gradle
index 282ce61d735f..fe08286d0494 100644
--- a/build.gradle
+++ b/build.gradle
@@ -64,6 +64,10 @@ subprojects {
   apply plugin: 'maven' // make pom files for deployment
   apply plugin: 'nebula.maven-base-publish'
 
+  artifacts {
+    archives sourceJar
+  }
+
   compileJava {
     options.encoding = "UTF-8"
   }

From ceae2fd7c79ce1eaaaa2eed265481cfce6fdce16 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Thu, 25 Jul 2019 18:41:54 -0700
Subject: [PATCH 16/22] Bump version to 1.0-adobe-3.0-vectorized-SNAPSHOT

---
 build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index fb09611f628d..3cf0822db1af 100644
--- a/build.gradle
+++ b/build.gradle
@@ -48,7 +48,7 @@ allprojects {
   group = "org.apache.iceberg"
   apply plugin: 'com.palantir.baseline-idea'
   /* TODO - this assumes that the upstream apache version is 1.0 so we need to be consistent w/ upstream changes */
-  version = "1.0-adobe-3.0-SNAPSHOT"
+  version = "1.0-adobe-3.0-vectorized-SNAPSHOT"
   repositories {
     maven { url  "http://palantir.bintray.com/releases" }
     mavenCentral()

From 83b94e6202b71ed766c338f0b514911fbd89da14 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Thu, 25 Jul 2019 19:05:16 -0700
Subject: [PATCH 17/22] Temporarily ignore applying style check

---
 build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index 3cf0822db1af..8ff5497753b7 100644
--- a/build.gradle
+++ b/build.gradle
@@ -178,7 +178,7 @@ configure(baselineProjects) {
   // Thus we concede to applying all of the Baseline plugins individually on all the projects we are
   // ready to enforce linting on.
   apply plugin: 'org.inferred.processors'
-  apply plugin: 'com.palantir.baseline-checkstyle'
+ // apply plugin: 'com.palantir.baseline-checkstyle'
   apply plugin: 'com.palantir.baseline-error-prone'
   apply plugin: 'com.palantir.baseline-scalastyle'
   apply plugin: 'com.palantir.baseline-class-uniqueness'

From 3b8c43a3d007358ed76a87baf07c15015e8b9b55 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Thu, 25 Jul 2019 19:20:23 -0700
Subject: [PATCH 18/22] Fixing javadoc error

---
 .../vector/VectorizedParquetValueReaders.java | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
index ad08b98cffe7..b01d862a48c4 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -50,17 +50,17 @@
  * - Creates One Vector per RowGroup. So a Batch would have as many rows as there are in the underlying RowGroup.
  * - Mapping of Iceberg type to Arrow type is done in ArrowSchemaUtil.convert()
  * - Iceberg to Arrow Type mapping :
- * 	 icebergType : LONG   		-> 		Field Vector Type : org.apache.arrow.vector.BigIntVector
- * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
- * 	 icebergType : BOOLEAN 		-> 		Field Vector Type : org.apache.arrow.vector.BitVector
- * 	 icebergType : INTEGER 		-> 		Field Vector Type : org.apache.arrow.vector.IntVector
- * 	 icebergType : FLOAT   		-> 		Field Vector Type : org.apache.arrow.vector.Float4Vector
- * 	 icebergType : DOUBLE  		-> 		Field Vector Type : org.apache.arrow.vector.Float8Vector
- * 	 icebergType : DATE    		-> 		Field Vector Type : org.apache.arrow.vector.DateDayVector
- * 	 icebergType : TIMESTAMP  -> 		Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
- * 	 icebergType : STRING  		-> 		Field Vector Type : org.apache.arrow.vector.VarCharVector
- * 	 icebergType : BINARY  		-> 		Field Vector Type : org.apache.arrow.vector.VarBinaryVector
- * 	 icebergField : DECIMAL 	->  	Field Vector Type : org.apache.arrow.vector.DecimalVector
+ * 	 icebergType : LONG   		- 		Field Vector Type : org.apache.arrow.vector.BigIntVector
+ * 	 icebergType : STRING  		- 		Field Vector Type : org.apache.arrow.vector.VarCharVector
+ * 	 icebergType : BOOLEAN 		- 		Field Vector Type : org.apache.arrow.vector.BitVector
+ * 	 icebergType : INTEGER 		- 		Field Vector Type : org.apache.arrow.vector.IntVector
+ * 	 icebergType : FLOAT   		- 		Field Vector Type : org.apache.arrow.vector.Float4Vector
+ * 	 icebergType : DOUBLE  		- 		Field Vector Type : org.apache.arrow.vector.Float8Vector
+ * 	 icebergType : DATE    		- 		Field Vector Type : org.apache.arrow.vector.DateDayVector
+ * 	 icebergType : TIMESTAMP  - 		Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
+ * 	 icebergType : STRING  		- 		Field Vector Type : org.apache.arrow.vector.VarCharVector
+ * 	 icebergType : BINARY  		- 		Field Vector Type : org.apache.arrow.vector.VarBinaryVector
+ * 	 icebergField : DECIMAL 	-  	Field Vector Type : org.apache.arrow.vector.DecimalVector
  */
 public class VectorizedParquetValueReaders {
 

From c01cb7153bb8f3223d996488919f6390acbd0570 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Fri, 26 Jul 2019 09:55:45 -0700
Subject: [PATCH 19/22] Updating versions.lock

---
 versions.lock | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/versions.lock b/versions.lock
index b7e574d1bc60..2f314a21729b 100644
--- a/versions.lock
+++ b/versions.lock
@@ -7,16 +7,17 @@ com.carrotsearch:hppc:0.7.2 (1 constraints: f70cda14)
 com.clearspring.analytics:stream:2.7.0 (1 constraints: 1a0dd136)
 com.esotericsoftware:kryo-shaded:4.0.2 (2 constraints: b71345a6)
 com.esotericsoftware:minlog:1.3.0 (1 constraints: 670e7c4f)
-com.fasterxml.jackson.core:jackson-annotations:2.7.9 (4 constraints: f24786bf)
+com.fasterxml.jackson.core:jackson-annotations:2.7.9 (5 constraints: f154e19f)
 com.fasterxml.jackson.core:jackson-core:2.7.9 (5 constraints: d748db55)
-com.fasterxml.jackson.core:jackson-databind:2.7.9 (8 constraints: a77bca51)
+com.fasterxml.jackson.core:jackson-databind:2.7.9 (9 constraints: a688bc53)
 com.fasterxml.jackson.module:jackson-module-paranamer:2.7.9 (1 constraints: e0154200)
 com.fasterxml.jackson.module:jackson-module-scala_2.11:2.7.9 (1 constraints: 7f0da251)
 com.github.ben-manes.caffeine:caffeine:2.7.0 (1 constraints: 0b050a36)
 com.github.luben:zstd-jni:1.3.2-2 (1 constraints: 760d7c51)
-com.google.code.findbugs:jsr305:3.0.2 (10 constraints: c483db75)
+com.google.code.findbugs:jsr305:3.0.2 (9 constraints: d276cf3c)
 com.google.code.gson:gson:2.2.4 (1 constraints: 8c0d3f2f)
 com.google.errorprone:error_prone_annotations:2.3.3 (2 constraints: 161a2544)
+com.google.flatbuffers:flatbuffers-java:1.9.0 (2 constraints: e5199714)
 com.google.guava:failureaccess:1.0.1 (1 constraints: 140ae1b4)
 com.google.guava:guava:28.0-jre (23 constraints: cc5c2ea0)
 com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava (1 constraints: bd17c918)
@@ -40,7 +41,6 @@ com.twitter:chill-java:0.9.3 (2 constraints: a716716f)
 com.twitter:chill_2.11:0.9.3 (2 constraints: 121b92c3)
 com.twitter:parquet-hadoop-bundle:1.6.0 (3 constraints: 7c262424)
 com.univocity:univocity-parsers:2.7.3 (1 constraints: c40ccb27)
-com.vlkan:flatbuffers:1.2.0-3f79e055 (2 constraints: 411e1dee)
 commons-beanutils:commons-beanutils:1.7.0 (1 constraints: da0e635f)
 commons-beanutils:commons-beanutils-core:1.8.0 (1 constraints: 1d134124)
 commons-cli:commons-cli:1.2 (8 constraints: 9467c282)
@@ -66,6 +66,7 @@ io.dropwizard.metrics:metrics-json:3.1.5 (1 constraints: 1a0dc936)
 io.dropwizard.metrics:metrics-jvm:3.1.5 (1 constraints: 1a0dc936)
 io.netty:netty:3.9.9.Final (9 constraints: 9eb0396d)
 io.netty:netty-all:4.1.17.Final (3 constraints: d2312526)
+it.unimi.dsi:fastutil:7.0.13 (1 constraints: fc0d4043)
 javax.activation:activation:1.1.1 (1 constraints: 140dbb36)
 javax.annotation:javax.annotation-api:1.2 (2 constraints: 2d21193d)
 javax.inject:javax.inject:1 (4 constraints: 852d0c1a)
@@ -94,10 +95,10 @@ org.antlr:antlr4-runtime:4.7 (1 constraints: 7a0e125f)
 org.antlr:stringtemplate:3.2.1 (1 constraints: c10a3bc6)
 org.apache.ant:ant:1.9.1 (3 constraints: a721ed14)
 org.apache.ant:ant-launcher:1.9.1 (1 constraints: 69082485)
-org.apache.arrow:arrow-format:0.10.0 (1 constraints: 1f0de721)
-org.apache.arrow:arrow-memory:0.10.0 (1 constraints: 1f0de721)
-org.apache.arrow:arrow-vector:0.10.0 (1 constraints: e90c9734)
-org.apache.avro:avro:1.8.2 (4 constraints: 3d2eebf3)
+org.apache.arrow:arrow-format:0.12.0 (1 constraints: 210ded21)
+org.apache.arrow:arrow-memory:0.12.0 (1 constraints: 210ded21)
+org.apache.arrow:arrow-vector:0.12.0 (2 constraints: 1d122345)
+org.apache.avro:avro:1.8.2 (5 constraints: 083cf387)
 org.apache.avro:avro-ipc:1.8.2 (1 constraints: f90b5bf4)
 org.apache.avro:avro-mapred:1.8.2 (2 constraints: 3a1a4787)
 org.apache.calcite:calcite-avatica:1.2.0-incubating (4 constraints: a044b922)

From 5f63918166209e515df85dcb7820d29460a11969 Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Fri, 26 Jul 2019 10:35:18 -0700
Subject: [PATCH 20/22] fixed checkstyle errors

---
 build.gradle                                  |   2 +-
 .../spark/data/SparkParquetReaders.java       |   7 +-
 .../vector/VectorizedParquetValueReaders.java | 159 +++++++++---------
 .../vector/VectorizedSparkParquetReaders.java |  19 +--
 .../iceberg/spark/source/IcebergSource.java   |   3 -
 .../iceberg/spark/data/TestHelpers.java       |   7 +-
 .../TestSparkParquetVectorizedReader.java     |  38 +++--
 7 files changed, 127 insertions(+), 108 deletions(-)

diff --git a/build.gradle b/build.gradle
index 8ff5497753b7..3cf0822db1af 100644
--- a/build.gradle
+++ b/build.gradle
@@ -178,7 +178,7 @@ configure(baselineProjects) {
   // Thus we concede to applying all of the Baseline plugins individually on all the projects we are
   // ready to enforce linting on.
   apply plugin: 'org.inferred.processors'
- // apply plugin: 'com.palantir.baseline-checkstyle'
+  apply plugin: 'com.palantir.baseline-checkstyle'
   apply plugin: 'com.palantir.baseline-error-prone'
   apply plugin: 'com.palantir.baseline-scalastyle'
   apply plugin: 'com.palantir.baseline-class-uniqueness'
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
index bb9330577935..e8cc08359021 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java
@@ -59,7 +59,6 @@
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -113,12 +112,16 @@ public ParquetValueReader<?> struct(Types.StructType ignored, GroupType struct,
   }
 
   private static class ReadBuilder extends TypeWithSchemaVisitor<ParquetValueReader<?>> {
-    protected final MessageType type;
+    private final MessageType type;
 
     ReadBuilder(MessageType type) {
       this.type = type;
     }
 
+    protected MessageType getType() {
+      return type;
+    }
+
     @Override
     public ParquetValueReader<?> message(Types.StructType expected, MessageType message,
                                          List<ParquetValueReader<?>> fieldReaders) {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
index b01d862a48c4..3b67c96a85d6 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java
@@ -50,24 +50,24 @@
  * - Creates One Vector per RowGroup. So a Batch would have as many rows as there are in the underlying RowGroup.
  * - Mapping of Iceberg type to Arrow type is done in ArrowSchemaUtil.convert()
  * - Iceberg to Arrow Type mapping :
- * 	 icebergType : LONG   		- 		Field Vector Type : org.apache.arrow.vector.BigIntVector
- * 	 icebergType : STRING  		- 		Field Vector Type : org.apache.arrow.vector.VarCharVector
- * 	 icebergType : BOOLEAN 		- 		Field Vector Type : org.apache.arrow.vector.BitVector
- * 	 icebergType : INTEGER 		- 		Field Vector Type : org.apache.arrow.vector.IntVector
- * 	 icebergType : FLOAT   		- 		Field Vector Type : org.apache.arrow.vector.Float4Vector
- * 	 icebergType : DOUBLE  		- 		Field Vector Type : org.apache.arrow.vector.Float8Vector
- * 	 icebergType : DATE    		- 		Field Vector Type : org.apache.arrow.vector.DateDayVector
- * 	 icebergType : TIMESTAMP  - 		Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
- * 	 icebergType : STRING  		- 		Field Vector Type : org.apache.arrow.vector.VarCharVector
- * 	 icebergType : BINARY  		- 		Field Vector Type : org.apache.arrow.vector.VarBinaryVector
- * 	 icebergField : DECIMAL 	-  	Field Vector Type : org.apache.arrow.vector.DecimalVector
+ *   icebergType : LONG       -   Field Vector Type : org.apache.arrow.vector.BigIntVector
+ *   icebergType : STRING     -   Field Vector Type : org.apache.arrow.vector.VarCharVector
+ *   icebergType : BOOLEAN    -   Field Vector Type : org.apache.arrow.vector.BitVector
+ *   icebergType : INTEGER    -   Field Vector Type : org.apache.arrow.vector.IntVector
+ *   icebergType : FLOAT      -   Field Vector Type : org.apache.arrow.vector.Float4Vector
+ *   icebergType : DOUBLE     -   Field Vector Type : org.apache.arrow.vector.Float8Vector
+ *   icebergType : DATE       -   Field Vector Type : org.apache.arrow.vector.DateDayVector
+ *   icebergType : TIMESTAMP  -   Field Vector Type : org.apache.arrow.vector.TimeStampMicroTZVector
+ *   icebergType : STRING     -   Field Vector Type : org.apache.arrow.vector.VarCharVector
+ *   icebergType : BINARY     -   Field Vector Type : org.apache.arrow.vector.VarBinaryVector
+ *   icebergField : DECIMAL   -   Field Vector Type : org.apache.arrow.vector.DecimalVector
  */
 public class VectorizedParquetValueReaders {
 
   public abstract static class VectorReader extends ParquetValueReaders.PrimitiveReader<FieldVector> {
 
-    protected FieldVector vec;
-    protected boolean isOptional;
+    private FieldVector vec;
+    private boolean isOptional;
 
     VectorReader(ColumnDescriptor desc,
         Types.NestedField icebergField,
@@ -76,28 +76,35 @@ public abstract static class VectorReader extends ParquetValueReaders.PrimitiveR
       super(desc);
       this.vec = ArrowSchemaUtil.convert(icebergField).createVector(rootAlloc);
       this.isOptional = desc.getPrimitiveType().isRepetition(Type.Repetition.OPTIONAL);
-      // System.out.println("=> icebergField : "+icebergField.type().typeId().name()+" ,  Field Vector Type : "+vec.getClass().getName());
+    }
+
+    protected FieldVector getVector() {
+      return vec;
+    }
+
+    protected boolean isOptional() {
+      return isOptional;
     }
 
     @Override
     public FieldVector read(FieldVector ignore) {
 
       vec.reset();
-      int i=0;
+      int ordinal = 0;
 
-      while(column.hasNext()) {
+      while (column.hasNext()) {
         // Todo: this check works for flat schemas only
         // need to get max definition level to do proper check
-        if(isOptional && column.currentDefinitionLevel() == 0) {
+        if (isOptional && column.currentDefinitionLevel() == 0) {
           // handle null
           column.nextNull();
-          nextNullAt(i);
+          nextNullAt(ordinal);
         } else {
-          nextValueAt(i);
+          nextValueAt(ordinal);
         }
-        i++;
+        ordinal++;
       }
-      vec.setValueCount(i);
+      vec.setValueCount(ordinal);
       return vec;
     }
 
@@ -106,9 +113,9 @@ public int getRowCount() {
       return vec.getValueCount();
     }
 
-    protected abstract void nextNullAt(int i);
+    protected abstract void nextNullAt(int ordinal);
 
-    protected abstract void nextValueAt(int i);
+    protected abstract void nextValueAt(int ordinal);
   }
 
   protected static class StringReader extends VectorReader {
@@ -118,21 +125,21 @@ protected static class StringReader extends VectorReader {
     }
 
     @Override
-    protected void nextNullAt(int i) {
-      ((VarCharVector) vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((VarCharVector) getVector()).setNull(ordinal);
     }
 
     @Override
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       Binary binary = column.nextBinary();
       if (binary == null) {
 
-        ((VarCharVector) vec).setNull(i);
+        ((VarCharVector) getVector()).setNull(ordinal);
 
       } else {
         String utf8Str = binary.toStringUsingUTF8();
-        ((VarCharVector) vec).setSafe(i, utf8Str.getBytes());
+        ((VarCharVector) getVector()).setSafe(ordinal, utf8Str.getBytes());
       }
     }
 
@@ -148,14 +155,14 @@ protected static class IntegerReader extends VectorReader {
     }
 
     @Override
-    protected void nextNullAt(int i) {
-      ((IntVector) vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((IntVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       int intValue = column.nextInteger();
-      ((IntVector)vec).setSafe(i, intValue);
+      ((IntVector) getVector()).setSafe(ordinal, intValue);
 
     }
   }
@@ -169,14 +176,14 @@ protected static class LongReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((BigIntVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((BigIntVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       long longValue = column.nextLong();
-      ((BigIntVector)vec).setSafe(i, longValue);
+      ((BigIntVector) getVector()).setSafe(ordinal, longValue);
 
     }
   }
@@ -189,10 +196,10 @@ protected static class TimestampMillisReader extends LongReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       long longValue = column.nextLong();
-      ((BigIntVector)vec).setSafe(i, 1000 * longValue);
+      ((BigIntVector) getVector()).setSafe(ordinal, 1000 * longValue);
 
     }
   }
@@ -205,14 +212,14 @@ protected static class TimestampMicroReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((TimeStampMicroTZVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((TimeStampMicroTZVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       long longValue = column.nextLong();
-      ((TimeStampMicroTZVector)vec).setSafe(i, longValue);
+      ((TimeStampMicroTZVector) getVector()).setSafe(ordinal, longValue);
 
     }
   }
@@ -225,20 +232,19 @@ protected static class BooleanReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((BitVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((BitVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       boolean bool = column.nextBoolean();
-      ((BitVector)vec).setSafe(i, bool ? 1 : 0);
+      ((BitVector) getVector()).setSafe(ordinal, bool ? 1 : 0);
 
     }
   }
 
 
-
   protected static class FloatReader extends VectorReader {
 
     FloatReader(ColumnDescriptor desc,
@@ -247,14 +253,14 @@ protected static class FloatReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((Float4Vector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((Float4Vector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       float floatValue = column.nextFloat();
-      ((Float4Vector)vec).setSafe(i, floatValue);
+      ((Float4Vector) getVector()).setSafe(ordinal, floatValue);
 
     }
   }
@@ -267,14 +273,14 @@ protected static class DoubleReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((Float8Vector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((Float8Vector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       double doubleValue = column.nextDouble();
-      ((Float8Vector)vec).setSafe(i, doubleValue);
+      ((Float8Vector) getVector()).setSafe(ordinal, doubleValue);
 
     }
   }
@@ -288,20 +294,19 @@ protected static class BinaryReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((VarBinaryVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((VarBinaryVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       Binary binaryValue = column.nextBinary();
-      ((VarBinaryVector)vec).setSafe(i, binaryValue.getBytes());
+      ((VarBinaryVector) getVector()).setSafe(ordinal, binaryValue.getBytes());
 
     }
   }
 
 
-
   protected static class DateReader extends VectorReader {
 
     DateReader(ColumnDescriptor desc,
@@ -310,14 +315,14 @@ protected static class DateReader extends VectorReader {
       super(desc, icebergField, rootAlloc);
     }
 
-    protected void nextNullAt(int i) {
-      ((DateDayVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((DateDayVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       int dateValue = column.nextInteger();
-      ((DateDayVector)vec).setSafe(i, dateValue);
+      ((DateDayVector) getVector()).setSafe(ordinal, dateValue);
 
     }
   }
@@ -337,16 +342,16 @@ protected static class IntegerDecimalReader extends VectorReader {
       this.scale = scale;
     }
 
-    protected void nextNullAt(int i) {
-      ((DecimalVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((DecimalVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       int decimalIntValue = column.nextInteger();
       Decimal decimalValue = Decimal.apply(decimalIntValue, precision, scale);
 
-      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+      ((DecimalVector) getVector()).setSafe(ordinal, decimalValue.toJavaBigDecimal());
 
     }
   }
@@ -365,22 +370,20 @@ protected static class LongDecimalReader extends VectorReader {
       this.scale = scale;
     }
 
-    protected void nextNullAt(int i) {
-      ((DecimalVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((DecimalVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       long decimalLongValue = column.nextLong();
       Decimal decimalValue = Decimal.apply(decimalLongValue, precision, scale);
 
-      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+      ((DecimalVector) getVector()).setSafe(ordinal, decimalValue.toJavaBigDecimal());
 
     }
   }
 
-
-
   protected static class BinaryDecimalReader extends VectorReader {
     private final int precision;
     private final int scale;
@@ -395,16 +398,16 @@ protected static class BinaryDecimalReader extends VectorReader {
       this.scale = scale;
     }
 
-    protected void nextNullAt(int i) {
-      ((DecimalVector)vec).setNull(i);
+    protected void nextNullAt(int ordinal) {
+      ((DecimalVector) getVector()).setNull(ordinal);
     }
 
-    protected void nextValueAt(int i) {
+    protected void nextValueAt(int ordinal) {
 
       Binary binaryValue = column.nextBinary();
       Decimal decimalValue = Decimal.fromDecimal(new BigDecimal(new BigInteger(binaryValue.getBytes()), scale));
 
-      ((DecimalVector)vec).setSafe(i, decimalValue.toJavaBigDecimal());
+      ((DecimalVector) getVector()).setSafe(ordinal, decimalValue.toJavaBigDecimal());
 
     }
   }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
index 91f69116baa7..f283cdabe083 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java
@@ -43,6 +43,9 @@
 
 public class VectorizedSparkParquetReaders {
 
+  private VectorizedSparkParquetReaders() {
+  }
+
   @SuppressWarnings("unchecked")
   public static ParquetValueReader<ColumnarBatch> buildReader(
       Schema tableSchema,
@@ -55,11 +58,11 @@ public static ParquetValueReader<ColumnarBatch> buildReader(
   }
 
   private static class ReadBuilder extends TypeWithSchemaVisitor<ParquetValueReader<?>> {
-    protected final MessageType parquetSchema;
-    protected final Schema projectedIcebergSchema;
-    protected final Schema tableIcebergSchema;
-    protected final org.apache.arrow.vector.types.pojo.Schema arrowSchema;
-    protected final RootAllocator rootAllocator;
+    private final MessageType parquetSchema;
+    private final Schema projectedIcebergSchema;
+    private final Schema tableIcebergSchema;
+    private final org.apache.arrow.vector.types.pojo.Schema arrowSchema;
+    private final RootAllocator rootAllocator;
 
     ReadBuilder(Schema tableSchema, Schema projectedIcebergSchema, MessageType parquetSchema) {
       this.parquetSchema = parquetSchema;
@@ -94,7 +97,7 @@ public ParquetValueReader<?> struct(Types.StructType expected, GroupType struct,
         // readersById.put(id, (ParquetValueReader<FieldVector>)ParquetValueReaders.
         //     option(fieldType, fieldD, fieldReaders.get(i)));
 
-        readersById.put(id, (ParquetValueReader<FieldVector>)fieldReaders.get(i));
+        readersById.put(id, (ParquetValueReader<FieldVector>) fieldReaders.get(i));
         typesById.put(id, fieldType);
       }
 
@@ -239,8 +242,4 @@ protected String[] path(String name) {
       return path;
     }
   }
-
-
-
-
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
index b3edfc2827eb..8229b12e7ba8 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -26,10 +26,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Table;
-import org.apache.iceberg.catalog.TableIdentifier;
 import org.apache.iceberg.hadoop.HadoopTables;
-import org.apache.iceberg.hive.HiveCatalog;
-import org.apache.iceberg.hive.HiveCatalogs;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.types.CheckCompatibility;
 import org.apache.spark.sql.SaveMode;
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
index 335f8a840790..f5ee12489655 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java
@@ -65,7 +65,8 @@
 @SuppressWarnings("checkstyle:OverloadMethodsDeclarationOrder")
 public class TestHelpers {
 
-  private TestHelpers() {}
+  private TestHelpers() {
+  }
 
   public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) {
     List<Types.NestedField> fields = struct.fields();
@@ -208,7 +209,7 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter
 
   public static void assertEqualsUnsafe(Types.StructType struct, List<Record> expected, ColumnarBatch batch) {
     List<Types.NestedField> fields = struct.fields();
-    for (int r=0; r<batch.numRows(); r++) {
+    for (int r = 0; r < batch.numRows(); r++) {
 
       Record expRec = expected.get(r);
       InternalRow actualRow = batch.getRow(r);
@@ -224,7 +225,7 @@ public static void assertEqualsUnsafe(Types.StructType struct, List<Record> expe
         //     + " , expected:"+expectedValue);
         if (actualRow.isNullAt(i)) {
 
-          Assert.assertTrue("Expect null", (expectedValue == null));
+          Assert.assertTrue("Expect null", expectedValue == null);
         } else {
           Object actualValue = actualRow.get(i, convert(fieldType));
           assertEqualsUnsafe(fieldType, expectedValue, actualValue);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
index fed41b499f59..da9a46681d6d 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.iceberg.spark.data;
 
 import java.io.File;
@@ -44,29 +63,29 @@ protected void writeAndValidate(Schema schema) throws IOException {
     }
 
 
-    try(CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
-    .project(schema)
-    .createReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type))
-    .build()) {
+    try (CloseableIterable<ColumnarBatch> batchReader = Parquet.read(Files.localInput(testFile))
+        .project(schema)
+        .createReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type))
+        .build()) {
 
       Iterator<ColumnarBatch> batches = batchReader.iterator();
       int numRowsRead = 0;
       int numExpectedRead = 0;
-      while(batches.hasNext()) {
+      while (batches.hasNext()) {
 
         ColumnarBatch batch = batches.next();
         numRowsRead += batch.numRows();
 
         List<GenericData.Record> expectedBatch = new ArrayList<>(batch.numRows());
-        for(int i = numExpectedRead; i < numExpectedRead+batch.numRows(); i++) {
+        for (int i = numExpectedRead; i < numExpectedRead + batch.numRows(); i++) {
           expectedBatch.add(expected.get(i));
         }
 
         // System.out.println("-> Check "+numExpectedRead+" - "+ (numExpectedRead+batch.numRows()));
         assertEqualsUnsafe(schema.asStruct(), expectedBatch, batch);
 
-        System.out.println("Batch read with "+batch.numRows()+" rows. Read "+numRowsRead+" till now. " +
-            "Expected batch "+expectedBatch.size());
+        System.out.println("Batch read with " + batch.numRows() + " rows. Read " + numRowsRead + " till now. " +
+            "Expected batch " + expectedBatch.size());
 
         numExpectedRead += batch.numRows();
       }
@@ -76,9 +95,6 @@ protected void writeAndValidate(Schema schema) throws IOException {
     }
   }
 
-
-
-
   @Test
   public void testArray() throws IOException {
     System.out.println("Not Supported");

From bcf2e2d9bfe36e8b73d6ab4050aeaa112f45ad7d Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Fri, 26 Jul 2019 10:43:43 -0700
Subject: [PATCH 21/22] Revert "Bump version to
 1.0-adobe-3.0-vectorized-SNAPSHOT"

This reverts commit ceae2fd7c79ce1eaaaa2eed265481cfce6fdce16.
---
 build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index 3cf0822db1af..fb09611f628d 100644
--- a/build.gradle
+++ b/build.gradle
@@ -48,7 +48,7 @@ allprojects {
   group = "org.apache.iceberg"
   apply plugin: 'com.palantir.baseline-idea'
   /* TODO - this assumes that the upstream apache version is 1.0 so we need to be consistent w/ upstream changes */
-  version = "1.0-adobe-3.0-vectorized-SNAPSHOT"
+  version = "1.0-adobe-3.0-SNAPSHOT"
   repositories {
     maven { url  "http://palantir.bintray.com/releases" }
     mavenCentral()

From 515ef8516f14e989bb6d0efda851751655a974cf Mon Sep 17 00:00:00 2001
From: Gautam Kowshik <kowshik@adobe.com>
Date: Fri, 26 Jul 2019 10:49:20 -0700
Subject: [PATCH 22/22] cleanup

---
 build.gradle | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/build.gradle b/build.gradle
index fb09611f628d..71e8065095b2 100644
--- a/build.gradle
+++ b/build.gradle
@@ -47,8 +47,7 @@ if (JavaVersion.current() != JavaVersion.VERSION_1_8) {
 allprojects {
   group = "org.apache.iceberg"
   apply plugin: 'com.palantir.baseline-idea'
-  /* TODO - this assumes that the upstream apache version is 1.0 so we need to be consistent w/ upstream changes */
-  version = "1.0-adobe-3.0-SNAPSHOT"
+  version = gitVersion()
   repositories {
     maven { url  "http://palantir.bintray.com/releases" }
     mavenCentral()
@@ -111,16 +110,6 @@ subprojects {
         }
       }
     }
-    repositories {
-        maven {
-            name 'Experience platform snapshot artifactory'
-            url 'https://artifactory.corp.adobe.com/artifactory/maven-experienceplatform-snapshot/'
-            credentials {
-                username = "${artifactory_user_p}" == "" ? System.getenv("ARTIFACTORY_USER") : "${artifactory_user_p}"
-                password = "${artifactory_key_p}" == "" ? System.getenv("ARTIFACTORY_API_TOKEN") : "${artifactory_key_p}"
-            }
-        }
-    }
   }
 }