From 36582921994da107fdd5f74d5d7798a082caf3bc Mon Sep 17 00:00:00 2001 From: "igor.suhorukov" Date: Thu, 25 Aug 2022 18:03:17 +0300 Subject: [PATCH 1/2] ARROW-17525: [Java] Read ORC files using org.apache.arrow.dataset.jni.NativeDatasetFactory --- java/dataset/pom.xml | 32 ++++++++++++++ java/dataset/src/main/cpp/jni_wrapper.cc | 2 + .../apache/arrow/dataset/file/FileFormat.java | 1 + .../dataset/file/TestFileSystemDataset.java | 42 +++++++++++++++++++ 4 files changed, 77 insertions(+) diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 9eadf896888..4d5df9c45e7 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -109,6 +109,38 @@ jackson-databind test + + org.apache.arrow.orc + arrow-orc + ${project.version} + test + + + org.apache.orc + orc-core + 1.7.6 + test + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + commons-logging + commons-logging + + + + + org.apache.hive + hive-storage-api + 2.8.1 + test + diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index d0881639034..ef9178b1b5d 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -91,6 +91,8 @@ arrow::Result> GetFileFormat( return std::make_shared(); case 1: return std::make_shared(); + case 2: + return std::make_shared(); default: std::string error_message = "illegal file format id: " + std::to_string(file_format_id); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java index 343e458ce23..b428b254b10 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java @@ -23,6 +23,7 @@ public enum FileFormat { PARQUET(0), ARROW_IPC(1), + ORC(2), NONE(-1); private final int id; diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index 2fd8a19bac1..daa159953ed 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -59,6 +59,13 @@ import org.apache.arrow.vector.types.pojo.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; import org.junit.Assert; import org.junit.ClassRule; import org.junit.Test; @@ -357,6 +364,41 @@ public void testBaseArrowIpcRead() throws Exception { AutoCloseables.close(factory); } + @Test + public void testBaseOrcRead() throws Exception { + String dataName = "test-orc"; + String basePath = TMP.getRoot().getAbsolutePath(); + + TypeDescription orcSchema = TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(new Path(basePath, dataName), + OrcFile.writerOptions(new Configuration()).setSchema(orcSchema)); + VectorizedRowBatch batch = orcSchema.createRowBatch(); + LongColumnVector longColumnVector = (LongColumnVector) batch.cols[0]; + longColumnVector.vector[0] = Integer.MIN_VALUE; + longColumnVector.vector[1] = Integer.MAX_VALUE; + batch.size = 2; + writer.addRowBatch(batch); + writer.close(); + + String orcDatasetUri = new File(basePath, dataName).toURI().toString(); + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.ORC, orcDatasetUri); + ScanOptions options = new ScanOptions(100); + Schema schema = inferResultSchemaFromFactory(factory, options); + List datum = collectResultFromFactory(factory, options); + + assertSingleTaskProduced(factory, options); + assertEquals(1, datum.size()); + assertEquals(1, schema.getFields().size()); + assertEquals("ints", schema.getFields().get(0).getName()); + + String expectedJsonUnordered = "[[2147483647], [-2147483648]]"; + checkParquetReadResult(schema, expectedJsonUnordered, datum); + + AutoCloseables.close(datum); + AutoCloseables.close(factory); + } + private void checkParquetReadResult(Schema schema, String expectedJson, List actual) throws IOException { final ObjectMapper json = new ObjectMapper(); From 25ac8233bc063c244ae3b6bc47d945bd8a6f90fc Mon Sep 17 00:00:00 2001 From: "igor.suhorukov" Date: Wed, 7 Sep 2022 18:23:58 +0300 Subject: [PATCH 2/2] ARROW-17525: [Java] Read ORC files using org.apache.arrow.dataset.jni.NativeDatasetFactory --- .../apache/arrow/dataset/OrcWriteSupport.java | 42 +++++++++++++++++++ .../dataset/file/TestFileSystemDataset.java | 17 ++------ 2 files changed, 45 insertions(+), 14 deletions(-) create mode 100644 java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java new file mode 100644 index 00000000000..c49612995ee --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; + +public class OrcWriteSupport { + public static void writeTempFile(TypeDescription orcSchema, Path path, Integer[] values) throws IOException { + Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(new Configuration()).setSchema(orcSchema)); + VectorizedRowBatch batch = orcSchema.createRowBatch(); + LongColumnVector longColumnVector = (LongColumnVector) batch.cols[0]; + for (int idx = 0; idx < values.length; idx++) { + longColumnVector.vector[idx] = values[idx]; + } + batch.size = values.length; + writer.addRowBatch(batch); + writer.close(); + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index daa159953ed..b8d51a3edb1 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -37,6 +37,7 @@ import java.util.concurrent.Executors; import java.util.stream.Collectors; +import org.apache.arrow.dataset.OrcWriteSupport; import org.apache.arrow.dataset.ParquetWriteSupport; import org.apache.arrow.dataset.jni.NativeDataset; import org.apache.arrow.dataset.jni.NativeInstanceReleasedException; @@ -59,13 +60,8 @@ import org.apache.arrow.vector.types.pojo.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.OrcFile; import org.apache.orc.TypeDescription; -import org.apache.orc.Writer; import org.junit.Assert; import org.junit.ClassRule; import org.junit.Test; @@ -370,15 +366,8 @@ public void testBaseOrcRead() throws Exception { String basePath = TMP.getRoot().getAbsolutePath(); TypeDescription orcSchema = TypeDescription.fromString("struct"); - Writer writer = OrcFile.createWriter(new Path(basePath, dataName), - OrcFile.writerOptions(new Configuration()).setSchema(orcSchema)); - VectorizedRowBatch batch = orcSchema.createRowBatch(); - LongColumnVector longColumnVector = (LongColumnVector) batch.cols[0]; - longColumnVector.vector[0] = Integer.MIN_VALUE; - longColumnVector.vector[1] = Integer.MAX_VALUE; - batch.size = 2; - writer.addRowBatch(batch); - writer.close(); + Path path = new Path(basePath, dataName); + OrcWriteSupport.writeTempFile(orcSchema, path, new Integer[]{Integer.MIN_VALUE, Integer.MAX_VALUE}); String orcDatasetUri = new File(basePath, dataName).toURI().toString(); FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(),