From eb2fecc2599d5bcc5b86593a40923b381d3e0312 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 20 Sep 2022 14:41:44 -0500 Subject: [PATCH 1/4] Read CSV files using org.apache.arrow.dataset.jni.NativeDatasetFactory --- java/dataset/src/main/cpp/jni_wrapper.cc | 2 + .../apache/arrow/dataset/file/FileFormat.java | 1 + .../apache/arrow/dataset/CsvWriteSupport.java | 52 +++++++++++++++++++ .../dataset/file/TestFileSystemDataset.java | 25 +++++++++ 4 files changed, 80 insertions(+) create mode 100644 java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index ef9178b1b5d..950ed986155 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -93,6 +93,8 @@ arrow::Result> GetFileFormat( return std::make_shared(); case 2: return std::make_shared(); + case 3: + return std::make_shared(); default: std::string error_message = "illegal file format id: " + std::to_string(file_format_id); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java index b428b254b10..aad4fa5f2af 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java @@ -24,6 +24,7 @@ public enum FileFormat { PARQUET(0), ARROW_IPC(1), ORC(2), + CSV(3), NONE(-1); private final int id; diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java new file mode 100644 index 00000000000..6473c0de673 --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Random; + +public class CsvWriteSupport { + private final String path; + private final String uri; + private final Random random = new Random(); + + public CsvWriteSupport(File outputFolder) { + path = outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".csv"; + uri = "file://" + path; + } + + public static CsvWriteSupport writeTempFile(File outputFolder, String... values) + throws IOException, URISyntaxException { + CsvWriteSupport writer = new CsvWriteSupport(outputFolder); + File csvOutputFile = new File(new URI(writer.uri)); + try (FileWriter addValues = new FileWriter(csvOutputFile, true)) { + for (Object value : values) { + addValues.write(value + "\n"); + } + } + return writer; + } + + public String getOutputURI() { + return uri; + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index b8d51a3edb1..3375bada13d 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -37,6 +37,7 @@ import java.util.concurrent.Executors; import java.util.stream.Collectors; +import org.apache.arrow.dataset.CsvWriteSupport; import org.apache.arrow.dataset.OrcWriteSupport; import org.apache.arrow.dataset.ParquetWriteSupport; import org.apache.arrow.dataset.jni.NativeDataset; @@ -388,6 +389,30 @@ public void testBaseOrcRead() throws Exception { AutoCloseables.close(factory); } + @Test + public void testBaseCsvRead() throws Exception { + CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile( + TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++"); + String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\", \"Python\"], [\"Celin\", \"C++\"]]"; + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.CSV, writeSupport.getOutputURI()); + + ScanOptions options = new ScanOptions(100); + Schema schema = inferResultSchemaFromFactory(factory, options); + List datum = collectResultFromFactory(factory, options); + + System.out.println(schema); + assertSingleTaskProduced(factory, options); + assertEquals(1, datum.size()); + assertEquals(2, schema.getFields().size()); + assertEquals("Name", schema.getFields().get(0).getName()); + + checkParquetReadResult(schema, expectedJsonUnordered, datum); + + AutoCloseables.close(datum); + AutoCloseables.close(factory); + } + private void checkParquetReadResult(Schema schema, String expectedJson, List actual) throws IOException { final ObjectMapper json = new ObjectMapper(); From 19ca9e5b382644d8543210f54c336130ad98822d Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Thu, 22 Sep 2022 13:27:31 -0500 Subject: [PATCH 2/4] Create URI with scheme and ssp --- .../org/apache/arrow/dataset/CsvWriteSupport.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java index 6473c0de673..954408ce25e 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java @@ -25,20 +25,17 @@ import java.util.Random; public class CsvWriteSupport { - private final String path; - private final String uri; + private final URI uri; private final Random random = new Random(); - public CsvWriteSupport(File outputFolder) { - path = outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".csv"; - uri = "file://" + path; + public CsvWriteSupport(File outputFolder) throws URISyntaxException { + uri = new URI("file", outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".csv", null); } public static CsvWriteSupport writeTempFile(File outputFolder, String... values) - throws IOException, URISyntaxException { + throws URISyntaxException, IOException { CsvWriteSupport writer = new CsvWriteSupport(outputFolder); - File csvOutputFile = new File(new URI(writer.uri)); - try (FileWriter addValues = new FileWriter(csvOutputFile, true)) { + try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) { for (Object value : values) { addValues.write(value + "\n"); } @@ -47,6 +44,6 @@ public static CsvWriteSupport writeTempFile(File outputFolder, String... values) } public String getOutputURI() { - return uri; + return uri.toString(); } } From 8b57022228c25bdc5ee6c6f7ef513b06ed60084f Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 4 Oct 2022 11:48:14 -0500 Subject: [PATCH 3/4] Solving error for JNI compilation failure --- .../org/apache/arrow/dataset/file/TestFileSystemDataset.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index efd2b604ebd..16d0f96ca4d 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -375,7 +375,7 @@ public void testBaseCsvRead() throws Exception { List datum = collectResultFromFactory(factory, options); System.out.println(schema); - assertSingleTaskProduced(factory, options); + assertScanBatchesProduced(factory, options); assertEquals(1, datum.size()); assertEquals(2, schema.getFields().size()); assertEquals("Name", schema.getFields().get(0).getName()); From 98c6d1d2e94ec7bb29a2d7b224c18bf20b5eb5db Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Thu, 6 Oct 2022 17:39:43 -0500 Subject: [PATCH 4/4] Close resources try-with + Disable CSV when needed + Update Dataset build by maven --- java/dataset/src/main/cpp/jni_wrapper.cc | 4 +++ .../dataset/file/TestFileSystemDataset.java | 26 +++++++-------- java/pom.xml | 32 +++++++++++++------ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 950ed986155..aa7d7670232 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -91,10 +91,14 @@ arrow::Result> GetFileFormat( return std::make_shared(); case 1: return std::make_shared(); +#ifdef ARROW_ORC case 2: return std::make_shared(); +#endif +#ifdef ARROW_CSV case 3: return std::make_shared(); +#endif default: std::string error_message = "illegal file format id: " + std::to_string(file_format_id); diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index 16d0f96ca4d..b8a13937a8a 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -367,23 +367,23 @@ public void testBaseCsvRead() throws Exception { CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile( TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++"); String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\", \"Python\"], [\"Celin\", \"C++\"]]"; - FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), - FileFormat.CSV, writeSupport.getOutputURI()); - ScanOptions options = new ScanOptions(100); - Schema schema = inferResultSchemaFromFactory(factory, options); - List datum = collectResultFromFactory(factory, options); + try ( + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.CSV, writeSupport.getOutputURI()) + ) { + List datum = collectResultFromFactory(factory, options); + Schema schema = inferResultSchemaFromFactory(factory, options); - System.out.println(schema); - assertScanBatchesProduced(factory, options); - assertEquals(1, datum.size()); - assertEquals(2, schema.getFields().size()); - assertEquals("Name", schema.getFields().get(0).getName()); + assertScanBatchesProduced(factory, options); + assertEquals(1, datum.size()); + assertEquals(2, schema.getFields().size()); + assertEquals("Name", schema.getFields().get(0).getName()); - checkParquetReadResult(schema, expectedJsonUnordered, datum); + checkParquetReadResult(schema, expectedJsonUnordered, datum); - AutoCloseables.close(datum); - AutoCloseables.close(factory); + AutoCloseables.close(datum); + } } private void checkParquetReadResult(Schema schema, String expectedJson, List actual) diff --git a/java/pom.xml b/java/pom.xml index cdd8fa181de..15b06a02a38 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -916,7 +916,7 @@ generate-cdata-dylib_so - java-dist/lib + java-dist @@ -981,8 +981,18 @@ generate-jni-dylib_so - java-dist/lib + java-dist false + ON + ON + ON + OFF + OFF + OFF + ON + OFF + ON + OFF @@ -1015,16 +1025,16 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=ON + -DARROW_CSV=${ARROW_CSV} -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON - -DARROW_GANDIVA=ON + -DARROW_GANDIVA=${ARROW_GANDIVA} -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON - -DARROW_ORC=ON - -DARROW_PARQUET=ON - -DARROW_PLASMA=ON + -DARROW_ORC=${ARROW_ORC} + -DARROW_PARQUET=${ARROW_PARQUET} + -DARROW_PLASMA=${ARROW_PLASMA} -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Release @@ -1062,12 +1072,16 @@ -S java -B java-jni - -DARROW_JAVA_JNI_ENABLE_C=OFF + -DARROW_JAVA_JNI_ENABLE_C=${ARROW_JAVA_JNI_ENABLE_C} + -DARROW_JAVA_JNI_ENABLE_DATASET=${ARROW_JAVA_JNI_ENABLE_DATASET} + -DARROW_JAVA_JNI_ENABLE_GANDIVA=${ARROW_JAVA_JNI_ENABLE_GANDIVA} + -DARROW_JAVA_JNI_ENABLE_ORC=${ARROW_JAVA_JNI_ENABLE_ORC} + -DARROW_JAVA_JNI_ENABLE_PLASMA=${ARROW_JAVA_JNI_ENABLE_PLASMA} -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_LIBDIR=lib - -DCMAKE_INSTALL_PREFIX=${arrow.c.jni.dist.dir} + -DCMAKE_INSTALL_PREFIX=${arrow.dataset.jni.dist.dir} -DCMAKE_PREFIX_PATH=${project.basedir}/../java-dist ../