From 77e52779beccfe4dcaa0e893429ae8e86f05ebf0 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 2 Aug 2024 11:09:01 +0000 Subject: [PATCH 1/4] fix TestFragmentScanOptions --- .../dataset/TestFragmentScanOptions.java | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java index d5981905288..c48320593e8 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -51,6 +51,16 @@ public class TestFragmentScanOptions { + private CsvFragmentScanOptions create( + ArrowSchema cSchema, + Map convertOptionsMap, + Map readOptions, + Map parseOptions) { + CsvConvertOptions convertOptions = new CsvConvertOptions(convertOptionsMap); + convertOptions.setArrowSchema(cSchema); + return new CsvFragmentScanOptions(convertOptions, readOptions, parseOptions); + } + @Test public void testCsvConvertOptions() throws Exception { final Schema schema = @@ -63,24 +73,29 @@ public void testCsvConvertOptions() throws Exception { String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator); + ArrowSchema cSchema2 = ArrowSchema.allocateNew(allocator); CDataDictionaryProvider provider = new CDataDictionaryProvider()) { Data.exportSchema(allocator, schema, provider, cSchema); - CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of("delimiter", ";")); - convertOptions.setArrowSchema(cSchema); - CsvFragmentScanOptions fragmentScanOptions = - new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of()); + Data.exportSchema(allocator, schema, provider, cSchema2); + CsvFragmentScanOptions fragmentScanOptions1 = + create(cSchema, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of("delimiter", ";")); + CsvFragmentScanOptions fragmentScanOptions2 = + create(cSchema2, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of("delimiter", ";")); ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) .columns(Optional.empty()) - .fragmentScanOptions(fragmentScanOptions) + .fragmentScanOptions(fragmentScanOptions1) .build(); try (DatasetFactory datasetFactory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + allocator, + NativeMemoryPool.getDefault(), + FileFormat.CSV, + path, + Optional.of(fragmentScanOptions2)); Dataset dataset = datasetFactory.finish(); Scanner scanner = dataset.newScan(options); ArrowReader reader = scanner.scanBatches()) { - assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); int rowCount = 0; while (reader.loadNextBatch()) { @@ -106,30 +121,38 @@ public void testCsvConvertOptionsDelimiterNotSet() throws Exception { String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator); + ArrowSchema cSchema2 = ArrowSchema.allocateNew(allocator); CDataDictionaryProvider provider = new CDataDictionaryProvider()) { Data.exportSchema(allocator, schema, provider, cSchema); - CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of()); - convertOptions.setArrowSchema(cSchema); - CsvFragmentScanOptions fragmentScanOptions = - new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of()); + Data.exportSchema(allocator, schema, provider, cSchema2); + CsvFragmentScanOptions fragmentScanOptions1 = + create(cSchema, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of()); + CsvFragmentScanOptions fragmentScanOptions2 = + create(cSchema2, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of()); ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) .columns(Optional.empty()) - .fragmentScanOptions(fragmentScanOptions) + .fragmentScanOptions(fragmentScanOptions1) .build(); try (DatasetFactory datasetFactory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + allocator, + NativeMemoryPool.getDefault(), + FileFormat.CSV, + path, + Optional.of(fragmentScanOptions2)); Dataset dataset = datasetFactory.finish(); Scanner scanner = dataset.newScan(options); ArrowReader reader = scanner.scanBatches()) { - - assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); int rowCount = 0; while (reader.loadNextBatch()) { - final ValueIterableVector idVector = - (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id"); - assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(1, 2, 3)); + final ValueIterableVector idVector = + (ValueIterableVector) + reader.getVectorSchemaRoot().getVector("Id;Name;Language"); + assertThat( + idVector.getValueIterable(), + IsIterableContainingInOrder.contains( + new Text("1;Juno;Java"), new Text("2;Peter;Python"), new Text("3;Celin;C++"))); rowCount += reader.getVectorSchemaRoot().getRowCount(); } assertEquals(3, rowCount); @@ -157,13 +180,12 @@ public void testCsvConvertOptionsNoOption() throws Exception { assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); int rowCount = 0; while (reader.loadNextBatch()) { - final ValueIterableVector idVector = - (ValueIterableVector) - reader.getVectorSchemaRoot().getVector("Id;Name;Language"); + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id;Name;Language"); assertThat( idVector.getValueIterable(), IsIterableContainingInOrder.contains( - "1;Juno;Java\n" + "2;Peter;Python\n" + "3;Celin;C++")); + new Text("1;Juno;Java"), new Text("2;Peter;Python"), new Text("3;Celin;C++"))); rowCount += reader.getVectorSchemaRoot().getRowCount(); } assertEquals(3, rowCount); From 52871fd35932604659b7ca239e8901718cb1e590 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Tue, 13 Aug 2024 08:54:57 +0000 Subject: [PATCH 2/4] Fix parseChar return type --- java/dataset/src/main/cpp/jni_wrapper.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 63b8dd73f47..49cc85251c8 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -368,7 +368,7 @@ std::shared_ptr LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; } -inline bool ParseChar(const std::string& key, const std::string& value) { +inline char ParseChar(const std::string& key, const std::string& value) { if (value.size() != 1) { JniThrow("Option " + key + " should be a char, but is " + value); } From a1be096d98145f4efe314ce47cdbe8c04213b5ba Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Tue, 13 Aug 2024 10:13:08 +0000 Subject: [PATCH 3/4] fix test --- .../arrow/dataset/TestFragmentScanOptions.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java index c48320593e8..850e4f358e3 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -196,7 +196,10 @@ public void testCsvConvertOptionsNoOption() throws Exception { public void testCsvReadParseAndReadOptions() throws Exception { final Schema schema = new Schema( - Collections.singletonList(Field.nullable("Id;Name;Language", new ArrowType.Utf8())), + Arrays.asList( + Field.nullable("Id", new ArrowType.Int(64, true)), + Field.nullable("Name", new ArrowType.Utf8()), + Field.nullable("Language", new ArrowType.Utf8())), null); String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); @@ -224,12 +227,9 @@ public void testCsvReadParseAndReadOptions() throws Exception { assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); int rowCount = 0; while (reader.loadNextBatch()) { - final ValueIterableVector idVector = - (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id;Name;Language"); - assertThat( - idVector.getValueIterable(), - IsIterableContainingInOrder.contains( - new Text("2;Peter;Python"), new Text("3;Celin;C++"))); + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id"); + assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(2, 3)); rowCount += reader.getVectorSchemaRoot().getRowCount(); } assertEquals(2, rowCount); From 855a3afff529876d7425f50ad42ae01299e1ccf9 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Tue, 13 Aug 2024 11:06:15 +0000 Subject: [PATCH 4/4] fix --- .../java/org/apache/arrow/dataset/TestFragmentScanOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java index 850e4f358e3..ed6344f0f9c 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -229,7 +229,7 @@ public void testCsvReadParseAndReadOptions() throws Exception { while (reader.loadNextBatch()) { final ValueIterableVector idVector = (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id"); - assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(2, 3)); + assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(2L, 3L)); rowCount += reader.getVectorSchemaRoot().getRowCount(); } assertEquals(2, rowCount);