-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-11776: [C++][Java] Support parquet write from ArrowReader to file #14151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
58ebf06
9abbc15
e526daa
07345a2
f824d0e
759c9c2
88893e0
092baa2
434ad8a
4619774
951462f
fbc99b5
5f86c71
50abe7f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.dataset.file; | ||
|
|
||
| import org.apache.arrow.c.ArrowArrayStream; | ||
| import org.apache.arrow.c.Data; | ||
| import org.apache.arrow.memory.BufferAllocator; | ||
| import org.apache.arrow.vector.ipc.ArrowReader; | ||
|
|
||
| /** | ||
| * JNI-based utility to write datasets into files. It internally depends on C++ static method | ||
| * FileSystemDataset::Write. | ||
| */ | ||
| public class DatasetFileWriter { | ||
|
||
|
|
||
| /** | ||
| * Write the contents of an ArrowReader as a dataset. | ||
| * | ||
| * @param reader the datasource for writing | ||
| * @param format target file format | ||
| * @param uri target file uri | ||
| * @param maxPartitions maximum partitions to be included in written files | ||
| * @param partitionColumns columns used to partition output files. Empty to disable partitioning | ||
| * @param baseNameTemplate file name template used to make partitions. E.g. "dat_{i}", i is current partition | ||
| * ID around all written files. | ||
| */ | ||
| public static void write(BufferAllocator allocator, ArrowReader reader, FileFormat format, String uri, | ||
| String[] partitionColumns, int maxPartitions, String baseNameTemplate) { | ||
| try (final ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { | ||
| Data.exportArrayStream(allocator, reader, stream); | ||
| JniWrapper.get().writeFromScannerToFile(stream.memoryAddress(), | ||
| format.id(), uri, partitionColumns, maxPartitions, baseNameTemplate); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Write the contents of an ArrowReader as a dataset, with default partitioning settings. | ||
| * | ||
| * @param reader the datasource for writing | ||
| * @param format target file format | ||
| * @param uri target file uri | ||
| */ | ||
| public static void write(BufferAllocator allocator, ArrowReader reader, FileFormat format, String uri) { | ||
| write(allocator, reader, format, uri, new String[0], 1024, "data_{i}"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.dataset.scanner; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Iterator; | ||
|
|
||
| import org.apache.arrow.memory.BufferAllocator; | ||
| import org.apache.arrow.vector.VectorLoader; | ||
| import org.apache.arrow.vector.VectorUnloader; | ||
| import org.apache.arrow.vector.ipc.ArrowReader; | ||
| import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; | ||
| import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; | ||
| import org.apache.arrow.vector.types.pojo.Schema; | ||
|
|
||
| /** | ||
| * An implementation of {@link ArrowReader} that reads | ||
| * the dataset from {@link Scanner}. | ||
| */ | ||
| public class ArrowScannerReader extends ArrowReader { | ||
| private final Scanner scanner; | ||
|
|
||
| private Iterator<? extends ScanTask> taskIterator; | ||
|
|
||
| private ScanTask currentTask = null; | ||
| private ArrowReader currentReader = null; | ||
|
|
||
| /** | ||
| * Constructs a scanner reader using a Scanner. | ||
| * | ||
| * @param scanner scanning data over dataset | ||
| * @param allocator to allocate new buffers | ||
| */ | ||
| public ArrowScannerReader(Scanner scanner, BufferAllocator allocator) { | ||
| super(allocator); | ||
| this.scanner = scanner; | ||
| this.taskIterator = scanner.scan().iterator(); | ||
| if (taskIterator.hasNext()) { | ||
| currentTask = taskIterator.next(); | ||
| currentReader = currentTask.execute(); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected void loadRecordBatch(ArrowRecordBatch batch) { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
|
|
||
| @Override | ||
| protected void loadDictionary(ArrowDictionaryBatch dictionaryBatch) { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean loadNextBatch() throws IOException { | ||
| if (currentReader == null) { | ||
| return false; | ||
| } | ||
| boolean result = currentReader.loadNextBatch(); | ||
|
|
||
| if (!result) { | ||
| try { | ||
| currentTask.close(); | ||
| currentReader.close(); | ||
| } catch (Exception e) { | ||
| throw new IOException(e); | ||
| } | ||
|
|
||
| while (!result) { | ||
| if (!taskIterator.hasNext()) { | ||
| return false; | ||
| } else { | ||
| currentTask = taskIterator.next(); | ||
| currentReader = currentTask.execute(); | ||
| result = currentReader.loadNextBatch(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| VectorLoader loader = new VectorLoader(this.getVectorSchemaRoot()); | ||
| VectorUnloader unloader = | ||
| new VectorUnloader(currentReader.getVectorSchemaRoot()); | ||
| try (ArrowRecordBatch recordBatch = unloader.getRecordBatch()) { | ||
| loader.load(recordBatch); | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| @Override | ||
| public long bytesRead() { | ||
| return 0L; | ||
| } | ||
|
|
||
| @Override | ||
| protected void closeReadSource() throws IOException { | ||
| try { | ||
| currentTask.close(); | ||
|
||
| currentReader.close(); | ||
| scanner.close(); | ||
JkSelf marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } catch (Exception e) { | ||
| throw new IOException(e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected Schema readSchema() throws IOException { | ||
| return scanner.schema(); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.