diff --git a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java index feb3bc2460d5..1c7bb742f529 100644 --- a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java +++ b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java @@ -19,6 +19,7 @@ package org.apache.paimon.arrow.writer; import org.apache.paimon.arrow.ArrowUtils; +import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.DataGetters; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -39,6 +40,7 @@ import org.apache.paimon.data.columnar.ShortColumnVector; import org.apache.paimon.data.columnar.TimestampColumnVector; import org.apache.paimon.data.columnar.VectorizedColumnBatch; +import org.apache.paimon.memory.MemorySegment; import org.apache.paimon.utils.IntArrayList; import org.apache.arrow.vector.BigIntVector; @@ -94,7 +96,25 @@ protected void doWrite( @Override protected void doWrite(int rowIndex, DataGetters getters, int pos) { - ((VarCharVector) fieldVector).setSafe(rowIndex, getters.getString(pos).toBytes()); + BinaryString binaryString = getters.getString(pos); + MemorySegment[] segments = binaryString.getSegments(); + + // Very important performance optimization, which can avoid copying out new byte array + if (segments.length == 1) { + byte[] heapMemory = segments[0].getHeapMemory(); + if (heapMemory != null) { + ((VarCharVector) fieldVector) + .setSafe( + rowIndex, + heapMemory, + binaryString.getOffset(), + binaryString.getSizeInBytes()); + return; + } + } + + // Else copy new byte array + ((VarCharVector) fieldVector).setSafe(rowIndex, binaryString.toBytes()); } } diff --git a/paimon-benchmark/paimon-micro-benchmarks/pom.xml b/paimon-benchmark/paimon-micro-benchmarks/pom.xml index 319433f41ffe..ddac785a1e41 100644 --- a/paimon-benchmark/paimon-micro-benchmarks/pom.xml +++ b/paimon-benchmark/paimon-micro-benchmarks/pom.xml @@ -146,6 +146,12 @@ under the License. ${project.version} + + org.apache.paimon + paimon-arrow + ${project.version} + + diff --git a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java new file mode 100644 index 000000000000..8bdedbc0fdd8 --- /dev/null +++ b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.benchmark.arrow; + +import org.apache.paimon.arrow.vector.ArrowFormatWriter; +import org.apache.paimon.benchmark.Benchmark; +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; + +import java.util.concurrent.ThreadLocalRandom; + +/** Benchmark for arrow write. */ +public class ArrowWriteBenchmark { + + @Test + public void testWrite() { + int batch = 1024 * 20; + Benchmark benchmark = + new Benchmark("read", batch * batch) + .setNumWarmupIters(1) + .setOutputPerIteration(true); + RowType rowType = + RowType.of( + DataTypes.STRING(), + DataTypes.STRING(), + DataTypes.STRING(), + DataTypes.STRING()); + ThreadLocalRandom rnd = ThreadLocalRandom.current(); + byte[] bytes = new byte[100]; + rnd.nextBytes(bytes); + BinaryString binaryString = BinaryString.fromBytes(bytes, 1, 99); + GenericRow row = GenericRow.of(binaryString, binaryString, binaryString, binaryString); + benchmark.addCase( + "write", + 1, + () -> { + try (ArrowFormatWriter writer = new ArrowFormatWriter(rowType, batch, true)) { + for (int i = 0; i < batch; i++) { + writer.reset(); + for (int j = 0; j < batch; j++) { + writer.write(row); + } + } + } + }); + benchmark.run(); + } +}