apache · emkornfield · Mar 27, 2021 · Mar 27, 2021 · Mar 27, 2021 · Mar 27, 2021
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
@@ -137,10 +137,13 @@ def _gold_tests(self, gold_dir):
                 skip.add("Go")
                 skip.add("JS")
                 skip.add("Rust")
-            if name == 'zstd':
-                skip.add("Java")
+
+            # See https://github.com/apache/arrow/pull/9822 for how to
+            # disable specific compression type tests.
+
             if prefix == '4.0.0-shareddict':
                 skip.add("Go")
+
             yield datagen.File(name, None, None, skip=skip, path=out_path)
 
     def _run_test_cases(self, producer, consumer, case_runner,

diff --git a/docs/source/status.rst b/docs/source/status.rst
@@ -126,7 +126,7 @@ IPC Format
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Sparse tensors              | ✓     |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
-| Buffer compression          | ✓     |       |       |            |       |       | ✓     |
+| Buffer compression          | ✓     | ✓ (3) |       |            |       |       | ✓     |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
 | Endianness conversion       | ✓ (2) |       |       |            |       |       |       |
 +-----------------------------+-------+-------+-------+------------+-------+-------+-------+
@@ -139,6 +139,8 @@ Notes:
 
 * \(2) Data with non-native endianness can be byte-swapped automatically when reading.
 
+* \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance.
+
 .. seealso::
    The :ref:`format-ipc` specification.
 
@@ -221,7 +223,7 @@ Third-Party Data Formats
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
 | ORC                         | R       |         |       |            |       |         |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
-| Parquet                     | R/W     |         |       |            |       | R/W (1) |       |
+| Parquet                     | R/W     | R (2)   |       |            |       | R/W (1) |       |
 +-----------------------------+---------+---------+-------+------------+-------+---------+-------+
 
 Notes:
@@ -231,3 +233,5 @@ Notes:
 * *W* = Write supported
 
 * \(1) Nested read/write not supported
+
+* \(2) Through JNI bindings to datasets.
diff --git a/java/compression/pom.xml b/java/compression/pom.xml
@@ -44,8 +44,9 @@
       <version>1.20</version>
     </dependency>
     <dependency>
-      <groupId>io.netty</groupId>
-      <artifactId>netty-common</artifactId>
-    </dependency>
+    <groupId>com.github.luben</groupId>
+    <artifactId>zstd-jni</artifactId>
+    <version>1.4.9-1</version>
+</dependency>
   </dependencies>
 </project>
diff --git a/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java b/java/compression/src/main/java/org/apache/arrow/compression/CommonsCompressionFactory.java
@@ -21,7 +21,9 @@
 import org.apache.arrow.vector.compression.CompressionUtil;
 
 /**
- * A factory implementation based on Apache Commons library.
+ * Default implementation of factory supported LZ4 and ZSTD compression.
+ *
+ * // TODO(ARROW-12115): Rename this class.
  */
 public class CommonsCompressionFactory implements CompressionCodec.Factory {
 
@@ -32,6 +34,8 @@ public CompressionCodec createCodec(CompressionUtil.CodecType codecType) {
     switch (codecType) {
       case LZ4_FRAME:
         return new Lz4CompressionCodec();
+      case ZSTD:
+        return new ZstdCompressionCodec();
       default:
         throw new IllegalArgumentException("Compression type not supported: " + codecType);
     }

diff --git a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java
@@ -32,8 +32,6 @@
 import org.apache.commons.compress.compressors.lz4.FramedLZ4CompressorOutputStream;
 import org.apache.commons.compress.utils.IOUtils;
 
-import io.netty.util.internal.PlatformDependent;
-
 /**
  * Compression codec for the LZ4 algorithm.
  */
@@ -45,7 +43,7 @@ protected ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBu
         "The uncompressed buffer size exceeds the integer limit %s.", Integer.MAX_VALUE);
 
     byte[] inBytes = new byte[(int) uncompressedBuffer.writerIndex()];
-    PlatformDependent.copyMemory(uncompressedBuffer.memoryAddress(), inBytes, 0, uncompressedBuffer.writerIndex());
+    uncompressedBuffer.getBytes(/*index=*/0, inBytes);
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
     try (InputStream in = new ByteArrayInputStream(inBytes);
          OutputStream out = new FramedLZ4CompressorOutputStream(baos)) {
@@ -57,8 +55,7 @@ protected ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBu
     byte[] outBytes = baos.toByteArray();
 
     ArrowBuf compressedBuffer = allocator.buffer(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length);
-    PlatformDependent.copyMemory(
-        outBytes, 0, compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, outBytes.length);
+    compressedBuffer.setBytes(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, outBytes);
     compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + outBytes.length);
     return compressedBuffer;
   }
@@ -71,8 +68,7 @@ protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBu
     long decompressedLength = readUncompressedLength(compressedBuffer);
 
     byte[] inBytes = new byte[(int) (compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH)];
-    PlatformDependent.copyMemory(
-        compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, inBytes, 0, inBytes.length);
+    compressedBuffer.getBytes(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, inBytes);
     ByteArrayOutputStream out = new ByteArrayOutputStream((int) decompressedLength);
     try (InputStream in = new FramedLZ4CompressorInputStream(new ByteArrayInputStream(inBytes))) {
       IOUtils.copy(in, out);
@@ -82,8 +78,7 @@ protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBu
 
     byte[] outBytes = out.toByteArray();
     ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length);
-    PlatformDependent.copyMemory(outBytes, 0, decompressedBuffer.memoryAddress(), outBytes.length);
-    decompressedBuffer.writerIndex(decompressedLength);
+    decompressedBuffer.setBytes(/*index=*/0, outBytes);
     return decompressedBuffer;
   }
 

diff --git a/java/compression/src/main/java/org/apache/arrow/compression/ZstdCompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/ZstdCompressionCodec.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.compression;
+
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.compression.AbstractCompressionCodec;
+import org.apache.arrow.vector.compression.CompressionUtil;
+
+import com.github.luben.zstd.Zstd;
+
+/**
+ * Compression codec for the ZSTD algorithm.
+ */
+public class ZstdCompressionCodec extends AbstractCompressionCodec {
+
+  @Override
+  protected ArrowBuf doCompress(BufferAllocator allocator, ArrowBuf uncompressedBuffer) {
+    long maxSize = Zstd.compressBound(uncompressedBuffer.writerIndex());
+    long dstSize = CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + maxSize;
+    ArrowBuf compressedBuffer = allocator.buffer(dstSize);
+    long bytesWritten = Zstd.compressUnsafe(
+                          compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH, dstSize,
+                          /*src*/uncompressedBuffer.memoryAddress(), /*srcSize=*/uncompressedBuffer.writerIndex(),
+                          /*level=*/3);
+    if (Zstd.isError(bytesWritten)) {
+      compressedBuffer.close();
+      throw new RuntimeException("Error compressing: " + Zstd.getErrorName(bytesWritten));
+    }
+    compressedBuffer.writerIndex(CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH + bytesWritten);
+    return compressedBuffer;
+  }
+
+  @Override
+  protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBuffer) {
+    long decompressedLength = readUncompressedLength(compressedBuffer);
+    ArrowBuf uncompressedBuffer = allocator.buffer(decompressedLength);
+    long decompressedSize = Zstd.decompressUnsafe(uncompressedBuffer.memoryAddress(), decompressedLength,
+          /*src=*/compressedBuffer.memoryAddress() + CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH,
+          compressedBuffer.writerIndex() - CompressionUtil.SIZE_OF_UNCOMPRESSED_LENGTH);
+    if (Zstd.isError(decompressedSize)) {
+      uncompressedBuffer.close();
+      throw new RuntimeException("Error decompressing: " + Zstd.getErrorName(decompressedLength));
+    }
+    if (decompressedLength != decompressedSize) {
+      uncompressedBuffer.close();
+      throw new RuntimeException("Expected != actual decompressed length: " + 
+                                 decompressedLength + " != " + decompressedSize);
+    }
+    uncompressedBuffer.writerIndex(decompressedLength);
+    return uncompressedBuffer;
+  }
+
+  @Override
+  public CompressionUtil.CodecType getCodecType() {
+    return CompressionUtil.CodecType.ZSTD;
+  }
+}
diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java
@@ -80,6 +80,10 @@ public static Collection<Object[]> getCodecs() {
 
       CompressionCodec lz4Codec = new Lz4CompressionCodec();
       params.add(new Object[]{lz4Codec.getCodecType(), len, lz4Codec});
+
+      CompressionCodec zstdCodec = new ZstdCompressionCodec();
+      params.add(new Object[]{zstdCodec.getCodecType(), len, zstdCodec});
+
     }
     return params;
   }