From 4a9e2c404b39ae11fb4932e3558cba4f1a2dccb9 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Tue, 26 Aug 2025 11:03:36 +0200 Subject: [PATCH 1/7] feat: add reusable Huffman decoder Introduce a generic Huffman decoder that supports canonical Huffman codes. This utility can be shared across algorithms such as DEFLATE, DEFLATE64, and BZip2. The decoder simplifies and centralizes Huffman code handling, reducing duplication and making the BZip2 implementation (and future codecs) easier to maintain. --- .../compressors/alg/HuffmanDecoder.java | 235 ++++++++++++++++++ .../compressors/alg/package-info.java | 23 ++ .../bzip2/BZip2CompressorInputStream.java | 190 +++----------- .../compressors/alg/HuffmanDecoderTest.java | 43 ++++ .../bzip2/BZip2CompressorInputStreamTest.java | 21 +- 5 files changed, 335 insertions(+), 177 deletions(-) create mode 100644 src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java create mode 100644 src/main/java/org/apache/commons/compress/compressors/alg/package-info.java create mode 100644 src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java diff --git a/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java new file mode 100644 index 00000000000..1c984c6c2b3 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.alg; + +import java.io.EOFException; +import java.io.IOException; +import java.util.Objects; + +import org.apache.commons.compress.utils.BitInputStream; + +/** + * Canonical Huffman decoder. + *

+ * This class builds decoding tables from an array of code lengths (one entry per symbol) + * and then decodes symbols from a {@link BitInputStream}. The code set is expected to be a + * complete prefix code; i.e., the code lengths must satisfy Kraft's equality. + *

+ * + *

Usage

+ *
{@code
+ * int[] codeLengths = ...; // length per symbol (0 => unused)
+ * int symbolCount = codeLengths.length;
+ * int maxLen = 15; // maximum non-zero code length in codeLengths
+ * HuffmanDecoder dec = new HuffmanDecoder(codeLengths, symbolCount, maxLen);
+ * int sym = dec.decodeSymbol(bitIn);
+ * }
+ * + *

Thread-safety

+ * Instances are immutable after construction and may be safely shared between threads. + */ +public final class HuffmanDecoder { + + /** + * Maximum code length supported by this implementation. + */ + private static final int MAX_SUPPORTED_CODE_LENGTH = 30; + + /** Minimum non-zero code length */ + private final int minLength; + + /** Maximum non-zero code length */ + private final int maxLength; + + /** + * Symbols in canonical order (by length, then by symbol). + */ + private final int[] sorted; + /** + * For each code length, the bias between code values and indices into the sorted symbol table. + */ + private final int[] bias; + /** + * For each code length, the largest left-justified code of that length. + */ + private final int[] limit; + + /** + * Constructs a decoder from canonical code lengths. + *

+ * The {@code codeLengths} array provides, for each symbol index {@code i} in + * {@code [0, codeLengthSize)}, the length (in bits) of that symbol's code. A value of + * {@code 0} marks an unused symbol. All non-zero lengths must be {@code <= maxCodeLength} and + * the set of lengths must satisfy Kraft's equality (i.e., define a complete prefix code). + *

+ * + * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} + * @param codeLengthSize number of symbols to read from {@code codeLengths} (must be {@code > 0} + * and {@code <= codeLengths.length}) + * @param minCodeLength minimum allowed code length present in {@code codeLengths} + * @param maxCodeLength maximum allowed code length present in {@code codeLengths} + * @throws IllegalArgumentException if any argument is invalid + */ + public HuffmanDecoder( + final int[] codeLengths, final int codeLengthSize, final int minCodeLength, final int maxCodeLength) + throws IllegalArgumentException { + Objects.requireNonNull(codeLengths, "codeLengths"); + if (maxCodeLength > MAX_SUPPORTED_CODE_LENGTH) { + throw new IllegalArgumentException(String.format( + "maxCodeLength (%d) exceeds supported limit (%d)", maxCodeLength, MAX_SUPPORTED_CODE_LENGTH)); + } + if (codeLengthSize <= 0) { + throw new IllegalArgumentException(String.format("codeLengthSize must be > 0; was %d", codeLengthSize)); + } + if (codeLengths.length < codeLengthSize) { + throw new IllegalArgumentException(String.format( + "codeLengthSize (%d) exceeds codeLengths.length (%d)", codeLengthSize, codeLengths.length)); + } + // Validate and find min/max lengths + int min = maxCodeLength; + int max = minCodeLength; + + for (int i = 0; i < codeLengthSize; i++) { + final int len = codeLengths[i]; + if (len < minCodeLength || len > maxCodeLength) { + throw new IllegalArgumentException(String.format( + "invalid code length at symbol %d: %d (expected in [%d, %d])", + i, len, minCodeLength, maxCodeLength)); + } + if (len == 0) { + continue; // unused symbol + } + if (len < min) { + min = len; + } + if (len > max) { + max = len; + } + } + + this.minLength = min; + this.maxLength = max; + // Allocate outputs; we reuse them as scratch inside fillCodeTable + this.bias = new int[max + 1]; + this.limit = new int[max + 1]; + this.sorted = new int[codeLengthSize]; + + // Arrays are zero-initialized; no additional temps needed. + fillCodeTable(codeLengths, minLength, max, codeLengthSize, bias, limit, sorted); + } + + /** + * Returns the minimum code length (in bits) for this code set. + * + * @return minimum code length (in bits) + */ + public int getMinLength() { + return minLength; + } + + /** + * Returns the maximum code length (in bits) for this code set. + * + * @return maximum code length (in bits) + */ + public int getMaxLength() { + return maxLength; + } + + /** + * Build canonical decode tables. + */ + private static void fillCodeTable( + final int[] codeLengths, + final int minLen, + final int maxLen, + final int codeLengthSize, + final int[] bias, + final int[] limit, + final int[] sorted) { + + // 1) Histogram of code lengths + final int[] count = new int[maxLen + 1]; + for (int symbol = 0; symbol < codeLengthSize; symbol++) { + final int len = codeLengths[symbol]; + if (len == 0) { + continue; + } + count[codeLengths[symbol]]++; + } + + // 2) Generate starting offsets into sorted symbol table + // The offsets are biased by -1 to simplify code in the next step + final int[] offset = new int[maxLen + 1]; + offset[0] = -1; + for (int len = 1; len <= maxLen; len++) { + offset[len] = offset[len - 1] + count[len - 1]; + } + + // 3) Build table of symbols sorted by length, then by symbol + // Adjust offsets to point to the last element of each length + for (int symbol = 0; symbol < codeLengthSize; symbol++) { + final int len = codeLengths[symbol]; + sorted[++offset[len]] = symbol; + } + + // 4) Compute the largest left-justified code for each length + int firstCode = 0; + for (int len = minLen; len <= maxLen; len++) { + firstCode += count[len]; + limit[len] = firstCode - 1; + firstCode <<= 1; // prepare for next length + } + + // 5) Compute the bias for each length + for (int len = minLen; len <= maxLen; len++) { + bias[len] = limit[len] - offset[len]; + } + } + + /** + * Decodes one symbol from the input bitstream. + * + * @param in the source of bits (MSB-first) to read from + * @return the decoded symbol index + * @throws EOFException if the input ends in the middle of a Huffman codeword + * @throws IOException if an I/O error occurs while reading from {@code in} + */ + public int decodeSymbol(final BitInputStream in) throws IOException { + int len = minLength; + int code = readBitsFully(in, len); + while (len <= maxLength && code > limit[len]) { + final int b = readBitsFully(in, 1); + code = (code << 1) | b; + len++; + } + if (len > maxLength) { + throw new IOException("Invalid Huffman code: " + code); + } + return sorted[code - bias[len]]; + } + + private static int readBitsFully(final BitInputStream in, final int numBits) throws IOException { + final int code = (int) in.readBits(numBits); + if (code < 0) { + throw new EOFException("Truncated Huffman bitstream"); + } + return code; + } +} diff --git a/src/main/java/org/apache/commons/compress/compressors/alg/package-info.java b/src/main/java/org/apache/commons/compress/compressors/alg/package-info.java new file mode 100644 index 00000000000..4cf75057d69 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/alg/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Algorithms reused by various compressors. + * @since 1.29.0 + */ +package org.apache.commons.compress.compressors.alg; diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java index 5b65ffbee53..f4156441db4 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java @@ -31,6 +31,7 @@ import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.alg.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.InputStreamStatistics; import org.apache.commons.io.input.CloseShieldInputStream; @@ -59,16 +60,16 @@ static final class Data { */ final int[] unzftab = new int[256]; // 1024 byte - // Needs indexes from 0 to MAX_CODE_LEN inclusive. - final int[][] limit = new int[N_GROUPS][MAX_CODE_LEN + 1]; - // Needs indexes from 0 to MAX_CODE_LEN + 1 inclusive. - final int[][] base = new int[N_GROUPS][MAX_CODE_LEN + 2]; - final int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte - final int[] minLens = new int[N_GROUPS]; // 24 byte - + /** + * Huffman decoding tables. + */ + final HuffmanDecoder[] huffmanDecoders = new HuffmanDecoder[N_GROUPS]; + /** + * Number of non-null {@link #huffmanDecoders}. + */ + int huffmanDecodersCount; final int[] cftab = new int[257]; // 1028 byte final char[] getAndMoveToFrontDecode_yy = new char[256]; // 512 byte - final char[][] temp_charArray2d = new char[N_GROUPS][MAX_ALPHA_SIZE]; // 3096 // byte final byte[] recvDecodingTables_pos = new byte[N_GROUPS]; // 6 byte // --------------- @@ -158,85 +159,6 @@ private static void checkBounds(final int checkVal, final int limitExclusive, fi } } - /** - * Builds the Huffman decoding tables for use by {@code recvDecodingTables()}. - * - * @param alphaSize the alphabet size, guaranteed by the caller to be in the range [2, 258] - * (RUNA, RUNB, 255 byte values, and EOB). - * @param nGroups the number of Huffman coding groups, guaranteed by the caller to be in the range [0, 6]. - * @param dataShadow the data structure into which the tables are built; requires - * {@code temp_charArray2d} to be initialized. - */ - static void createHuffmanDecodingTables(final int alphaSize, final int nGroups, final Data dataShadow) { - final char[][] len = dataShadow.temp_charArray2d; - final int[] minLens = dataShadow.minLens; - final int[][] limit = dataShadow.limit; - final int[][] base = dataShadow.base; - final int[][] perm = dataShadow.perm; - - for (int t = 0; t < nGroups; t++) { - final char[] len_t = len[t]; - int minLen = len_t[0]; - int maxLen = len_t[0]; - for (int i = 1; i < alphaSize; i++) { - final char lent = len_t[i]; - if (lent > maxLen) { - maxLen = lent; - } - if (lent < minLen) { - minLen = lent; - } - } - hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen, maxLen, alphaSize); - minLens[t] = minLen; - } - } - - /** - * Called by createHuffmanDecodingTables() exclusively. - * - * @param minLen minimum code length in the range [1, {@value MAX_CODE_LEN}] guaranteed by the caller. - * @param maxLen maximum code length in the range [1, {@value MAX_CODE_LEN}] guaranteed by the caller. - */ - private static void hbCreateDecodeTables(final int[] limit, final int[] base, final int[] perm, final char[] length, final int minLen, final int maxLen, - final int alphaSize) { - for (int i = minLen, pp = 0; i <= maxLen; i++) { - for (int j = 0; j < alphaSize; j++) { - if (length[j] == i) { - perm[pp++] = j; - } - } - } - // Ensure the arrays were not reused. - Arrays.fill(base, 0); - Arrays.fill(limit, minLen, maxLen + 1, 0); - // Compute histogram of code lengths, shifted by 1. - for (int i = 0; i < alphaSize; i++) { - final int len = length[i] + 1; - base[len]++; - } - // Compute cumulative counts: base[len] = # of codes with length < len. - // In other terms: base[len] = index of the first code in the `perm` table. - for (int len = 1; len < base.length; len++) { - base[len] += base[len - 1]; - } - // Compute the last code for each length. - int vec = 0; - for (int len = minLen; len <= maxLen; len++) { - // increment by the number of length `len` codes - vec += base[len + 1] - base[len]; - // vec is now the last code of length `len` + 1 - limit[len] = vec - 1; - vec <<= 1; - } - // Compute the bias between code value and table index. - // base[minLen] cannot be computed using this rule, since limit[minLen - 1] does not exist, - // but has already the correct value 0. - for (int len = minLen + 1; len <= maxLen; len++) { - base[len] = (limit[len - 1] + 1 << 1) - base[len]; - } - } - private static void makeMaps(final Data data) throws IOException { final boolean[] inUse = data.inUse; final byte[] seqToUnseq = data.seqToUnseq; @@ -334,27 +256,24 @@ static void recvDecodingTables(final BitInputStream bin, final Data dataShadow) selector[i] = tmp; } - final char[][] len = dataShadow.temp_charArray2d; - - /* Now the coding tables */ + /* Now the Huffman coding tables */ for (int t = 0; t < nGroups; t++) { + final int[] codeLengths = new int[alphaSize]; int curr = bsR(bin, 5); - final char[] len_t = len[t]; for (int i = 0; i < alphaSize; i++) { while (bsGetBit(bin)) { curr += bsGetBit(bin) ? -1 : 1; } - // Same condition as in bzip2 - if (curr < 1 || curr > MAX_CODE_LEN) { - throw new CompressorException( - "Corrupted input, code length value out of range [%d, %d]: %d", 1, MAX_CODE_LEN, curr); - } - len_t[i] = (char) curr; + codeLengths[i] = (char) curr; + } + try { + // Same limits as in the reference C implementation of bzip2 + dataShadow.huffmanDecoders[t] = new HuffmanDecoder(codeLengths, alphaSize, 1, MAX_CODE_LEN); + } catch (final IllegalArgumentException e) { + throw new CompressorException("Invalid Huffman data: " + e.getMessage(), e); } } - - // finally create the Huffman tables - createHuffmanDecodingTables(alphaSize, nGroups, dataShadow); + dataShadow.huffmanDecodersCount = nGroups; } // Variables used by setup* methods exclusively @@ -474,10 +393,6 @@ private void getAndMoveToFrontDecode() throws IOException { final byte[] selector = dataShadow.selector; final byte[] seqToUnseq = dataShadow.seqToUnseq; final char[] yy = dataShadow.getAndMoveToFrontDecode_yy; - final int[] minLens = dataShadow.minLens; - final int[][] limit = dataShadow.limit; - final int[][] base = dataShadow.base; - final int[][] perm = dataShadow.perm; final int limitLast = this.blockSize100k * 100000; /* * Setting up the unzftab entries here is not strictly necessary, but it does save having to do it later in a separate pass, and so saves a block's @@ -487,18 +402,15 @@ private void getAndMoveToFrontDecode() throws IOException { yy[i] = (char) i; unzftab[i] = 0; } - int groupNo = 0; int groupPos = G_SIZE - 1; final int eob = dataShadow.inUseCount + 1; - int nextSym = getAndMoveToFrontDecode0(); int lastShadow = -1; + + // Initialize group, selector and huffmanDecoder + int groupNo = 0; int zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - int[] base_zt = base[zt]; - int[] limit_zt = limit[zt]; - int[] perm_zt = perm[zt]; - int minLens_zt = minLens[zt]; + HuffmanDecoder currentDecoder = getHuffmanDecoder(dataShadow, zt); + int nextSym = currentDecoder.decodeSymbol(bin); while (nextSym != eob) { if (nextSym == RUNA || nextSym == RUNB) { int s = -1; @@ -514,25 +426,11 @@ private void getAndMoveToFrontDecode() throws IOException { groupPos = G_SIZE - 1; checkBounds(++groupNo, selector.length, "groupNo"); zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - base_zt = base[zt]; - limit_zt = limit[zt]; - perm_zt = perm[zt]; - minLens_zt = minLens[zt]; + currentDecoder = getHuffmanDecoder(dataShadow, zt); } else { groupPos--; } - int zn = minLens_zt; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int tmp = zvec - base_zt[zn]; - checkBounds(tmp, perm_zt.length, "zvec"); - nextSym = perm_zt[tmp]; + nextSym = currentDecoder.decodeSymbol(bin); } checkBounds(s, this.data.ll8.length, "s"); final int yy0 = yy[0]; @@ -570,45 +468,19 @@ private void getAndMoveToFrontDecode() throws IOException { groupPos = G_SIZE - 1; checkBounds(++groupNo, selector.length, "groupNo"); zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - base_zt = base[zt]; - limit_zt = limit[zt]; - perm_zt = perm[zt]; - minLens_zt = minLens[zt]; + currentDecoder = getHuffmanDecoder(dataShadow, zt); } else { groupPos--; } - int zn = minLens_zt; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int idx = zvec - base_zt[zn]; - checkBounds(idx, perm_zt.length, "zvec"); - nextSym = perm_zt[idx]; + nextSym = currentDecoder.decodeSymbol(bin); } } this.last = lastShadow; } - private int getAndMoveToFrontDecode0() throws IOException { - final Data dataShadow = this.data; - final int zt = dataShadow.selector[0] & 0xff; - checkBounds(zt, dataShadow.limit.length, "zt"); - final int[] limit_zt = dataShadow.limit[zt]; - int zn = dataShadow.minLens[zt]; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int tmp = zvec - dataShadow.base[zt][zn]; - checkBounds(tmp, dataShadow.perm[zt].length, "zvec"); - return dataShadow.perm[zt][tmp]; + private static HuffmanDecoder getHuffmanDecoder(final Data dataShadow, final int zt) throws IOException { + checkBounds(zt, dataShadow.huffmanDecodersCount, "zt"); + return dataShadow.huffmanDecoders[zt]; } /** diff --git a/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java b/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java new file mode 100644 index 00000000000..2e7acff8d24 --- /dev/null +++ b/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.alg; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class HuffmanDecoderTest { + private static final int MIN_CODE_LEN = 1; + private static final int MAX_CODE_LEN = 20; + + @Test + void testCreateHuffmanDecodingTablesWithLargeAlphaSize() { + // Use a codeLengths array with length equal to MAX_ALPHA_SIZE (258) to test array bounds. + final int[] codeLengths = new int[258]; + for (int i = 0; i < codeLengths.length; i++) { + // Use all code lengths within valid range [1, 20] + codeLengths[i] = (char) ((i % MAX_CODE_LEN) + 1); + } + final HuffmanDecoder decoder = assertDoesNotThrow( + () -> new HuffmanDecoder(codeLengths, codeLengths.length, MIN_CODE_LEN, MAX_CODE_LEN), + "HuffmanDecoder constructor should not throw for valid codeLengths array of MAX_ALPHA_SIZE"); + assertEquals(decoder.getMinLength(), 1, "Minimum code length should be 1"); + } +} diff --git a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java index 2c84fe18e19..a38e22d5174 100644 --- a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java @@ -103,22 +103,6 @@ private BitInputStream prepareDecodingTables(final int codeLength) { return new BitInputStream(new ByteArrayInputStream(stream.toByteArray()), ByteOrder.BIG_ENDIAN); } - @Test - void testCreateHuffmanDecodingTablesWithLargeAlphaSize() { - final Data data = new Data(1); - // Use a codeLengths array with length equal to MAX_ALPHA_SIZE (258) to test array bounds. - final char[] codeLengths = new char[258]; - for (int i = 0; i < codeLengths.length; i++) { - // Use all code lengths within valid range [1, 20] - codeLengths[i] = (char) ((i % MAX_CODE_LEN) + 1); - } - data.temp_charArray2d[0] = codeLengths; - assertDoesNotThrow( - () -> BZip2CompressorInputStream.createHuffmanDecodingTables(codeLengths.length, 1, data), - "createHuffmanDecodingTables should not throw for valid codeLengths array of MAX_ALPHA_SIZE"); - assertEquals(data.minLens[0], 1, "Minimum code length should be 1"); - } - @Test void testHbCreateDecodeTables() throws IOException { assertThrows(CompressorException.class, () -> new BZip2CompressorInputStream( @@ -202,8 +186,9 @@ void testRecvDecodingTablesWithValidCodeLength(final int codeLength) throws IOEx "Should accept code length " + codeLength + " within [" + MIN_CODE_LEN + ", " + MAX_CODE_LEN + "]"); // We encoded 2 Huffman groups; both minLens should equal the encoded codeLength - assertEquals(codeLength, data.minLens[0], "Group 0 min code length mismatch"); - assertEquals(codeLength, data.minLens[1], "Group 1 min code length mismatch"); + assertEquals(2, data.huffmanDecodersCount, "Expected 2 Huffman groups"); + assertEquals(codeLength, data.huffmanDecoders[0].getMinLength(), "Group 0 min code length mismatch"); + assertEquals(codeLength, data.huffmanDecoders[1].getMinLength(), "Group 1 min code length mismatch"); } } From b12da143a004f7b3ef8f6a67879e01f40878b28e Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Fri, 29 Aug 2025 21:58:06 +0200 Subject: [PATCH 2/7] feat(deflate64): use shared `HuffmanDecoder` and add bit-order support * Switch DEFLATE64 to the common `HuffmanDecoder` (remove custom tree). * Expose bit order via `BitInputStream#getByteOrder()`. * Make `HuffmanDecoder` handle MSB (BZip2) and LSB (Deflate64). * Add tests for zero-length codes and cross-byte decoding in both orders. --- .../compressors/alg/HuffmanDecoder.java | 54 +++++-- .../Deflate64CompressorInputStream.java | 6 +- ...fmanDecoder.java => Deflate64Decoder.java} | 138 ++++-------------- ...{HuffmanState.java => Deflate64State.java} | 2 +- .../compress/utils/BitInputStream.java | 44 +++++- .../compressors/alg/HuffmanDecoderTest.java | 72 ++++++++- .../Deflate64CompressorInputStreamTest.java | 6 +- ...derTest.java => Deflate64DecoderTest.java} | 16 +- 8 files changed, 193 insertions(+), 145 deletions(-) rename src/main/java/org/apache/commons/compress/compressors/deflate64/{HuffmanDecoder.java => Deflate64Decoder.java} (80%) rename src/main/java/org/apache/commons/compress/compressors/deflate64/{HuffmanState.java => Deflate64State.java} (97%) rename src/test/java/org/apache/commons/compress/compressors/deflate64/{HuffmanDecoderTest.java => Deflate64DecoderTest.java} (93%) diff --git a/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java index 1c984c6c2b3..3105e077d63 100644 --- a/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java @@ -20,8 +20,10 @@ import java.io.EOFException; import java.io.IOException; +import java.nio.ByteOrder; import java.util.Objects; +import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.utils.BitInputStream; /** @@ -73,18 +75,38 @@ public final class HuffmanDecoder { /** * Constructs a decoder from canonical code lengths. *

- * The {@code codeLengths} array provides, for each symbol index {@code i} in - * {@code [0, codeLengthSize)}, the length (in bits) of that symbol's code. A value of - * {@code 0} marks an unused symbol. All non-zero lengths must be {@code <= maxCodeLength} and - * the set of lengths must satisfy Kraft's equality (i.e., define a complete prefix code). + * The {@code codeLengths} array provides, for each symbol index {@code i} in {@code [0, codeLengthSize)}, + * the length (in bits) of that symbol's code. + * A value of {@code 0} marks an unused symbol. + * All non-zero lengths must be {@code <= maxCodeLength}. *

* * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} - * @param codeLengthSize number of symbols to read from {@code codeLengths} (must be {@code > 0} - * and {@code <= codeLengths.length}) + * @throws NullPointerException if {@code codeLengths} is {@code null} + * @throws IllegalArgumentException if any code length is out of range [0, {@value #MAX_SUPPORTED_CODE_LENGTH}] + */ + public HuffmanDecoder(final int[] codeLengths) { + this(codeLengths, codeLengths.length, 0, MAX_SUPPORTED_CODE_LENGTH); + } + + /** + * Constructs a decoder from canonical code lengths. + *

+ * The {@code codeLengths} array provides, for each symbol index {@code i} in {@code [0, codeLengthSize)}, + * the length (in bits) of that symbol's code. + * A value of {@code 0} marks an unused symbol. + * All non-zero lengths must be {@code <= maxCodeLength}. + *

+ * + * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} + * @param codeLengthSize number of symbols to read from {@code codeLengths} + * (must be {@code > 0} and {@code <= codeLengths.length}) * @param minCodeLength minimum allowed code length present in {@code codeLengths} * @param maxCodeLength maximum allowed code length present in {@code codeLengths} - * @throws IllegalArgumentException if any argument is invalid + * @throws NullPointerException if {@code codeLengths} is {@code null} + * @throws IllegalArgumentException if {@code codeLengthSize} is out of range, if any code length is out of range + * or if {@code maxCodeLength} exceeds the implementation limit + * ({@value #MAX_SUPPORTED_CODE_LENGTH}) */ public HuffmanDecoder( final int[] codeLengths, final int codeLengthSize, final int minCodeLength, final int maxCodeLength) @@ -186,6 +208,9 @@ private static void fillCodeTable( // Adjust offsets to point to the last element of each length for (int symbol = 0; symbol < codeLengthSize; symbol++) { final int len = codeLengths[symbol]; + if (len == 0) { + continue; + } sorted[++offset[len]] = symbol; } @@ -215,21 +240,30 @@ public int decodeSymbol(final BitInputStream in) throws IOException { int len = minLength; int code = readBitsFully(in, len); while (len <= maxLength && code > limit[len]) { - final int b = readBitsFully(in, 1); + final int b = readBit(in); code = (code << 1) | b; len++; } if (len > maxLength) { - throw new IOException("Invalid Huffman code: " + code); + throw new CompressorException("Invalid Huffman code: " + code); } return sorted[code - bias[len]]; } + private static int readBit(final BitInputStream in) throws IOException { + final int bit = in.readBit(); + if (bit < 0) { + throw new EOFException("Truncated Huffman bitstream"); + } + return bit; + } + private static int readBitsFully(final BitInputStream in, final int numBits) throws IOException { final int code = (int) in.readBits(numBits); if (code < 0) { throw new EOFException("Truncated Huffman bitstream"); } - return code; + // Adjust for bit order + return in.getByteOrder() == ByteOrder.BIG_ENDIAN ? code : Integer.reverse(code) >>> (32 - numBits); } } diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java index 0b5623b79d6..925bc6d5fd7 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java @@ -35,11 +35,11 @@ */ public class Deflate64CompressorInputStream extends CompressorInputStream implements InputStreamStatistics { private InputStream originalStream; - private HuffmanDecoder decoder; + private Deflate64Decoder decoder; private long compressedBytesRead; private final byte[] oneByte = new byte[1]; - Deflate64CompressorInputStream(final HuffmanDecoder decoder) { + Deflate64CompressorInputStream(final Deflate64Decoder decoder) { this.decoder = decoder; } @@ -49,7 +49,7 @@ public class Deflate64CompressorInputStream extends CompressorInputStream implem * @param in the stream to read from */ public Deflate64CompressorInputStream(final InputStream in) { - this(new HuffmanDecoder(in)); + this(new Deflate64Decoder(in)); originalStream = in; } diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java similarity index 80% rename from src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java rename to src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java index 586be380b8b..1383ba40b9f 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java @@ -18,10 +18,10 @@ */ package org.apache.commons.compress.compressors.deflate64; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.DYNAMIC_CODES; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.FIXED_CODES; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.INITIAL; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.STORED; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.DYNAMIC_CODES; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.FIXED_CODES; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.INITIAL; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.STORED; import java.io.Closeable; import java.io.EOFException; @@ -30,6 +30,7 @@ import java.nio.ByteOrder; import java.util.Arrays; +import org.apache.commons.compress.compressors.alg.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.ByteUtils; import org.apache.commons.compress.utils.ExactMath; @@ -38,38 +39,7 @@ /** * TODO This class can't be final because it is mocked by Mockito. */ -class HuffmanDecoder implements Closeable { - - private static final class BinaryTreeNode { - private final int bits; - int literal = -1; - BinaryTreeNode leftNode; - BinaryTreeNode rightNode; - - private BinaryTreeNode(final int bits) { - this.bits = bits; - } - - void leaf(final int symbol) { - literal = symbol; - leftNode = null; - rightNode = null; - } - - BinaryTreeNode left() { - if (leftNode == null && literal == -1) { - leftNode = new BinaryTreeNode(bits + 1); - } - return leftNode; - } - - BinaryTreeNode right() { - if (rightNode == null && literal == -1) { - rightNode = new BinaryTreeNode(bits + 1); - } - return rightNode; - } - } +class Deflate64Decoder implements Closeable { private abstract static class DecoderState { abstract int available() throws IOException; @@ -78,7 +48,7 @@ private abstract static class DecoderState { abstract int read(byte[] b, int off, int len) throws IOException; - abstract HuffmanState state(); + abstract Deflate64State state(); } private static final class DecodingMemory { @@ -132,18 +102,18 @@ void recordToBuffer(final int distance, final int length, final byte[] buff) { private final class HuffmanCodes extends DecoderState { private boolean endOfBlock; - private final HuffmanState state; - private final BinaryTreeNode lengthTree; - private final BinaryTreeNode distanceTree; + private final Deflate64State state; + private final HuffmanDecoder symbolDecoder; + private final HuffmanDecoder distanceDecoder; private int runBufferPos; private byte[] runBuffer = ByteUtils.EMPTY_BYTE_ARRAY; private int runBufferLength; - HuffmanCodes(final HuffmanState state, final int[] lengths, final int[] distance) { + HuffmanCodes(final Deflate64State state, final int[] lengths, final int[] distance) { this.state = state; - lengthTree = buildTree(lengths); - distanceTree = buildTree(distance); + symbolDecoder = new HuffmanDecoder(lengths); + distanceDecoder = new HuffmanDecoder(distance); } @Override @@ -169,7 +139,7 @@ private int decodeNext(final byte[] b, final int off, final int len) throws IOEx int result = copyFromRunBuffer(b, off, len); while (result < len) { - final int symbol = nextSymbol(reader, lengthTree); + final int symbol = symbolDecoder.decodeSymbol(reader); if (symbol < 256) { b[off + result++] = memory.add((byte) symbol); } else if (symbol > 256) { @@ -178,7 +148,7 @@ private int decodeNext(final byte[] b, final int off, final int len) throws IOEx final int runXtra = runMask & 0x1F; run = ExactMath.add(run, readBits(runXtra)); - final int distSym = nextSymbol(reader, distanceTree); + final int distSym = distanceDecoder.decodeSymbol(reader); final int distMask = DISTANCE_TABLE[distSym]; int dist = distMask >>> 4; @@ -209,14 +179,11 @@ boolean hasData() { @Override int read(final byte[] b, final int off, final int len) throws IOException { - if (len == 0) { - return 0; - } - return decodeNext(b, off, len); + return len == 0 ? 0 : decodeNext(b, off, len); } @Override - HuffmanState state() { + Deflate64State state() { return endOfBlock ? INITIAL : state; } } @@ -233,7 +200,7 @@ boolean hasData() { } @Override - int read(final byte[] b, final int off, final int len) throws IOException { + int read(final byte[] b, final int off, final int len) { if (len == 0) { return 0; } @@ -241,7 +208,7 @@ int read(final byte[] b, final int off, final int len) throws IOException { } @Override - HuffmanState state() { + Deflate64State state() { return INITIAL; } } @@ -292,7 +259,7 @@ int read(final byte[] b, final int off, final int len) throws IOException { } @Override - HuffmanState state() { + Deflate64State state() { return read < blockLength ? STORED : INITIAL; } } @@ -367,71 +334,16 @@ HuffmanState state() { FIXED_DISTANCE = ArrayFill.fill(new int[32], 5); } - private static BinaryTreeNode buildTree(final int[] litTable) { - final int[] literalCodes = getCodes(litTable); - - final BinaryTreeNode root = new BinaryTreeNode(0); - - for (int i = 0; i < litTable.length; i++) { - final int len = litTable[i]; - if (len != 0) { - BinaryTreeNode node = root; - final int lit = literalCodes[len - 1]; - for (int p = len - 1; p >= 0; p--) { - final int bit = lit & 1 << p; - node = bit == 0 ? node.left() : node.right(); - if (node == null) { - throw new IllegalStateException("node doesn't exist in Huffman tree"); - } - } - node.leaf(i); - literalCodes[len - 1]++; - } - } - return root; - } - - private static int[] getCodes(final int[] litTable) { - int max = 0; - int[] blCount = new int[65]; - - for (final int aLitTable : litTable) { - if (aLitTable < 0 || aLitTable > 64) { - throw new IllegalArgumentException("Invalid code " + aLitTable + " in literal table"); - } - max = Math.max(max, aLitTable); - blCount[aLitTable]++; - } - blCount = Arrays.copyOf(blCount, max + 1); - - int code = 0; - final int[] nextCode = new int[max + 1]; - for (int i = 0; i <= max; i++) { - code = code + blCount[i] << 1; - nextCode[i] = code; - } - - return nextCode; - } - - private static int nextSymbol(final BitInputStream reader, final BinaryTreeNode tree) throws IOException { - BinaryTreeNode node = tree; - while (node != null && node.literal == -1) { - final long bit = readBits(reader, 1); - node = bit == 0 ? node.leftNode : node.rightNode; - } - return node != null ? node.literal : -1; - } - private static void populateDynamicTables(final BitInputStream reader, final int[] literals, final int[] distances) throws IOException { final int codeLengths = (int) (readBits(reader, 4) + 4); - final int[] codeLengthValues = new int[19]; + final int[] codeLengthValues = new int[CODE_LENGTHS_ORDER.length]; for (int cLen = 0; cLen < codeLengths; cLen++) { codeLengthValues[CODE_LENGTHS_ORDER[cLen]] = (int) readBits(reader, 3); } - final BinaryTreeNode codeLengthTree = buildTree(codeLengthValues); + // Decoder to decode the code lengths for the literal and distance tables + final HuffmanDecoder codeLengthDecoder = new HuffmanDecoder(codeLengthValues); final int[] auxBuffer = new int[literals.length + distances.length]; @@ -443,7 +355,7 @@ private static void populateDynamicTables(final BitInputStream reader, final int auxBuffer[off++] = value; length--; } else { - final int symbol = nextSymbol(reader, codeLengthTree); + final int symbol = codeLengthDecoder.decodeSymbol(reader); if (symbol < 16) { value = symbol; auxBuffer[off++] = value; @@ -489,7 +401,7 @@ private static long readBits(final BitInputStream reader, final int numBits) thr private final DecodingMemory memory = new DecodingMemory(); - HuffmanDecoder(final InputStream in) { + Deflate64Decoder(final InputStream in) { this.reader = new BitInputStream(in, ByteOrder.LITTLE_ENDIAN); this.in = in; state = new InitialState(); diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java similarity index 97% rename from src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java rename to src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java index 20f3979a91f..434b5d90ae1 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java @@ -18,6 +18,6 @@ */ package org.apache.commons.compress.compressors.deflate64; -enum HuffmanState { +enum Deflate64State { INITIAL, STORED, DYNAMIC_CODES, FIXED_CODES } diff --git a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java index 5086db56791..f554a4db673 100644 --- a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java +++ b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java @@ -45,10 +45,39 @@ public class BitInputStream implements Closeable { private int bitsCachedSize; /** - * Constructor taking an InputStream and its bit arrangement. + * Constructs a {@code BitInputStream} that reads individual bits from the + * given {@link InputStream}, interpreting them according to the specified + * bit ordering. * - * @param in the InputStream. - * @param byteOrder the bit arrangement across byte boundaries, either BIG_ENDIAN (aaaaabbb bb000000) or LITTLE_ENDIAN (bbbaaaaa 000000bb). + *

The bit ordering determines how consecutive bits are packed into bytes:

+ * + * + * + * @param in the underlying input stream providing the bytes + * @param byteOrder determines whether bits are read MSB-first ({@link ByteOrder#BIG_ENDIAN}) + * or LSB-first ({@link ByteOrder#LITTLE_ENDIAN}) */ public BitInputStream(final InputStream in, final ByteOrder byteOrder) { this.in = org.apache.commons.io.input.BoundedInputStream.builder().setInputStream(in).asSupplier().get(); @@ -210,4 +239,13 @@ private long readCachedBits(final int count) { return bitsOut; } + /** + * Returns the bit order used by this stream. + * + * @return the bit ordering: {@link ByteOrder#BIG_ENDIAN} (MSB-first) or {@link ByteOrder#LITTLE_ENDIAN} (LSB-first). + * @see #BitInputStream(InputStream, ByteOrder) + */ + public ByteOrder getByteOrder() { + return byteOrder; + } } diff --git a/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java b/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java index 2e7acff8d24..f7cef643721 100644 --- a/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java @@ -21,11 +21,22 @@ import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.stream.Stream; + +import org.apache.commons.compress.utils.BitInputStream; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; class HuffmanDecoderTest { - private static final int MIN_CODE_LEN = 1; - private static final int MAX_CODE_LEN = 20; @Test void testCreateHuffmanDecodingTablesWithLargeAlphaSize() { @@ -33,11 +44,64 @@ void testCreateHuffmanDecodingTablesWithLargeAlphaSize() { final int[] codeLengths = new int[258]; for (int i = 0; i < codeLengths.length; i++) { // Use all code lengths within valid range [1, 20] - codeLengths[i] = (char) ((i % MAX_CODE_LEN) + 1); + codeLengths[i] = (char) ((i % 20) + 1); } final HuffmanDecoder decoder = assertDoesNotThrow( - () -> new HuffmanDecoder(codeLengths, codeLengths.length, MIN_CODE_LEN, MAX_CODE_LEN), + () -> new HuffmanDecoder(codeLengths, codeLengths.length, 1, 20), "HuffmanDecoder constructor should not throw for valid codeLengths array of MAX_ALPHA_SIZE"); assertEquals(decoder.getMinLength(), 1, "Minimum code length should be 1"); + assertEquals(decoder.getMaxLength(), 20, "Maximum code length should be 20"); + } + + static Stream testDecodeSymbols() { + return Stream.of( + // Simple case + // Symbol 0: 10 + // Symbol 1: 11 + // Symbol 3: 0 + Arguments.of( + new int[] {2, 2, 0, 1}, + new byte[] {(byte) 0b10_11_0_0_0_0}, + Arrays.asList(0, 1, 3, 3, 3, 3), + ByteOrder.BIG_ENDIAN), + Arguments.of( + new int[] {2, 2, 0, 1}, + new byte[] {(byte) 0b01_11_0_0_0_0}, + Arrays.asList(3, 3, 3, 3, 1, 0), + ByteOrder.LITTLE_ENDIAN), + + // Across byte boundary + // Symbol 0: 10 + // Symbol 1: 11 + // Symbol 2: 0 + Arguments.of( + new int[] {2, 2, 1, 0}, + new byte[] {(byte) 0b0_11_10_11_1, (byte) 0b0_11_10_11_0}, + Arrays.asList(2, 1, 0, 1, 0, 1, 0, 1, 2), + ByteOrder.BIG_ENDIAN), + Arguments.of( + new int[] {2, 2, 1, 0}, + new byte[] {(byte) 0b1_11_01_11_0, (byte) 0b0_11_01_11_0}, + Arrays.asList(2, 1, 0, 1, 0, 1, 0, 1, 2), + ByteOrder.LITTLE_ENDIAN)); + } + + @ParameterizedTest + @MethodSource + void testDecodeSymbols( + final int[] codeLengths, + final byte[] inputData, + final List expectedSymbols, + final ByteOrder byteOrder) + throws IOException { + final HuffmanDecoder decoder = new HuffmanDecoder(codeLengths); + + final Collection actualSymbols = new ArrayList<>(); + try (BitInputStream in = new BitInputStream(new ByteArrayInputStream(inputData), byteOrder)) { + for (int i = 0; i < expectedSymbols.size(); i++) { + actualSymbols.add(decoder.decodeSymbol(in)); + } + } + assertEquals(expectedSymbols, actualSymbols, "Decoded symbols do not match expected symbols"); } } diff --git a/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStreamTest.java b/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStreamTest.java index ffd95cda289..809dcdd638d 100644 --- a/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStreamTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStreamTest.java @@ -44,10 +44,10 @@ @ExtendWith(MockitoExtension.class) class Deflate64CompressorInputStreamTest { - private final HuffmanDecoder nullDecoder = null; + private final Deflate64Decoder nullDecoder = null; @Mock - private HuffmanDecoder decoder; + private Deflate64Decoder decoder; private void fuzzingTest(final int[] bytes) throws IOException, ArchiveException { final int len = bytes.length; @@ -173,7 +173,7 @@ void testShouldThrowIOExceptionInsteadOfRuntimeExceptionCOMPRESS526() { */ @Test void testShouldThrowIOExceptionInsteadOfRuntimeExceptionCOMPRESS527() { - assertThrows(CompressorException.class, + assertThrows(EOFException.class, () -> fuzzingTest(new int[] { 0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x09, 0x00, 0x84, 0xb6, 0xba, 0x46, 0x72, 0xb6, 0xfe, 0x77, 0x4a, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x1c, 0x00, 0x62, 0x62, 0x62, 0x55, 0x54, 0x09, 0x00, 0x03, 0xe7, 0xce, 0x64, 0x55, 0xf3, 0xce, 0x64, 0x55, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0x5c, 0xf9, 0x01, 0x00, 0x04, 0x88, 0x13, 0x00, 0x00, 0x1d, 0x8b, diff --git a/src/test/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoderTest.java b/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64DecoderTest.java similarity index 93% rename from src/test/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoderTest.java rename to src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64DecoderTest.java index bb20912d762..8027ecf58a2 100644 --- a/src/test/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoderTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/deflate64/Deflate64DecoderTest.java @@ -27,7 +27,7 @@ import org.junit.jupiter.api.Test; -class HuffmanDecoderTest { +class Deflate64DecoderTest { @Test void testDecodeFixedHuffmanBlockWithMemoryLookup() throws Exception { @@ -50,7 +50,7 @@ void testDecodeFixedHuffmanBlockWithMemoryLookup() throws Exception { 0b00000000000000000000000000001101, // dist6 + offset <11> + end of block (000000) 0b11111111111111111111111111111000 // end of block (0000) + garbage }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[100]; final int len = decoder.decode(result); assertEquals(48, len); @@ -79,7 +79,7 @@ void testDecodeFixedHuffmanBlockWithMemoryLookupInExactBuffer() throws Exception 0b00000000000000000000000000001101, // dist6 + offset <11> + end of block (000000) 0b11111111111111111111111111111000 // end of block (0000) + garbage }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[48]; int len; len = decoder.decode(result); @@ -112,7 +112,7 @@ void testDecodeFixedHuffmanBlockWithMemoryLookupInSmallBuffer() throws Exception 0b11111111111111111111111111111000 // end of block (0000) + garbage }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[30]; int len; len = decoder.decode(result); @@ -142,7 +142,7 @@ void testDecodeSimpleFixedHuffmanBlock() throws Exception { 0b00000000000000000000000000000001, // d + end of block 0b11111111111111111111111111111100 // end of block (00) + garbage }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[100]; final int len = decoder.decode(result); assertEquals(11, len); @@ -168,7 +168,7 @@ void testDecodeSimpleFixedHuffmanBlockToSmallBuffer() throws Exception { 0b00000000000000000000000000000001, // d + end of block 0b11111111111111111111111111111100 // end of block (00) + garbage }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[10]; int len; len = decoder.decode(result); @@ -185,7 +185,7 @@ void testDecodeUncompressedBlock() throws Exception { final byte[] data = { 0b1, // end of block + no compression mode 11, 0, -12, -1, // len & ~len 'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd' }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[100]; final int len = decoder.decode(result); assertEquals(11, len); @@ -198,7 +198,7 @@ void testDecodeUncompressedBlockWithInvalidLenNLenValue() throws Exception { final byte[] data = { 0b1, // end of block + no compression mode 11, 0, -12, -2, // len & ~len 'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd' }; - try (HuffmanDecoder decoder = new HuffmanDecoder(new ByteArrayInputStream(data))) { + try (Deflate64Decoder decoder = new Deflate64Decoder(new ByteArrayInputStream(data))) { final byte[] result = new byte[100]; final IllegalStateException e = assertThrows(IllegalStateException.class, () -> { final int len = decoder.decode(result); From a17a6542027c6363d2dec2904ba36fe21e4679fa Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Fri, 29 Aug 2025 21:59:58 +0200 Subject: [PATCH 3/7] fix: move `HuffmanDecoder` to `compressors.support` --- .../compress/compressors/bzip2/BZip2CompressorInputStream.java | 2 +- .../compress/compressors/deflate64/Deflate64Decoder.java | 2 +- .../compress/compressors/{alg => support}/HuffmanDecoder.java | 2 +- .../compress/compressors/{alg => support}/package-info.java | 2 +- .../compressors/{alg => support}/HuffmanDecoderTest.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename src/main/java/org/apache/commons/compress/compressors/{alg => support}/HuffmanDecoder.java (99%) rename src/main/java/org/apache/commons/compress/compressors/{alg => support}/package-info.java (93%) rename src/test/java/org/apache/commons/compress/compressors/{alg => support}/HuffmanDecoderTest.java (98%) diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java index f4156441db4..6ea8b8e34ae 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java @@ -31,7 +31,7 @@ import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; -import org.apache.commons.compress.compressors.alg.HuffmanDecoder; +import org.apache.commons.compress.compressors.support.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.InputStreamStatistics; import org.apache.commons.io.input.CloseShieldInputStream; diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java index 1383ba40b9f..67ca8077692 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java @@ -30,7 +30,7 @@ import java.nio.ByteOrder; import java.util.Arrays; -import org.apache.commons.compress.compressors.alg.HuffmanDecoder; +import org.apache.commons.compress.compressors.support.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.ByteUtils; import org.apache.commons.compress.utils.ExactMath; diff --git a/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java similarity index 99% rename from src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java rename to src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java index 3105e077d63..14cccb2ff78 100644 --- a/src/main/java/org/apache/commons/compress/compressors/alg/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.commons.compress.compressors.alg; +package org.apache.commons.compress.compressors.support; import java.io.EOFException; import java.io.IOException; diff --git a/src/main/java/org/apache/commons/compress/compressors/alg/package-info.java b/src/main/java/org/apache/commons/compress/compressors/support/package-info.java similarity index 93% rename from src/main/java/org/apache/commons/compress/compressors/alg/package-info.java rename to src/main/java/org/apache/commons/compress/compressors/support/package-info.java index 4cf75057d69..542424d0748 100644 --- a/src/main/java/org/apache/commons/compress/compressors/alg/package-info.java +++ b/src/main/java/org/apache/commons/compress/compressors/support/package-info.java @@ -20,4 +20,4 @@ * Algorithms reused by various compressors. * @since 1.29.0 */ -package org.apache.commons.compress.compressors.alg; +package org.apache.commons.compress.compressors.support; diff --git a/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java b/src/test/java/org/apache/commons/compress/compressors/support/HuffmanDecoderTest.java similarity index 98% rename from src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java rename to src/test/java/org/apache/commons/compress/compressors/support/HuffmanDecoderTest.java index f7cef643721..a60428e1005 100644 --- a/src/test/java/org/apache/commons/compress/compressors/alg/HuffmanDecoderTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/support/HuffmanDecoderTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.commons.compress.compressors.alg; +package org.apache.commons.compress.compressors.support; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; From 6cb2d474818a33f38714b10c017336d3e6031fe1 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Fri, 29 Aug 2025 23:10:20 +0200 Subject: [PATCH 4/7] Fix Javadoc on JDK 8 --- .../commons/compress/compressors/support/HuffmanDecoder.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java index 14cccb2ff78..7072abc8765 100644 --- a/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java @@ -83,7 +83,7 @@ public final class HuffmanDecoder { * * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} * @throws NullPointerException if {@code codeLengths} is {@code null} - * @throws IllegalArgumentException if any code length is out of range [0, {@value #MAX_SUPPORTED_CODE_LENGTH}] + * @throws IllegalArgumentException if any code length is out of range [0, 30] */ public HuffmanDecoder(final int[] codeLengths) { this(codeLengths, codeLengths.length, 0, MAX_SUPPORTED_CODE_LENGTH); @@ -105,8 +105,7 @@ public HuffmanDecoder(final int[] codeLengths) { * @param maxCodeLength maximum allowed code length present in {@code codeLengths} * @throws NullPointerException if {@code codeLengths} is {@code null} * @throws IllegalArgumentException if {@code codeLengthSize} is out of range, if any code length is out of range - * or if {@code maxCodeLength} exceeds the implementation limit - * ({@value #MAX_SUPPORTED_CODE_LENGTH}) + * or if {@code maxCodeLength} exceeds the implementation limit (30) */ public HuffmanDecoder( final int[] codeLengths, final int codeLengthSize, final int minCodeLength, final int maxCodeLength) From fe3d66c8f7c599026640a9efcc999b0174cf64c1 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Fri, 29 Aug 2025 23:49:41 +0200 Subject: [PATCH 5/7] fix: remove cast to `char` Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../compress/compressors/bzip2/BZip2CompressorInputStream.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java index 6ea8b8e34ae..5103e53f1dc 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java @@ -264,7 +264,7 @@ static void recvDecodingTables(final BitInputStream bin, final Data dataShadow) while (bsGetBit(bin)) { curr += bsGetBit(bin) ? -1 : 1; } - codeLengths[i] = (char) curr; + codeLengths[i] = curr; } try { // Same limits as in the reference C implementation of bzip2 From 3e63b0ce091ebfeebccf98c5c845792a3386ee02 Mon Sep 17 00:00:00 2001 From: Gary Gregory Date: Fri, 26 Sep 2025 09:54:02 -0400 Subject: [PATCH 6/7] Add missing Javadoc since tag 1.29.0 --- .../commons/compress/compressors/support/HuffmanDecoder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java index 7072abc8765..411b2043741 100644 --- a/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java @@ -45,6 +45,8 @@ * *

Thread-safety

* Instances are immutable after construction and may be safely shared between threads. + * + * @since 1.29.0 */ public final class HuffmanDecoder { From f699713eaae9776357629528244b5846c71fa6f6 Mon Sep 17 00:00:00 2001 From: Gary Gregory Date: Fri, 26 Sep 2025 09:55:13 -0400 Subject: [PATCH 7/7] Add Javadoc @since tag --- .../java/org/apache/commons/compress/utils/BitInputStream.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java index f554a4db673..f2766b03114 100644 --- a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java +++ b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java @@ -244,6 +244,7 @@ private long readCachedBits(final int count) { * * @return the bit ordering: {@link ByteOrder#BIG_ENDIAN} (MSB-first) or {@link ByteOrder#LITTLE_ENDIAN} (LSB-first). * @see #BitInputStream(InputStream, ByteOrder) + * @since 1.29.0 */ public ByteOrder getByteOrder() { return byteOrder;