Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public abstract class DictionaryValuesWriter extends ValuesWriter implements Req
protected boolean dictionaryTooBig;

/* current size in bytes the dictionary will take once serialized */
protected int dictionaryByteSize;
protected long dictionaryByteSize;

/* size in bytes of the dictionary at the end of last dictionary encoded page (in case the current page falls back to PLAIN) */
protected int lastUsedDictionaryByteSize;
Expand Down Expand Up @@ -173,7 +173,7 @@ public BytesInput getBytes() {
BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
// remember size of dictionary when we last wrote a page
lastUsedDictionarySize = getDictionarySize();
lastUsedDictionaryByteSize = dictionaryByteSize;
lastUsedDictionaryByteSize = Math.toIntExact(dictionaryByteSize);
return bytes;
} catch (IOException e) {
throw new ParquetEncodingException("could not encode the values", e);
Expand Down Expand Up @@ -249,7 +249,7 @@ public void writeBytes(Binary v) {
id = binaryDictionaryContent.size();
binaryDictionaryContent.put(v.copy(), id);
// length as int (4 bytes) + actual bytes
dictionaryByteSize += 4 + v.length();
dictionaryByteSize += 4L + v.length();
}
encodedValues.add(id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.apache.parquet.column.values.plain.PlainValuesWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.mockito.Mockito;

public class TestDictionary {

Expand Down Expand Up @@ -171,6 +172,20 @@ public void testBinaryDictionaryFallBack() throws IOException {
assertEquals(0, cw.getBufferedSize());
}

@Test
public void testBinaryDictionaryIntegerOverflow() {
Binary mock = Mockito.mock(Binary.class);
Mockito.when(mock.length()).thenReturn(Integer.MAX_VALUE - 1);
// make the writer happy
Mockito.when(mock.copy()).thenReturn(Binary.fromString(" world"));

final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(100, 100);
cw.writeBytes(Binary.fromString("hello"));
cw.writeBytes(mock);

assertEquals(PLAIN, cw.getEncoding());
}

@Test
public void testBinaryDictionaryChangedValues() throws IOException {
int COUNT = 100;
Expand Down
4 changes: 4 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,10 @@
</excludeModules>
<excludes>
<exclude>${shade.prefix}</exclude>
<!-- In PARQUET-2052 this field is changed from int to long which is a minor API
change to fix a integer overflow issue.
TODO: remove this after Parquet 1.13 release -->
<exclude>org.apache.parquet.column.values.dictionary.DictionaryValuesWriter#dictionaryByteSize</exclude>
</excludes>
</parameter>
</configuration>
Expand Down