Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.vector;

/**
* Interface for all int type vectors.
*/
public interface BaseIntVector extends ValueVector {

/**
* set the encoded value from a {@link org.apache.arrow.vector.dictionary.Dictionary}.
*/
void setEncodedValue(int index, int value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be technically be long (https://github.com/apache/arrow/blob/master/format/Schema.fbs#L260). Note "Int" is a type which can be 8, 16, 32 or 64 bit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method sets the indices(which called dictionary encoded values in DictionaryEncoder) which is explicit int type, see https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java#L55

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, implementation detail, that seems like it might cause us problems at some point but not ncessary to fix now.

}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class BigIntVector extends BaseFixedWidthVector {
public class BigIntVector extends BaseFixedWidthVector implements BaseIntVector {
public static final byte TYPE_WIDTH = 8;
private final FieldReader reader;

Expand Down Expand Up @@ -339,6 +339,11 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((BigIntVector) to);
}

@Override
public void setEncodedValue(int index, int value) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not do this as long and we can have just a generic setWithPossibleTruncate(int index, long value)? Could be generally useful as this naming/sizing is very dictionary specific.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that is a good suggestion and I suggested a long as well. @tianchen92 do you want to open a JIRA/PR to refactor the name/size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.

this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
BigIntVector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class IntVector extends BaseFixedWidthVector {
public class IntVector extends BaseFixedWidthVector implements BaseIntVector {
public static final byte TYPE_WIDTH = 4;
private final FieldReader reader;

Expand Down Expand Up @@ -343,6 +343,11 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((IntVector) to);
}

@Override
public void setEncodedValue(int index, int value) {
this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
IntVector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.complex.impl.SmallIntReaderImpl;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.NullableSmallIntHolder;
Expand All @@ -35,7 +36,7 @@
* short values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class SmallIntVector extends BaseFixedWidthVector {
public class SmallIntVector extends BaseFixedWidthVector implements BaseIntVector {
public static final byte TYPE_WIDTH = 2;
private final FieldReader reader;

Expand Down Expand Up @@ -370,6 +371,12 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((SmallIntVector) to);
}

@Override
public void setEncodedValue(int index, int value) {
Preconditions.checkArgument(value <= Short.MAX_VALUE, "value is overflow: %s", value);
this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
SmallIntVector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.complex.impl.TinyIntReaderImpl;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.NullableTinyIntHolder;
Expand All @@ -35,7 +36,7 @@
* byte values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class TinyIntVector extends BaseFixedWidthVector {
public class TinyIntVector extends BaseFixedWidthVector implements BaseIntVector {
public static final byte TYPE_WIDTH = 1;
private final FieldReader reader;

Expand Down Expand Up @@ -370,6 +371,12 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((TinyIntVector) to);
}

@Override
public void setEncodedValue(int index, int value) {
Preconditions.checkArgument(value <= Byte.MAX_VALUE, "value is overflow: %s", value);
this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
TinyIntVector to;

Expand Down
13 changes: 11 additions & 2 deletions java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.complex.impl.UInt1ReaderImpl;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.NullableUInt1Holder;
Expand All @@ -35,7 +36,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class UInt1Vector extends BaseFixedWidthVector {
public class UInt1Vector extends BaseFixedWidthVector implements BaseIntVector {
private static final byte TYPE_WIDTH = 1;
private final FieldReader reader;

Expand Down Expand Up @@ -150,7 +151,7 @@ public void copyFrom(int fromIndex, int thisIndex, UInt1Vector from) {
}

/**
* Identical to {@link #copyFrom()} but reallocates buffer if index is larger
* Identical to {@link #copyFrom(int, int, UInt1Vector)} but reallocates buffer if index is larger
* than capacity.
*/
public void copyFromSafe(int fromIndex, int thisIndex, UInt1Vector from) {
Expand Down Expand Up @@ -329,6 +330,14 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((UInt1Vector) to);
}

@Override
public void setEncodedValue(int index, int value) {
Preconditions.checkArgument(value <= 0xFF, "value is overflow: %s", value);
this.setSafe(index, value);
}



private class TransferImpl implements TransferPair {
UInt1Vector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.complex.impl.UInt2ReaderImpl;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.holders.NullableUInt2Holder;
Expand All @@ -35,7 +36,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class UInt2Vector extends BaseFixedWidthVector {
public class UInt2Vector extends BaseFixedWidthVector implements BaseIntVector {
private static final byte TYPE_WIDTH = 2;
private final FieldReader reader;

Expand Down Expand Up @@ -308,6 +309,12 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((UInt2Vector) to);
}

@Override
public void setEncodedValue(int index, int value) {
Preconditions.checkArgument(value <= Character.MAX_VALUE, "value is overflow: %s", value);
this.setSafe(index, value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add asserts or precondition.checkArguments to ensure the value doesn't overflow for all types with smaller bitwidths

}

private class TransferImpl implements TransferPair {
UInt2Vector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class UInt4Vector extends BaseFixedWidthVector {
public class UInt4Vector extends BaseFixedWidthVector implements BaseIntVector {
private static final byte TYPE_WIDTH = 4;
private final FieldReader reader;

Expand Down Expand Up @@ -301,6 +301,11 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((UInt4Vector) to);
}

@Override
public void setEncodedValue(int index, int value) {
this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
UInt4Vector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
* integer values which could be null. A validity buffer (bit vector) is
* maintained to track which elements in the vector are null.
*/
public class UInt8Vector extends BaseFixedWidthVector {
public class UInt8Vector extends BaseFixedWidthVector implements BaseIntVector {
private static final byte TYPE_WIDTH = 8;
private final FieldReader reader;

Expand Down Expand Up @@ -302,6 +302,11 @@ public TransferPair makeTransferPair(ValueVector to) {
return new TransferImpl((UInt8Vector) to);
}

@Override
public void setEncodedValue(int index, int value) {
this.setSafe(index, value);
}

private class TransferImpl implements TransferPair {
UInt8Vector to;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@

package org.apache.arrow.vector.dictionary;

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.types.Types.MinorType;
Expand Down Expand Up @@ -61,43 +59,27 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
Field indexField = new Field(valueField.getName(), indexFieldType, null);

// vector to hold our indices (dictionary encoded values)
FieldVector indices = indexField.createVector(vector.getAllocator());

// use reflection to pull out the set method
// TODO implement a common interface for int vectors
Method setter = null;
for (Class<?> c : Arrays.asList(int.class, long.class)) {
try {
setter = indices.getClass().getMethod("setSafe", int.class, c);
break;
} catch (NoSuchMethodException e) {
// ignore
}
FieldVector createdVector = indexField.createVector(vector.getAllocator());
if (! (createdVector instanceof BaseIntVector)) {
throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" +
createdVector.getClass());
}
if (setter == null) {
throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" + indices.getClass());
}

int count = vector.getValueCount();

BaseIntVector indices = (BaseIntVector) createdVector;
indices.allocateNew();

try {
for (int i = 0; i < count; i++) {
Object value = vector.getObject(i);
if (value != null) { // if it's null leave it null
// note: this may fail if value was not included in the dictionary
Object encoded = lookUps.get(value);
if (encoded == null) {
throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value);
}
setter.invoke(indices, i, encoded);
int count = vector.getValueCount();

for (int i = 0; i < count; i++) {
Object value = vector.getObject(i);
if (value != null) { // if it's null leave it null
// note: this may fail if value was not included in the dictionary
Integer encoded = lookUps.get(value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a follow-up to this PR you could implement a Map<Object, int> to avoid the unboxing costs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, good suggestion!

if (encoded == null) {
throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value);
}
indices.setEncodedValue(i, encoded);
}
} catch (IllegalAccessException e) {
throw new RuntimeException("IllegalAccessException invoking vector mutator set():", e);
} catch (InvocationTargetException e) {
throw new RuntimeException("InvocationTargetException invoking vector mutator set():", e.getCause());
}

indices.setValueCount(count);
Expand Down