-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-5726: [Java] Implement a common interface for int vectors #4698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.arrow.vector; | ||
|
|
||
| /** | ||
| * Interface for all int type vectors. | ||
| */ | ||
| public interface BaseIntVector extends ValueVector { | ||
|
|
||
| /** | ||
| * set the encoded value from a {@link org.apache.arrow.vector.dictionary.Dictionary}. | ||
| */ | ||
| void setEncodedValue(int index, int value); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,7 @@ | |
| * integer values which could be null. A validity buffer (bit vector) is | ||
| * maintained to track which elements in the vector are null. | ||
| */ | ||
| public class BigIntVector extends BaseFixedWidthVector { | ||
| public class BigIntVector extends BaseFixedWidthVector implements BaseIntVector { | ||
| public static final byte TYPE_WIDTH = 8; | ||
| private final FieldReader reader; | ||
|
|
||
|
|
@@ -339,6 +339,11 @@ public TransferPair makeTransferPair(ValueVector to) { | |
| return new TransferImpl((BigIntVector) to); | ||
| } | ||
|
|
||
| @Override | ||
| public void setEncodedValue(int index, int value) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not do this as long and we can have just a generic setWithPossibleTruncate(int index, long value)? Could be generally useful as this naming/sizing is very dictionary specific.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that is a good suggestion and I suggested a long as well. @tianchen92 do you want to open a JIRA/PR to refactor the name/size?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. |
||
| this.setSafe(index, value); | ||
| } | ||
|
|
||
| private class TransferImpl implements TransferPair { | ||
| BigIntVector to; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; | ||
|
|
||
| import org.apache.arrow.memory.BufferAllocator; | ||
| import org.apache.arrow.util.Preconditions; | ||
| import org.apache.arrow.vector.complex.impl.UInt2ReaderImpl; | ||
| import org.apache.arrow.vector.complex.reader.FieldReader; | ||
| import org.apache.arrow.vector.holders.NullableUInt2Holder; | ||
|
|
@@ -35,7 +36,7 @@ | |
| * integer values which could be null. A validity buffer (bit vector) is | ||
| * maintained to track which elements in the vector are null. | ||
| */ | ||
| public class UInt2Vector extends BaseFixedWidthVector { | ||
| public class UInt2Vector extends BaseFixedWidthVector implements BaseIntVector { | ||
| private static final byte TYPE_WIDTH = 2; | ||
| private final FieldReader reader; | ||
|
|
||
|
|
@@ -308,6 +309,12 @@ public TransferPair makeTransferPair(ValueVector to) { | |
| return new TransferImpl((UInt2Vector) to); | ||
| } | ||
|
|
||
| @Override | ||
| public void setEncodedValue(int index, int value) { | ||
| Preconditions.checkArgument(value <= Character.MAX_VALUE, "value is overflow: %s", value); | ||
| this.setSafe(index, value); | ||
|
||
| } | ||
|
|
||
| private class TransferImpl implements TransferPair { | ||
| UInt2Vector to; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,12 +17,10 @@ | |
|
|
||
| package org.apache.arrow.vector.dictionary; | ||
|
|
||
| import java.lang.reflect.InvocationTargetException; | ||
| import java.lang.reflect.Method; | ||
| import java.util.Arrays; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
|
|
||
| import org.apache.arrow.vector.BaseIntVector; | ||
| import org.apache.arrow.vector.FieldVector; | ||
| import org.apache.arrow.vector.ValueVector; | ||
| import org.apache.arrow.vector.types.Types.MinorType; | ||
|
|
@@ -61,43 +59,27 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { | |
| Field indexField = new Field(valueField.getName(), indexFieldType, null); | ||
|
|
||
| // vector to hold our indices (dictionary encoded values) | ||
| FieldVector indices = indexField.createVector(vector.getAllocator()); | ||
|
|
||
| // use reflection to pull out the set method | ||
| // TODO implement a common interface for int vectors | ||
| Method setter = null; | ||
| for (Class<?> c : Arrays.asList(int.class, long.class)) { | ||
| try { | ||
| setter = indices.getClass().getMethod("setSafe", int.class, c); | ||
| break; | ||
| } catch (NoSuchMethodException e) { | ||
| // ignore | ||
| } | ||
| FieldVector createdVector = indexField.createVector(vector.getAllocator()); | ||
| if (! (createdVector instanceof BaseIntVector)) { | ||
| throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" + | ||
| createdVector.getClass()); | ||
| } | ||
| if (setter == null) { | ||
| throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" + indices.getClass()); | ||
| } | ||
|
|
||
| int count = vector.getValueCount(); | ||
|
|
||
| BaseIntVector indices = (BaseIntVector) createdVector; | ||
| indices.allocateNew(); | ||
|
|
||
| try { | ||
| for (int i = 0; i < count; i++) { | ||
| Object value = vector.getObject(i); | ||
| if (value != null) { // if it's null leave it null | ||
| // note: this may fail if value was not included in the dictionary | ||
| Object encoded = lookUps.get(value); | ||
| if (encoded == null) { | ||
| throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value); | ||
| } | ||
| setter.invoke(indices, i, encoded); | ||
| int count = vector.getValueCount(); | ||
|
|
||
| for (int i = 0; i < count; i++) { | ||
| Object value = vector.getObject(i); | ||
| if (value != null) { // if it's null leave it null | ||
| // note: this may fail if value was not included in the dictionary | ||
| Integer encoded = lookUps.get(value); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as a follow-up to this PR you could implement a Map<Object, int> to avoid the unboxing costs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, good suggestion! |
||
| if (encoded == null) { | ||
| throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value); | ||
| } | ||
| indices.setEncodedValue(i, encoded); | ||
| } | ||
| } catch (IllegalAccessException e) { | ||
| throw new RuntimeException("IllegalAccessException invoking vector mutator set():", e); | ||
| } catch (InvocationTargetException e) { | ||
| throw new RuntimeException("InvocationTargetException invoking vector mutator set():", e.getCause()); | ||
| } | ||
|
|
||
| indices.setValueCount(count); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be technically be long (https://github.com/apache/arrow/blob/master/format/Schema.fbs#L260). Note "Int" is a type which can be 8, 16, 32 or 64 bit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method sets the indices(which called dictionary encoded values in DictionaryEncoder) which is explicit int type, see https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java#L55
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, implementation detail, that seems like it might cause us problems at some point but not ncessary to fix now.