diff --git a/pom.xml b/pom.xml index 6d6872554129..6a7a02ea7a0b 100644 --- a/pom.xml +++ b/pom.xml @@ -609,6 +609,17 @@ calcite-druid ${calcite.version} + + org.apache.datasketches + datasketches-hive + ${datasketches.version} + + + org.slf4j + slf4j-simple + + + org.apache.orc orc-core diff --git a/ql/src/gen/vectorization/UDAFTemplates/VectorUDAFComputeDsKllSketch.txt b/ql/src/gen/vectorization/UDAFTemplates/VectorUDAFComputeDsKllSketch.txt new file mode 100644 index 000000000000..7ab07d5bd4f3 --- /dev/null +++ b/ql/src/gen/vectorization/UDAFTemplates/VectorUDAFComputeDsKllSketch.txt @@ -0,0 +1,314 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates; + +import org.apache.hadoop.hive.ql.udf.datasketches.kll.KllHistogramEstimator; +import org.apache.hadoop.hive.ql.udf.datasketches.kll.KllHistogramEstimatorFactory; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +#IF COMPLETE +import org.apache.hadoop.hive.ql.exec.vector.; +#ENDIF COMPLETE +import org.apache.hadoop.hive.ql.exec.vector.VectorAggregationBufferRow; +import org.apache.hadoop.hive.ql.exec.vector.VectorAggregationDesc; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.util.JavaDataModel; + +/** + * Generated from template VectorUDAFComputeDsKllSketch.txt. + */ +@Description(name = "ds_kll_sketch", value = "_FUNC_(x) " + + "Returns a KllFloatsSketch in a serialized form as a binary blob." + + " Values must be of type float.") +public class extends VectorAggregateExpression { + + private transient int k; + + public () { + super(); + } + + public (VectorAggregationDesc vecAggrDesc) { + this(vecAggrDesc, 200); + } + + public (VectorAggregationDesc vecAggrDesc, int k) { + super(vecAggrDesc); + this.k = k; + } + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + return new Aggregation(); + } + + @Override + public void aggregateInput(AggregationBuffer agg, VectorizedRowBatch batch) throws HiveException { + inputExpression.evaluate(batch); + +#IF COMPLETE + inputColumn = () batch.cols[this.inputExpression.getOutputColumnNum()]; +#ENDIF COMPLETE +#IF MERGING + BytesColumnVector inputColumn = (BytesColumnVector) batch.cols[this.inputExpression.getOutputColumnNum()]; +#ENDIF MERGING + + int batchSize = batch.size; + + if (batchSize == 0) { + return; + } + + Aggregation myagg = (Aggregation) agg; + +#IF COMPLETE + myagg.prepare(k); + if (inputColumn.noNulls) { + if (inputColumn.isRepeating) { + for (int i = 0; i < batchSize; i++) { + myagg.estimator.addToEstimator(inputColumn.vector[0]); + } + } else { + if (batch.selectedInUse) { + for (int s = 0; s < batchSize; s++) { + int i = batch.selected[s]; + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } else { + for (int i = 0; i < batchSize; i++) { + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } + } else { + if (inputColumn.isRepeating) { + if (!inputColumn.isNull[0]) { + for (int i = 0; i < batchSize; i++) { + myagg.estimator.addToEstimator(inputColumn.vector[0]); + } + } + } else { + if (batch.selectedInUse) { + for (int j = 0; j < batchSize; ++j) { + int i = batch.selected[j]; + if (!inputColumn.isNull[i]) { + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } else { + for (int i = 0; i < batchSize; i++) { + if (!inputColumn.isNull[i]) { + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } + } + } +#ENDIF COMPLETE +#IF MERGING + if (inputColumn.isRepeating) { + if (!inputColumn.isNull[0] && inputColumn.length[0] > 0) { + KllHistogramEstimator mergingKLL = KllHistogramEstimatorFactory.getKllHistogramEstimator( + inputColumn.vector[0], inputColumn.start[0], inputColumn.length[0]); + myagg.prepare(mergingKLL.getK()); + myagg.estimator.mergeEstimators(mergingKLL); + } + } else { + for (int i = 0; i < batchSize; i++) { + int s = i; + if (batch.selectedInUse) { + s = batch.selected[i]; + } + if (!inputColumn.isNull[s] && inputColumn.length[s] > 0) { + KllHistogramEstimator mergingKLL = KllHistogramEstimatorFactory.getKllHistogramEstimator( + inputColumn.vector[s], inputColumn.start[s], inputColumn.length[s]); + myagg.prepare(mergingKLL.getK()); + myagg.estimator.mergeEstimators(mergingKLL); + } + } + } +#ENDIF MERGING + } + + private Aggregation getAggregation(VectorAggregationBufferRow[] sets, int rowid, int bufferIndex) { + VectorAggregationBufferRow bufferRow = sets[rowid]; + Aggregation myagg = (Aggregation) bufferRow.getAggregationBuffer(bufferIndex); + myagg.prepare(k); + return myagg; + } + + @Override + public void aggregateInputSelection(VectorAggregationBufferRow[] aggregationBufferSets, int aggregateIndex, + VectorizedRowBatch batch) throws HiveException { + inputExpression.evaluate(batch); + +#IF COMPLETE + inputColumn = () batch.cols[this.inputExpression.getOutputColumnNum()]; +#ENDIF COMPLETE +#IF MERGING + BytesColumnVector inputColumn = (BytesColumnVector) batch.cols[this.inputExpression.getOutputColumnNum()]; +#ENDIF MERGING + + int batchSize = batch.size; + + if (batchSize == 0) { + return; + } + +#IF COMPLETE + if (inputColumn.noNulls) { + if (inputColumn.isRepeating) { + for (int i = 0; i < batchSize; i++) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[0]); + } + } else { + if (batch.selectedInUse) { + for (int s = 0; s < batchSize; s++) { + int i = batch.selected[s]; + Aggregation myagg = getAggregation(aggregationBufferSets, s, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } else { + for (int i = 0; i < batchSize; i++) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } + } else { + if (inputColumn.isRepeating) { + if (!inputColumn.isNull[0]) { + for (int i = 0; i < batchSize; i++) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[0]); + } + } + } else { + if (batch.selectedInUse) { + for (int s = 0; s < batchSize; s++) { + int i = batch.selected[s]; + if (!inputColumn.isNull[i]) { + Aggregation myagg = getAggregation(aggregationBufferSets, s, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } else { + for (int i = 0; i < batchSize; i++) { + if (!inputColumn.isNull[i]) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + myagg.estimator.addToEstimator(inputColumn.vector[i]); + } + } + } + } + } +#ENDIF COMPLETE +#IF MERGING + if (inputColumn.isRepeating) { + if (!inputColumn.isNull[0] && inputColumn.length[0] > 0) { + for (int i = 0; i < batchSize; i++) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + KllHistogramEstimator mergingKLL = KllHistogramEstimatorFactory.getKllHistogramEstimator( + inputColumn.vector[0], inputColumn.start[0], inputColumn.length[0]); + myagg.estimator.mergeEstimators(mergingKLL); + } + } + } else { + for (int i = 0; i < batchSize; i++) { + int s = i; + if (batch.selectedInUse) { + s = batch.selected[i]; + } + if (!inputColumn.isNull[s] && inputColumn.length[s] > 0) { + Aggregation myagg = getAggregation(aggregationBufferSets, i, aggregateIndex); + KllHistogramEstimator mergingKLL = KllHistogramEstimatorFactory.getKllHistogramEstimator( + inputColumn.vector[s], inputColumn.start[s], inputColumn.length[s]); + myagg.estimator.mergeEstimators(mergingKLL); + } + } + } +#ENDIF MERGING + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + agg.reset(); + } + + @Override + public long getAggregationBufferFixedSize() { + return 0; + } + + @Override + public boolean matches(String name, ColumnVector.Type inputColVectorType, ColumnVector.Type outputColVectorType, + GenericUDAFEvaluator.Mode mode) { + return name.equals("ds_kll_sketch") && + outputColVectorType == ColumnVector.Type.BYTES && +#IF MERGING + inputColVectorType == ColumnVector.Type.BYTES && + (mode == GenericUDAFEvaluator.Mode.PARTIAL2 || mode == GenericUDAFEvaluator.Mode.FINAL); +#ENDIF MERGING +#IF COMPLETE + inputColVectorType == ColumnVector.Type. && + (mode == GenericUDAFEvaluator.Mode.PARTIAL1 || mode == GenericUDAFEvaluator.Mode.COMPLETE); +#ENDIF COMPLETE + } + + @Override + public void assignRowColumn( + VectorizedRowBatch batch, int batchIndex, int columnNum, AggregationBuffer agg) throws HiveException { + Aggregation myagg = (Aggregation) agg; + BytesColumnVector outputCol = (BytesColumnVector) batch.cols[columnNum]; + if (myagg.estimator == null) { + outputCol.isNull[batchIndex] = true; + outputCol.noNulls = false; + } else { + outputCol.isNull[batchIndex] = false; + outputCol.isRepeating = false; + byte[] outputbuf = myagg.estimator.serialize(); + outputCol.setRef(batchIndex, outputbuf, 0, outputbuf.length); + } + } + + static class Aggregation implements AggregationBuffer { + + KllHistogramEstimator estimator; + + @Override + public int getVariableSize() { + return estimator.lengthFor(JavaDataModel.get()); + } + + @Override + public void reset() { + estimator = null; + } + + public void prepare(int k) { + if (estimator == null) { + estimator = KllHistogramEstimatorFactory.getEmptyHistogramEstimator(k); + } + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index acfaef7a354d..e922ce477964 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -464,7 +464,7 @@ public static boolean isFullOuterMapEnabled(HiveConf hiveConf, JoinOperator join HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_TEST_MAPJOINFULLOUTER_OVERRIDE); EnabledOverride mapJoinFullOuterOverride = - EnabledOverride.nameMap.get(testMapJoinFullOuterOverrideString); + EnabledOverride.NAME_MAP.get(testMapJoinFullOuterOverrideString); final boolean isEnabled = HiveConf.getBoolVar( diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 8e3408316fba..46ea0bcfc0cb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -39,12 +39,15 @@ import java.util.Stack; import java.util.TreeSet; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorCoalesce; +import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFComputeDsKllSketchDouble; +import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFComputeDsKllSketchFinal; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.DecimalColDivideDecimalScalar; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinAntiJoinLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinAntiJoinMultiKeyOperator; @@ -247,7 +250,7 @@ public class Vectorizer implements PhysicalPlanResolver { - protected static transient final Logger LOG = LoggerFactory.getLogger(Vectorizer.class); + protected static final Logger LOG = LoggerFactory.getLogger(Vectorizer.class); private static final Pattern supportedDataTypesPattern; @@ -286,9 +289,11 @@ public class Vectorizer implements PhysicalPlanResolver { supportedDataTypesPattern = Pattern.compile(patternBuilder.toString()); } - private Set> supportedGenericUDFs = new HashSet<>(); + private final Set> supportedGenericUDFs = new HashSet<>(); - private Set supportedAggregationUdfs = new HashSet<>(); + private final Set supportedAggregationUdfs = Arrays.stream(VECTORIZABLE_UDAF.values()) + .map(e -> e.name().toLowerCase()) + .collect(Collectors.toSet()); // The set of virtual columns that vectorized readers *MAY* support. public static final ImmutableSet vectorizableVirtualColumns = @@ -296,18 +301,46 @@ public class Vectorizer implements PhysicalPlanResolver { private HiveConf hiveConf; + private enum VECTORIZABLE_UDAF { + MIN, + MAX, + COUNT, + SUM, + AVG, + VARIANCE, + VAR_POP, + VAR_SAMP, + STD, + STDDEV, + STDDEV_POP, + STDDEV_SAMP, + BLOOM_FILTER, + COMPUTE_BIT_VECTOR_HLL, + DS_KLL_SKETCH; + + @Override + public String toString() { + return name().toLowerCase(); + } + } + public enum EnabledOverride { NONE, DISABLE, ENABLE; - public static final Map nameMap = new HashMap<>(); + public static final Map NAME_MAP = new HashMap<>(); static { for (EnabledOverride vectorizationEnabledOverride : values()) { - nameMap.put( - vectorizationEnabledOverride.name().toLowerCase(), vectorizationEnabledOverride); + NAME_MAP.put( + vectorizationEnabledOverride.toString(), vectorizationEnabledOverride); } - }; + } + + @Override + public String toString() { + return name().toLowerCase(); + } } private boolean isVectorizationEnabled; @@ -510,21 +543,6 @@ public Vectorizer() { // For conditional expressions supportedGenericUDFs.add(GenericUDFIf.class); - - supportedAggregationUdfs.add("min"); - supportedAggregationUdfs.add("max"); - supportedAggregationUdfs.add("count"); - supportedAggregationUdfs.add("sum"); - supportedAggregationUdfs.add("avg"); - supportedAggregationUdfs.add("variance"); - supportedAggregationUdfs.add("var_pop"); - supportedAggregationUdfs.add("var_samp"); - supportedAggregationUdfs.add("std"); - supportedAggregationUdfs.add("stddev"); - supportedAggregationUdfs.add("stddev_pop"); - supportedAggregationUdfs.add("stddev_samp"); - supportedAggregationUdfs.add(BLOOM_FILTER_FUNCTION); - supportedAggregationUdfs.add("compute_bit_vector_hll"); } private class VectorTaskColumnInfo { @@ -2395,7 +2413,7 @@ public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticE HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_TEST_VECTORIZATION_ENABLED_OVERRIDE); vectorizationEnabledOverride = - EnabledOverride.nameMap.get(vectorizationEnabledOverrideString); + EnabledOverride.NAME_MAP.get(vectorizationEnabledOverrideString); isVectorizationEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); @@ -4470,17 +4488,23 @@ public static ImmutablePair getVectorAggregationDe VectorizationContext vContext) throws HiveException { - VectorizedUDAFs annotation = - AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class); - if (annotation == null) { - String issue = - "Evaluator " + evaluator.getClass().getSimpleName() + " does not have a " + - "vectorized UDAF annotation (aggregation: \"" + aggregationName + "\"). " + - "Vectorization not supported"; - return new ImmutablePair(null, issue); + Class[] vecAggrClasses; + // "ds_kll_sketch" needs special treatment because the UDAF is coming from data + // sketches library, we cannot add annotations there + if (aggregationName.equals(VECTORIZABLE_UDAF.DS_KLL_SKETCH.toString())) { + vecAggrClasses = new Class[] { + VectorUDAFComputeDsKllSketchDouble.class, VectorUDAFComputeDsKllSketchFinal.class + }; + } else { + VectorizedUDAFs annotation = + AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class); + if (annotation == null) { + String issue = "Evaluator " + evaluator.getClass().getSimpleName() + " does not have a " + + "vectorized UDAF annotation (aggregation: \"" + aggregationName + "\"). " + "Vectorization not supported"; + return new ImmutablePair<>(null, issue); + } + vecAggrClasses = annotation.value(); } - final Class[] vecAggrClasses = annotation.value(); - // Not final since it may change later due to DECIMAL_64. ColumnVector.Type outputColVectorType = diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimator.java new file mode 100644 index 000000000000..4d0777c3ff98 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimator.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.datasketches.kll; + +import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.util.JavaDataModel; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +public class KllHistogramEstimator { + + private final KllFloatsSketch kll; + + KllHistogramEstimator(int k) { + this.kll = new KllFloatsSketch(k); + } + + KllHistogramEstimator(KllFloatsSketch kll) { + this.kll = kll; + } + + public byte[] serialize() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try { + KllUtils.serializeKll(bos, kll); + final byte[] result = bos.toByteArray(); + bos.close(); + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void addToEstimator(long v) { + kll.update(v); + } + + public void addToEstimator(double d) { + kll.update((float) d); + } + + public void addToEstimator(HiveDecimal decimal) { + kll.update(decimal.floatValue()); + } + + public void mergeEstimators(KllHistogramEstimator o) { + kll.merge(o.kll); + } + + public int lengthFor(JavaDataModel model) { + return KllUtils.lengthFor(model, kll); + } + + public KllFloatsSketch getSketch() { + return kll; + } + + public int getK() { + return kll.getK(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimatorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimatorFactory.java new file mode 100644 index 000000000000..72dc7d3b400c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllHistogramEstimatorFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.ql.udf.datasketches.kll; + +public class KllHistogramEstimatorFactory { + + private KllHistogramEstimatorFactory() { + throw new AssertionError("Suppress default constructor for non instantiation"); + } + + /** + * This function deserializes the serialized KLL histogram estimator from a byte array. + * @param buf to deserialize + * @param start start index for deserialization + * @param len start+len is deserialized + * @return KLL histogram estimator + */ + public static KllHistogramEstimator getKllHistogramEstimator(byte[] buf, int start, int len) { + return new KllHistogramEstimator(KllUtils.deserializeKll(buf, start, len)); + } + + /** + * This method creates an empty histogram estimator with a KLL sketch of a given k parameter. + * @param k the KLL parameter k for initializing the sketch + * @return an empty histogram estimator with a KLL sketch of a given k parameter + */ + public static KllHistogramEstimator getEmptyHistogramEstimator(int k) { + return new KllHistogramEstimator(k); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllUtils.java new file mode 100644 index 000000000000..2d9c08b586de --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/datasketches/kll/KllUtils.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.datasketches.kll; + +import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.memory.Memory; +import org.apache.hadoop.hive.ql.util.JavaDataModel; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * KLL serialization utilities. + */ +public class KllUtils { + + private KllUtils() { + throw new AssertionError("Suppress default constructor for non instantiation"); + } + + /** + * KLL is serialized according to what provided by data-sketches library + * @param out output stream to write to + * @param kll KLL sketch that needs to be serialized + * @throws IOException if an error occurs during serialization + */ + public static void serializeKll(OutputStream out, KllFloatsSketch kll) throws IOException { + out.write(kll.toByteArray()); + } + + /** + * This function deserializes the serialized KLL sketch from a stream. + * @param in input stream to be deserialized + * @return KLL sketch + * @throws IOException if errors occur while reading the stream + */ + public static KllFloatsSketch deserializeKll(InputStream in) throws IOException { + final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + final byte[] data = new byte[4]; + int nRead; + + while ((nRead = in.read(data, 0, data.length)) != -1) { + buffer.write(data, 0, nRead); + } + + buffer.flush(); + return KllFloatsSketch.heapify(Memory.wrap(buffer.toByteArray())); + } + + /** + * This function deserializes the serialized KLL sketch from a byte array. + * @param buf to deserialize + * @param start start index for deserialization + * @param len start+len is deserialized + * @return KLL sketch + */ + public static KllFloatsSketch deserializeKll(byte[] buf, int start, int len) { + InputStream is = new ByteArrayInputStream(buf, start, len); + try { + KllFloatsSketch result = deserializeKll(is); + is.close(); + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Returns the length of the given KLL sketch according to the given java data model. + * @param model the java data model to compute the length + * @param kll the KLL sketch to compute the length for + * @return the length of the given KLL sketch according to the given java data model + */ + public static int lengthFor(JavaDataModel model, KllFloatsSketch kll) { + return model == null ? KllFloatsSketch.getMaxSerializedSizeBytes(kll.getK(), kll.getN()) + : (int) model.lengthForByteArrayOfSize(kll.getSerializedSizeBytes()); + } +} diff --git a/ql/src/test/queries/clientpositive/compute_kll_sketch.q b/ql/src/test/queries/clientpositive/compute_kll_sketch.q new file mode 100644 index 000000000000..c7ff5d64363d --- /dev/null +++ b/ql/src/test/queries/clientpositive/compute_kll_sketch.q @@ -0,0 +1,67 @@ +--! qt:dataset:src +--! qt:dataset:alltypesorc + +-- check that both call (aggregation column alone, aggregation column + sketch size) +-- work with vectorization and that the computed values coincide +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc; +-- compare it against the non-vectorized execution +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc; + +-- change the k parameter (data sketch size) for KLL to see if it's actually used +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc; + +-- START: series of tests covering different data types +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc; + +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc; + +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc; + +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc; + +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc; +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc; +select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc; +-- END: series of tests covering different data types + +-- testing that the KLL sketch of two identical columns is equal +create table test_compute_kll (key1 int, key2 int); +insert overwrite table test_compute_kll select a.key, b.key from src as a, src as b; + +set hive.vectorized.execution.enabled=true; +select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll; +select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll; + +set hive.vectorized.execution.enabled=false; +select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll; +select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll; diff --git a/ql/src/test/results/clientpositive/llap/compute_kll_sketch.q.out b/ql/src/test/results/clientpositive/llap/compute_kll_sketch.q.out new file mode 100644 index 000000000000..b463c5ed5c8b --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/compute_kll_sketch.q.out @@ -0,0 +1,829 @@ +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 300 + min K : 300 + M : 8 + N : 9173 + Epsilon : 0.896% + Epsison PMF : 1.127% + Empty : false + Estimation Mode : true + Levels : 5 + Sorted : false + Buffer Capacity Items: 781 + Retained Items : 769 + Storage Bytes : 3124 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (ctinyint as float), 300)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 300 + min K : 300 + M : 8 + N : 9173 + Epsilon : 0.896% + Epsison PMF : 1.127% + Empty : false + Estimation Mode : true + Levels : 5 + Sorted : false + Buffer Capacity Items: 781 + Retained Items : 769 + Storage Bytes : 3124 + Min Value : -64.0 + Max Value : 62.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 16376.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 16376.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 16376.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (csmallint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 16376.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -1.07327936E9 + Max Value : 1.07368058E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -1.07327936E9 + Max Value : 1.07368058E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -1.07327936E9 + Max Value : 1.07368058E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -1.07327936E9 + Max Value : 1.07368058E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -2.14731162E9 + Max Value : 2.14549837E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -2.14731162E9 + Max Value : 2.14549837E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -2.14731162E9 + Max Value : 2.14549837E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cbigint as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -2.14731162E9 + Max Value : 2.14549837E9 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 79.553 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 79.553 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat, 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 79.553 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cfloat)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9173 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 535 + Storage Bytes : 2192 + Min Value : -64.0 + Max Value : 79.553 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 9763216.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 9763216.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float), 200)) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 9763216.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (cdouble as float))) from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 9174 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 6 + Sorted : false + Buffer Capacity Items: 547 + Retained Items : 536 + Storage Bytes : 2196 + Min Value : -16379.0 + Max Value : 9763216.0 +### End sketch summary + +PREHOOK: query: create table test_compute_kll (key1 int, key2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_compute_kll +POSTHOOK: query: create table test_compute_kll (key1 int, key2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_compute_kll +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: insert overwrite table test_compute_kll select a.key, b.key from src as a, src as b +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_compute_kll +POSTHOOK: query: insert overwrite table test_compute_kll select a.key, b.key from src as a, src as b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_compute_kll +POSTHOOK: Lineage: test_compute_kll.key1 EXPRESSION [(src)a.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_compute_kll.key2 EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll +PREHOOK: type: QUERY +PREHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 250000 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 11 + Sorted : false + Buffer Capacity Items: 601 + Retained Items : 598 + Storage Bytes : 2464 + Min Value : 0.0 + Max Value : 498.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll +PREHOOK: type: QUERY +PREHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 250000 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 11 + Sorted : false + Buffer Capacity Items: 601 + Retained Items : 598 + Storage Bytes : 2464 + Min Value : 0.0 + Max Value : 498.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll +PREHOOK: type: QUERY +PREHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key1 as float))) from test_compute_kll +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 250000 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 11 + Sorted : false + Buffer Capacity Items: 601 + Retained Items : 598 + Storage Bytes : 2464 + Min Value : 0.0 + Max Value : 498.0 +### End sketch summary + +PREHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll +PREHOOK: type: QUERY +PREHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### +POSTHOOK: query: select ds_kll_stringify(ds_kll_sketch(cast (key2 as float))) from test_compute_kll +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_compute_kll +#### A masked pattern was here #### + +### KLL sketch summary: + K : 200 + min K : 200 + M : 8 + N : 250000 + Epsilon : 1.329% + Epsison PMF : 1.652% + Empty : false + Estimation Mode : true + Levels : 11 + Sorted : false + Buffer Capacity Items: 601 + Retained Items : 598 + Storage Bytes : 2464 + Min Value : 0.0 + Max Value : 498.0 +### End sketch summary + diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out index 86d9bd2cd8a3..723901d0d25f 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out @@ -395,7 +395,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -433,7 +433,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -567,7 +567,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Map 6 Map Operator Tree: diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_ntile.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_ntile.q.out index 0cc4f720b749..57807528e429 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_ntile.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_ntile.q.out @@ -398,7 +398,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -436,7 +436,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -571,7 +571,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Map 6 Map Operator Tree: diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out index aff6f0761365..e9452e24cfc7 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out @@ -239,10 +239,10 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -335,7 +335,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Map 6 Map Operator Tree: diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rank.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rank.q.out index 7a815182a9c8..26a229278826 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rank.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rank.q.out @@ -398,7 +398,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -436,7 +436,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -571,7 +571,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Map 6 Map Operator Tree: diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out index 050e08384737..288359372fcb 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out @@ -148,7 +148,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -186,7 +186,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -299,7 +299,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -371,7 +371,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 5 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -493,7 +493,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -531,7 +531,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -645,7 +645,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -683,7 +683,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out index 64235ba13f56..ac90339391f6 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out @@ -120,7 +120,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 3 Data size: 687 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -159,7 +159,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile.q.out index 096e7e22e6e5..fc06a82e53f1 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile.q.out @@ -123,7 +123,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -161,7 +161,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -285,7 +285,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -323,7 +323,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -439,7 +439,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -477,7 +477,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile_partition_by.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile_partition_by.q.out index bfad651dbee2..22b0bca67c0d 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile_partition_by.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_ntile_partition_by.q.out @@ -159,7 +159,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 3 Data size: 687 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -198,7 +198,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out index d6c691157eb9..8866944ced78 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out @@ -75,10 +75,10 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank.q.out index 2404321a261d..8357951d893e 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank.q.out @@ -123,7 +123,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -161,7 +161,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -276,7 +276,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -348,7 +348,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 5 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -472,7 +472,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -510,7 +510,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) @@ -626,7 +626,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: boolean) Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -664,7 +664,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank_partition_by.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank_partition_by.q.out index e0052a4cba12..90dbb585e334 100644 --- a/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank_partition_by.q.out +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_rank_partition_by.q.out @@ -125,7 +125,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: char(1)) Statistics: Num rows: 3 Data size: 687 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: binary) - Execution mode: llap + Execution mode: vectorized, llap LLAP IO: may be used (ACID table) Reducer 2 Execution mode: llap @@ -164,7 +164,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Reducer 4 - Execution mode: llap + Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: ds_kll_sketch(VALUE._col0) diff --git a/vector-code-gen/src/org/apache/hadoop/hive/tools/GenVectorCode.java b/vector-code-gen/src/org/apache/hadoop/hive/tools/GenVectorCode.java index 358680161aef..998b19cb0e2f 100644 --- a/vector-code-gen/src/org/apache/hadoop/hive/tools/GenVectorCode.java +++ b/vector-code-gen/src/org/apache/hadoop/hive/tools/GenVectorCode.java @@ -1214,10 +1214,15 @@ public class GenVectorCode extends Task { {"VectorUDAFSum", "VectorUDAFSumLong", "long"}, {"VectorUDAFSum", "VectorUDAFSumDouble", "double"}, + // "long" as for "MERGING" is ignored {"VectorUDAFComputeBitVector", "VectorUDAFComputeBitVectorFinal", "long", "MERGING"}, {"VectorUDAFComputeBitVector", "VectorUDAFComputeBitVectorLong", "long", "COMPLETE"}, {"VectorUDAFComputeBitVector", "VectorUDAFComputeBitVectorDouble", "double", "COMPLETE"}, + // "double" as for "MERGING" is ignored + {"VectorUDAFComputeDsKllSketch", "VectorUDAFComputeDsKllSketchFinal", "double", "MERGING"}, + {"VectorUDAFComputeDsKllSketch", "VectorUDAFComputeDsKllSketchDouble", "double", "COMPLETE"}, + // Template, , , {"VectorUDAFAvg", "VectorUDAFAvgLong", "long", "PARTIAL1"}, @@ -1471,7 +1476,9 @@ private void generate() throws Exception { } else if (tdesc[0].equals("VectorUDAFAvgDecimalMerge")) { generateVectorUDAFAvgMerge(tdesc); } else if (tdesc[0].equals("VectorUDAFComputeBitVector")) { - generateVectorUDAFComputeBitVector(tdesc); + generateVectorUDAFDataSummary(tdesc); + } else if (tdesc[0].equals("VectorUDAFComputeDsKllSketch")) { + generateVectorUDAFDataSummary(tdesc); } else if (tdesc[0].equals("VectorUDAFVar")) { generateVectorUDAFVar(tdesc); } else if (tdesc[0].equals("VectorUDAFVarDecimal")) { @@ -1940,7 +1947,7 @@ private void generateVectorUDAFAvg(String[] tdesc) throws Exception { className, templateString); } - private void generateVectorUDAFComputeBitVector(String[] tdesc) throws Exception { + private void generateVectorUDAFDataSummary(String[] tdesc) throws Exception { String className = tdesc[1]; String valueType = tdesc[2]; String columnType = getColumnVectorType(valueType);