From e2bdf33c947a5982a99dad4736105a4298345544 Mon Sep 17 00:00:00 2001 From: "Chen, Junjie" Date: Fri, 26 Jul 2019 05:25:52 +0800 Subject: [PATCH 01/19] Add argument validation to HadoopTables#create (#298) --- .../java/org/apache/iceberg/hadoop/HadoopTables.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java index 553faef93e94..e0de97dca262 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java @@ -19,6 +19,8 @@ package org.apache.iceberg.hadoop; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; import java.util.Map; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; @@ -69,19 +71,24 @@ public Table load(String location) { * location. * * @param schema iceberg schema used to create the table - * @param spec partition specification + * @param spec partitioning spec, if null the table will be unpartitioned + * @param properties a string map of table properties, initialized to empty if null * @param location a path URI (e.g. hdfs:///warehouse/my_table) * @return newly created table implementation */ @Override public Table create(Schema schema, PartitionSpec spec, Map properties, String location) { + Preconditions.checkNotNull(schema, "A table schema is required"); + TableOperations ops = newTableOps(location); if (ops.current() != null) { throw new AlreadyExistsException("Table already exists at location: " + location); } - TableMetadata metadata = TableMetadata.newTableMetadata(ops, schema, spec, location, properties); + Map tableProps = properties == null ? ImmutableMap.of() : properties; + PartitionSpec partitionSpec = spec == null ? PartitionSpec.unpartitioned() : spec; + TableMetadata metadata = TableMetadata.newTableMetadata(ops, schema, partitionSpec, location, tableProps); ops.commit(null, metadata); return new BaseTable(ops, location); From 1596d61d5aee12cb089affa7b9113ece74025e8d Mon Sep 17 00:00:00 2001 From: David Phillips Date: Thu, 25 Jul 2019 14:30:47 -0700 Subject: [PATCH 02/19] Install source JAR when running install target (#310) --- build.gradle | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build.gradle b/build.gradle index 282ce61d735f..fe08286d0494 100644 --- a/build.gradle +++ b/build.gradle @@ -64,6 +64,10 @@ subprojects { apply plugin: 'maven' // make pom files for deployment apply plugin: 'nebula.maven-base-publish' + artifacts { + archives sourceJar + } + compileJava { options.encoding = "UTF-8" } From daf062053a87b2333bf2c0d2cd3c3676ab264989 Mon Sep 17 00:00:00 2001 From: moulimukherjee Date: Fri, 26 Jul 2019 14:06:30 -0700 Subject: [PATCH 03/19] Add projectStrict for Dates and Timestamps (#283) --- .../iceberg/expressions/Projections.java | 14 +- .../expressions/ResidualEvaluator.java | 71 +++-- .../org/apache/iceberg/transforms/Dates.java | 7 +- .../iceberg/transforms/ProjectionUtil.java | 84 +++++ .../apache/iceberg/transforms/Timestamps.java | 7 +- .../transforms/TestDatesProjection.java | 217 +++++++++++++ .../transforms/TestTimestampsProjection.java | 293 ++++++++++++++++++ 7 files changed, 658 insertions(+), 35 deletions(-) create mode 100644 api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java create mode 100644 api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java diff --git a/api/src/main/java/org/apache/iceberg/expressions/Projections.java b/api/src/main/java/org/apache/iceberg/expressions/Projections.java index 50d0693d4590..f800b350858e 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Projections.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Projections.java @@ -221,9 +221,10 @@ public Expression predicate(BoundPredicate pred) { // similarly, if partitioning by day(ts) and hour(ts), the more restrictive // projection should be used. ts = 2019-01-01T01:00:00 produces day=2019-01-01 and // hour=2019-01-01-01. the value will be in 2019-01-01-01 and not in 2019-01-01-02. - result = Expressions.and( - result, - ((Transform) part.transform()).project(part.name(), pred)); + UnboundPredicate inclusiveProjection = ((Transform) part.transform()).project(part.name(), pred); + if (inclusiveProjection != null) { + result = Expressions.and(result, inclusiveProjection); + } } return result; @@ -251,9 +252,10 @@ public Expression predicate(BoundPredicate pred) { // any timestamp where either projection predicate is true must match the original // predicate. For example, ts = 2019-01-01T03:00:00 matches the hour projection but not // the day, but does match the original predicate. - result = Expressions.or( - result, - ((Transform) part.transform()).projectStrict(part.name(), pred)); + UnboundPredicate strictProjection = ((Transform) part.transform()).projectStrict(part.name(), pred); + if (strictProjection != null) { + result = Expressions.or(result, strictProjection); + } } return result; diff --git a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java index 0fae40d3decb..dd3a0b8da8a0 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java @@ -19,12 +19,9 @@ package org.apache.iceberg.expressions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import java.io.Serializable; import java.util.Comparator; import java.util.List; -import java.util.Objects; import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.StructLike; @@ -200,10 +197,15 @@ public Expression notEq(BoundReference ref, Literal lit) { @Override @SuppressWarnings("unchecked") public Expression predicate(BoundPredicate pred) { - // Get the strict projection of this predicate in partition data, then use it to determine - // whether to return the original predicate. The strict projection returns true iff the - // original predicate would have returned true, so the predicate can be eliminated if the - // strict projection evaluates to true. + /** + * Get the strict projection and inclusive projection of this predicate in partition data, + * then use them to determine whether to return the original predicate. The strict projection + * returns true iff the original predicate would have returned true, so the predicate can be + * eliminated if the strict projection evaluates to true. Similarly the inclusive projection + * returns false iff the original predicate would have returned false, so the predicate can + * also be eliminated if the inclusive projection evaluates to false. + */ + // // If there is no strict projection or if it evaluates to false, then return the predicate. List parts = spec.getFieldsBySourceId(pred.ref().fieldId()); @@ -211,31 +213,50 @@ public Expression predicate(BoundPredicate pred) { return pred; // not associated inclusive a partition field, can't be evaluated } - List> strictProjections = Lists.transform(parts, - part -> ((Transform) part.transform()).projectStrict(part.name(), pred)); + for (PartitionField part : parts) { - if (Iterables.all(strictProjections, Objects::isNull)) { - // if there are no strict projections, the predicate must be in the residual - return pred; - } + // checking the strict projection + UnboundPredicate strictProjection = ((Transform) part.transform()).projectStrict(part.name(), pred); + Expression strictResult = null; + + if (strictProjection != null) { + Expression bound = strictProjection.bind(spec.partitionType(), caseSensitive); + if (bound instanceof BoundPredicate) { + strictResult = super.predicate((BoundPredicate) bound); + } else { + // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse + strictResult = bound; + } + } - Expression result = Expressions.alwaysFalse(); - for (UnboundPredicate strictProjection : strictProjections) { - if (strictProjection == null) { - continue; + if (strictResult != null && strictResult.op() == Expression.Operation.TRUE) { + // If strict is true, returning true + return Expressions.alwaysTrue(); } - Expression bound = strictProjection.bind(spec.partitionType(), caseSensitive); - if (bound instanceof BoundPredicate) { - // evaluate the bound predicate, which will return alwaysTrue or alwaysFalse - result = Expressions.or(result, super.predicate((BoundPredicate) bound)); - } else { - // update the result expression with the non-predicate residual (e.g. alwaysTrue) - result = Expressions.or(result, bound); + // checking the inclusive projection + UnboundPredicate inclusiveProjection = ((Transform) part.transform()).project(part.name(), pred); + Expression inclusiveResult = null; + if (inclusiveProjection != null) { + Expression boundInclusive = inclusiveProjection.bind(spec.partitionType(), caseSensitive); + if (boundInclusive instanceof BoundPredicate) { + // using predicate method specific to inclusive + inclusiveResult = super.predicate((BoundPredicate) boundInclusive); + } else { + // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse + inclusiveResult = boundInclusive; + } } + + if (inclusiveResult != null && inclusiveResult.op() == Expression.Operation.FALSE) { + // If inclusive is false, returning false + return Expressions.alwaysFalse(); + } + } - return result; + // neither strict not inclusive predicate was conclusive, returning the original pred + return pred; } @Override diff --git a/api/src/main/java/org/apache/iceberg/transforms/Dates.java b/api/src/main/java/org/apache/iceberg/transforms/Dates.java index a57d6d95eaa4..94714d2e7069 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Dates.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Dates.java @@ -73,8 +73,11 @@ public UnboundPredicate project(String fieldName, BoundPredicate projectStrict(String fieldName, BoundPredicate predicate) { - return null; + public UnboundPredicate projectStrict(String fieldName, BoundPredicate pred) { + if (pred.op() == NOT_NULL || pred.op() == IS_NULL) { + return Expressions.predicate(pred.op(), fieldName); + } + return ProjectionUtil.truncateIntegerStrict(fieldName, pred, this); } @Override diff --git a/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java b/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java index 04da036e1637..ef1e0c3591ed 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java +++ b/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java @@ -52,6 +52,90 @@ static UnboundPredicate truncateInteger( } } + static UnboundPredicate truncateIntegerStrict( + String name, BoundPredicate pred, Transform transform) { + int boundary = pred.literal().value(); + switch (pred.op()) { + case LT: + // predicate would be <= the previous partition + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary) - 1); + case LT_EQ: + // Checking if the literal is at the upper partition boundary + if (transform.apply(boundary + 1).equals(transform.apply(boundary))) { + // Literal is not at upper boundary, for eg: 2019-07-02T02:12:34.0000 + // the predicate can be < 2019-07-01 + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary) - 1); + } else { + // Literal is not at upper boundary, for eg: 2019-07-02T23:59:59.99999 + // the predicate can be <= 2019-07-02 + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary)); + } + case GT: + // predicate would be >= the next partition + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary) + 1); + case GT_EQ: + // Checking if the literal is at the lower partition boundary + if (transform.apply(boundary - 1).equals(transform.apply(boundary))) { + // Literal is not at lower boundary, for eg: 2019-07-02T02:12:34.0000 + // the predicate can be >= 2019-07-03 + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary) + 1); + } else { + // Literal was at the lower boundary, for eg: 2019-07-02T00:00:00.0000 + // the predicate can be >= 2019-07-02 + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary)); + } + case NOT_EQ: + return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); + case EQ: + // there is no predicate that guarantees equality because adjacent ints transform to the same value + return null; + default: + return null; + } + } + + static UnboundPredicate truncateLongStrict( + String name, BoundPredicate pred, Transform transform) { + long boundary = pred.literal().value(); + switch (pred.op()) { + case LT: + // predicate would be <= the previous partition + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary) - 1); + case LT_EQ: + // Checking if the literal is at the upper partition boundary + if (transform.apply(boundary + 1L).equals(transform.apply(boundary))) { + // Literal is not at upper boundary, for eg: 2019-07-02T02:12:34.0000 + // the predicate can be <= 2019-07-01 + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary) - 1); + } else { + // Literal is not at upper boundary, for eg: 2019-07-02T23:59:59.99999 + // the predicate can be <= 2019-07-02 + return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary)); + } + case GT: + // predicate would be >= the next partition + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary) + 1); + case GT_EQ: + // Checking if the literal is at the lower partition boundary + if (transform.apply(boundary - 1L).equals(transform.apply(boundary))) { + // Literal is not at lower boundary, for eg: 2019-07-02T02:12:34.0000 + // the predicate can be >= 2019-07-03 + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary) + 1); + } else { + // Literal was at the lower boundary, for eg: 2019-07-02T00:00:00.0000 + // the predicate can be >= 2019-07-02 + return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary)); + } + case NOT_EQ: + return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); + case EQ: + // there is no predicate that guarantees equality because adjacent longs transform to the same value + return null; + default: + return null; + } + } + static UnboundPredicate truncateLong( String name, BoundPredicate pred, Transform transform) { long boundary = pred.literal().value(); diff --git a/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java b/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java index f01ea050229c..7259defd85f3 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java @@ -76,8 +76,11 @@ public UnboundPredicate project(String fieldName, BoundPredicate } @Override - public UnboundPredicate projectStrict(String fieldName, BoundPredicate predicate) { - return null; + public UnboundPredicate projectStrict(String fieldName, BoundPredicate pred) { + if (pred.op() == NOT_NULL || pred.op() == IS_NULL) { + return Expressions.predicate(pred.op(), fieldName); + } + return ProjectionUtil.truncateLongStrict(fieldName, pred, this); } @Override diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java new file mode 100644 index 000000000000..7b4ee7844b5b --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.transforms; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.expressions.Projections; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.types.Types.NestedField.optional; + +public class TestDatesProjection { + private static final Types.DateType TYPE = Types.DateType.get(); + private static final Schema SCHEMA = new Schema(optional(1, "date", TYPE)); + + public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp, String expectedLiteral) { + + Expression projection = Projections.strict(spec).project(filter); + UnboundPredicate predicate = assertAndUnwrapUnbound(projection); + + Assert.assertEquals(expectedOp, predicate.op()); + + Literal literal = predicate.literal(); + Dates transform = (Dates) spec.getFieldsBySourceId(1).get(0).transform(); + String output = transform.toHumanString((int) literal.value()); + Assert.assertEquals(expectedLiteral, output); + } + + public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp) { + + Expression projection = Projections.strict(spec).project(filter); + Assert.assertEquals(projection.op(), expectedOp); + } + + public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp) { + + Expression projection = Projections.inclusive(spec).project(filter); + Assert.assertEquals(projection.op(), expectedOp); + } + + public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp, String expectedLiteral) { + Expression projection = Projections.inclusive(spec).project(filter); + UnboundPredicate predicate = assertAndUnwrapUnbound(projection); + + Assert.assertEquals(predicate.op(), expectedOp); + + Literal literal = predicate.literal(); + Dates transform = (Dates) spec.getFieldsBySourceId(1).get(0).transform(); + String output = transform.toHumanString((int) literal.value()); + Assert.assertEquals(expectedLiteral, output); + } + + @Test + public void testMonthStrictLowerBound() { + Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); + + assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016-12"); + assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2016-12"); + assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-02"); + assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-01"); + assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-01"); + assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); + } + + @Test + public void testMonthStrictUpperBound() { + Integer date = (Integer) Literal.of("2017-12-31").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); + + assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-12"); + assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); + } + + @Test + public void testMonthInclusiveLowerBound() { + Integer date = (Integer) Literal.of("2017-12-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); + + assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-12"); + assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); + } + + @Test + public void testMonthInclusiveUpperBound() { + Integer date = (Integer) Literal.of("2017-12-31").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); + + assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-12"); + assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); + } + + @Test + public void testDayStrict() { + Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("date").build(); + + assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016-12-31"); + // should be the same date for <= + assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-01-01"); + assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-01-02"); + // should be the same date for >= + assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-01-01"); + assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-01-01"); + assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); + } + + @Test + public void testDayInclusive() { + Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("date").build(); + + assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016-12-31"); + assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-01-01"); + assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-01-02"); + assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-01-01"); + assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-01-01"); + assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); + } + + @Test + public void testYearStrictLowerBound() { + Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); + + assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017"); + assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); + } + + @Test + public void testYearStrictUpperBound() { + Integer date = (Integer) Literal.of("2017-12-31").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); + + assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017"); + assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); + } + + @Test + public void testYearInclusiveLowerBound() { + Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); + + assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017"); + assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); + } + + @Test + public void testYearInclusiveUpperBound() { + Integer date = (Integer) Literal.of("2017-12-31").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); + + assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017"); + assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); + } +} diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java new file mode 100644 index 000000000000..5d3c5b09085f --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.transforms; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.expressions.Projections; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.types.Types.NestedField.optional; + +public class TestTimestampsProjection { + private static final Types.TimestampType TYPE = Types.TimestampType.withoutZone(); + private static final Schema SCHEMA = new Schema(optional(1, "timestamp", TYPE)); + + public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp, String expectedLiteral) { + + Expression projection = Projections.strict(spec).project(filter); + UnboundPredicate predicate = assertAndUnwrapUnbound(projection); + + Assert.assertEquals(expectedOp, predicate.op()); + + Literal literal = predicate.literal(); + Timestamps transform = (Timestamps) spec.getFieldsBySourceId(1).get(0).transform(); + String output = transform.toHumanString((int) literal.value()); + Assert.assertEquals(expectedLiteral, output); + } + + public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp) { + + Expression projection = Projections.strict(spec).project(filter); + Assert.assertEquals(projection.op(), expectedOp); + } + + public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp) { + + Expression projection = Projections.inclusive(spec).project(filter); + Assert.assertEquals(projection.op(), expectedOp); + } + + public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, + Expression.Operation expectedOp, String expectedLiteral) { + Expression projection = Projections.inclusive(spec).project(filter); + UnboundPredicate predicate = assertAndUnwrapUnbound(projection); + + Assert.assertEquals(predicate.op(), expectedOp); + + Literal literal = predicate.literal(); + Timestamps transform = (Timestamps) spec.getFieldsBySourceId(1).get(0).transform(); + String output = transform.toHumanString((int) literal.value()); + Assert.assertEquals(expectedLiteral, output); + } + + @Test + public void testMonthStrictLowerBound() { + Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testMonthStrictUpperBound() { + Long date = (long) Literal.of("2017-12-31T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testMonthInclusiveLowerBound() { + Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testMonthInclusiveUpperBound() { + Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testDayStrictLowerBound() { + Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testDayStrictUpperBound() { + Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testDayInclusiveLowerBound() { + Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testDayInclusiveUpperBound() { + Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testYearStrictLowerBound() { + Long date = (long) Literal.of("2017-01-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testYearStrictUpperBound() { + Long date = (long) Literal.of("2017-12-31T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testYearInclusiveLowerBound() { + Long date = (long) Literal.of("2017-01-01T00:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testYearInclusiveUpperBound() { + Long date = (long) Literal.of("2017-12-31T23:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testHourStrictLowerBound() { + Long date = (long) Literal.of("2017-12-01T10:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testHourStrictUpperBound() { + Long date = (long) Literal.of("2017-12-01T10:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); + + assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); + assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); + assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); + assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); + assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); + } + + @Test + public void testHourInclusiveLowerBound() { + Long date = (long) Literal.of("2017-12-01T10:00:00.00000").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } + + @Test + public void testHourInclusiveUpperBound() { + Long date = (long) Literal.of("2017-12-01T10:59:59.999999").to(TYPE).value(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); + + assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); + assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); + assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); + } +} From 77a456a50da6c8ba0d77d7dc858b1041625fc158 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Fri, 26 Jul 2019 14:51:00 -0700 Subject: [PATCH 04/19] Correctly publish artifacts on JitPack (#321) The Gradle install target produces invalid POM files that are missing the dependencyManagement section and versions for some dependencies. Instead, we directly tell JitPack to run the correct Gradle target. --- build.gradle | 4 ---- jitpack.yml | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) create mode 100644 jitpack.yml diff --git a/build.gradle b/build.gradle index fe08286d0494..282ce61d735f 100644 --- a/build.gradle +++ b/build.gradle @@ -64,10 +64,6 @@ subprojects { apply plugin: 'maven' // make pom files for deployment apply plugin: 'nebula.maven-base-publish' - artifacts { - archives sourceJar - } - compileJava { options.encoding = "UTF-8" } diff --git a/jitpack.yml b/jitpack.yml new file mode 100644 index 000000000000..abeb8eecceb3 --- /dev/null +++ b/jitpack.yml @@ -0,0 +1,2 @@ +install: + - ./gradlew publishToMavenLocal From 5366efedd7cee903da7db1624901484c3c6c6f9d Mon Sep 17 00:00:00 2001 From: Li Cheng Date: Tue, 30 Jul 2019 01:01:09 +0800 Subject: [PATCH 05/19] Add build info to README.md (#304) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 38bb70947cd1..76e9ac2925cc 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,9 @@ Community discussions happen primarily on the [dev mailing list][dev-list] or on Iceberg is built using Gradle 5.2.1. +* To invoke a build and run tests: `./gradlew build` +* To skip tests: `./gradlew build -x test` + Iceberg table support is organized in library modules: * `iceberg-common` contains utility classes used in other modules From c36740be1eed8f1773b9f48bcf498f56dc784d84 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Mon, 29 Jul 2019 12:09:21 -0700 Subject: [PATCH 06/19] Convert Iceberg time type to Hive string type (#325) --- .../main/java/org/apache/iceberg/hive/HiveTypeConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hive/src/main/java/org/apache/iceberg/hive/HiveTypeConverter.java b/hive/src/main/java/org/apache/iceberg/hive/HiveTypeConverter.java index 31f3b73bd927..dc50c7a6e7fc 100644 --- a/hive/src/main/java/org/apache/iceberg/hive/HiveTypeConverter.java +++ b/hive/src/main/java/org/apache/iceberg/hive/HiveTypeConverter.java @@ -45,7 +45,7 @@ public static String convert(Type type) { case DATE: return "date"; case TIME: - throw new UnsupportedOperationException("Hive does not support time fields"); + return "string"; case TIMESTAMP: return "timestamp"; case STRING: From f4fc8ff7ef3495880ef6b360966f9c67544a4386 Mon Sep 17 00:00:00 2001 From: Arina Ielchiieva Date: Tue, 30 Jul 2019 19:31:58 +0300 Subject: [PATCH 07/19] Add overwrite option to write builders (#318) --- .../org/apache/iceberg/ManifestListWriter.java | 1 + .../java/org/apache/iceberg/ManifestWriter.java | 1 + .../main/java/org/apache/iceberg/avro/Avro.java | 12 +++++++++++- .../org/apache/iceberg/avro/AvroFileAppender.java | 6 +++--- orc/src/main/java/org/apache/iceberg/orc/ORC.java | 9 +++++++++ .../java/org/apache/iceberg/parquet/Parquet.java | 14 ++++++++++++-- .../org/apache/iceberg/parquet/ParquetWriter.java | 5 +++-- 7 files changed, 40 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java index 2d578e7420d9..0271695d32b9 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java +++ b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java @@ -73,6 +73,7 @@ private static FileAppender newAppender(OutputFile file, Map FileAppender newAppender(FileFormat format, PartitionSpec .meta("schema", SchemaParser.toJson(spec.schema())) .meta("partition-spec", PartitionSpecParser.toJsonFields(spec)) .meta("partition-spec-id", String.valueOf(spec.specId())) + .overwrite() .build(); default: throw new IllegalArgumentException("Unsupported format: " + format); diff --git a/core/src/main/java/org/apache/iceberg/avro/Avro.java b/core/src/main/java/org/apache/iceberg/avro/Avro.java index 65ba9d833cef..720baec6d4d6 100644 --- a/core/src/main/java/org/apache/iceberg/avro/Avro.java +++ b/core/src/main/java/org/apache/iceberg/avro/Avro.java @@ -84,6 +84,7 @@ public static class WriteBuilder { private Map config = Maps.newHashMap(); private Map metadata = Maps.newLinkedHashMap(); private Function> createWriterFunc = GenericAvroWriter::new; + private boolean overwrite; private WriteBuilder(OutputFile file) { this.file = file; @@ -124,6 +125,15 @@ public WriteBuilder meta(Map properties) { return this; } + public WriteBuilder overwrite() { + return overwrite(true); + } + + public WriteBuilder overwrite(boolean enabled) { + this.overwrite = enabled; + return this; + } + private CodecFactory codec() { String codec = config.getOrDefault(AVRO_COMPRESSION, AVRO_COMPRESSION_DEFAULT); try { @@ -141,7 +151,7 @@ public FileAppender build() throws IOException { meta("iceberg.schema", SchemaParser.toJson(schema)); return new AvroFileAppender<>( - AvroSchemaUtil.convert(schema, name), file, createWriterFunc, codec(), metadata); + AvroSchemaUtil.convert(schema, name), file, createWriterFunc, codec(), metadata, overwrite); } } diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java b/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java index 8cc0712da9b1..a77b07d0b8e0 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java @@ -39,8 +39,9 @@ class AvroFileAppender implements FileAppender { AvroFileAppender(Schema schema, OutputFile file, Function> createWriterFunc, - CodecFactory codec, Map metadata) throws IOException { - this.stream = file.create(); + CodecFactory codec, Map metadata, + boolean overwrite) throws IOException { + this.stream = overwrite ? file.createOrOverwrite() : file.create(); this.writer = newAvroWriter(schema, stream, createWriterFunc, codec, metadata); } @@ -92,7 +93,6 @@ private static DataFileWriter newAvroWriter( writer.setMeta(entry.getKey(), entry.getValue()); } - // TODO: support overwrite return writer.create(schema, stream); } } diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index a014d2a042f5..0f7ee46cff96 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -89,6 +89,15 @@ public WriteBuilder schema(Schema newSchema) { return this; } + public WriteBuilder overwrite() { + return overwrite(true); + } + + public WriteBuilder overwrite(boolean enabled) { + OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, enabled); + return this; + } + public FileAppender build() { Preconditions.checkNotNull(schema, "Schema is required"); return new OrcFileAppender<>(TypeConversion.toOrc(schema, new ColumnIdMap()), diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 494f2c57885e..b7a31d0e7600 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -85,6 +85,7 @@ public static class WriteBuilder { private Map config = Maps.newLinkedHashMap(); private Function> createWriterFunc = null; private MetricsConfig metricsConfig = MetricsConfig.getDefault(); + private ParquetFileWriter.Mode writeMode = ParquetFileWriter.Mode.CREATE; private WriteBuilder(OutputFile file) { this.file = file; @@ -138,6 +139,15 @@ public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) { return this; } + public WriteBuilder overwrite() { + return overwrite(true); + } + + public WriteBuilder overwrite(boolean enabled) { + this.writeMode = enabled ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE; + return this; + } + @SuppressWarnings("unchecked") private WriteSupport getWriteSupport(MessageType type) { if (writeSupport != null) { @@ -201,7 +211,7 @@ public FileAppender build() throws IOException { return new org.apache.iceberg.parquet.ParquetWriter<>( conf, file, schema, rowGroupSize, metadata, createWriterFunc, codec(), - parquetProperties, metricsConfig); + parquetProperties, metricsConfig, writeMode); } else { return new ParquetWriteAdapter<>(new ParquetWriteBuilder(ParquetIO.file(file)) .withWriterVersion(writerVersion) @@ -210,7 +220,7 @@ conf, file, schema, rowGroupSize, metadata, createWriterFunc, codec(), .setKeyValueMetadata(metadata) .setWriteSupport(getWriteSupport(type)) .withCompressionCodec(codec()) - .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) // TODO: support modes + .withWriteMode(writeMode) .withRowGroupSize(rowGroupSize) .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java index c0956a11684e..593de9b9c778 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java @@ -83,7 +83,8 @@ class ParquetWriter implements FileAppender, Closeable { Function> createWriterFunc, CompressionCodecName codec, ParquetProperties properties, - MetricsConfig metricsConfig) { + MetricsConfig metricsConfig, + ParquetFileWriter.Mode writeMode) { this.output = output; this.targetRowGroupSize = rowGroupSize; this.props = properties; @@ -95,7 +96,7 @@ class ParquetWriter implements FileAppender, Closeable { try { this.writer = new ParquetFileWriter(ParquetIO.file(output, conf), parquetSchema, - ParquetFileWriter.Mode.OVERWRITE, rowGroupSize, 0); + writeMode, rowGroupSize, 0); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to create Parquet file"); } From 08e087335203864b031fa85bd5296a9f5f535b50 Mon Sep 17 00:00:00 2001 From: Daniel Weeks Date: Tue, 30 Jul 2019 09:35:01 -0700 Subject: [PATCH 08/19] Fix out of order Pig partition fields (#326) --- .../org/apache/iceberg/pig/IcebergPigInputFormat.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java b/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java index ca2529b03ccb..e9b41ec8ef66 100644 --- a/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java +++ b/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java @@ -197,12 +197,16 @@ private boolean advance() throws IOException { if (hasJoinedPartitionColumns) { Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns); - Schema partitionSchema = TypeUtil.select(tableSchema, idColumns); Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns); + Map partitionSpecFieldIndexMap = Maps.newHashMap(); + for(int i=0; i Date: Wed, 31 Jul 2019 14:44:05 -0700 Subject: [PATCH 09/19] Add mapping to Iceberg for external name-based schemas (#338) --- .../java/org/apache/iceberg/SchemaUpdate.java | 32 ++- .../org/apache/iceberg/TableProperties.java | 2 + .../apache/iceberg/mapping/MappedField.java | 96 +++++++ .../apache/iceberg/mapping/MappedFields.java | 110 ++++++++ .../apache/iceberg/mapping/MappingUtil.java | 261 ++++++++++++++++++ .../apache/iceberg/mapping/NameMapping.java | 66 +++++ .../iceberg/mapping/NameMappingParser.java | 144 ++++++++++ .../org/apache/iceberg/util/JsonUtil.java | 10 + .../org/apache/iceberg/TableTestBase.java | 6 +- .../java/org/apache/iceberg/TestTables.java | 8 +- .../iceberg/mapping/TestMappingUpdates.java | 249 +++++++++++++++++ .../iceberg/mapping/TestNameMapping.java | 250 +++++++++++++++++ 12 files changed, 1226 insertions(+), 8 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/mapping/MappedField.java create mode 100644 core/src/main/java/org/apache/iceberg/mapping/MappedFields.java create mode 100644 core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java create mode 100644 core/src/main/java/org/apache/iceberg/mapping/NameMapping.java create mode 100644 core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java create mode 100644 core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java create mode 100644 core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java diff --git a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java index 48cbb049f31a..d94f449b4bb4 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java +++ b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java @@ -28,9 +28,14 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -39,6 +44,7 @@ * Schema evolution API implementation. */ class SchemaUpdate implements UpdateSchema { + private static final Logger LOG = LoggerFactory.getLogger(SchemaUpdate.class); private static final int TABLE_ROOT_ID = -1; private final TableOperations ops; @@ -200,7 +206,7 @@ public Schema apply() { @Override public void commit() { - TableMetadata update = base.updateSchema(apply(), lastColumnId); + TableMetadata update = applyChangesToMapping(base.updateSchema(apply(), lastColumnId)); ops.commit(base, update); } @@ -210,6 +216,30 @@ private int assignNewColumnId() { return next; } + private TableMetadata applyChangesToMapping(TableMetadata metadata) { + String mappingJson = metadata.property(TableProperties.DEFAULT_NAME_MAPPING, null); + if (mappingJson != null) { + try { + // parse and update the mapping + NameMapping mapping = NameMappingParser.fromJson(mappingJson); + NameMapping updated = MappingUtil.update(mapping, updates, adds); + + // replace the table property + Map updatedProperties = Maps.newHashMap(); + updatedProperties.putAll(metadata.properties()); + updatedProperties.put(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(updated)); + + return metadata.replaceProperties(updatedProperties); + + } catch (RuntimeException e) { + // log the error, but do not fail the update + LOG.warn("Failed to update external schema mapping: {}", mappingJson, e); + } + } + + return metadata; + } + private static Schema applyChanges(Schema schema, List deletes, Map updates, Multimap adds) { diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index d426dd3ba878..4dd778d165b8 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -91,4 +91,6 @@ private TableProperties() {} public static final String DEFAULT_WRITE_METRICS_MODE = "write.metadata.metrics.default"; public static final String DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)"; + + public static final String DEFAULT_NAME_MAPPING = "schema.name-mapping.default"; } diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappedField.java b/core/src/main/java/org/apache/iceberg/mapping/MappedField.java new file mode 100644 index 000000000000..2a56afac4563 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/mapping/MappedField.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableSet; +import java.util.Objects; +import java.util.Set; + +/** + * An immutable mapping between a field ID and a set of names. + */ +public class MappedField { + + static MappedField of(Integer id, String name) { + return new MappedField(id, ImmutableSet.of(name), null); + } + + static MappedField of(Integer id, Iterable names) { + return new MappedField(id, names, null); + } + + static MappedField of(Integer id, String name, MappedFields nestedMapping) { + return new MappedField(id, ImmutableSet.of(name), nestedMapping); + } + + static MappedField of(Integer id, Iterable names, MappedFields nestedMapping) { + return new MappedField(id, names, nestedMapping); + } + + private final Set names; + private Integer id; + private MappedFields nestedMapping; + + private MappedField(Integer id, Iterable names, MappedFields nested) { + this.id = id; + this.names = ImmutableSet.copyOf(names); + this.nestedMapping = nested; + } + + public Integer id() { + return id; + } + + public Set names() { + return names; + } + + public MappedFields nestedMapping() { + return nestedMapping; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + MappedField that = (MappedField) other; + return names.equals(that.names) && + Objects.equals(id, that.id) && + Objects.equals(nestedMapping, that.nestedMapping); + } + + @Override + public int hashCode() { + return Objects.hash(names, id, nestedMapping); + } + + @Override + public String toString() { + return "([" + Joiner.on(", ").join(names) + "] -> " + (id != null ? id : "?") + + (nestedMapping != null ? ", " + nestedMapping + ")" : ")"); + } +} diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java b/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java new file mode 100644 index 000000000000..824257b88761 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class MappedFields { + + static MappedFields of(MappedField... fields) { + return new MappedFields(ImmutableList.copyOf(fields)); + } + + static MappedFields of(List fields) { + return new MappedFields(fields); + } + + private final List fields; + private final Map nameToId; + private final Map idToField; + + private MappedFields(List fields) { + this.fields = ImmutableList.copyOf(fields); + this.nameToId = indexIds(fields); + this.idToField = indexFields(fields); + } + + public MappedField field(int id) { + return idToField.get(id); + } + + public Integer id(String name) { + return nameToId.get(name); + } + + public int size() { + return fields.size(); + } + + private static Map indexIds(List fields) { + ImmutableMap.Builder builder = ImmutableMap.builder(); + fields.forEach(field -> + field.names().forEach(name -> { + Integer id = field.id(); + if (id != null) { + builder.put(name, id); + } + })); + return builder.build(); + } + + private static Map indexFields(List fields) { + ImmutableMap.Builder builder = ImmutableMap.builder(); + fields.forEach(field -> { + Integer id = field.id(); + if (id != null) { + builder.put(id, field); + } + }); + return builder.build(); + } + + public List fields() { + return fields; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + return fields.equals(((MappedFields) other).fields); + } + + @Override + public int hashCode() { + return Objects.hash(fields); + } + + @Override + public String toString() { + return "[ " + Joiner.on(", ").join(fields) + " ]"; + } +} diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java b/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java new file mode 100644 index 000000000000..95d8b4e047a5 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +public class MappingUtil { + private static final Joiner DOT = Joiner.on('.'); + + private MappingUtil() { + } + + /** + * Create a name-based mapping for a schema. + *

+ * The mapping returned by this method will use the schema's name for each field. + * + * @param schema a {@link Schema} + * @return a {@link NameMapping} initialized with the schema's fields and names + */ + public static NameMapping create(Schema schema) { + return new NameMapping(TypeUtil.visit(schema, CreateMapping.INSTANCE)); + } + + /** + * Update a name-based mapping using changes to a schema. + * + * @param mapping a name-based mapping + * @param updates a map from field ID to updated field definitions + * @param adds a map from parent field ID to nested fields to be added + * @return an updated mapping with names added to renamed fields and the mapping extended for new fields + */ + public static NameMapping update(NameMapping mapping, + Map updates, + Multimap adds) { + return new NameMapping(visit(mapping, new UpdateMapping(updates, adds))); + } + + static Map indexById(MappedFields mapping) { + return visit(mapping, new IndexById()); + } + + static Map indexByName(MappedFields mapping) { + return visit(mapping, IndexByName.INSTANCE); + } + + private static class UpdateMapping implements Visitor { + private final Map updates; + private final Multimap adds; + + private UpdateMapping(Map updates, Multimap adds) { + this.updates = updates; + this.adds = adds; + } + + @Override + public MappedFields mapping(NameMapping mapping, MappedFields result) { + return addNewFields(result, -1 /* parent ID used to add top-level fields */); + } + + @Override + public MappedFields fields(MappedFields fields, List fieldResults) { + return MappedFields.of(fieldResults); + } + + @Override + public MappedField field(MappedField field, MappedFields fieldResult) { + // update this field's names + Set fieldNames = Sets.newHashSet(field.names()); + Types.NestedField update = updates.get(field.id()); + if (update != null) { + fieldNames.add(update.name()); + } + + // add a new mapping for any new nested fields + MappedFields nestedMapping = addNewFields(fieldResult, field.id()); + return MappedField.of(field.id(), fieldNames, nestedMapping); + } + + private MappedFields addNewFields(MappedFields mapping, int parentId) { + Collection fieldsToAdd = adds.get(parentId); + if (fieldsToAdd == null || fieldsToAdd.isEmpty()) { + return mapping; + } + + List fields = Lists.newArrayList(); + if (mapping != null) { + fields.addAll(mapping.fields()); + } + + for (Types.NestedField add : fieldsToAdd) { + MappedFields nestedMapping = TypeUtil.visit(add.type(), CreateMapping.INSTANCE); + fields.add(MappedField.of(add.fieldId(), add.name(), nestedMapping)); + } + + return MappedFields.of(fields); + } + } + + private static class IndexByName implements Visitor, Map> { + static final IndexByName INSTANCE = new IndexByName(); + + @Override + public Map mapping(NameMapping mapping, Map result) { + return result; + } + + @Override + public Map fields(MappedFields fields, List> fieldResults) { + // merge the results of each field + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (Map results : fieldResults) { + builder.putAll(results); + } + return builder.build(); + } + + @Override + public Map field(MappedField field, Map fieldResult) { + ImmutableMap.Builder builder = ImmutableMap.builder(); + + if (fieldResult != null) { + for (String name : field.names()) { + for (Map.Entry entry : fieldResult.entrySet()) { + String fullName = DOT.join(name, entry.getKey()); + builder.put(fullName, entry.getValue()); + } + } + } + + for (String name : field.names()) { + builder.put(name, field); + } + + return builder.build(); + } + } + + private static class IndexById implements Visitor, Map> { + private final Map result = Maps.newHashMap(); + + @Override + public Map mapping(NameMapping mapping, Map fieldsResult) { + return fieldsResult; + } + + @Override + public Map fields(MappedFields fields, List> fieldResults) { + return result; + } + + @Override + public Map field(MappedField field, Map fieldResult) { + Preconditions.checkState(!result.containsKey(field.id()), "Invalid mapping: ID %s is not unique", field.id()); + result.put(field.id(), field); + return result; + } + } + + private interface Visitor { + S mapping(NameMapping mapping, S result); + S fields(MappedFields fields, List fieldResults); + T field(MappedField field, S fieldResult); + } + + private static S visit(NameMapping mapping, Visitor visitor) { + return visitor.mapping(mapping, visit(mapping.asMappedFields(), visitor)); + } + + private static S visit(MappedFields mapping, Visitor visitor) { + if (mapping == null) { + return null; + } + + List fieldResults = Lists.newArrayList(); + for (MappedField field : mapping.fields()) { + fieldResults.add(visitor.field(field, visit(field.nestedMapping(), visitor))); + } + + return visitor.fields(mapping, fieldResults); + } + + private static class CreateMapping extends TypeUtil.SchemaVisitor { + private static final CreateMapping INSTANCE = new CreateMapping(); + + private CreateMapping() { + } + + @Override + public MappedFields schema(Schema schema, MappedFields structResult) { + return structResult; + } + + @Override + public MappedFields struct(Types.StructType struct, List fieldResults) { + List fields = Lists.newArrayListWithExpectedSize(fieldResults.size()); + + for (int i = 0; i < fieldResults.size(); i += 1) { + Types.NestedField field = struct.fields().get(i); + MappedFields result = fieldResults.get(i); + fields.add(MappedField.of(field.fieldId(), field.name(), result)); + } + + return MappedFields.of(fields); + } + + @Override + public MappedFields field(Types.NestedField field, MappedFields fieldResult) { + return fieldResult; + } + + @Override + public MappedFields list(Types.ListType list, MappedFields elementResult) { + return MappedFields.of(MappedField.of(list.elementId(), "element", elementResult)); + } + + @Override + public MappedFields map(Types.MapType map, MappedFields keyResult, MappedFields valueResult) { + return MappedFields.of( + MappedField.of(map.keyId(), "key", keyResult), + MappedField.of(map.valueId(), "value", valueResult) + ); + } + + @Override + public MappedFields primitive(Type.PrimitiveType primitive) { + return null; // no mapping because primitives have no nested fields + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java b/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java new file mode 100644 index 000000000000..2d8d4d80d90f --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.google.common.base.Joiner; +import java.util.List; +import java.util.Map; + +/** + * Represents a mapping from external schema names to Iceberg type IDs. + */ +public class NameMapping { + private static final Joiner DOT = Joiner.on('.'); + + private final MappedFields mapping; + private final Map fieldsById; + private final Map fieldsByName; + + NameMapping(MappedFields mapping) { + this.mapping = mapping; + this.fieldsById = MappingUtil.indexById(mapping); + this.fieldsByName = MappingUtil.indexByName(mapping); + } + + public MappedField find(int id) { + return fieldsById.get(id); + } + + public MappedField find(String... names) { + return fieldsByName.get(DOT.join(names)); + } + + public MappedField find(List names) { + return fieldsByName.get(DOT.join(names)); + } + + public MappedFields asMappedFields() { + return mapping; + } + + @Override + public String toString() { + if (mapping.fields().isEmpty()) { + return "[]"; + } else { + return "[\n " + Joiner.on("\n ").join(mapping.fields()) + "\n]"; + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java b/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java new file mode 100644 index 000000000000..453165c3d4ff --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import java.io.IOException; +import java.io.StringWriter; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.util.JsonUtil; + +/** + * Parses external name mappings from a JSON representation. + *

+ * [ { "field-id": 1, "names": ["id", "record_id"] },
+ *   { "field-id": 2, "names": ["data"] },
+ *   { "field-id": 3, "names": ["location"], "fields": [
+ *       { "field-id": 4, "names": ["latitude", "lat"] },
+ *       { "field-id": 5, "names": ["longitude", "long"] }
+ *     ] } ]
+ * 
+ */ +public class NameMappingParser { + + private NameMappingParser() { + } + + private static final String FIELD_ID = "field-id"; + private static final String NAMES = "names"; + private static final String FIELDS = "fields"; + + public static String toJson(NameMapping mapping) { + try { + StringWriter writer = new StringWriter(); + JsonGenerator generator = JsonUtil.factory().createGenerator(writer); + generator.useDefaultPrettyPrinter(); + toJson(mapping, generator); + generator.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to write json for: %s", mapping); + } + } + + static void toJson(NameMapping nameMapping, JsonGenerator generator) throws IOException { + toJson(nameMapping.asMappedFields(), generator); + } + + private static void toJson(MappedFields mapping, JsonGenerator generator) throws IOException { + generator.writeStartArray(); + + for (MappedField field : mapping.fields()) { + toJson(field, generator); + } + + generator.writeEndArray(); + } + + private static void toJson(MappedField field, JsonGenerator generator) throws IOException { + generator.writeStartObject(); + + generator.writeNumberField(FIELD_ID, field.id()); + + generator.writeArrayFieldStart(NAMES); + for (String name : field.names()) { + generator.writeString(name); + } + generator.writeEndArray(); + + MappedFields nested = field.nestedMapping(); + if (nested != null) { + generator.writeFieldName(FIELDS); + toJson(nested, generator); + } + + generator.writeEndObject(); + } + + public static NameMapping fromJson(String json) { + try { + return fromJson(JsonUtil.mapper().readValue(json, JsonNode.class)); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to convert version from json: %s", json); + } + } + + static NameMapping fromJson(JsonNode node) { + return new NameMapping(fieldsFromJson(node)); + } + + private static MappedFields fieldsFromJson(JsonNode node) { + Preconditions.checkArgument(node.isArray(), "Cannot parse non-array mapping fields: %s", node); + + List fields = Lists.newArrayList(); + node.elements().forEachRemaining(fieldNode -> fields.add(fieldFromJson(fieldNode))); + + return MappedFields.of(fields); + } + + private static MappedField fieldFromJson(JsonNode node) { + Preconditions.checkArgument(node != null && !node.isNull() && node.isObject(), + "Cannot parse non-object mapping field: %s", node); + + Integer id = JsonUtil.getIntOrNull(FIELD_ID, node); + + Set names; + if (node.has(NAMES)) { + names = ImmutableSet.copyOf(JsonUtil.getStringList(NAMES, node)); + } else { + names = ImmutableSet.of(); + } + + MappedFields nested; + if (node.has(FIELDS)) { + nested = fieldsFromJson(node.get(FIELDS)); + } else { + nested = null; + } + + return MappedField.of(id, names, nested); + } +} diff --git a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java index 09a546643a10..976a1cc3933c 100644 --- a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java @@ -52,6 +52,16 @@ public static int getInt(String property, JsonNode node) { return pNode.asInt(); } + public static Integer getIntOrNull(String property, JsonNode node) { + if (!node.has(property)) { + return null; + } + JsonNode pNode = node.get(property); + Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isIntegralNumber() && pNode.canConvertToInt(), + "Cannot parse %s from non-string value: %s", property, pNode); + return pNode.asInt(); + } + public static long getLong(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing int %s", property); JsonNode pNode = node.get(property); diff --git a/core/src/test/java/org/apache/iceberg/TableTestBase.java b/core/src/test/java/org/apache/iceberg/TableTestBase.java index 6c4f6820631f..c55c07d2c70d 100644 --- a/core/src/test/java/org/apache/iceberg/TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/TableTestBase.java @@ -40,7 +40,7 @@ public class TableTestBase { // Schema passed to create tables - static final Schema SCHEMA = new Schema( + public static final Schema SCHEMA = new Schema( required(3, "id", Types.IntegerType.get()), required(4, "data", Types.StringType.get()) ); @@ -80,7 +80,7 @@ public class TableTestBase { File tableDir = null; File metadataDir = null; - TestTables.TestTable table = null; + public TestTables.TestTable table = null; @Before public void setupTable() throws Exception { @@ -117,7 +117,7 @@ Integer version() { return TestTables.metadataVersion("test"); } - TableMetadata readMetadata() { + public TableMetadata readMetadata() { return TestTables.readMetadata("test"); } diff --git a/core/src/test/java/org/apache/iceberg/TestTables.java b/core/src/test/java/org/apache/iceberg/TestTables.java index 42752058f63c..13e336ea9eed 100644 --- a/core/src/test/java/org/apache/iceberg/TestTables.java +++ b/core/src/test/java/org/apache/iceberg/TestTables.java @@ -38,7 +38,7 @@ public class TestTables { private TestTables() {} - static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { + public static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name, temp); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); @@ -47,7 +47,7 @@ static TestTable create(File temp, String name, Schema schema, PartitionSpec spe return new TestTable(ops, name); } - static Transaction beginCreate(File temp, String name, Schema schema, PartitionSpec spec) { + public static Transaction beginCreate(File temp, String name, Schema schema, PartitionSpec spec) { TableOperations ops = new TestTableOperations(name, temp); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); @@ -77,12 +77,12 @@ public static Transaction beginReplace(File temp, String name, Schema schema, Pa } } - static TestTable load(File temp, String name) { + public static TestTable load(File temp, String name) { TestTableOperations ops = new TestTableOperations(name, temp); return new TestTable(ops, name); } - static class TestTable extends BaseTable { + public static class TestTable extends BaseTable { private final TestTableOperations ops; private TestTable(TestTableOperations ops, String name) { diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java new file mode 100644 index 000000000000..bd39cb0e389f --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import com.google.common.collect.ImmutableList; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableTestBase; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.types.Types.NestedField.required; + +public class TestMappingUpdates extends TableTestBase { + @Test + public void testAddColumnMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data")), + mapping.asMappedFields()); + + table.updateSchema() + .addColumn("ts", Types.TimestampType.withZone()) + .commit(); + + NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "ts")), + updated.asMappedFields()); + } + + @Test + public void testAddNestedColumnMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data")), + mapping.asMappedFields()); + + table.updateSchema() + .addColumn("point", Types.StructType.of( + required(1, "x", Types.DoubleType.get()), + required(2, "y", Types.DoubleType.get()))) + .commit(); + + NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "point", MappedFields.of( + MappedField.of(4, "x"), + MappedField.of(5, "y") + ))), + updated.asMappedFields()); + + table.updateSchema() + .addColumn("point", "z", Types.DoubleType.get()) + .commit(); + + NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "point", MappedFields.of( + MappedField.of(4, "x"), + MappedField.of(5, "y"), + MappedField.of(6, "z") + ))), + pointUpdated.asMappedFields()); + } + + @Test + public void testRenameMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data")), + mapping.asMappedFields()); + + table.updateSchema() + .renameColumn("id", "object_id") + .commit(); + + NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, ImmutableList.of("id", "object_id")), + MappedField.of(2, "data")), + updated.asMappedFields()); + } + + @Test + public void testRenameNestedFieldMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + table.updateSchema() + .addColumn("point", Types.StructType.of( + required(1, "x", Types.DoubleType.get()), + required(2, "y", Types.DoubleType.get()))) + .commit(); + + NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "point", MappedFields.of( + MappedField.of(4, "x"), + MappedField.of(5, "y") + ))), + updated.asMappedFields()); + + table.updateSchema() + .renameColumn("point.x", "X") + .renameColumn("point.y", "Y") + .commit(); + + NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "point", MappedFields.of( + MappedField.of(4, ImmutableList.of("x", "X")), + MappedField.of(5, ImmutableList.of("y", "Y")) + ))), + pointUpdated.asMappedFields()); + } + + + @Test + public void testRenameComplexFieldMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + table.updateSchema() + .addColumn("point", Types.StructType.of( + required(1, "x", Types.DoubleType.get()), + required(2, "y", Types.DoubleType.get()))) + .commit(); + + NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "point", MappedFields.of( + MappedField.of(4, "x"), + MappedField.of(5, "y") + ))), + updated.asMappedFields()); + + table.updateSchema() + .renameColumn("point", "p2") + .commit(); + + NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, ImmutableList.of("point", "p2"), MappedFields.of( + MappedField.of(4, "x"), + MappedField.of(5, "y") + ))), + pointUpdated.asMappedFields()); + } + + @Test + public void testMappingUpdateFailureSkipsMappingUpdate() { + NameMapping mapping = MappingUtil.create(table.schema()); + table.updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) + .commit(); + + table.updateSchema() + .renameColumn("id", "object_id") + .commit(); + + String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); + NameMapping updated = NameMappingParser.fromJson(updatedJson); + + Assert.assertEquals( + MappedFields.of( + MappedField.of(1, ImmutableList.of("id", "object_id")), + MappedField.of(2, "data")), + updated.asMappedFields()); + + // rename data to id, which conflicts in the mapping above + // this update should succeed, even though the mapping update fails + table.updateSchema() + .renameColumn("data", "id") + .commit(); + + Assert.assertEquals("Mapping JSON should not change", + updatedJson, table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + } +} diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java new file mode 100644 index 000000000000..7a4976455562 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mapping; + +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.types.Types.NestedField.required; + +public class TestNameMapping { + @Test + public void testFlatSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get())); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data")); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testNestedStructSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "location", Types.StructType.of( + required(4, "latitude", Types.FloatType.get()), + required(5, "longitude", Types.FloatType.get()) + ))); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "location", MappedFields.of( + MappedField.of(4, "latitude"), + MappedField.of(5, "longitude") + ))); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testMapSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "map", Types.MapType.ofRequired(4, 5, + Types.StringType.get(), + Types.DoubleType.get()))); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "map", MappedFields.of( + MappedField.of(4, "key"), + MappedField.of(5, "value") + ))); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testComplexKeyMapSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "map", Types.MapType.ofRequired(4, 5, + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())), + Types.DoubleType.get()))); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "map", MappedFields.of( + MappedField.of(4, "key", MappedFields.of( + MappedField.of(6, "x"), + MappedField.of(7, "y") + )), + MappedField.of(5, "value") + ))); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testComplexValueMapSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "map", Types.MapType.ofRequired(4, 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())) + ))); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "map", MappedFields.of( + MappedField.of(4, "key"), + MappedField.of(5, "value", MappedFields.of( + MappedField.of(6, "x"), + MappedField.of(7, "y") + )) + ))); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testListSchemaToMapping() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "list", Types.ListType.ofRequired(4, Types.StringType.get()))); + + MappedFields expected = MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "list", MappedFields.of( + MappedField.of(4, "element") + ))); + + NameMapping mapping = MappingUtil.create(schema); + Assert.assertEquals(expected, mapping.asMappedFields()); + } + + @Test + public void testFailsDuplicateId() { + // the schema can be created because ID indexing is lazy + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(1, "data", Types.StringType.get())); + + AssertHelpers.assertThrows("Should fail if IDs are reused", + IllegalArgumentException.class, "Multiple entries with same key", + () -> MappingUtil.create(schema)); + } + + @Test + public void testFailsDuplicateName() { + AssertHelpers.assertThrows("Should fail if names are reused", + IllegalArgumentException.class, "Multiple entries with same key", + () -> new NameMapping(MappedFields.of(MappedField.of(1, "x"), MappedField.of(2, "x")))); + } + + @Test + public void testAllowsDuplicateNamesInSeparateContexts() { + new NameMapping(MappedFields.of( + MappedField.of(1, "x", MappedFields.of(MappedField.of(3, "x"))), + MappedField.of(2, "y", MappedFields.of(MappedField.of(4, "x"))) + )); + } + + @Test + public void testMappingFindById() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "map", Types.MapType.ofRequired(4, 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())))), + required(8, "list", Types.ListType.ofRequired(9, + Types.StringType.get())), + required(10, "location", Types.StructType.of( + required(11, "latitude", Types.FloatType.get()), + required(12, "longitude", Types.FloatType.get()) + ))); + + NameMapping mapping = MappingUtil.create(schema); + + Assert.assertNull("Should not return a field mapping for a missing ID", mapping.find(100)); + Assert.assertEquals(MappedField.of(2, "data"), mapping.find(2)); + Assert.assertEquals(MappedField.of(6, "x"), mapping.find(6)); + Assert.assertEquals(MappedField.of(9, "element"), mapping.find(9)); + Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find(11)); + Assert.assertEquals( + MappedField.of(10, "location", MappedFields.of( + MappedField.of(11, "latitude"), + MappedField.of(12, "longitude"))), + mapping.find(10)); + } + + @Test + public void testMappingFindByName() { + Schema schema = new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "map", Types.MapType.ofRequired(4, 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())))), + required(8, "list", Types.ListType.ofRequired(9, + Types.StringType.get())), + required(10, "location", Types.StructType.of( + required(11, "latitude", Types.FloatType.get()), + required(12, "longitude", Types.FloatType.get()) + ))); + + NameMapping mapping = MappingUtil.create(schema); + + Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("element")); + Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("x")); + Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("key")); + Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("value")); + Assert.assertEquals(MappedField.of(2, "data"), mapping.find("data")); + Assert.assertEquals(MappedField.of(6, "x"), mapping.find("map", "value", "x")); + Assert.assertEquals(MappedField.of(9, "element"), mapping.find("list", "element")); + Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find("location", "latitude")); + Assert.assertEquals( + MappedField.of(10, "location", MappedFields.of( + MappedField.of(11, "latitude"), + MappedField.of(12, "longitude"))), + mapping.find("location")); + } +} From c56c0b8dea642d1970c014c2e3c91d092f4dccbb Mon Sep 17 00:00:00 2001 From: Edgar Rodriguez <266670+edgarRd@users.noreply.github.com> Date: Wed, 31 Jul 2019 15:00:11 -0700 Subject: [PATCH 10/19] Site: Fix broken link to Iceberg API (#333) --- site/docs/spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/spark.md b/site/docs/spark.md index 9489f37b0878..223046639a66 100644 --- a/site/docs/spark.md +++ b/site/docs/spark.md @@ -25,7 +25,7 @@ Iceberg uses Spark's DataSourceV2 API for data source and catalog implementation To use Iceberg in Spark 2.4, add the `iceberg-spark-runtime` Jar to Spark's `jars` folder. -Spark 2.4 is limited to reading and writing existing Iceberg tables. Use the [Iceberg API](api) to create Iceberg tables. +Spark 2.4 is limited to reading and writing existing Iceberg tables. Use the [Iceberg API](../api) to create Iceberg tables. ### Reading an Iceberg table From b1799908e86e03903379b8505d8a444752a0d64d Mon Sep 17 00:00:00 2001 From: Arina Ielchiieva Date: Thu, 1 Aug 2019 01:02:01 +0300 Subject: [PATCH 11/19] Add forTable method for Avro WriteBuilder (#322) --- core/src/main/java/org/apache/iceberg/avro/Avro.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/avro/Avro.java b/core/src/main/java/org/apache/iceberg/avro/Avro.java index 720baec6d4d6..1276e23f7818 100644 --- a/core/src/main/java/org/apache/iceberg/avro/Avro.java +++ b/core/src/main/java/org/apache/iceberg/avro/Avro.java @@ -34,6 +34,7 @@ import org.apache.avro.io.DatumWriter; import org.apache.avro.specific.SpecificData; import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.Table; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; @@ -90,6 +91,12 @@ private WriteBuilder(OutputFile file) { this.file = file; } + public WriteBuilder forTable(Table table) { + schema(table.schema()); + setAll(table.properties()); + return this; + } + public WriteBuilder schema(org.apache.iceberg.Schema newSchema) { this.schema = newSchema; return this; From c7bf199c2ee1e5aac174e9eed51ff263bc4683d6 Mon Sep 17 00:00:00 2001 From: "Chen, Junjie" Date: Thu, 1 Aug 2019 06:02:25 +0800 Subject: [PATCH 12/19] Remove multiple literal strings check rule for scala (#335) --- project/scalastyle_config.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/project/scalastyle_config.xml b/project/scalastyle_config.xml index ee3cac3669ab..04333bb75e68 100644 --- a/project/scalastyle_config.xml +++ b/project/scalastyle_config.xml @@ -136,12 +136,6 @@ - - - - - - all From a7c2d0185ea3aa9593d06c0f15e29c5cff9b4ce2 Mon Sep 17 00:00:00 2001 From: lys0716 Date: Wed, 31 Jul 2019 15:16:26 -0700 Subject: [PATCH 13/19] Fix invalid javadoc url in README.md (#336) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76e9ac2925cc..7551daf8c21d 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,9 @@ The core Java library that tracks table snapshots and metadata is complete, but The [Iceberg format specification][iceberg-spec] is being actively updated and is open for comment. Until the specification is complete and released, it carries no compatibility guarantees. The spec is currently evolving as the Java reference implementation changes. -[Java API javadocs][iceberg-javadocs] are available for the 0.6.0 tag. +[Java API javadocs][iceberg-javadocs] are available for the master. -[iceberg-javadocs]: https://iceberg.apache.org/javadoc/0.6.0/index.html?com/netflix/iceberg/package-summary.html +[iceberg-javadocs]: https://iceberg.apache.org/javadoc/master [iceberg-spec]: https://iceberg.apache.org/spec From fde0dcc58e795281632c219415581f854a58be99 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 31 Jul 2019 15:36:26 -0700 Subject: [PATCH 14/19] Use UnicodeUtil.truncateString for Truncate transform. (#340) This truncates by unicode codepoint instead of Java chars. --- api/src/main/java/org/apache/iceberg/transforms/Truncate.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java index d65d6462c918..2eacaa235e86 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java @@ -27,6 +27,7 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.UnboundPredicate; import org.apache.iceberg.types.Type; +import org.apache.iceberg.util.UnicodeUtil; import static org.apache.iceberg.expressions.Expression.Operation.IS_NULL; import static org.apache.iceberg.expressions.Expression.Operation.LT; @@ -233,7 +234,7 @@ public Integer width() { @Override public CharSequence apply(CharSequence value) { - return value.subSequence(0, Math.min(value.length(), length)); + return UnicodeUtil.truncateString(value, length); } @Override From 26765c8f0f6986c7aa8b7a2e456cb7e85d2e23f5 Mon Sep 17 00:00:00 2001 From: Edgar Rodriguez <266670+edgarRd@users.noreply.github.com> Date: Wed, 31 Jul 2019 17:20:03 -0700 Subject: [PATCH 15/19] Refactor metrics tests for reuse (#331) --- build.gradle | 26 +++++----- .../java/org/apache/iceberg/TestMetrics.java | 36 ++++++++------ .../apache/iceberg/TestMetricsTruncation.java | 21 ++++---- ...Test.java => ParquetWritingTestUtils.java} | 15 +++--- .../apache/iceberg/parquet/TestParquet.java | 10 +++- .../iceberg/parquet/TestParquetMetrics.java | 49 +++++++++++++++++++ 6 files changed, 109 insertions(+), 48 deletions(-) rename parquet/src/test/java/org/apache/iceberg/parquet/TestParquetUtil.java => core/src/test/java/org/apache/iceberg/TestMetrics.java (92%) rename parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java => core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java (96%) rename parquet/src/test/java/org/apache/iceberg/parquet/{BaseParquetWritingTest.java => ParquetWritingTestUtils.java} (83%) create mode 100644 parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java diff --git a/build.gradle b/build.gradle index 282ce61d735f..4f9c40387e0e 100644 --- a/build.gradle +++ b/build.gradle @@ -77,6 +77,8 @@ subprojects { all { exclude group: 'org.slf4j', module: 'slf4j-log4j12' } + + testArtifacts } ext { @@ -95,6 +97,15 @@ subprojects { testCompile 'org.mockito:mockito-core' } + task testJar(type: Jar){ + archiveClassifier = 'tests' + from sourceSets.test.output + } + + artifacts { + testArtifacts testJar + } + publishing { publications { nebula(MavenPublication) { @@ -284,19 +295,6 @@ project(':iceberg-hive') { exclude group: 'org.slf4j', module: 'slf4j-log4j12' } } - - task testJar(type: Jar){ - archiveClassifier = 'tests' - from sourceSets.test.output - } - - configurations { - testArtifacts - } - - artifacts { - testArtifacts testJar - } } project(':iceberg-orc') { @@ -335,6 +333,8 @@ project(':iceberg-parquet') { compileOnly("org.apache.hadoop:hadoop-client") { exclude group: 'org.apache.avro', module: 'avro' } + + testCompile project(path: ':iceberg-core', configuration: 'testArtifacts') } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetUtil.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java similarity index 92% rename from parquet/src/test/java/org/apache/iceberg/parquet/TestParquetUtil.java rename to core/src/test/java/org/apache/iceberg/TestMetrics.java index e75c2477592b..379d297c3bdc 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.iceberg.parquet; +package org.apache.iceberg; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -32,9 +32,8 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericFixed; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.Schema; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.io.InputFile; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.BinaryType; import org.apache.iceberg.types.Types.BooleanType; @@ -55,17 +54,24 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.Files.localInput; import static org.apache.iceberg.types.Conversions.fromByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; -public class TestParquetUtil extends BaseParquetWritingTest { +/** + * Tests for Metrics. + */ +public abstract class TestMetrics { + private final UUID uuid = UUID.randomUUID(); private final GenericFixed fixed = new GenericData.Fixed( org.apache.avro.Schema.createFixed("fixedCol", null, null, 4), "abcd".getBytes(StandardCharsets.UTF_8)); + public abstract Metrics getMetrics(InputFile file); + + public abstract File writeRecords(Schema schema, Record... records) throws IOException; + @Test public void testMetricsForTopLevelFields() throws IOException { Schema schema = new Schema( @@ -113,9 +119,9 @@ public void testMetricsForTopLevelFields() throws IOException { secondRecord.put("fixedCol", fixed); secondRecord.put("binaryCol", "W".getBytes()); - File parquetFile = writeRecords(schema, firstRecord, secondRecord); + File recordsFile = writeRecords(schema, firstRecord, secondRecord); - Metrics metrics = ParquetUtil.fileMetrics(localInput(parquetFile)); + Metrics metrics = getMetrics(Files.localInput(recordsFile)); Assert.assertEquals(2L, (long) metrics.recordCount()); assertCounts(1, 2L, 0L, metrics); assertBounds(1, BooleanType.get(), false, true, metrics); @@ -160,9 +166,9 @@ public void testMetricsForDecimals() throws IOException { record.put("decimalAsInt64", new BigDecimal("4.75")); record.put("decimalAsFixed", new BigDecimal("5.80")); - File parquetFile = writeRecords(schema, record); + File recordsFile = writeRecords(schema, record); - Metrics metrics = ParquetUtil.fileMetrics(localInput(parquetFile)); + Metrics metrics = getMetrics(Files.localInput(recordsFile)); Assert.assertEquals(1L, (long) metrics.recordCount()); assertCounts(1, 1L, 0L, metrics); assertBounds(1, DecimalType.of(4, 2), new BigDecimal("2.55"), new BigDecimal("2.55"), metrics); @@ -197,9 +203,9 @@ public void testMetricsForNestedStructFields() throws IOException { record.put("intCol", Integer.MAX_VALUE); record.put("nestedStructCol", nestedStruct); - File parquetFile = writeRecords(schema, record); + File recordsFile = writeRecords(schema, record); - Metrics metrics = ParquetUtil.fileMetrics(localInput(parquetFile)); + Metrics metrics = getMetrics(Files.localInput(recordsFile)); Assert.assertEquals(1L, (long) metrics.recordCount()); assertCounts(1, 1L, 0L, metrics); assertBounds(1, IntegerType.get(), Integer.MAX_VALUE, Integer.MAX_VALUE, metrics); @@ -232,9 +238,9 @@ public void testMetricsForListAndMapElements() throws IOException { map.put("4", struct); record.put(1, map); - File parquetFile = writeRecords(schema, record); + File recordsFile = writeRecords(schema, record); - Metrics metrics = ParquetUtil.fileMetrics(localInput(parquetFile)); + Metrics metrics = getMetrics(Files.localInput(recordsFile)); Assert.assertEquals(1L, (long) metrics.recordCount()); assertCounts(1, 1, 0, metrics); assertBounds(1, IntegerType.get(), null, null, metrics); @@ -256,9 +262,9 @@ public void testMetricsForNullColumns() throws IOException { Record secondRecord = new Record(AvroSchemaUtil.convert(schema.asStruct())); secondRecord.put("intCol", null); - File parquetFile = writeRecords(schema, firstRecord, secondRecord); + File recordsFile = writeRecords(schema, firstRecord, secondRecord); - Metrics metrics = ParquetUtil.fileMetrics(localInput(parquetFile)); + Metrics metrics = getMetrics(Files.localInput(recordsFile)); Assert.assertEquals(2L, (long) metrics.recordCount()); assertCounts(1, 2, 2, metrics); assertBounds(1, IntegerType.get(), null, null, metrics); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java similarity index 96% rename from parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java rename to core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java index a887e19cff53..7a999048ba30 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java @@ -17,23 +17,23 @@ * under the License. */ -package org.apache.iceberg.parquet; +package org.apache.iceberg; +import java.nio.ByteBuffer; +import java.util.Comparator; import org.apache.iceberg.expressions.Literal; import org.junit.Assert; import org.junit.Test; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Comparator; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin; import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax; import static org.apache.iceberg.util.UnicodeUtil.truncateStringMin; -public class TestParquetMetricsTruncation { +@SuppressWarnings("checkstyle:LocalVariableName") +public class TestMetricsTruncation { @Test - public void testTruncateBinaryMin() throws IOException { + public void testTruncateBinaryMin() { ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2}); // Output of test1 when truncated to 2 bytes ByteBuffer test1_2_expected = ByteBuffer.wrap(new byte[] {1, 1}); @@ -55,7 +55,7 @@ public void testTruncateBinaryMin() throws IOException { } @Test - public void testTruncateBinaryMax() throws IOException { + public void testTruncateBinaryMax() { ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, 2}); ByteBuffer test2 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2}); ByteBuffer test3 = ByteBuffer.wrap(new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, 2}); @@ -83,8 +83,9 @@ public void testTruncateBinaryMax() throws IOException { cmp.compare(truncateBinaryMax(Literal.of(test4), 2).value(), expectedOutput) == 0); } + @SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters") @Test - public void testTruncateStringMin() throws IOException { + public void testTruncateStringMin() { String test1 = "イロハニホヘト"; // Output of test1 when truncated to 2 unicode characters String test1_2_expected = "イロ"; @@ -96,7 +97,6 @@ public void testTruncateStringMin() throws IOException { // test4 consists of 2 4 byte UTF-8 characters String test4 = "\uD800\uDC00\uD800\uDC00"; String test4_1_expected = "\uD800\uDC00"; - Comparator cmp = Literal.of(test1).comparator(); Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateStringMin(Literal.of(test1), 3).value(), test1) <= 0); @@ -120,8 +120,9 @@ public void testTruncateStringMin() throws IOException { cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4_1_expected) == 0); } + @SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters") @Test - public void testTruncateStringMax() throws IOException { + public void testTruncateStringMax() { String test1 = "イロハニホヘト"; // Output of test1 when truncated to 2 unicode characters String test1_2_expected = "イヮ"; diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/BaseParquetWritingTest.java b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java similarity index 83% rename from parquet/src/test/java/org/apache/iceberg/parquet/BaseParquetWritingTest.java rename to parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java index d587b6442706..f42bdbdc6406 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/BaseParquetWritingTest.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java @@ -31,24 +31,23 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.io.FileAppender; import org.apache.parquet.schema.MessageType; -import org.junit.Rule; import org.junit.rules.TemporaryFolder; import static org.apache.iceberg.Files.localOutput; /** - * Base utility test class for tests that need to write Parquet files + * Utilities for tests that need to write Parquet files. */ -public abstract class BaseParquetWritingTest { +class ParquetWritingTestUtils { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private ParquetWritingTestUtils() {} - File writeRecords(Schema schema, GenericData.Record... records) throws IOException { - return writeRecords(schema, Collections.emptyMap(), null, records); + static File writeRecords(TemporaryFolder temp, Schema schema, GenericData.Record... records) throws IOException { + return writeRecords(temp, schema, Collections.emptyMap(), null, records); } - File writeRecords( + static File writeRecords( + TemporaryFolder temp, Schema schema, Map properties, Function> createWriterFunc, GenericData.Record... records) throws IOException { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 9429e45cdd30..f5433eaa160c 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -32,13 +32,19 @@ import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.schema.MessageType; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; import static org.apache.iceberg.Files.localInput; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.parquet.ParquetWritingTestUtils.writeRecords; import static org.apache.iceberg.types.Types.NestedField.optional; -public class TestParquet extends BaseParquetWritingTest { +public class TestParquet { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); @Test public void testRowGroupSizeConfigurable() throws IOException { @@ -79,7 +85,7 @@ private File generateFileWithTwoRowGroups(Function Date: Thu, 1 Aug 2019 13:05:56 -0700 Subject: [PATCH 16/19] Spark: Add support for write-audit-publish workflows (#342) --- .../org/apache/iceberg/SnapshotUpdate.java | 7 +++++++ .../org/apache/iceberg/SnapshotProducer.java | 15 +++++++++++++- .../org/apache/iceberg/TableMetadata.java | 10 ++++++++++ .../org/apache/iceberg/TableProperties.java | 3 +++ .../iceberg/spark/source/IcebergSource.java | 3 ++- .../apache/iceberg/spark/source/Writer.java | 20 +++++++++++++++++++ 6 files changed, 56 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java b/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java index 5cabc023d1b0..fdd1a63510b9 100644 --- a/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java +++ b/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java @@ -45,4 +45,11 @@ public interface SnapshotUpdate extends PendingUpdate { */ ThisT deleteWith(Consumer deleteFunc); + /** + * Called to stage a snapshot in table metadata, but not update the current snapshot id. + * + * @return this for method chaining + */ + ThisT stageOnly(); + } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java index e8cc29baf7ea..83f15b5814a2 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java @@ -79,6 +79,7 @@ public void accept(String file) { private final List manifestLists = Lists.newArrayList(); private Long snapshotId = null; private TableMetadata base = null; + private boolean stageOnly = false; private Consumer deleteFunc = defaultDelete; protected SnapshotProducer(TableOperations ops) { @@ -96,6 +97,12 @@ protected SnapshotProducer(TableOperations ops) { protected abstract ThisT self(); + @Override + public ThisT stageOnly() { + this.stageOnly = true; + return self(); + } + @Override public ThisT deleteWith(Consumer deleteCallback) { Preconditions.checkArgument(this.deleteFunc == defaultDelete, "Cannot set delete callback more than once"); @@ -230,7 +237,13 @@ public void commit() { .run(taskOps -> { Snapshot newSnapshot = apply(); newSnapshotId.set(newSnapshot.snapshotId()); - TableMetadata updated = base.replaceCurrentSnapshot(newSnapshot); + TableMetadata updated; + if (stageOnly) { + updated = base.addStagedSnapshot(newSnapshot); + } else { + updated = base.replaceCurrentSnapshot(newSnapshot); + } + // if the table UUID is missing, add it here. the UUID will be re-created each time this operation retries // to ensure that if a concurrent operation assigns the UUID, this operation will not fail. taskOps.commit(base, updated.withUUID()); diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java index ef289b4ec727..ea015ba201c3 100644 --- a/core/src/main/java/org/apache/iceberg/TableMetadata.java +++ b/core/src/main/java/org/apache/iceberg/TableMetadata.java @@ -321,6 +321,16 @@ public TableMetadata updatePartitionSpec(PartitionSpec newPartitionSpec) { currentSnapshotId, snapshots, snapshotLog); } + public TableMetadata addStagedSnapshot(Snapshot snapshot) { + List newSnapshots = ImmutableList.builder() + .addAll(snapshots) + .add(snapshot) + .build(); + return new TableMetadata(ops, null, uuid, location, + snapshot.timestampMillis(), lastColumnId, schema, defaultSpecId, specs, properties, + currentSnapshotId, newSnapshots, snapshotLog); + } + public TableMetadata replaceCurrentSnapshot(Snapshot snapshot) { List newSnapshots = ImmutableList.builder() .addAll(snapshots) diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 4dd778d165b8..2ca8a25e7d9c 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -93,4 +93,7 @@ private TableProperties() {} public static final String DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)"; public static final String DEFAULT_NAME_MAPPING = "schema.name-mapping.default"; + + public static final String WRITE_AUDIT_PUBLISH_ENABLED = "write.wap.enabled"; + public static final String WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT = "false"; } diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index b6dfade9a09d..3656787d2911 100644 --- a/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -75,7 +75,8 @@ public Optional createWriter(String jobId, StructType dsStruct Table table = getTableAndResolveHadoopConfiguration(options, conf); validateWriteSchema(table.schema(), dsStruct); String appId = lazySparkSession().sparkContext().applicationId(); - return Optional.of(new Writer(table, options, mode == SaveMode.Overwrite, appId)); + String wapId = lazySparkSession().conf().get("spark.wap.id", null); + return Optional.of(new Writer(table, options, mode == SaveMode.Overwrite, appId, wapId)); } @Override diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java b/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java index aa9becc12ea8..f5205057e584 100644 --- a/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java +++ b/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java @@ -45,6 +45,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotUpdate; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.encryption.EncryptionManager; @@ -87,14 +88,20 @@ class Writer implements DataSourceWriter { private final EncryptionManager encryptionManager; private final boolean replacePartitions; private final String applicationId; + private final String wapId; Writer(Table table, DataSourceOptions options, boolean replacePartitions, String applicationId) { + this(table, options, replacePartitions, applicationId, null); + } + + Writer(Table table, DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId) { this.table = table; this.format = getFileFormat(table.properties(), options); this.fileIo = table.io(); this.encryptionManager = table.encryption(); this.replacePartitions = replacePartitions; this.applicationId = applicationId; + this.wapId = wapId; } private FileFormat getFileFormat(Map tableProperties, DataSourceOptions options) { @@ -104,6 +111,11 @@ private FileFormat getFileFormat(Map tableProperties, DataSource return FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } + private boolean isWapTable() { + return Boolean.parseBoolean(table.properties().getOrDefault( + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); + } + @Override public DataWriterFactory createWriterFactory() { return new WriterFactory( @@ -124,6 +136,14 @@ protected void commitOperation(SnapshotUpdate operation, int numFiles, String if (applicationId != null) { operation.set("spark.app.id", applicationId); } + + if (isWapTable() && wapId != null) { + // write-audit-publish is enabled for this table and job + // stage the changes without changing the current snapshot + operation.set("wap.id", wapId); + operation.stageOnly(); + } + long start = System.currentTimeMillis(); operation.commit(); // abort is automatically called if this fails long duration = System.currentTimeMillis() - start; From 62d09d7fcd9bc50f2a691fb4ad3dc73bc3491807 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Thu, 1 Aug 2019 13:48:02 -0700 Subject: [PATCH 17/19] Avoid write failures if metrics mode is invalid (#301) --- .../org/apache/iceberg/MetricsConfig.java | 29 +++++++++++++++---- .../org/apache/iceberg/TableProperties.java | 1 + .../org/apache/iceberg/TestMetricsModes.java | 24 +++++++++++++++ 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/MetricsConfig.java b/core/src/main/java/org/apache/iceberg/MetricsConfig.java index 864ec060495e..816f23e7ea99 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsConfig.java +++ b/core/src/main/java/org/apache/iceberg/MetricsConfig.java @@ -22,13 +22,15 @@ import com.google.common.collect.Maps; import java.util.Map; import org.apache.iceberg.MetricsModes.MetricsMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE; import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT; public class MetricsConfig { - private static final String COLUMN_CONF_PREFIX = "write.metadata.metrics.column."; + private static final Logger LOG = LoggerFactory.getLogger(MetricsConfig.class); private Map columnModes = Maps.newHashMap(); private MetricsMode defaultMode; @@ -43,15 +45,30 @@ public static MetricsConfig getDefault() { public static MetricsConfig fromProperties(Map props) { MetricsConfig spec = new MetricsConfig(); + String defaultModeAsString = props.getOrDefault(DEFAULT_WRITE_METRICS_MODE, DEFAULT_WRITE_METRICS_MODE_DEFAULT); + try { + spec.defaultMode = MetricsModes.fromString(defaultModeAsString); + } catch (IllegalArgumentException err) { + // Mode was invalid, log the error and use the default + LOG.warn("Ignoring invalid default metrics mode: {}", defaultModeAsString, err); + spec.defaultMode = MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT); + } + props.keySet().stream() - .filter(key -> key.startsWith(COLUMN_CONF_PREFIX)) + .filter(key -> key.startsWith(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX)) .forEach(key -> { - MetricsMode mode = MetricsModes.fromString(props.get(key)); - String columnAlias = key.replaceFirst(COLUMN_CONF_PREFIX, ""); + String columnAlias = key.replaceFirst(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX, ""); + MetricsMode mode; + try { + mode = MetricsModes.fromString(props.get(key)); + } catch (IllegalArgumentException err) { + // Mode was invalid, log the error and use the default + LOG.warn("Ignoring invalid metrics mode for column {}: {}", columnAlias, props.get(key), err); + mode = spec.defaultMode; + } spec.columnModes.put(columnAlias, mode); }); - String defaultModeAsString = props.getOrDefault(DEFAULT_WRITE_METRICS_MODE, DEFAULT_WRITE_METRICS_MODE_DEFAULT); - spec.defaultMode = MetricsModes.fromString(defaultModeAsString); + return spec; } diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 2ca8a25e7d9c..b1e4ab09cd47 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -89,6 +89,7 @@ private TableProperties() {} public static final String METADATA_COMPRESSION = "write.metadata.compression-codec"; public static final String METADATA_COMPRESSION_DEFAULT = "none"; + public static final String METRICS_MODE_COLUMN_CONF_PREFIX = "write.metadata.metrics.column."; public static final String DEFAULT_WRITE_METRICS_MODE = "write.metadata.metrics.default"; public static final String DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)"; diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java index 03b6bb792202..698f5c3a193f 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java @@ -19,6 +19,8 @@ package org.apache.iceberg; +import com.google.common.collect.ImmutableMap; +import java.util.Map; import org.apache.iceberg.MetricsModes.Counts; import org.apache.iceberg.MetricsModes.Full; import org.apache.iceberg.MetricsModes.None; @@ -51,4 +53,26 @@ public void testInvalidTruncationLength() { exceptionRule.expectMessage("length should be positive"); MetricsModes.fromString("truncate(0)"); } + + @Test + public void testInvalidColumnModeValue() { + Map properties = ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", "troncate(5)"); + + MetricsConfig config = MetricsConfig.fromProperties(properties); + Assert.assertEquals("Invalid mode should be defaulted to table default (full)", + MetricsModes.Full.get(), config.columnMode("col")); + } + + @Test + public void testInvalidDefaultColumnModeValue() { + Map properties = ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, "fuull", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", "troncate(5)"); + + MetricsConfig config = MetricsConfig.fromProperties(properties); + Assert.assertEquals("Invalid mode should be defaulted to library default (truncate(16))", + MetricsModes.Truncate.withLength(16), config.columnMode("col")); + } } From 30d45f88d1f10afdef0884b89860e6fd1f25365d Mon Sep 17 00:00:00 2001 From: Vinitha Gankidi Date: Tue, 30 Jul 2019 19:18:59 -0700 Subject: [PATCH 18/19] Fix truncateStringMax in UnicodeUtil (#334) Fixes #328, fixes #329. Index to codePointAt should be the offset calculated by code points --- .../java/org/apache/iceberg/util/UnicodeUtil.java | 6 +++--- .../org/apache/iceberg/TestMetricsTruncation.java | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java index 1eaed21df6d2..f76ec73429d7 100644 --- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java @@ -79,11 +79,11 @@ public static Literal truncateStringMax(Literal inpu // Try incrementing the code points from the end for (int i = length - 1; i >= 0; i--) { - int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1; + // Get the offset in the truncated string buffer where the number of unicode characters = i + int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i); + int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) + 1; // No overflow if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) { - // Get the offset in the truncated string buffer where the number of unicode characters = i - int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i); truncatedStringBuffer.setLength(offsetByCodePoint); // Append next code point to the truncated substring truncatedStringBuffer.appendCodePoint(nextCodePoint); diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java index 7a999048ba30..af304dac3580 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java @@ -139,6 +139,9 @@ public void testTruncateStringMax() { String test6 = "\uD800\uDFFF\uD800\uDFFF"; // Increment the previous character String test6_2_expected = "\uD801\uDC00"; + String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02"; + String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03"; + String test7_1_expected = "\uD83D\uDE03"; Comparator cmp = Literal.of(test1).comparator(); Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", @@ -176,5 +179,13 @@ public void testTruncateStringMax() { Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " + "the first character incremented", cmp.compare( truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0); + Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0); + Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " + + "character should be incremented", cmp.compare( + truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0); + Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " + + "character should be incremented", cmp.compare( + truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0); } } From 67f4371a076ba2902c4cfa57f98224b0ab9261e7 Mon Sep 17 00:00:00 2001 From: Gautam Kowshik Date: Thu, 1 Aug 2019 15:24:34 -0700 Subject: [PATCH 19/19] [Vectorization] Added batch sizing, switched to BufferAllocator, other minor style fixes. --- .../org/apache/iceberg/parquet/Parquet.java | 2 +- .../iceberg/parquet/ParquetValueReaders.java | 2 +- ...gSourceFlatParquetDataFilterBenchmark.java | 47 +++++++- ...ergSourceFlatParquetDataReadBenchmark.java | 86 +++++++++++++- .../vector/VectorizedParquetValueReaders.java | 110 +++++++++++------- .../vector/VectorizedSparkParquetReaders.java | 67 +++++++---- .../apache/iceberg/spark/source/Reader.java | 24 ++-- .../TestSparkParquetVectorizedReader.java | 2 +- 8 files changed, 262 insertions(+), 78 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index aebd2c214527..4c34fab555cd 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -293,7 +293,7 @@ public static class ReadBuilder { private Map properties = Maps.newHashMap(); private boolean callInit = false; private boolean reuseContainers = false; - private int maxRecordsPerBatch = 1000; + private int maxRecordsPerBatch = 10000; private ReadBuilder(InputFile file) { this.file = file; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index 45293674f20b..0e6e686def3f 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -26,7 +26,6 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; -import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -590,6 +589,7 @@ public static class ColumnarBatchReader implements ParquetValueReader column; private final TripleIterator[] columns; private final List> children; + private ColumnarBatch columnarBatch; @SuppressWarnings("unchecked") public ColumnarBatchReader(List types, diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index 1c9164593eaa..273d6b1de341 100644 --- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -76,7 +76,52 @@ public void readWithFilterIceberg() { tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); withTableProperties(tableProperties, () -> { String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "10000") + .load(tableLocation).filter(FILTER_COND); + materialize(df); + }); + } + + + @Benchmark + @Threads(1) + public void readWithFilterIceberg100k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "100000") + .load(tableLocation).filter(FILTER_COND); + materialize(df); + }); + } + + @Benchmark + @Threads(1) + public void readWithFilterIceberg10k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "10000") + .load(tableLocation).filter(FILTER_COND); + materialize(df); + }); + } + + @Benchmark + @Threads(1) + public void readWithFilterIceberg5k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "5000") + .load(tableLocation).filter(FILTER_COND); materialize(df); }); } diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index 37a63369981c..9e510810cb6f 100644 --- a/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -66,12 +66,43 @@ public void tearDownBenchmark() throws IOException { @Benchmark @Threads(1) - public void readIceberg() { + public void readIcebergVectorized100k() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); withTableProperties(tableProperties, () -> { String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "100000") + .load(tableLocation); + materialize(df); + }); + } + + @Benchmark + @Threads(1) + public void readIcebergVectorized10k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "10000") + .load(tableLocation); + materialize(df); + }); + } + + + @Benchmark + @Threads(1) + public void readIcebergVectorized5k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "5000") + .load(tableLocation); materialize(df); }); } @@ -102,12 +133,43 @@ public void readFileSourceNonVectorized() { @Benchmark @Threads(1) - public void readWithProjectionIceberg() { + public void readWithProjectionIcebergVectorized100k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "100000") + .load(tableLocation).select("longCol"); + materialize(df); + }); + } + + @Benchmark + @Threads(1) + public void readWithProjectionIcebergVectorized10k() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); withTableProperties(tableProperties, () -> { String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "10000") + .load(tableLocation).select("longCol"); + materialize(df); + }); + } + + + @Benchmark + @Threads(1) + public void readWithProjectionIcebergVectorized5k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "5000") + .load(tableLocation).select("longCol"); materialize(df); }); } @@ -136,6 +198,22 @@ public void readWithProjectionFileSourceNonVectorized() { }); } + + @Benchmark + @Threads(1) + public void readIcebergVectorized1k() { + Map tableProperties = Maps.newHashMap(); + tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); + withTableProperties(tableProperties, () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg") + .option("iceberg.read.numrecordsperbatch", "1000") + .load(tableLocation); + materialize(df); + }); + } + + private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { Dataset df = spark().range(NUM_ROWS) diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java index 3b67c96a85d6..e030fa44955b 100644 --- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java +++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedParquetValueReaders.java @@ -21,7 +21,7 @@ import java.math.BigDecimal; import java.math.BigInteger; -import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -66,16 +66,32 @@ public class VectorizedParquetValueReaders { public abstract static class VectorReader extends ParquetValueReaders.PrimitiveReader { + public static final int DEFAULT_NUM_ROWS_IN_BATCH = 10000; + // private static final Logger LOG = LoggerFactory.getLogger(VectorReader.class); + private FieldVector vec; private boolean isOptional; + private int rowsInBatch = DEFAULT_NUM_ROWS_IN_BATCH; + private ColumnDescriptor desc; VectorReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { + BufferAllocator rootAlloc) { super(desc); this.vec = ArrowSchemaUtil.convert(icebergField).createVector(rootAlloc); this.isOptional = desc.getPrimitiveType().isRepetition(Type.Repetition.OPTIONAL); + this.desc = desc; + } + + VectorReader(ColumnDescriptor desc, + Types.NestedField icebergField, + BufferAllocator rootAlloc, + int rowsInBatch) { + + this(desc, icebergField, rootAlloc); + this.rowsInBatch = (rowsInBatch == 0) ? DEFAULT_NUM_ROWS_IN_BATCH : rowsInBatch; + // LOG.info("=> [VectorReader] rowsInBatch = " + this.rowsInBatch); } protected FieldVector getVector() { @@ -92,19 +108,28 @@ public FieldVector read(FieldVector ignore) { vec.reset(); int ordinal = 0; - while (column.hasNext()) { - // Todo: this check works for flat schemas only - // need to get max definition level to do proper check - if (isOptional && column.currentDefinitionLevel() == 0) { - // handle null - column.nextNull(); - nextNullAt(ordinal); + for (; ordinal < rowsInBatch; ordinal++) { + if (column.hasNext()) { + // while (column.hasNext()) { + // Todo: this check works for flat schemas only + // need to get max definition level to do proper check + if (isOptional && column.currentDefinitionLevel() == 0) { + // handle null + column.nextNull(); + nextNullAt(ordinal); + } else { + nextValueAt(ordinal); + } } else { - nextValueAt(ordinal); + // proceed to next rowgroup Or exit. + // LOG.info("**** No more in RowGroup. Exiting!"); + break; } - ordinal++; + // } } vec.setValueCount(ordinal); + // LOG.info("=> Vector col:" + desc.getPrimitiveType().getPrimitiveTypeName() + + // ", for setting batch size :" + rowsInBatch + ", with " + ordinal + " values"); return vec; } @@ -120,8 +145,9 @@ public int getRowCount() { protected static class StringReader extends VectorReader { - StringReader(ColumnDescriptor desc, Types.NestedField icebergField, RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + StringReader(ColumnDescriptor desc, Types.NestedField icebergField, + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } @Override @@ -138,8 +164,8 @@ protected void nextValueAt(int ordinal) { ((VarCharVector) getVector()).setNull(ordinal); } else { - String utf8Str = binary.toStringUsingUTF8(); - ((VarCharVector) getVector()).setSafe(ordinal, utf8Str.getBytes()); + // String utf8Str = binary.toStringUsingUTF8(); + ((VarCharVector) getVector()).setSafe(ordinal, binary.getBytesUnsafe()); } } @@ -149,9 +175,9 @@ protected static class IntegerReader extends VectorReader { IntegerReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { + BufferAllocator rootAlloc, int recordsPerBatch) { - super(desc, icebergField, rootAlloc); + super(desc, icebergField, rootAlloc, recordsPerBatch); } @Override @@ -171,9 +197,9 @@ protected static class LongReader extends VectorReader { LongReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { + BufferAllocator rootAlloc, int recordsPerBatch) { - super(desc, icebergField, rootAlloc); + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -192,8 +218,8 @@ protected static class TimestampMillisReader extends LongReader { TimestampMillisReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextValueAt(int ordinal) { @@ -208,8 +234,8 @@ protected static class TimestampMicroReader extends VectorReader { TimestampMicroReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -228,8 +254,8 @@ protected static class BooleanReader extends VectorReader { BooleanReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -249,8 +275,8 @@ protected static class FloatReader extends VectorReader { FloatReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -269,8 +295,8 @@ protected static class DoubleReader extends VectorReader { DoubleReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -290,8 +316,8 @@ protected static class BinaryReader extends VectorReader { BinaryReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -311,8 +337,8 @@ protected static class DateReader extends VectorReader { DateReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc) { - super(desc, icebergField, rootAlloc); + BufferAllocator rootAlloc, int recordsPerBatch) { + super(desc, icebergField, rootAlloc, recordsPerBatch); } protected void nextNullAt(int ordinal) { @@ -334,10 +360,10 @@ protected static class IntegerDecimalReader extends VectorReader { IntegerDecimalReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc, - int precision, int scale) { + BufferAllocator rootAlloc, + int precision, int scale, int recordsPerBatch) { - super(desc, icebergField, rootAlloc); + super(desc, icebergField, rootAlloc, recordsPerBatch); this.precision = precision; this.scale = scale; } @@ -362,10 +388,10 @@ protected static class LongDecimalReader extends VectorReader { LongDecimalReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc, - int precision, int scale) { + BufferAllocator rootAlloc, + int precision, int scale, int recordsPerBatch) { - super(desc, icebergField, rootAlloc); + super(desc, icebergField, rootAlloc, recordsPerBatch); this.precision = precision; this.scale = scale; } @@ -390,10 +416,10 @@ protected static class BinaryDecimalReader extends VectorReader { BinaryDecimalReader(ColumnDescriptor desc, Types.NestedField icebergField, - RootAllocator rootAlloc, - int precision, int scale) { + BufferAllocator rootAlloc, + int precision, int scale, int recordsPerBatch) { - super(desc, icebergField, rootAlloc); + super(desc, icebergField, rootAlloc, recordsPerBatch); this.precision = precision; this.scale = scale; } diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java index f283cdabe083..7a86903ad369 100644 --- a/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java +++ b/spark/src/main/java/org/apache/iceberg/spark/data/vector/VectorizedSparkParquetReaders.java @@ -25,7 +25,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; import org.apache.iceberg.Schema; import org.apache.iceberg.arrow.ArrowSchemaUtil; @@ -39,10 +39,15 @@ import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.spark.sql.execution.arrow.ArrowUtils; import org.apache.spark.sql.vectorized.ColumnarBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class VectorizedSparkParquetReaders { + private static final Logger LOG = LoggerFactory.getLogger(VectorizedSparkParquetReaders.class); + private VectorizedSparkParquetReaders() { } @@ -52,9 +57,21 @@ public static ParquetValueReader buildReader( Schema expectedSchema, MessageType fileSchema) { + return buildReader(tableSchema, expectedSchema, fileSchema, + VectorizedParquetValueReaders.VectorReader.DEFAULT_NUM_ROWS_IN_BATCH); + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader buildReader( + Schema tableSchema, + Schema expectedSchema, + MessageType fileSchema, + Integer recordsPerBatch) { + + LOG.info("=> [VectorizedSparkParquetReaders] recordsPerBatch = {}", recordsPerBatch); return (ParquetValueReader) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(tableSchema, expectedSchema, fileSchema)); + new ReadBuilder(tableSchema, expectedSchema, fileSchema, recordsPerBatch)); } private static class ReadBuilder extends TypeWithSchemaVisitor> { @@ -62,14 +79,19 @@ private static class ReadBuilder extends TypeWithSchemaVisitor [ReadBuilder] recordsPerBatch = {}", this.recordsPerBatch); } @Override @@ -142,24 +164,27 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy case ENUM: case JSON: case UTF8: - return new VectorizedParquetValueReaders.StringReader(desc, icebergField, rootAllocator); + //return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator, recordsPerBatch); + return new VectorizedParquetValueReaders.StringReader(desc, icebergField, rootAllocator, recordsPerBatch); case INT_8: case INT_16: case INT_32: - return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator, recordsPerBatch); // if (expected != null && expected.typeId() == Types.LongType.get().typeId()) { // return new ParquetValueReaders.IntAsLongReader(desc); // } else { // return new ParquetValueReaders.UnboxedReader(desc); // } case DATE: - return new VectorizedParquetValueReaders.DateReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.DateReader(desc, icebergField, rootAllocator, recordsPerBatch); case INT_64: - return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator, recordsPerBatch); case TIMESTAMP_MICROS: - return new VectorizedParquetValueReaders.TimestampMicroReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.TimestampMicroReader(desc, icebergField, + rootAllocator, recordsPerBatch); case TIMESTAMP_MILLIS: - return new VectorizedParquetValueReaders.TimestampMillisReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.TimestampMillisReader(desc, icebergField, + rootAllocator, recordsPerBatch); case DECIMAL: DecimalMetadata decimal = primitive.getDecimalMetadata(); switch (primitive.getPrimitiveTypeName()) { @@ -167,21 +192,21 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy case FIXED_LEN_BYTE_ARRAY: return new VectorizedParquetValueReaders.BinaryDecimalReader(desc, icebergField, rootAllocator, decimal.getPrecision(), - decimal.getScale()); + decimal.getScale(), recordsPerBatch); case INT64: return new VectorizedParquetValueReaders.LongDecimalReader(desc, icebergField, rootAllocator, decimal.getPrecision(), - decimal.getScale()); + decimal.getScale(), recordsPerBatch); case INT32: return new VectorizedParquetValueReaders.IntegerDecimalReader(desc, icebergField, rootAllocator, decimal.getPrecision(), - decimal.getScale()); + decimal.getScale(), recordsPerBatch); default: throw new UnsupportedOperationException( "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); } case BSON: - return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator, recordsPerBatch); default: throw new UnsupportedOperationException( "Unsupported logical type: " + primitive.getOriginalType()); @@ -191,22 +216,22 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy switch (primitive.getPrimitiveTypeName()) { case FIXED_LEN_BYTE_ARRAY: case BINARY: - return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.BinaryReader(desc, icebergField, rootAllocator, recordsPerBatch); case INT32: - return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.IntegerReader(desc, icebergField, rootAllocator, recordsPerBatch); case FLOAT: - return new VectorizedParquetValueReaders.FloatReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.FloatReader(desc, icebergField, rootAllocator, recordsPerBatch); // if (expected != null && expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { // return new ParquetValueReaders.FloatAsDoubleReader(desc); // } else { // return new ParquetValueReaders.UnboxedReader<>(desc); // } case BOOLEAN: - return new VectorizedParquetValueReaders.BooleanReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.BooleanReader(desc, icebergField, rootAllocator, recordsPerBatch); case INT64: - return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.LongReader(desc, icebergField, rootAllocator, recordsPerBatch); case DOUBLE: - return new VectorizedParquetValueReaders.DoubleReader(desc, icebergField, rootAllocator); + return new VectorizedParquetValueReaders.DoubleReader(desc, icebergField, rootAllocator, recordsPerBatch); default: throw new UnsupportedOperationException("Unsupported type: " + primitive); } diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java index 75d72f1c465e..40a5a6d3ff3a 100644 --- a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java +++ b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java @@ -59,6 +59,7 @@ import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.data.SparkAvroReader; import org.apache.iceberg.spark.data.SparkOrcReader; +import org.apache.iceberg.spark.data.vector.VectorizedParquetValueReaders; import org.apache.iceberg.spark.data.vector.VectorizedSparkParquetReaders; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -86,6 +87,8 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.apache.spark.unsafe.types.UTF8String; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.collection.JavaConverters; class Reader implements DataSourceReader, @@ -96,13 +99,14 @@ class Reader implements DataSourceReader, private static final Filter[] NO_FILTERS = new Filter[0]; + private static final Logger LOG = LoggerFactory.getLogger(Reader.class); private final Table table; private final Long snapshotId; private final Long asOfTimestamp; private final FileIO fileIo; private final EncryptionManager encryptionManager; private final boolean caseSensitive; - private int numRecordsPerBatch; + private final int numRecordsPerBatch; private StructType requestedSchema = null; private List filterExpressions = null; private Filter[] pushedFilters = NO_FILTERS; @@ -111,18 +115,21 @@ class Reader implements DataSourceReader, private Schema schema; private StructType type = null; // cached because Spark accesses it multiple times private List tasks = null; // lazy cache of tasks - private static final int DEFAULT_NUM_RECORDS_PER_BATCH = 1000; Reader(Table table, boolean caseSensitive, DataSourceOptions options) { this.table = table; this.snapshotId = options.get("snapshot-id").map(Long::parseLong).orElse(null); this.asOfTimestamp = options.get("as-of-timestamp").map(Long::parseLong).orElse(null); Optional numRecordsPerBatchOpt = options.get("iceberg.read.numrecordsperbatch"); - this.numRecordsPerBatch = DEFAULT_NUM_RECORDS_PER_BATCH; if (numRecordsPerBatchOpt.isPresent()) { + this.numRecordsPerBatch = Integer.parseInt(numRecordsPerBatchOpt.get()); + + } else { + + this.numRecordsPerBatch = VectorizedParquetValueReaders.VectorReader.DEFAULT_NUM_ROWS_IN_BATCH; } - // LOG.info("[IcebergSource] => Reading numRecordsPerBatch = "+numRecordsPerBatch); + LOG.info("=> Set Config numRecordsPerBatch = {}", numRecordsPerBatch); if (snapshotId != null && asOfTimestamp != null) { throw new IllegalArgumentException( @@ -169,6 +176,7 @@ public List> planBatchInputPartitions() { new ReadTask(task, tableSchemaString, expectedSchemaString, fileIo, encryptionManager, caseSensitive, numRecordsPerBatch)); } + LOG.info("=> Batching input partitions with {} tasks.", readTasks.size()); return readTasks; } @@ -305,6 +313,7 @@ private ReadTask( this.encryptionManager = encryptionManager; this.caseSensitive = caseSensitive; this.numRecordsPerBatch = numRecordsPerBatch; + LOG.info("=> [ReadTask] numRecordsPerBatch = {}", numRecordsPerBatch); } @Override @@ -340,7 +349,7 @@ private static class TaskDataReader implements InputPartitionReader inputFiles; private final boolean caseSensitive; - private final int numRecordsPerBatch; + private final Integer numRecordsPerBatch; private Iterator currentIterator; private Closeable currentCloseable = null; @@ -361,9 +370,10 @@ private static class TaskDataReader implements InputPartitionReader inputFileBuilder.put(decrypted.location(), decrypted)); this.inputFiles = inputFileBuilder.build(); // open last because the schemas and fileIo must be set + this.numRecordsPerBatch = numRecordsPerBatch; this.currentIterator = open(tasks.next()); this.caseSensitive = caseSensitive; - this.numRecordsPerBatch = numRecordsPerBatch; + LOG.info("=> [TaskDataReader] numRecordsPerBatch = {}", numRecordsPerBatch); } @Override @@ -521,7 +531,7 @@ private CloseableIterable newParquetIterable(InputFile location, .project(readSchema, SparkSchemaUtil.convert(readSchema)) .split(task.start(), task.length()) .createReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(tableSchema, readSchema, - fileSchema)) + fileSchema, numRecordsPerBatch)) .filter(task.residual()) .caseSensitive(caseSensitive) .recordsPerBatch(numRecordsPerBatch) diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java index da9a46681d6d..96933d454348 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetVectorizedReader.java @@ -65,7 +65,7 @@ protected void writeAndValidate(Schema schema) throws IOException { try (CloseableIterable batchReader = Parquet.read(Files.localInput(testFile)) .project(schema) - .createReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type)) + .createReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(schema, schema, type, 10000)) .build()) { Iterator batches = batchReader.iterator();