-
Notifications
You must be signed in to change notification settings - Fork 3k
Core:Remove unnecessary row filtering in deleted manifest file #4316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,9 @@ | |
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.StructLike; | ||
| import org.apache.iceberg.flink.FlinkSchemaUtil; | ||
| import org.apache.iceberg.flink.RowDataWrapper; | ||
| import org.apache.iceberg.flink.data.RowDataProjection; | ||
| import org.apache.iceberg.io.BaseTaskWriter; | ||
| import org.apache.iceberg.io.FileAppenderFactory; | ||
| import org.apache.iceberg.io.FileIO; | ||
|
|
@@ -41,6 +43,7 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter<RowData> { | |
| private final Schema schema; | ||
| private final Schema deleteSchema; | ||
| private final RowDataWrapper wrapper; | ||
| private final RowDataWrapper wrapperDelete; | ||
| private final boolean upsert; | ||
|
|
||
| BaseDeltaTaskWriter(PartitionSpec spec, | ||
|
|
@@ -57,6 +60,7 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter<RowData> { | |
| this.schema = schema; | ||
| this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); | ||
| this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); | ||
| this.wrapperDelete = new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about |
||
| this.upsert = upsert; | ||
| } | ||
|
|
||
|
|
@@ -66,6 +70,10 @@ RowDataWrapper wrapper() { | |
| return wrapper; | ||
| } | ||
|
|
||
| RowDataWrapper wrapperDelete() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is there an accessor for this? |
||
| return wrapperDelete; | ||
| } | ||
|
|
||
| @Override | ||
| public void write(RowData row) throws IOException { | ||
| RowDataDeltaWriter writer = route(row); | ||
|
|
@@ -74,7 +82,8 @@ public void write(RowData row) throws IOException { | |
| case INSERT: | ||
| case UPDATE_AFTER: | ||
| if (upsert) { | ||
| writer.delete(row); | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't create a new projection every time. Instead it should create one and reuse it by calling |
||
| writer.deleteKey(wrap); | ||
| } | ||
| writer.write(row); | ||
| break; | ||
|
|
@@ -83,10 +92,21 @@ public void write(RowData row) throws IOException { | |
| if (upsert) { | ||
| break; // UPDATE_BEFORE is not necessary for UPDATE, we do nothing to prevent delete one row twice | ||
| } | ||
| writer.delete(row); | ||
| if (deleteSchema != null) { | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
| writer.deleteKey(wrap); | ||
| } else { | ||
| writer.delete(row); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this need to change? I think you just want to fix the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addtionally, in the We've been workin on the PR in my fork but I'll run some tests. |
||
| break; | ||
|
|
||
| case DELETE: | ||
| writer.delete(row); | ||
| if (deleteSchema != null) { | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
| writer.deleteKey(wrap); | ||
| } else { | ||
| writer.delete(row); | ||
| } | ||
| break; | ||
|
|
||
| default: | ||
|
|
@@ -103,5 +123,10 @@ protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { | |
| protected StructLike asStructLike(RowData data) { | ||
| return wrapper.wrap(data); | ||
| } | ||
|
|
||
| @Override | ||
| protected StructLike asStructLikeKey(RowData data) { | ||
| return wrapperDelete.wrap(data); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,8 @@ | |
| import org.apache.iceberg.io.TaskWriter; | ||
| import org.apache.iceberg.io.UnpartitionedWriter; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.util.ArrayUtil; | ||
|
|
||
| public class RowDataTaskWriterFactory implements TaskWriterFactory<RowData> { | ||
|
|
@@ -70,9 +72,8 @@ public RowDataTaskWriterFactory(Table table, | |
| if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { | ||
| this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); | ||
| } else { | ||
| // TODO provide the ability to customize the equality-delete row schema. | ||
| this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, | ||
| ArrayUtil.toIntArray(equalityFieldIds), schema, null); | ||
| ArrayUtil.toIntArray(equalityFieldIds), TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), null); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the correct schema for |
||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -450,7 +450,10 @@ private static class RowDataWriter extends ParquetValueWriters.StructWriter<RowD | |
|
|
||
| @Override | ||
| protected Object get(RowData struct, int index) { | ||
| return fieldGetter[index].getFieldOrNull(struct); | ||
| if (struct.isNullAt(index)) { | ||
| return null; | ||
| } | ||
| return this.fieldGetter[index].getFieldOrNull(struct); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep the work to just Flink 1.14 for now, so comments don't get duplicated on many things. |
||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,9 @@ | |
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.StructLike; | ||
| import org.apache.iceberg.flink.FlinkSchemaUtil; | ||
| import org.apache.iceberg.flink.RowDataWrapper; | ||
| import org.apache.iceberg.flink.data.RowDataProjection; | ||
| import org.apache.iceberg.io.BaseTaskWriter; | ||
| import org.apache.iceberg.io.FileAppenderFactory; | ||
| import org.apache.iceberg.io.FileIO; | ||
|
|
@@ -41,6 +43,7 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter<RowData> { | |
| private final Schema schema; | ||
| private final Schema deleteSchema; | ||
| private final RowDataWrapper wrapper; | ||
| private final RowDataWrapper wrapperDelete; | ||
| private final boolean upsert; | ||
|
|
||
| BaseDeltaTaskWriter(PartitionSpec spec, | ||
|
|
@@ -57,6 +60,7 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter<RowData> { | |
| this.schema = schema; | ||
| this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); | ||
| this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); | ||
| this.wrapperDelete = new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); | ||
| this.upsert = upsert; | ||
| } | ||
|
|
||
|
|
@@ -66,6 +70,10 @@ RowDataWrapper wrapper() { | |
| return wrapper; | ||
| } | ||
|
|
||
| RowDataWrapper wrapperDelete() { | ||
| return wrapperDelete; | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you revert the changes in all modules other than 1.14? That makes reviewing and updating for review comments much easier. Once this goes in we can backport to 1.12 and 1.13.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah agreed. I left a similar comment on another file because the changes were a bit much. It keeps the discussion of a specific change all in one place.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, I believe that @hililiwei made the changes as existing tests might not pass in earlier versions of Flink. But for something important, we should still keep the changes in one PR while reviewing. Otherwise it's difficult for others to review. |
||
|
|
||
| @Override | ||
| public void write(RowData row) throws IOException { | ||
| RowDataDeltaWriter writer = route(row); | ||
|
|
@@ -74,7 +82,8 @@ public void write(RowData row) throws IOException { | |
| case INSERT: | ||
| case UPDATE_AFTER: | ||
| if (upsert) { | ||
| writer.delete(row); | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
| writer.deleteKey(wrap); | ||
| } | ||
| writer.write(row); | ||
| break; | ||
|
|
@@ -83,10 +92,21 @@ public void write(RowData row) throws IOException { | |
| if (upsert) { | ||
| break; // UPDATE_BEFORE is not necessary for UPDATE, we do nothing to prevent delete one row twice | ||
| } | ||
| writer.delete(row); | ||
| if (deleteSchema != null) { | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
| writer.deleteKey(wrap); | ||
| } else { | ||
| writer.delete(row); | ||
| } | ||
| break; | ||
|
|
||
| case DELETE: | ||
| writer.delete(row); | ||
| if (deleteSchema != null) { | ||
| RowData wrap = RowDataProjection.create(schema, deleteSchema).wrap(row); | ||
| writer.deleteKey(wrap); | ||
| } else { | ||
| writer.delete(row); | ||
| } | ||
| break; | ||
|
|
||
| default: | ||
|
|
@@ -103,5 +123,10 @@ protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { | |
| protected StructLike asStructLike(RowData data) { | ||
| return wrapper.wrap(data); | ||
| } | ||
|
|
||
| @Override | ||
| protected StructLike asStructLikeKey(RowData data) { | ||
| return wrapperDelete.wrap(data); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did this need to change? NPE in a test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is an NPE in some test cases, yes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
However, I'm going to investigate a bit furhter as I do think it might be indicative of a bug.
I think if we use the correct deletion schema in all cases, the NPEs will go away. I am testing now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or actually, I don't think this change is needed. If there's no
fieldGetterfor a given index, that's likely indicative of a bug.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this didn't need to be changed if we use the full schema as the deletion schema outside of
upsertcase like here https://github.com/apache/iceberg/pull/4364/files#diff-bbdfbcbd83d2e6f53d402e804dcd0a6fd28ab39d1cc3f74a88641ae8763fde3bR75-R87