From 964634ea495c533f0229a476a0e2b3f7dde678ff Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Mon, 14 Nov 2022 05:46:26 +0530 Subject: [PATCH 1/5] HIVE-26734: Iceberg: Add an option to allow positional delete files without actual row data. --- .../hive/writer/HiveIcebergDeleteWriter.java | 14 +- .../iceberg/mr/hive/writer/WriterBuilder.java | 7 +- .../writer/HiveIcebergWriterTestBase.java | 24 +- .../queries/positive/iceberg_v2_deletes.q | 83 +++++ .../positive/llap/iceberg_v2_deletes.q.out | 315 ++++++++++++++++++ 5 files changed, 432 insertions(+), 11 deletions(-) create mode 100644 iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q create mode 100644 iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java index b95ba910b63c..04382c67f899 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java @@ -41,13 +41,15 @@ class HiveIcebergDeleteWriter extends HiveIcebergWriterBase { private final GenericRecord rowDataTemplate; + private final boolean isWriteDeleteRow; HiveIcebergDeleteWriter(Schema schema, Map specs, FileWriterFactory writerFactory, OutputFileFactory fileFactory, FileIO io, - long targetFileSize) { + long targetFileSize, boolean isWriteDeleteRow) { super(schema, specs, io, new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, io, targetFileSize)); rowDataTemplate = GenericRecord.create(schema); + this.isWriteDeleteRow = isWriteDeleteRow; } @Override @@ -55,7 +57,15 @@ public void write(Writable row) throws IOException { Record rec = ((Container) row).get(); PositionDelete positionDelete = IcebergAcidUtil.getPositionDelete(rec, rowDataTemplate); int specId = IcebergAcidUtil.parseSpecId(rec); - writer.write(positionDelete, specs.get(specId), partition(positionDelete.row(), specId)); + if (isWriteDeleteRow) { + writer.write(positionDelete, specs.get(specId), partition(positionDelete.row(), specId)); + } else { + // Extract the row data to get the partition detail, and remove it from the positional delete to avoid writing + // the row data in the delete file + Record rowData = positionDelete.row(); + positionDelete.set(positionDelete.path(), positionDelete.pos(), null); + writer.write(positionDelete, specs.get(specId), partition(rowData, specId)); + } } @Override diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java index 64b4cdd32f20..9bb296dd7814 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java @@ -92,7 +92,10 @@ public HiveIcebergWriter build() { long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + boolean isWriteDeleteRow = Boolean.parseBoolean(properties.getOrDefault("iceberg.write.deleterow", "false")); + Schema dataSchema = table.schema(); + Schema positionalDeleteSchema = isWriteDeleteRow ? dataSchema : null; FileIO io = table.io(); Map specs = table.specs(); int currentSpecId = table.spec().specId(); @@ -110,13 +113,13 @@ public HiveIcebergWriter build() { .build(); HiveFileWriterFactory writerFactory = new HiveFileWriterFactory(table, dataFileFormat, dataSchema, null, - deleteFileFormat, null, null, null, dataSchema); + deleteFileFormat, null, null, null, positionalDeleteSchema); HiveIcebergWriter writer; switch (operation) { case DELETE: writer = new HiveIcebergDeleteWriter(dataSchema, specs, writerFactory, deleteOutputFileFactory, - io, targetFileSize); + io, targetFileSize, isWriteDeleteRow); break; case OTHER: writer = new HiveIcebergRecordWriter(dataSchema, specs, currentSpecId, writerFactory, outputFileFactory, diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java index 1a818a131a04..57b4ffd9e2db 100644 --- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java +++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.hadoop.hive.conf.HiveConf; @@ -83,16 +84,24 @@ public class HiveIcebergWriterTestBase { @Parameterized.Parameter(1) public boolean partitioned; - @Parameterized.Parameters(name = "fileFormat={0}, partitioned={1}") + @Parameterized.Parameter(2) + public boolean isWriteDeleteRowData; + + @Parameterized.Parameters(name = "fileFormat={0}, partitioned={1}, isWriteDeleteRowData={2}") public static Collection parameters() { return Lists.newArrayList(new Object[][] { - { FileFormat.PARQUET, true }, - { FileFormat.ORC, true }, - { FileFormat.AVRO, true }, - { FileFormat.PARQUET, false }, + { FileFormat.PARQUET, true, true }, + { FileFormat.ORC, true, true }, + { FileFormat.AVRO, true, true }, + { FileFormat.PARQUET, false, true }, + { FileFormat.PARQUET, true, false }, + { FileFormat.ORC, true, false }, + { FileFormat.AVRO, true, false }, + { FileFormat.PARQUET, false, false }, // Skip this until the ORC reader is fixed - test only issue // { FileFormat.ORC, false }, - { FileFormat.AVRO, false } + { FileFormat.AVRO, false, true }, + { FileFormat.AVRO, false, false } }); } @@ -105,7 +114,8 @@ public void init() throws IOException { PartitionSpec.builderFor(SCHEMA) .bucket("data", 3) .build(); - this.helper = new TestHelper(new HiveConf(), tables, location.toString(), SCHEMA, spec, fileFormat, temp); + this.helper = new TestHelper(new HiveConf(), tables, location.toString(), SCHEMA, spec, fileFormat, + Collections.singletonMap("iceberg.write.deleterow", String.valueOf(isWriteDeleteRowData)), temp); this.table = helper.createTable(); helper.appendToTable(RECORDS); diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q new file mode 100644 index 000000000000..68b9d3164d3a --- /dev/null +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q @@ -0,0 +1,83 @@ +-- Mask random uuid +--! qt:replace:/(\s+'uuid'=')\S+('\s*)/$1#Masked#$2/ + +-- create an unpartitioned table with delete data set to true + create table ice01 (id int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true'); + +-- check the property value +show create table ice01; + +-- insert some values +insert into ice01 values (1),(2),(3),(4); + +-- check the inserted values +select * from ice01; + +-- delete some values +delete from ice01 where id>2; + +-- check the values, the delete value should be there +select * from ice01 order by id; + +-- insert some more data + insert into ice01 values (5),(6),(7),(8); + +-- check the values, only the delete value shouldn't be there +select * from ice01 order by id; + +-- delete one value +delete from ice01 where id=7; + +-- change the delete row type +Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false'); + +-- check the property value +show create table ice01; + +-- delete some more rows now +delete from ice01 where id=5; + +-- check the entries, the deleted entries shouldn't be there. +select * from ice01 order by id; + +-- create a partitioned table with delete data set to true + create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true'); + +-- insert some values +insert into icepart01 values (1,1),(2,1),(3,2),(4,2); + +-- check the inserted values +select * from icepart01 order by id;; + +-- delete some values +delete from icepart01 where id>=2 AND id<4; + +-- check the values, the delete value should be there +select * from icepart01; + +-- insert some more data + insert into icepart01 values (5,1),(6,2),(7,1),(8,2); + +-- check the values, only the delete value shouldn't be there +select * from icepart01 order by id; + +-- delete one value +delete from icepart01 where id=7; + +-- change the delete row type +Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false'); + +-- check the property value +show create table icepart01; + +-- delete some more rows now +delete from icepart01 where id=5; + +-- check the entries, the deleted entries shouldn't be there. +select * from icepart01 order by id;; + +-- clean up +drop table ice01; +drop table icepart01; \ No newline at end of file diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out new file mode 100644 index 000000000000..19c633fc1c5a --- /dev/null +++ b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out @@ -0,0 +1,315 @@ +PREHOOK: query: create table ice01 (id int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ice01 +POSTHOOK: query: create table ice01 (id int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ice01 +PREHOOK: query: show create table ice01 +PREHOOK: type: SHOW_CREATETABLE +PREHOOK: Input: default@ice01 +POSTHOOK: query: show create table ice01 +POSTHOOK: type: SHOW_CREATETABLE +POSTHOOK: Input: default@ice01 +CREATE TABLE `ice01`( + `id` int) +ROW FORMAT SERDE + 'org.apache.iceberg.mr.hive.HiveIcebergSerDe' +STORED BY + 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' + +LOCATION +#### A masked pattern was here #### +TBLPROPERTIES ( + 'bucketing_version'='2', + 'engine.hive.enabled'='true', + 'format-version'='2', + 'iceberg.orc.files.only'='true', + 'iceberg.write.deleterow'='true', +#### A masked pattern was here #### + 'serialization.format'='1', + 'table_type'='ICEBERG', +#### A masked pattern was here #### + 'uuid'='#Masked#', + 'write.delete.mode'='merge-on-read', + 'write.format.default'='orc') +PREHOOK: query: insert into ice01 values (1),(2),(3),(4) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@ice01 +POSTHOOK: query: insert into ice01 values (1),(2),(3),(4) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@ice01 +PREHOOK: query: select * from ice01 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +#### A masked pattern was here #### +POSTHOOK: query: select * from ice01 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +#### A masked pattern was here #### +1 +2 +3 +4 +PREHOOK: query: delete from ice01 where id>2 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +PREHOOK: Output: default@ice01 +POSTHOOK: query: delete from ice01 where id>2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +POSTHOOK: Output: default@ice01 +PREHOOK: query: select * from ice01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +#### A masked pattern was here #### +POSTHOOK: query: select * from ice01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +#### A masked pattern was here #### +1 +2 +PREHOOK: query: insert into ice01 values (5),(6),(7),(8) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@ice01 +POSTHOOK: query: insert into ice01 values (5),(6),(7),(8) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@ice01 +PREHOOK: query: select * from ice01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +#### A masked pattern was here #### +POSTHOOK: query: select * from ice01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +#### A masked pattern was here #### +1 +2 +5 +6 +7 +8 +PREHOOK: query: delete from ice01 where id=7 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +PREHOOK: Output: default@ice01 +POSTHOOK: query: delete from ice01 where id=7 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +POSTHOOK: Output: default@ice01 +PREHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@ice01 +PREHOOK: Output: default@ice01 +POSTHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@ice01 +POSTHOOK: Output: default@ice01 +PREHOOK: query: show create table ice01 +PREHOOK: type: SHOW_CREATETABLE +PREHOOK: Input: default@ice01 +POSTHOOK: query: show create table ice01 +POSTHOOK: type: SHOW_CREATETABLE +POSTHOOK: Input: default@ice01 +CREATE TABLE `ice01`( + `id` int) +ROW FORMAT SERDE + 'org.apache.iceberg.mr.hive.HiveIcebergSerDe' +STORED BY + 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' + +LOCATION +#### A masked pattern was here #### +TBLPROPERTIES ( + 'bucketing_version'='2', + 'engine.hive.enabled'='true', + 'format-version'='2', + 'iceberg.orc.files.only'='true', + 'iceberg.write.deleterow'='false', +#### A masked pattern was here #### + 'serialization.format'='1', + 'table_type'='ICEBERG', +#### A masked pattern was here #### + 'uuid'='#Masked#', + 'write.delete.mode'='merge-on-read', + 'write.format.default'='orc') +PREHOOK: query: delete from ice01 where id=5 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +PREHOOK: Output: default@ice01 +POSTHOOK: query: delete from ice01 where id=5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +POSTHOOK: Output: default@ice01 +PREHOOK: query: select * from ice01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@ice01 +#### A masked pattern was here #### +POSTHOOK: query: select * from ice01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice01 +#### A masked pattern was here #### +1 +2 +6 +8 +PREHOOK: query: create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@icepart01 +POSTHOOK: query: create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@icepart01 +PREHOOK: query: insert into icepart01 values (1,1),(2,1),(3,2),(4,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@icepart01 +POSTHOOK: query: insert into icepart01 values (1,1),(2,1),(3,2),(4,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@icepart01 +PREHOOK: query: select * from icepart01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +#### A masked pattern was here #### +POSTHOOK: query: select * from icepart01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +#### A masked pattern was here #### +1 1 +2 1 +3 2 +4 2 +PREHOOK: query: delete from icepart01 where id>=2 AND id<4 +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +PREHOOK: Output: default@icepart01 +POSTHOOK: query: delete from icepart01 where id>=2 AND id<4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +POSTHOOK: Output: default@icepart01 +PREHOOK: query: select * from icepart01 +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +#### A masked pattern was here #### +POSTHOOK: query: select * from icepart01 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +#### A masked pattern was here #### +1 1 +4 2 +PREHOOK: query: insert into icepart01 values (5,1),(6,2),(7,1),(8,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@icepart01 +POSTHOOK: query: insert into icepart01 values (5,1),(6,2),(7,1),(8,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@icepart01 +PREHOOK: query: select * from icepart01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +#### A masked pattern was here #### +POSTHOOK: query: select * from icepart01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +#### A masked pattern was here #### +1 1 +4 2 +5 1 +6 2 +7 1 +8 2 +PREHOOK: query: delete from icepart01 where id=7 +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +PREHOOK: Output: default@icepart01 +POSTHOOK: query: delete from icepart01 where id=7 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +POSTHOOK: Output: default@icepart01 +PREHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@icepart01 +PREHOOK: Output: default@icepart01 +POSTHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@icepart01 +POSTHOOK: Output: default@icepart01 +PREHOOK: query: show create table icepart01 +PREHOOK: type: SHOW_CREATETABLE +PREHOOK: Input: default@icepart01 +POSTHOOK: query: show create table icepart01 +POSTHOOK: type: SHOW_CREATETABLE +POSTHOOK: Input: default@icepart01 +CREATE TABLE `icepart01`( + `id` int, + `part` int) +PARTITIONED BY SPEC ( +part) +ROW FORMAT SERDE + 'org.apache.iceberg.mr.hive.HiveIcebergSerDe' +STORED BY + 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' + +LOCATION +#### A masked pattern was here #### +TBLPROPERTIES ( + 'bucketing_version'='2', + 'engine.hive.enabled'='true', + 'format-version'='2', + 'iceberg.orc.files.only'='true', + 'iceberg.write.deleterow'='false', +#### A masked pattern was here #### + 'serialization.format'='1', + 'table_type'='ICEBERG', +#### A masked pattern was here #### + 'uuid'='#Masked#', + 'write.delete.mode'='merge-on-read', + 'write.format.default'='orc') +PREHOOK: query: delete from icepart01 where id=5 +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +PREHOOK: Output: default@icepart01 +POSTHOOK: query: delete from icepart01 where id=5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +POSTHOOK: Output: default@icepart01 +PREHOOK: query: select * from icepart01 order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@icepart01 +#### A masked pattern was here #### +POSTHOOK: query: select * from icepart01 order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@icepart01 +#### A masked pattern was here #### +1 1 +4 2 +6 2 +8 2 +PREHOOK: query: drop table ice01 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@ice01 +PREHOOK: Output: default@ice01 +POSTHOOK: query: drop table ice01 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@ice01 +POSTHOOK: Output: default@ice01 +PREHOOK: query: drop table icepart01 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@icepart01 +PREHOOK: Output: default@icepart01 +POSTHOOK: query: drop table icepart01 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@icepart01 +POSTHOOK: Output: default@icepart01 From ddf8f5033ad087560edada6af4eaa870cd6499a6 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Mon, 14 Nov 2022 08:36:30 +0530 Subject: [PATCH 2/5] Fix Test File Output Location. --- .../{llap => }/iceberg_v2_deletes.q.out | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) rename iceberg/iceberg-handler/src/test/results/positive/{llap => }/iceberg_v2_deletes.q.out (90%) diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out similarity index 90% rename from iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out rename to iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out index 19c633fc1c5a..e4cdb9214fd0 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_v2_deletes.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out @@ -22,14 +22,14 @@ STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' LOCATION -#### A masked pattern was here #### + 'hdfs://### HDFS PATH ###' TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', 'format-version'='2', 'iceberg.orc.files.only'='true', 'iceberg.write.deleterow'='true', -#### A masked pattern was here #### + 'metadata_location'='hdfs://### HDFS PATH ###', 'serialization.format'='1', 'table_type'='ICEBERG', #### A masked pattern was here #### @@ -47,11 +47,11 @@ POSTHOOK: Output: default@ice01 PREHOOK: query: select * from ice01 PREHOOK: type: QUERY PREHOOK: Input: default@ice01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from ice01 POSTHOOK: type: QUERY POSTHOOK: Input: default@ice01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 2 3 @@ -67,11 +67,11 @@ POSTHOOK: Output: default@ice01 PREHOOK: query: select * from ice01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@ice01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from ice01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 2 PREHOOK: query: insert into ice01 values (5),(6),(7),(8) @@ -85,11 +85,11 @@ POSTHOOK: Output: default@ice01 PREHOOK: query: select * from ice01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@ice01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from ice01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 2 5 @@ -126,7 +126,7 @@ STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' LOCATION -#### A masked pattern was here #### + 'hdfs://### HDFS PATH ###' TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', @@ -134,6 +134,8 @@ TBLPROPERTIES ( 'iceberg.orc.files.only'='true', 'iceberg.write.deleterow'='false', #### A masked pattern was here #### + 'metadata_location'='hdfs://### HDFS PATH ###', + 'previous_metadata_location'='hdfs://### HDFS PATH ###', 'serialization.format'='1', 'table_type'='ICEBERG', #### A masked pattern was here #### @@ -151,11 +153,11 @@ POSTHOOK: Output: default@ice01 PREHOOK: query: select * from ice01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@ice01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from ice01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 2 6 @@ -181,11 +183,11 @@ POSTHOOK: Output: default@icepart01 PREHOOK: query: select * from icepart01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@icepart01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from icepart01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@icepart01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 1 2 1 3 2 @@ -201,11 +203,11 @@ POSTHOOK: Output: default@icepart01 PREHOOK: query: select * from icepart01 PREHOOK: type: QUERY PREHOOK: Input: default@icepart01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from icepart01 POSTHOOK: type: QUERY POSTHOOK: Input: default@icepart01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 1 4 2 PREHOOK: query: insert into icepart01 values (5,1),(6,2),(7,1),(8,2) @@ -219,11 +221,11 @@ POSTHOOK: Output: default@icepart01 PREHOOK: query: select * from icepart01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@icepart01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from icepart01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@icepart01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 1 4 2 5 1 @@ -263,7 +265,7 @@ STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' LOCATION -#### A masked pattern was here #### + 'hdfs://### HDFS PATH ###' TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', @@ -271,6 +273,8 @@ TBLPROPERTIES ( 'iceberg.orc.files.only'='true', 'iceberg.write.deleterow'='false', #### A masked pattern was here #### + 'metadata_location'='hdfs://### HDFS PATH ###', + 'previous_metadata_location'='hdfs://### HDFS PATH ###', 'serialization.format'='1', 'table_type'='ICEBERG', #### A masked pattern was here #### @@ -288,11 +292,11 @@ POSTHOOK: Output: default@icepart01 PREHOOK: query: select * from icepart01 order by id PREHOOK: type: QUERY PREHOOK: Input: default@icepart01 -#### A masked pattern was here #### +PREHOOK: Output: hdfs://### HDFS PATH ### POSTHOOK: query: select * from icepart01 order by id POSTHOOK: type: QUERY POSTHOOK: Input: default@icepart01 -#### A masked pattern was here #### +POSTHOOK: Output: hdfs://### HDFS PATH ### 1 1 4 2 6 2 From 228bc5baf5bcf536b74890dc9f127c4c3baef6d5 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Mon, 14 Nov 2022 13:31:59 +0530 Subject: [PATCH 3/5] Regen test file. --- .../test/results/positive/iceberg_v2_deletes.q.out | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out index e4cdb9214fd0..06883a05858f 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out @@ -35,7 +35,9 @@ TBLPROPERTIES ( #### A masked pattern was here #### 'uuid'='#Masked#', 'write.delete.mode'='merge-on-read', - 'write.format.default'='orc') + 'write.format.default'='orc', + 'write.merge.mode'='merge-on-read', + 'write.update.mode'='merge-on-read') PREHOOK: query: insert into ice01 values (1),(2),(3),(4) PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table @@ -141,7 +143,9 @@ TBLPROPERTIES ( #### A masked pattern was here #### 'uuid'='#Masked#', 'write.delete.mode'='merge-on-read', - 'write.format.default'='orc') + 'write.format.default'='orc', + 'write.merge.mode'='merge-on-read', + 'write.update.mode'='merge-on-read') PREHOOK: query: delete from ice01 where id=5 PREHOOK: type: QUERY PREHOOK: Input: default@ice01 @@ -280,7 +284,9 @@ TBLPROPERTIES ( #### A masked pattern was here #### 'uuid'='#Masked#', 'write.delete.mode'='merge-on-read', - 'write.format.default'='orc') + 'write.format.default'='orc', + 'write.merge.mode'='merge-on-read', + 'write.update.mode'='merge-on-read') PREHOOK: query: delete from icepart01 where id=5 PREHOOK: type: QUERY PREHOOK: Input: default@icepart01 From d5e0099d5f38db5c860bcbf8aa734bc1d89715ac Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Tue, 15 Nov 2022 02:42:47 +0530 Subject: [PATCH 4/5] Address Review Comments. --- .../hive/writer/HiveIcebergDeleteWriter.java | 17 ++++++-------- .../iceberg/mr/hive/writer/WriterBuilder.java | 13 ++++++----- .../writer/HiveIcebergWriterTestBase.java | 6 ++--- .../queries/positive/iceberg_v2_deletes.q | 16 +++++++------- .../results/positive/iceberg_v2_deletes.q.out | 22 +++++++++---------- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java index 04382c67f899..c1d8c606fa97 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java @@ -41,15 +41,15 @@ class HiveIcebergDeleteWriter extends HiveIcebergWriterBase { private final GenericRecord rowDataTemplate; - private final boolean isWriteDeleteRow; + private final boolean skipOriginalRow; HiveIcebergDeleteWriter(Schema schema, Map specs, FileWriterFactory writerFactory, OutputFileFactory fileFactory, FileIO io, - long targetFileSize, boolean isWriteDeleteRow) { + long targetFileSize, boolean skipOriginalRow) { super(schema, specs, io, new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, io, targetFileSize)); rowDataTemplate = GenericRecord.create(schema); - this.isWriteDeleteRow = isWriteDeleteRow; + this.skipOriginalRow = skipOriginalRow; } @Override @@ -57,15 +57,12 @@ public void write(Writable row) throws IOException { Record rec = ((Container) row).get(); PositionDelete positionDelete = IcebergAcidUtil.getPositionDelete(rec, rowDataTemplate); int specId = IcebergAcidUtil.parseSpecId(rec); - if (isWriteDeleteRow) { - writer.write(positionDelete, specs.get(specId), partition(positionDelete.row(), specId)); - } else { - // Extract the row data to get the partition detail, and remove it from the positional delete to avoid writing - // the row data in the delete file - Record rowData = positionDelete.row(); + Record rowData = positionDelete.row(); + if (skipOriginalRow) { + // Set null as the row data as we intend to avoid writing the actual row data in the delete file. positionDelete.set(positionDelete.path(), positionDelete.pos(), null); - writer.write(positionDelete, specs.get(specId), partition(rowData, specId)); } + writer.write(positionDelete, specs.get(specId), partition(rowData, specId)); } @Override diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java index 9bb296dd7814..8d6268f275ad 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java @@ -47,6 +47,9 @@ public class WriterBuilder { // A task may write multiple output files using multiple writers. Each of them must have a unique operationId. private static AtomicInteger operationNum = new AtomicInteger(0); + // To specify whether to write the actual row data while writing the delete files. + public static final String ICEBERG_DELETE_SKIPROWDATA = "iceberg.delete.skiprowdata"; + private WriterBuilder(Table table) { this.table = table; } @@ -92,10 +95,9 @@ public HiveIcebergWriter build() { long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - boolean isWriteDeleteRow = Boolean.parseBoolean(properties.getOrDefault("iceberg.write.deleterow", "false")); + boolean skipOriginalRow = Boolean.parseBoolean(properties.getOrDefault(ICEBERG_DELETE_SKIPROWDATA, "true")); Schema dataSchema = table.schema(); - Schema positionalDeleteSchema = isWriteDeleteRow ? dataSchema : null; FileIO io = table.io(); Map specs = table.specs(); int currentSpecId = table.spec().specId(); @@ -112,14 +114,15 @@ public HiveIcebergWriter build() { .operationId("delete-" + operationId) .build(); - HiveFileWriterFactory writerFactory = new HiveFileWriterFactory(table, dataFileFormat, dataSchema, null, - deleteFileFormat, null, null, null, positionalDeleteSchema); + HiveFileWriterFactory writerFactory = + new HiveFileWriterFactory(table, dataFileFormat, dataSchema, null, deleteFileFormat, null, null, null, + skipOriginalRow ? null : dataSchema); HiveIcebergWriter writer; switch (operation) { case DELETE: writer = new HiveIcebergDeleteWriter(dataSchema, specs, writerFactory, deleteOutputFileFactory, - io, targetFileSize, isWriteDeleteRow); + io, targetFileSize, skipOriginalRow); break; case OTHER: writer = new HiveIcebergRecordWriter(dataSchema, specs, currentSpecId, writerFactory, outputFileFactory, diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java index 57b4ffd9e2db..bd3d3aefce23 100644 --- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java +++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterTestBase.java @@ -85,9 +85,9 @@ public class HiveIcebergWriterTestBase { public boolean partitioned; @Parameterized.Parameter(2) - public boolean isWriteDeleteRowData; + public boolean skipRowData; - @Parameterized.Parameters(name = "fileFormat={0}, partitioned={1}, isWriteDeleteRowData={2}") + @Parameterized.Parameters(name = "fileFormat={0}, partitioned={1}, skipRowData={2}") public static Collection parameters() { return Lists.newArrayList(new Object[][] { { FileFormat.PARQUET, true, true }, @@ -115,7 +115,7 @@ public void init() throws IOException { .bucket("data", 3) .build(); this.helper = new TestHelper(new HiveConf(), tables, location.toString(), SCHEMA, spec, fileFormat, - Collections.singletonMap("iceberg.write.deleterow", String.valueOf(isWriteDeleteRowData)), temp); + Collections.singletonMap(WriterBuilder.ICEBERG_DELETE_SKIPROWDATA, String.valueOf(skipRowData)), temp); this.table = helper.createTable(); helper.appendToTable(RECORDS); diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q index 68b9d3164d3a..43a1a8033caf 100644 --- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_v2_deletes.q @@ -1,9 +1,9 @@ -- Mask random uuid --! qt:replace:/(\s+'uuid'=')\S+('\s*)/$1#Masked#$2/ --- create an unpartitioned table with delete data set to true +-- create an unpartitioned table with skip delete data set to false create table ice01 (id int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true'); + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false'); -- check the property value show create table ice01; @@ -29,8 +29,8 @@ select * from ice01 order by id; -- delete one value delete from ice01 where id=7; --- change the delete row type -Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false'); +-- change to skip the row data now +Alter table ice01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true'); -- check the property value show create table ice01; @@ -41,9 +41,9 @@ delete from ice01 where id=5; -- check the entries, the deleted entries shouldn't be there. select * from ice01 order by id; --- create a partitioned table with delete data set to true +-- create a partitioned table with skip row data set to false create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true'); + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false'); -- insert some values insert into icepart01 values (1,1),(2,1),(3,2),(4,2); @@ -66,8 +66,8 @@ select * from icepart01 order by id; -- delete one value delete from icepart01 where id=7; --- change the delete row type -Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false'); +-- change to skip the row data now +Alter table icepart01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true'); -- check the property value show create table icepart01; diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out index 06883a05858f..bf7bb4d4bc78 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_v2_deletes.q.out @@ -1,10 +1,10 @@ PREHOOK: query: create table ice01 (id int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false') PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@ice01 POSTHOOK: query: create table ice01 (id int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false') POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@ice01 @@ -27,8 +27,8 @@ TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', 'format-version'='2', + 'iceberg.delete.skiprowdata'='false', 'iceberg.orc.files.only'='true', - 'iceberg.write.deleterow'='true', 'metadata_location'='hdfs://### HDFS PATH ###', 'serialization.format'='1', 'table_type'='ICEBERG', @@ -106,11 +106,11 @@ POSTHOOK: query: delete from ice01 where id=7 POSTHOOK: type: QUERY POSTHOOK: Input: default@ice01 POSTHOOK: Output: default@ice01 -PREHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +PREHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true') PREHOOK: type: ALTERTABLE_PROPERTIES PREHOOK: Input: default@ice01 PREHOOK: Output: default@ice01 -POSTHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +POSTHOOK: query: Alter table ice01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true') POSTHOOK: type: ALTERTABLE_PROPERTIES POSTHOOK: Input: default@ice01 POSTHOOK: Output: default@ice01 @@ -133,8 +133,8 @@ TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', 'format-version'='2', + 'iceberg.delete.skiprowdata'='true', 'iceberg.orc.files.only'='true', - 'iceberg.write.deleterow'='false', #### A masked pattern was here #### 'metadata_location'='hdfs://### HDFS PATH ###', 'previous_metadata_location'='hdfs://### HDFS PATH ###', @@ -167,12 +167,12 @@ POSTHOOK: Output: hdfs://### HDFS PATH ### 6 8 PREHOOK: query: create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false') PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@icepart01 POSTHOOK: query: create table icepart01 (id int) partitioned by (part int) Stored by Iceberg stored as ORC - TBLPROPERTIES('format-version'='2', 'iceberg.write.deleterow'='true') + TBLPROPERTIES('format-version'='2', 'iceberg.delete.skiprowdata'='false') POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@icepart01 @@ -244,11 +244,11 @@ POSTHOOK: query: delete from icepart01 where id=7 POSTHOOK: type: QUERY POSTHOOK: Input: default@icepart01 POSTHOOK: Output: default@icepart01 -PREHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +PREHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true') PREHOOK: type: ALTERTABLE_PROPERTIES PREHOOK: Input: default@icepart01 PREHOOK: Output: default@icepart01 -POSTHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.write.deleterow'='false') +POSTHOOK: query: Alter table icepart01 set TBLPROPERTIES('iceberg.delete.skiprowdata'='true') POSTHOOK: type: ALTERTABLE_PROPERTIES POSTHOOK: Input: default@icepart01 POSTHOOK: Output: default@icepart01 @@ -274,8 +274,8 @@ TBLPROPERTIES ( 'bucketing_version'='2', 'engine.hive.enabled'='true', 'format-version'='2', + 'iceberg.delete.skiprowdata'='true', 'iceberg.orc.files.only'='true', - 'iceberg.write.deleterow'='false', #### A masked pattern was here #### 'metadata_location'='hdfs://### HDFS PATH ###', 'previous_metadata_location'='hdfs://### HDFS PATH ###', From c18f9950f36ee5ec1581d9ce8b7c69b1b84f43d3 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Tue, 15 Nov 2022 11:34:56 +0530 Subject: [PATCH 5/5] Address Review Comments. --- .../iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java | 8 ++++---- .../org/apache/iceberg/mr/hive/writer/WriterBuilder.java | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java index c1d8c606fa97..bd61f101cd95 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDeleteWriter.java @@ -41,15 +41,15 @@ class HiveIcebergDeleteWriter extends HiveIcebergWriterBase { private final GenericRecord rowDataTemplate; - private final boolean skipOriginalRow; + private final boolean skipRowData; HiveIcebergDeleteWriter(Schema schema, Map specs, FileWriterFactory writerFactory, OutputFileFactory fileFactory, FileIO io, - long targetFileSize, boolean skipOriginalRow) { + long targetFileSize, boolean skipRowData) { super(schema, specs, io, new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, io, targetFileSize)); rowDataTemplate = GenericRecord.create(schema); - this.skipOriginalRow = skipOriginalRow; + this.skipRowData = skipRowData; } @Override @@ -58,7 +58,7 @@ public void write(Writable row) throws IOException { PositionDelete positionDelete = IcebergAcidUtil.getPositionDelete(rec, rowDataTemplate); int specId = IcebergAcidUtil.parseSpecId(rec); Record rowData = positionDelete.row(); - if (skipOriginalRow) { + if (skipRowData) { // Set null as the row data as we intend to avoid writing the actual row data in the delete file. positionDelete.set(positionDelete.path(), positionDelete.pos(), null); } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java index 8d6268f275ad..31073f640186 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java @@ -49,6 +49,7 @@ public class WriterBuilder { // To specify whether to write the actual row data while writing the delete files. public static final String ICEBERG_DELETE_SKIPROWDATA = "iceberg.delete.skiprowdata"; + public static final String ICEBERG_DELETE_SKIPROWDATA_DEFAULT = "true"; private WriterBuilder(Table table) { this.table = table; @@ -95,7 +96,8 @@ public HiveIcebergWriter build() { long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - boolean skipOriginalRow = Boolean.parseBoolean(properties.getOrDefault(ICEBERG_DELETE_SKIPROWDATA, "true")); + boolean skipRowData = + Boolean.parseBoolean(properties.getOrDefault(ICEBERG_DELETE_SKIPROWDATA, ICEBERG_DELETE_SKIPROWDATA_DEFAULT)); Schema dataSchema = table.schema(); FileIO io = table.io(); @@ -116,13 +118,13 @@ public HiveIcebergWriter build() { HiveFileWriterFactory writerFactory = new HiveFileWriterFactory(table, dataFileFormat, dataSchema, null, deleteFileFormat, null, null, null, - skipOriginalRow ? null : dataSchema); + skipRowData ? null : dataSchema); HiveIcebergWriter writer; switch (operation) { case DELETE: writer = new HiveIcebergDeleteWriter(dataSchema, specs, writerFactory, deleteOutputFileFactory, - io, targetFileSize, skipOriginalRow); + io, targetFileSize, skipRowData); break; case OTHER: writer = new HiveIcebergRecordWriter(dataSchema, specs, currentSpecId, writerFactory, outputFileFactory,