apache · rdblue · Sep 25, 2020 · Sep 21, 2020 · Sep 21, 2020 · Sep 22, 2020
diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java
@@ -286,7 +286,9 @@ private Transaction newReplaceTableTransaction(boolean orCreate) {
    *
    * @param io a FileIO to use for deletes
    * @param metadata the last valid TableMetadata instance for a dropped table.
+   * @deprecated will be removed in 0.11.0; use CatalogUtil.dropTableData instead.
    */
+  @Deprecated
   protected static void dropTableData(FileIO io, TableMetadata metadata) {
     // Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete
     // as much of the delete work as possible and avoid orphaned data or manifest files.

diff --git a/core/src/main/java/org/apache/iceberg/CatalogUtil.java b/core/src/main/java/org/apache/iceberg/CatalogUtil.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.exceptions.RuntimeIOException;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.relocated.com.google.common.base.Joiner;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.MapMaker;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.util.Tasks;
+import org.apache.iceberg.util.ThreadPools;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CatalogUtil {
+  private static final Logger LOG = LoggerFactory.getLogger(CatalogUtil.class);
+
+  private CatalogUtil() {
+  }
+
+  /**
+   * Drops all data and metadata files referenced by TableMetadata.
+   * <p>
+   * This should be called by dropTable implementations to clean up table files once the table has been dropped in the
+   * metastore.
+   *
+   * @param io a FileIO to use for deletes
+   * @param metadata the last valid TableMetadata instance for a dropped table.
+   */
+  public static void dropTableData(FileIO io, TableMetadata metadata) {
+    // Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete
+    // as much of the delete work as possible and avoid orphaned data or manifest files.
+
+    Set<String> manifestListsToDelete = Sets.newHashSet();
+    Set<ManifestFile> manifestsToDelete = Sets.newHashSet();
+    for (Snapshot snapshot : metadata.snapshots()) {
+      // add all manifests to the delete set because both data and delete files should be removed
+      Iterables.addAll(manifestsToDelete, snapshot.allManifests());
+      // add the manifest list to the delete set, if present
+      if (snapshot.manifestListLocation() != null) {
+        manifestListsToDelete.add(snapshot.manifestListLocation());
+      }
+    }
+
+    LOG.info("Manifests to delete: {}", Joiner.on(", ").join(manifestsToDelete));
+
+    // run all of the deletes
+
+    deleteFiles(io, manifestsToDelete);
+
+    Tasks.foreach(Iterables.transform(manifestsToDelete, ManifestFile::path))
+        .noRetry().suppressFailureWhenFinished()
+        .onFailure((manifest, exc) -> LOG.warn("Delete failed for manifest: {}", manifest, exc))
+        .run(io::deleteFile);
+
+    Tasks.foreach(manifestListsToDelete)
+        .noRetry().suppressFailureWhenFinished()
+        .onFailure((list, exc) -> LOG.warn("Delete failed for manifest list: {}", list, exc))
+        .run(io::deleteFile);
+
+    Tasks.foreach(metadata.metadataFileLocation())
+        .noRetry().suppressFailureWhenFinished()
+        .onFailure((list, exc) -> LOG.warn("Delete failed for metadata file: {}", list, exc))
+        .run(io::deleteFile);
+  }
+
+  private static void deleteFiles(FileIO io, Set<ManifestFile> allManifests) {
+    // keep track of deleted files in a map that can be cleaned up when memory runs low
+    Map<String, Boolean> deletedFiles = new MapMaker()
+        .concurrencyLevel(ThreadPools.WORKER_THREAD_POOL_SIZE)
+        .weakKeys()
+        .makeMap();
+
+    Tasks.foreach(allManifests)
+        .noRetry().suppressFailureWhenFinished()
+        .executeWith(ThreadPools.getWorkerPool())
+        .onFailure((item, exc) -> LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc))
+        .run(manifest -> {
+          try (ManifestReader<?> reader = ManifestFiles.open(manifest, io)) {
+            for (ManifestEntry<?> entry : reader.entries()) {
+              // intern the file path because the weak key map uses identity (==) instead of equals
+              String path = entry.file().path().toString().intern();
+              Boolean alreadyDeleted = deletedFiles.putIfAbsent(path, true);
+              if (alreadyDeleted == null || !alreadyDeleted) {
+                try {
+                  io.deleteFile(path);
+                } catch (RuntimeException e) {
+                  // this may happen if the map of deleted files gets cleaned up by gc
+                  LOG.warn("Delete failed for data file: {}", path, e);
+                }
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest.path());
+          }
+        });
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java
@@ -33,6 +33,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.iceberg.BaseMetastoreCatalog;
+import org.apache.iceberg.CatalogUtil;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.TableMetadata;
 import org.apache.iceberg.TableOperations;
@@ -198,7 +199,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) {
       if (purge && lastMetadata != null) {
         // Since the data files and the metadata files may store in different locations,
         // so it has to call dropTableData to force delete the data file.
-        dropTableData(ops.io(), lastMetadata);
+        CatalogUtil.dropTableData(ops.io(), lastMetadata);
       }
       fs.delete(tablePath, true /* recursive */);
       return true;

diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java
@@ -19,11 +19,14 @@
 
 package org.apache.iceberg.hadoop;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.Map;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.BaseTable;
+import org.apache.iceberg.CatalogUtil;
 import org.apache.iceberg.MetadataTableType;
 import org.apache.iceberg.MetadataTableUtils;
 import org.apache.iceberg.PartitionSpec;
@@ -144,6 +147,50 @@ public Table create(Schema schema, PartitionSpec spec, SortOrder order,
     return new BaseTable(ops, location);
   }
 
+  /**
+   * Drop a table and delete all data and metadata files.
+   *
+   * @param location a path URI (e.g. hdfs:///warehouse/my_table)
+   * @return true if the table was dropped, false if it did not exist
+   */
+  public boolean dropTable(String location) {
+    return dropTable(location, true);
+  }
+
+  /**
+   * Drop a table; optionally delete data and metadata files.
+   * <p>
+   * If purge is set to true the implementation should delete all data and metadata files.
+   *
+   * @param location a path URI (e.g. hdfs:///warehouse/my_table)
+   * @param purge if true, delete all data and metadata files in the table
+   * @return true if the table was dropped, false if it did not exist
+   */
+  public boolean dropTable(String location, boolean purge) {
+    TableOperations ops = newTableOps(location);
+    TableMetadata lastMetadata = null;
+    if (ops.current() != null) {
+      if (purge) {
+        lastMetadata = ops.current();
+      }
+    } else {
+      return false;
+    }
+
+    try {
+      if (purge && lastMetadata != null) {
+        // Since the data files and the metadata files may store in different locations,
+        // so it has to call dropTableData to force delete the data file.
+        CatalogUtil.dropTableData(ops.io(), lastMetadata);
+      }
+      Path tablePath = new Path(location);
+      Util.getFs(tablePath, conf).delete(tablePath, true /* recursive */);
+      return true;
+    } catch (IOException e) {
+      throw new UncheckedIOException("Failed to delete file: " + location, e);
+    }
+  }
+
   @VisibleForTesting
   TableOperations newTableOps(String location) {
     if (location.contains(METADATA_JSON)) {

diff --git a/...erg/hadoop/TestHadoopTablesSortOrder.java → ...ache/iceberg/hadoop/TestHadoopTables.java b/...erg/hadoop/TestHadoopTablesSortOrder.java → ...ache/iceberg/hadoop/TestHadoopTables.java
@@ -20,10 +20,20 @@
 package org.apache.iceberg.hadoop;
 
 import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import org.apache.iceberg.AppendFiles;
+import org.apache.iceberg.AssertHelpers;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.SortOrder;
 import org.apache.iceberg.Table;
+import org.apache.iceberg.exceptions.NoSuchTableException;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.transforms.Transform;
 import org.apache.iceberg.transforms.Transforms;
@@ -38,7 +48,7 @@
 import static org.apache.iceberg.SortDirection.ASC;
 import static org.apache.iceberg.types.Types.NestedField.required;
 
-public class TestHadoopTablesSortOrder {
+public class TestHadoopTables {
 
   private static final HadoopTables TABLES = new HadoopTables();
   private static final Schema SCHEMA = new Schema(
@@ -48,20 +58,62 @@ public class TestHadoopTablesSortOrder {
 
   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
-  private String tableLocation = null;
+  private File tableDir = null;
 
   @Before
   public void setupTableLocation() throws Exception {
-    File tableDir = temp.newFolder();
-    this.tableLocation = tableDir.toURI().toString();
+    tableDir = temp.newFolder();
+  }
+
+  @Test
+  public void testDropTable() {
+    TABLES.create(SCHEMA, tableDir.toURI().toString());
+    TABLES.dropTable(tableDir.toURI().toString());
+    AssertHelpers.assertThrows(
+        "Should complain about missing table", NoSuchTableException.class,
+        "Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));
+  }
+
+  @Test
+  public void testDropTableWithPurge() throws IOException {
+    File dataDir = temp.newFolder();
+
+    createDummyTable(tableDir, dataDir);
+
+    TABLES.dropTable(tableDir.toURI().toString(), true);
+    AssertHelpers.assertThrows(
+        "Should complain about missing table", NoSuchTableException.class,
+        "Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));
+
+    Assert.assertEquals(0, dataDir.listFiles().length);
+    Assert.assertFalse(tableDir.exists());
+
+    Assert.assertFalse(TABLES.dropTable(tableDir.toURI().toString()));
+  }
+
+  @Test
+  public void testDropTableWithoutPurge() throws IOException {
+    File dataDir = temp.newFolder();
+
+    createDummyTable(tableDir, dataDir);
+
+    TABLES.dropTable(tableDir.toURI().toString(), false);
+    AssertHelpers.assertThrows(
+        "Should complain about missing table", NoSuchTableException.class,
+        "Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));
+
+    Assert.assertEquals(1, dataDir.listFiles().length);
+    Assert.assertFalse(tableDir.exists());
+
+    Assert.assertFalse(TABLES.dropTable(tableDir.toURI().toString()));
   }
 
   @Test
   public void testDefaultSortOrder() {
     PartitionSpec spec = PartitionSpec.builderFor(SCHEMA)
         .bucket("data", 16)
         .build();
-    Table table = TABLES.create(SCHEMA, spec, tableLocation);
+    Table table = TABLES.create(SCHEMA, spec, tableDir.toURI().toString());
 
     SortOrder sortOrder = table.sortOrder();
     Assert.assertEquals("Order ID must match", 0, sortOrder.orderId());
@@ -76,7 +128,7 @@ public void testCustomSortOrder() {
     SortOrder order = SortOrder.builderFor(SCHEMA)
         .asc("id", NULLS_FIRST)
         .build();
-    Table table = TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableLocation);
+    Table table = TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableDir.toURI().toString());
 
     SortOrder sortOrder = table.sortOrder();
     Assert.assertEquals("Order ID must match", 1, sortOrder.orderId());
@@ -86,4 +138,22 @@ public void testCustomSortOrder() {
     Transform<?, ?> transform = Transforms.identity(Types.IntegerType.get());
     Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform());
   }
+
+  private static void createDummyTable(File tableDir, File dataDir) throws IOException {
+    Table table = TABLES.create(SCHEMA, tableDir.toURI().toString());
+    AppendFiles append = table.newAppend();
+    String data = dataDir.getPath() + "/data.parquet";
+    Files.write(Paths.get(data), new ArrayList<>(), StandardCharsets.UTF_8);
+    DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned())
+        .withPath(data)
+        .withFileSizeInBytes(10)
+        .withRecordCount(1)
+        .build();
+    append.appendFile(dataFile);
+    append.commit();
+
+    // Make sure that the data file and the manifest dir is created
+    Assert.assertEquals(1, dataDir.listFiles().length);
+    Assert.assertEquals(1, tableDir.listFiles().length);
+  }
 }
diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java
@@ -36,6 +36,7 @@
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.hadoop.hive.metastore.api.UnknownDBException;
 import org.apache.iceberg.BaseMetastoreCatalog;
+import org.apache.iceberg.CatalogUtil;
 import org.apache.iceberg.TableMetadata;
 import org.apache.iceberg.TableOperations;
 import org.apache.iceberg.catalog.Namespace;
@@ -140,7 +141,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) {
       });
 
       if (purge && lastMetadata != null) {
-        dropTableData(ops.io(), lastMetadata);
+        CatalogUtil.dropTableData(ops.io(), lastMetadata);
       }
 
       LOG.info("Dropped table: {}", identifier);