Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,9 @@ private Transaction newReplaceTableTransaction(boolean orCreate) {
*
* @param io a FileIO to use for deletes
* @param metadata the last valid TableMetadata instance for a dropped table.
* @deprecated will be removed in 0.11.0; use CatalogUtil.dropTableData instead.
*/
@Deprecated
protected static void dropTableData(FileIO io, TableMetadata metadata) {
// Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete
// as much of the delete work as possible and avoid orphaned data or manifest files.
Expand Down
119 changes: 119 additions & 0 deletions core/src/main/java/org/apache/iceberg/CatalogUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg;

import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.MapMaker;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.util.Tasks;
import org.apache.iceberg.util.ThreadPools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CatalogUtil {
private static final Logger LOG = LoggerFactory.getLogger(CatalogUtil.class);

private CatalogUtil() {
}

/**
* Drops all data and metadata files referenced by TableMetadata.
* <p>
* This should be called by dropTable implementations to clean up table files once the table has been dropped in the
* metastore.
*
* @param io a FileIO to use for deletes
* @param metadata the last valid TableMetadata instance for a dropped table.
*/
public static void dropTableData(FileIO io, TableMetadata metadata) {
// Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete
// as much of the delete work as possible and avoid orphaned data or manifest files.

Set<String> manifestListsToDelete = Sets.newHashSet();
Set<ManifestFile> manifestsToDelete = Sets.newHashSet();
for (Snapshot snapshot : metadata.snapshots()) {
// add all manifests to the delete set because both data and delete files should be removed
Iterables.addAll(manifestsToDelete, snapshot.allManifests());
// add the manifest list to the delete set, if present
if (snapshot.manifestListLocation() != null) {
manifestListsToDelete.add(snapshot.manifestListLocation());
}
}

LOG.info("Manifests to delete: {}", Joiner.on(", ").join(manifestsToDelete));

// run all of the deletes

deleteFiles(io, manifestsToDelete);

Tasks.foreach(Iterables.transform(manifestsToDelete, ManifestFile::path))
.noRetry().suppressFailureWhenFinished()
.onFailure((manifest, exc) -> LOG.warn("Delete failed for manifest: {}", manifest, exc))
.run(io::deleteFile);

Tasks.foreach(manifestListsToDelete)
.noRetry().suppressFailureWhenFinished()
.onFailure((list, exc) -> LOG.warn("Delete failed for manifest list: {}", list, exc))
.run(io::deleteFile);

Tasks.foreach(metadata.metadataFileLocation())
.noRetry().suppressFailureWhenFinished()
.onFailure((list, exc) -> LOG.warn("Delete failed for metadata file: {}", list, exc))
.run(io::deleteFile);
}

private static void deleteFiles(FileIO io, Set<ManifestFile> allManifests) {
// keep track of deleted files in a map that can be cleaned up when memory runs low
Map<String, Boolean> deletedFiles = new MapMaker()
.concurrencyLevel(ThreadPools.WORKER_THREAD_POOL_SIZE)
.weakKeys()
.makeMap();

Tasks.foreach(allManifests)
.noRetry().suppressFailureWhenFinished()
.executeWith(ThreadPools.getWorkerPool())
.onFailure((item, exc) -> LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc))
.run(manifest -> {
try (ManifestReader<?> reader = ManifestFiles.open(manifest, io)) {
for (ManifestEntry<?> entry : reader.entries()) {
// intern the file path because the weak key map uses identity (==) instead of equals
String path = entry.file().path().toString().intern();
Boolean alreadyDeleted = deletedFiles.putIfAbsent(path, true);
if (alreadyDeleted == null || !alreadyDeleted) {
try {
io.deleteFile(path);
} catch (RuntimeException e) {
// this may happen if the map of deleted files gets cleaned up by gc
LOG.warn("Delete failed for data file: {}", path, e);
}
}
}
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest.path());
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.iceberg.BaseMetastoreCatalog;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableOperations;
Expand Down Expand Up @@ -198,7 +199,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) {
if (purge && lastMetadata != null) {
// Since the data files and the metadata files may store in different locations,
// so it has to call dropTableData to force delete the data file.
dropTableData(ops.io(), lastMetadata);
CatalogUtil.dropTableData(ops.io(), lastMetadata);
}
fs.delete(tablePath, true /* recursive */);
return true;
Expand Down
47 changes: 47 additions & 0 deletions core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@

package org.apache.iceberg.hadoop;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.BaseTable;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.MetadataTableType;
import org.apache.iceberg.MetadataTableUtils;
import org.apache.iceberg.PartitionSpec;
Expand Down Expand Up @@ -144,6 +147,50 @@ public Table create(Schema schema, PartitionSpec spec, SortOrder order,
return new BaseTable(ops, location);
}

/**
* Drop a table and delete all data and metadata files.
*
* @param location a path URI (e.g. hdfs:///warehouse/my_table)
* @return true if the table was dropped, false if it did not exist
*/
public boolean dropTable(String location) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this wasn't implemented before because it is not part of the Tables API, but now that this is the only implementation, maybe we should consider just deprecating the Tables API and making HadoopTables a stand-alone class.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this would merit another discussion, and another PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed.

return dropTable(location, true);
}

/**
* Drop a table; optionally delete data and metadata files.
* <p>
* If purge is set to true the implementation should delete all data and metadata files.
*
* @param location a path URI (e.g. hdfs:///warehouse/my_table)
* @param purge if true, delete all data and metadata files in the table
* @return true if the table was dropped, false if it did not exist
*/
public boolean dropTable(String location, boolean purge) {
TableOperations ops = newTableOps(location);
TableMetadata lastMetadata = null;
if (ops.current() != null) {
if (purge) {
lastMetadata = ops.current();
}
} else {
return false;
}

try {
if (purge && lastMetadata != null) {
// Since the data files and the metadata files may store in different locations,
// so it has to call dropTableData to force delete the data file.
CatalogUtil.dropTableData(ops.io(), lastMetadata);
}
Path tablePath = new Path(location);
Util.getFs(tablePath, conf).delete(tablePath, true /* recursive */);
return true;
} catch (IOException e) {
throw new UncheckedIOException("Failed to delete file: " + location, e);
}
}

@VisibleForTesting
TableOperations newTableOps(String location) {
if (location.contains(METADATA_JSON)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,20 @@
package org.apache.iceberg.hadoop;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import org.apache.iceberg.AppendFiles;
import org.apache.iceberg.AssertHelpers;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DataFiles;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.Table;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.transforms.Transform;
import org.apache.iceberg.transforms.Transforms;
Expand All @@ -38,7 +48,7 @@
import static org.apache.iceberg.SortDirection.ASC;
import static org.apache.iceberg.types.Types.NestedField.required;

public class TestHadoopTablesSortOrder {
public class TestHadoopTables {

private static final HadoopTables TABLES = new HadoopTables();
private static final Schema SCHEMA = new Schema(
Expand All @@ -48,20 +58,62 @@ public class TestHadoopTablesSortOrder {

@Rule
public TemporaryFolder temp = new TemporaryFolder();
private String tableLocation = null;
private File tableDir = null;

@Before
public void setupTableLocation() throws Exception {
File tableDir = temp.newFolder();
this.tableLocation = tableDir.toURI().toString();
tableDir = temp.newFolder();
}

@Test
public void testDropTable() {
TABLES.create(SCHEMA, tableDir.toURI().toString());
TABLES.dropTable(tableDir.toURI().toString());
AssertHelpers.assertThrows(
"Should complain about missing table", NoSuchTableException.class,
"Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));
}

@Test
public void testDropTableWithPurge() throws IOException {
File dataDir = temp.newFolder();

createDummyTable(tableDir, dataDir);

TABLES.dropTable(tableDir.toURI().toString(), true);
AssertHelpers.assertThrows(
"Should complain about missing table", NoSuchTableException.class,
"Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));

Assert.assertEquals(0, dataDir.listFiles().length);
Assert.assertFalse(tableDir.exists());

Assert.assertFalse(TABLES.dropTable(tableDir.toURI().toString()));
}

@Test
public void testDropTableWithoutPurge() throws IOException {
File dataDir = temp.newFolder();

createDummyTable(tableDir, dataDir);

TABLES.dropTable(tableDir.toURI().toString(), false);
AssertHelpers.assertThrows(
"Should complain about missing table", NoSuchTableException.class,
"Table does not exist", () -> TABLES.load(tableDir.toURI().toString()));

Assert.assertEquals(1, dataDir.listFiles().length);
Assert.assertFalse(tableDir.exists());

Assert.assertFalse(TABLES.dropTable(tableDir.toURI().toString()));
}

@Test
public void testDefaultSortOrder() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA)
.bucket("data", 16)
.build();
Table table = TABLES.create(SCHEMA, spec, tableLocation);
Table table = TABLES.create(SCHEMA, spec, tableDir.toURI().toString());

SortOrder sortOrder = table.sortOrder();
Assert.assertEquals("Order ID must match", 0, sortOrder.orderId());
Expand All @@ -76,7 +128,7 @@ public void testCustomSortOrder() {
SortOrder order = SortOrder.builderFor(SCHEMA)
.asc("id", NULLS_FIRST)
.build();
Table table = TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableLocation);
Table table = TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableDir.toURI().toString());

SortOrder sortOrder = table.sortOrder();
Assert.assertEquals("Order ID must match", 1, sortOrder.orderId());
Expand All @@ -86,4 +138,22 @@ public void testCustomSortOrder() {
Transform<?, ?> transform = Transforms.identity(Types.IntegerType.get());
Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform());
}

private static void createDummyTable(File tableDir, File dataDir) throws IOException {
Table table = TABLES.create(SCHEMA, tableDir.toURI().toString());
AppendFiles append = table.newAppend();
String data = dataDir.getPath() + "/data.parquet";
Files.write(Paths.get(data), new ArrayList<>(), StandardCharsets.UTF_8);
DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned())
.withPath(data)
.withFileSizeInBytes(10)
.withRecordCount(1)
.build();
append.appendFile(dataFile);
append.commit();

// Make sure that the data file and the manifest dir is created
Assert.assertEquals(1, dataDir.listFiles().length);
Assert.assertEquals(1, tableDir.listFiles().length);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.api.UnknownDBException;
import org.apache.iceberg.BaseMetastoreCatalog;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.Namespace;
Expand Down Expand Up @@ -140,7 +141,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) {
});

if (purge && lastMetadata != null) {
dropTableData(ops.io(), lastMetadata);
CatalogUtil.dropTableData(ops.io(), lastMetadata);
}

LOG.info("Dropped table: {}", identifier);
Expand Down
Loading