diff --git a/checkstyle/checkstyle.xml b/checkstyle/checkstyle.xml index 13cfdb82bd0a8..5f8554e45560b 100644 --- a/checkstyle/checkstyle.xml +++ b/checkstyle/checkstyle.xml @@ -107,7 +107,7 @@ - + @@ -124,7 +124,7 @@ - + diff --git a/clients/src/main/java/org/apache/kafka/common/record/AbstractLegacyRecordBatch.java b/clients/src/main/java/org/apache/kafka/common/record/AbstractLegacyRecordBatch.java index 83637640af49d..ea6c2b23bbad1 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/AbstractLegacyRecordBatch.java +++ b/clients/src/main/java/org/apache/kafka/common/record/AbstractLegacyRecordBatch.java @@ -217,6 +217,16 @@ public boolean isControlBatch() { return false; } + @Override + public long deleteHorizonMs() { + return RecordBatch.NO_TIMESTAMP; + } + + @Override + public boolean hasDeleteHorizonMs() { + return false; + } + /** * Get an iterator for the nested entries contained within this batch. Note that * if the batch is not compressed, then this method will return an iterator over the @@ -468,6 +478,16 @@ public long offset() { return buffer.getLong(OFFSET_OFFSET); } + @Override + public long deleteHorizonMs() { + return RecordBatch.NO_TIMESTAMP; + } + + @Override + public boolean hasDeleteHorizonMs() { + return false; + } + @Override public LegacyRecord outerRecord() { return record; @@ -557,6 +577,16 @@ public long baseOffset() { return loadFullBatch().baseOffset(); } + @Override + public long deleteHorizonMs() { + return RecordBatch.NO_TIMESTAMP; + } + + @Override + public boolean hasDeleteHorizonMs() { + return false; + } + @Override public long lastOffset() { return offset; diff --git a/clients/src/main/java/org/apache/kafka/common/record/DefaultRecordBatch.java b/clients/src/main/java/org/apache/kafka/common/record/DefaultRecordBatch.java index d4a9587ffa56e..8fbdcc1ef4cce 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/DefaultRecordBatch.java +++ b/clients/src/main/java/org/apache/kafka/common/record/DefaultRecordBatch.java @@ -89,11 +89,15 @@ * by the broker and is preserved after compaction. Additionally, the MaxTimestamp of an empty batch always retains * the previous value prior to becoming empty. * + * The delete horizon flag for the sixth bit is used to determine if the first timestamp of the batch had been set to + * the time for which tombstones / transaction markers needs to be removed. If it is true, then the first timestamp is + * the delete horizon, otherwise, it is merely the first timestamp of the record batch. + * * The current attributes are given below: * - * ------------------------------------------------------------------------------------------------- - * | Unused (6-15) | Control (5) | Transactional (4) | Timestamp Type (3) | Compression Type (0-2) | - * ------------------------------------------------------------------------------------------------- + * --------------------------------------------------------------------------------------------------------------------------- + * | Unused (7-15) | Delete Horizon Flag (6) | Control (5) | Transactional (4) | Timestamp Type (3) | Compression Type (0-2) | + * --------------------------------------------------------------------------------------------------------------------------- */ public class DefaultRecordBatch extends AbstractRecordBatch implements MutableRecordBatch { static final int BASE_OFFSET_OFFSET = 0; @@ -128,6 +132,7 @@ public class DefaultRecordBatch extends AbstractRecordBatch implements MutableRe private static final byte COMPRESSION_CODEC_MASK = 0x07; private static final byte TRANSACTIONAL_FLAG_MASK = 0x10; private static final int CONTROL_FLAG_MASK = 0x20; + private static final byte DELETE_HORIZON_FLAG_MASK = 0x40; private static final byte TIMESTAMP_TYPE_MASK = 0x08; private static final int MAX_SKIP_BUFFER_SIZE = 2048; @@ -155,13 +160,27 @@ public void ensureValid() { } /** - * Get the timestamp of the first record in this batch. It is always the create time of the record even if the + * Gets the base timestamp of the batch which is used to calculate the timestamp deltas. + * + * @return The base timestamp or + * {@link RecordBatch#NO_TIMESTAMP} if the batch is empty + */ + public long baseTimestamp() { + return buffer.getLong(FIRST_TIMESTAMP_OFFSET); + } + + /** + * Get the timestamp of the first record in this batch. It is usually the create time of the record even if the * timestamp type of the batch is log append time. - * - * @return The first timestamp or {@link RecordBatch#NO_TIMESTAMP} if the batch is empty + * + * @return The first timestamp if a record has been appended, unless the delete horizon has been set + * {@link RecordBatch#NO_TIMESTAMP} if the batch is empty or if the delete horizon is set */ public long firstTimestamp() { - return buffer.getLong(FIRST_TIMESTAMP_OFFSET); + final long baseTimestamp = baseTimestamp(); + if (hasDeleteHorizonMs()) + return RecordBatch.NO_TIMESTAMP; + return baseTimestamp; } @Override @@ -245,6 +264,19 @@ public boolean isTransactional() { return (attributes() & TRANSACTIONAL_FLAG_MASK) > 0; } + @Override + public boolean hasDeleteHorizonMs() { + return (attributes() & DELETE_HORIZON_FLAG_MASK) > 0; + } + + @Override + public long deleteHorizonMs() { + final long baseTimestamp = baseTimestamp(); + if (hasDeleteHorizonMs()) + return baseTimestamp; + return RecordBatch.NO_TIMESTAMP; + } + @Override public boolean isControlBatch() { return (attributes() & CONTROL_FLAG_MASK) > 0; @@ -360,7 +392,7 @@ public void setMaxTimestamp(TimestampType timestampType, long maxTimestamp) { if (timestampType() == timestampType && currentMaxTimestamp == maxTimestamp) return; - byte attributes = computeAttributes(compressionType(), timestampType, isTransactional(), isControlBatch()); + byte attributes = computeAttributes(compressionType(), timestampType, isTransactional(), isControlBatch(), hasDeleteHorizonMs()); buffer.putShort(ATTRIBUTES_OFFSET, attributes); buffer.putLong(MAX_TIMESTAMP_OFFSET, maxTimestamp); long crc = computeChecksum(); @@ -407,7 +439,7 @@ public int hashCode() { } private static byte computeAttributes(CompressionType type, TimestampType timestampType, - boolean isTransactional, boolean isControl) { + boolean isTransactional, boolean isControl, boolean isDeleteHorizonSet) { if (timestampType == TimestampType.NO_TIMESTAMP_TYPE) throw new IllegalArgumentException("Timestamp type must be provided to compute attributes for message " + "format v2 and above"); @@ -419,6 +451,8 @@ private static byte computeAttributes(CompressionType type, TimestampType timest attributes |= COMPRESSION_CODEC_MASK & type.id; if (timestampType == TimestampType.LOG_APPEND_TIME) attributes |= TIMESTAMP_TYPE_MASK; + if (isDeleteHorizonSet) + attributes |= DELETE_HORIZON_FLAG_MASK; return attributes; } @@ -436,8 +470,8 @@ public static void writeEmptyHeader(ByteBuffer buffer, boolean isControlRecord) { int offsetDelta = (int) (lastOffset - baseOffset); writeHeader(buffer, baseOffset, offsetDelta, DefaultRecordBatch.RECORD_BATCH_OVERHEAD, magic, - CompressionType.NONE, timestampType, RecordBatch.NO_TIMESTAMP, timestamp, producerId, - producerEpoch, baseSequence, isTransactional, isControlRecord, partitionLeaderEpoch, 0); + CompressionType.NONE, timestampType, RecordBatch.NO_TIMESTAMP, timestamp, producerId, + producerEpoch, baseSequence, isTransactional, isControlRecord, false, partitionLeaderEpoch, 0); } static void writeHeader(ByteBuffer buffer, @@ -454,6 +488,7 @@ static void writeHeader(ByteBuffer buffer, int sequence, boolean isTransactional, boolean isControlBatch, + boolean isDeleteHorizonSet, int partitionLeaderEpoch, int numRecords) { if (magic < RecordBatch.CURRENT_MAGIC_VALUE) @@ -461,7 +496,7 @@ static void writeHeader(ByteBuffer buffer, if (firstTimestamp < 0 && firstTimestamp != NO_TIMESTAMP) throw new IllegalArgumentException("Invalid message timestamp " + firstTimestamp); - short attributes = computeAttributes(compressionType, timestampType, isTransactional, isControlBatch); + short attributes = computeAttributes(compressionType, timestampType, isTransactional, isControlBatch, isDeleteHorizonSet); int position = buffer.position(); buffer.putLong(position + BASE_OFFSET_OFFSET, baseOffset); @@ -699,6 +734,18 @@ public boolean isTransactional() { return loadBatchHeader().isTransactional(); } + @Override + public boolean hasDeleteHorizonMs() { + return loadBatchHeader().hasDeleteHorizonMs(); + } + + @Override + public long deleteHorizonMs() { + if (hasDeleteHorizonMs()) + return super.loadBatchHeader().deleteHorizonMs(); + return RecordBatch.NO_TIMESTAMP; + } + @Override public boolean isControlBatch() { return loadBatchHeader().isControlBatch(); diff --git a/clients/src/main/java/org/apache/kafka/common/record/MemoryRecords.java b/clients/src/main/java/org/apache/kafka/common/record/MemoryRecords.java index 8f73565d1b40f..53d38318c431a 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/MemoryRecords.java +++ b/clients/src/main/java/org/apache/kafka/common/record/MemoryRecords.java @@ -19,6 +19,7 @@ import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.errors.CorruptRecordException; import org.apache.kafka.common.record.MemoryRecords.RecordFilter.BatchRetention; +import org.apache.kafka.common.record.MemoryRecords.RecordFilter.BatchRetentionResult; import org.apache.kafka.common.utils.AbstractIterator; import org.apache.kafka.common.utils.ByteBufferOutputStream; import org.apache.kafka.common.utils.CloseableIterator; @@ -150,15 +151,22 @@ public FilterResult filterTo(TopicPartition partition, RecordFilter filter, Byte return filterTo(partition, batches(), filter, destinationBuffer, maxRecordBatchSize, decompressionBufferSupplier); } + /** + * Note: This method is also used to convert the first timestamp of the batch (which is usually the timestamp of the first record) + * to the delete horizon of the tombstones or txn markers which are present in the batch. + */ private static FilterResult filterTo(TopicPartition partition, Iterable batches, RecordFilter filter, ByteBuffer destinationBuffer, int maxRecordBatchSize, BufferSupplier decompressionBufferSupplier) { FilterResult filterResult = new FilterResult(destinationBuffer); ByteBufferOutputStream bufferOutputStream = new ByteBufferOutputStream(destinationBuffer); - for (MutableRecordBatch batch : batches) { long maxOffset = -1L; - BatchRetention batchRetention = filter.checkBatchRetention(batch); + + final BatchRetentionResult batchRetentionResult = filter.checkBatchRetention(batch); + final boolean containsMarkerForEmptyTxn = batchRetentionResult.containsMarkerForEmptyTxn; + final BatchRetention batchRetention = batchRetentionResult.batchRetention; + filterResult.bytesRead += batch.sizeInBytes(); if (batchRetention == BatchRetention.DELETE) @@ -168,38 +176,40 @@ private static FilterResult filterTo(TopicPartition partition, Iterable retainedRecords = new ArrayList<>(); - try (final CloseableIterator iterator = batch.streamingIterator(decompressionBufferSupplier)) { - while (iterator.hasNext()) { - Record record = iterator.next(); - filterResult.messagesRead += 1; - - if (filter.shouldRetainRecord(batch, record)) { - // Check for log corruption due to KAFKA-4298. If we find it, make sure that we overwrite - // the corrupted batch with correct data. - if (!record.hasMagic(batchMagic)) - writeOriginalBatch = false; - - if (record.offset() > maxOffset) - maxOffset = record.offset(); - - retainedRecords.add(record); - } else { - writeOriginalBatch = false; - } - } - } + final BatchFilterResult iterationResult = filterBatch(batch, decompressionBufferSupplier, filterResult, filter, + batchMagic, true, maxOffset, retainedRecords); + boolean containsTombstones = iterationResult.containsTombstones(); + boolean writeOriginalBatch = iterationResult.shouldWriteOriginalBatch(); + maxOffset = iterationResult.maxOffset(); if (!retainedRecords.isEmpty()) { - if (writeOriginalBatch) { + // we check if the delete horizon should be set to a new value + // in which case, we need to reset the base timestamp and overwrite the timestamp deltas + // if the batch does not contain tombstones, then we don't need to overwrite batch + boolean needToSetDeleteHorizon = batch.magic() >= 2 && (containsTombstones || containsMarkerForEmptyTxn) + && !batch.hasDeleteHorizonMs(); + if (writeOriginalBatch && !needToSetDeleteHorizon) { + if (batch.deleteHorizonMs() > filterResult.latestDeleteHorizon()) + filterResult.updateLatestDeleteHorizon(batch.deleteHorizonMs()); batch.writeTo(bufferOutputStream); filterResult.updateRetainedBatchMetadata(batch, retainedRecords.size(), false); } else { - MemoryRecordsBuilder builder = buildRetainedRecordsInto(batch, retainedRecords, bufferOutputStream); + final MemoryRecordsBuilder builder; + if (needToSetDeleteHorizon) { + long deleteHorizonMs = filter.currentTime + filter.deleteRetentionMs; + builder = buildRetainedRecordsInto(batch, retainedRecords, bufferOutputStream, deleteHorizonMs); + if (deleteHorizonMs > filterResult.latestDeleteHorizon()) { + filterResult.updateLatestDeleteHorizon(deleteHorizonMs); + } + } else { + builder = buildRetainedRecordsInto(batch, retainedRecords, bufferOutputStream, batch.deleteHorizonMs()); + if (batch.deleteHorizonMs() > filterResult.latestDeleteHorizon()) + filterResult.updateLatestDeleteHorizon(batch.deleteHorizonMs()); + } + MemoryRecords records = builder.build(); int filteredBatchSize = records.sizeInBytes(); if (filteredBatchSize > batch.sizeInBytes() && filteredBatchSize > maxRecordBatchSize) @@ -236,9 +246,68 @@ private static FilterResult filterTo(TopicPartition partition, Iterable retainedRecords) { + boolean containsTombstones = false; + try (final CloseableIterator iterator = batch.streamingIterator(decompressionBufferSupplier)) { + while (iterator.hasNext()) { + Record record = iterator.next(); + filterResult.messagesRead += 1; + + if (filter.shouldRetainRecord(batch, record)) { + // Check for log corruption due to KAFKA-4298. If we find it, make sure that we overwrite + // the corrupted batch with correct data. + if (!record.hasMagic(batchMagic)) + writeOriginalBatch = false; + + if (record.offset() > maxOffset) + maxOffset = record.offset(); + + retainedRecords.add(record); + + if (!record.hasValue()) { + containsTombstones = true; + } + } else { + writeOriginalBatch = false; + } + } + return new BatchFilterResult(writeOriginalBatch, containsTombstones, maxOffset); + } + } + + private static class BatchFilterResult { + private final boolean writeOriginalBatch; + private final boolean containsTombstones; + private final long maxOffset; + public BatchFilterResult(final boolean writeOriginalBatch, + final boolean containsTombstones, + final long maxOffset) { + this.writeOriginalBatch = writeOriginalBatch; + this.containsTombstones = containsTombstones; + this.maxOffset = maxOffset; + } + public boolean shouldWriteOriginalBatch() { + return this.writeOriginalBatch; + } + public boolean containsTombstones() { + return this.containsTombstones; + } + public long maxOffset() { + return this.maxOffset; + } + } + private static MemoryRecordsBuilder buildRetainedRecordsInto(RecordBatch originalBatch, List retainedRecords, - ByteBufferOutputStream bufferOutputStream) { + ByteBufferOutputStream bufferOutputStream, + final long deleteHorizonMs) { byte magic = originalBatch.magic(); TimestampType timestampType = originalBatch.timestampType(); long logAppendTime = timestampType == TimestampType.LOG_APPEND_TIME ? @@ -249,7 +318,7 @@ private static MemoryRecordsBuilder buildRetainedRecordsInto(RecordBatch origina MemoryRecordsBuilder builder = new MemoryRecordsBuilder(bufferOutputStream, magic, originalBatch.compressionType(), timestampType, baseOffset, logAppendTime, originalBatch.producerId(), originalBatch.producerEpoch(), originalBatch.baseSequence(), originalBatch.isTransactional(), - originalBatch.isControlBatch(), originalBatch.partitionLeaderEpoch(), bufferOutputStream.limit()); + originalBatch.isControlBatch(), originalBatch.partitionLeaderEpoch(), bufferOutputStream.limit(), deleteHorizonMs); for (Record record : retainedRecords) builder.append(record); @@ -300,6 +369,24 @@ public int hashCode() { } public static abstract class RecordFilter { + public final long currentTime; + public final long deleteRetentionMs; + + public RecordFilter(final long currentTime, final long deleteRetentionMs) { + this.currentTime = currentTime; + this.deleteRetentionMs = deleteRetentionMs; + } + + public static class BatchRetentionResult { + public final BatchRetention batchRetention; + public final boolean containsMarkerForEmptyTxn; + public BatchRetentionResult(final BatchRetention batchRetention, + final boolean containsMarkerForEmptyTxn) { + this.batchRetention = batchRetention; + this.containsMarkerForEmptyTxn = containsMarkerForEmptyTxn; + } + } + public enum BatchRetention { DELETE, // Delete the batch without inspecting records RETAIN_EMPTY, // Retain the batch even if it is empty @@ -310,7 +397,7 @@ public enum BatchRetention { * Check whether the full batch can be discarded (i.e. whether we even need to * check the records individually). */ - protected abstract BatchRetention checkBatchRetention(RecordBatch batch); + protected abstract BatchRetentionResult checkBatchRetention(RecordBatch batch); /** * Check whether a record should be retained in the log. Note that {@link #checkBatchRetention(RecordBatch)} @@ -331,11 +418,20 @@ public static class FilterResult { private long maxOffset = -1L; private long maxTimestamp = RecordBatch.NO_TIMESTAMP; private long shallowOffsetOfMaxTimestamp = -1L; + private long latestDeleteHorizonMs = RecordBatch.NO_TIMESTAMP; private FilterResult(ByteBuffer outputBuffer) { this.outputBuffer = outputBuffer; } + public void updateLatestDeleteHorizon(long deleteHorizon) { + this.latestDeleteHorizonMs = deleteHorizon; + } + + public long latestDeleteHorizon() { + return latestDeleteHorizonMs; + } + private void updateRetainedBatchMetadata(MutableRecordBatch retainedBatch, int numMessagesInBatch, boolean headerOnly) { int bytesRetained = headerOnly ? DefaultRecordBatch.RECORD_BATCH_OVERHEAD : retainedBatch.sizeInBytes(); updateRetainedBatchMetadata(retainedBatch.maxTimestamp(), retainedBatch.lastOffset(), diff --git a/clients/src/main/java/org/apache/kafka/common/record/MemoryRecordsBuilder.java b/clients/src/main/java/org/apache/kafka/common/record/MemoryRecordsBuilder.java index 054fb86199884..66dc654f0426d 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/MemoryRecordsBuilder.java +++ b/clients/src/main/java/org/apache/kafka/common/record/MemoryRecordsBuilder.java @@ -75,6 +75,7 @@ public void write(int b) { private int numRecords = 0; private float actualCompressionRatio = 1; private long maxTimestamp = RecordBatch.NO_TIMESTAMP; + private long deleteHorizonMs; private long offsetOfMaxTimestamp = -1; private Long lastOffset = null; private Long firstTimestamp = null; @@ -94,7 +95,8 @@ public MemoryRecordsBuilder(ByteBufferOutputStream bufferStream, boolean isTransactional, boolean isControlBatch, int partitionLeaderEpoch, - int writeLimit) { + int writeLimit, + long deleteHorizonMs) { if (magic > RecordBatch.MAGIC_VALUE_V0 && timestampType == TimestampType.NO_TIMESTAMP_TYPE) throw new IllegalArgumentException("TimestampType must be set for magic >= 0"); if (magic < RecordBatch.MAGIC_VALUE_V2) { @@ -120,6 +122,7 @@ public MemoryRecordsBuilder(ByteBufferOutputStream bufferStream, this.baseSequence = baseSequence; this.isTransactional = isTransactional; this.isControlBatch = isControlBatch; + this.deleteHorizonMs = deleteHorizonMs; this.partitionLeaderEpoch = partitionLeaderEpoch; this.writeLimit = writeLimit; this.initialPosition = bufferStream.position(); @@ -128,6 +131,28 @@ public MemoryRecordsBuilder(ByteBufferOutputStream bufferStream, bufferStream.position(initialPosition + batchHeaderSizeInBytes); this.bufferStream = bufferStream; this.appendStream = new DataOutputStream(compressionType.wrapForOutput(this.bufferStream, magic)); + + if (hasDeleteHorizonMs()) { + this.firstTimestamp = deleteHorizonMs; + } + } + + public MemoryRecordsBuilder(ByteBufferOutputStream bufferStream, + byte magic, + CompressionType compressionType, + TimestampType timestampType, + long baseOffset, + long logAppendTime, + long producerId, + short producerEpoch, + int baseSequence, + boolean isTransactional, + boolean isControlBatch, + int partitionLeaderEpoch, + int writeLimit) { + this(bufferStream, magic, compressionType, timestampType, baseOffset, logAppendTime, producerId, + producerEpoch, baseSequence, isTransactional, isControlBatch, partitionLeaderEpoch, writeLimit, + RecordBatch.NO_TIMESTAMP); } /** @@ -192,6 +217,10 @@ public boolean isTransactional() { return isTransactional; } + public boolean hasDeleteHorizonMs() { + return magic >= RecordBatch.MAGIC_VALUE_V2 && deleteHorizonMs >= 0L; + } + /** * Close this builder and return the resulting buffer. * @return The built log buffer @@ -365,7 +394,7 @@ private int writeDefaultBatchHeader() { DefaultRecordBatch.writeHeader(buffer, baseOffset, offsetDelta, size, magic, compressionType, timestampType, firstTimestamp, maxTimestamp, producerId, producerEpoch, baseSequence, isTransactional, isControlBatch, - partitionLeaderEpoch, numRecords); + hasDeleteHorizonMs(), partitionLeaderEpoch, numRecords); buffer.position(pos); return writtenCompressed; diff --git a/clients/src/main/java/org/apache/kafka/common/record/RecordBatch.java b/clients/src/main/java/org/apache/kafka/common/record/RecordBatch.java index 65a6a95fbe41f..af65ebaf9ad18 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/RecordBatch.java +++ b/clients/src/main/java/org/apache/kafka/common/record/RecordBatch.java @@ -210,6 +210,18 @@ public interface RecordBatch extends Iterable { */ boolean isTransactional(); + /** + * Whether or not the base timestamp has been set to the delete horizon + * @return true if it is, false otherwise + */ + boolean hasDeleteHorizonMs(); + + /** + * Get the delete horizon, returns -1L if the first timestamp is not the delete horizon + * @return timestamp of the delete horizon + */ + long deleteHorizonMs(); + /** * Get the partition leader epoch of this record batch. * @return The leader epoch or -1 if it is unknown diff --git a/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java b/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java index 041b8191f3f78..74409802118f3 100644 --- a/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java @@ -2895,10 +2895,10 @@ public void testUpdatePositionWithLastRecordMissingFromBatch() { new SimpleRecord(null, "value".getBytes())); // Remove the last record to simulate compaction - MemoryRecords.FilterResult result = records.filterTo(tp0, new MemoryRecords.RecordFilter() { + MemoryRecords.FilterResult result = records.filterTo(tp0, new MemoryRecords.RecordFilter(0, 0) { @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { - return BatchRetention.DELETE_EMPTY; + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { + return new BatchRetentionResult(BatchRetention.DELETE_EMPTY, false); } @Override diff --git a/clients/src/test/java/org/apache/kafka/common/record/MemoryRecordsTest.java b/clients/src/test/java/org/apache/kafka/common/record/MemoryRecordsTest.java index b8824d3a8276c..db6cb67e1d777 100644 --- a/clients/src/test/java/org/apache/kafka/common/record/MemoryRecordsTest.java +++ b/clients/src/test/java/org/apache/kafka/common/record/MemoryRecordsTest.java @@ -19,6 +19,7 @@ import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.errors.CorruptRecordException; import org.apache.kafka.common.header.internals.RecordHeaders; +import org.apache.kafka.common.record.MemoryRecords.RecordFilter; import org.apache.kafka.common.record.MemoryRecords.RecordFilter.BatchRetention; import org.apache.kafka.common.utils.Utils; import org.apache.kafka.test.TestUtils; @@ -247,6 +248,41 @@ public void testFilterToPreservesPartitionLeaderEpoch() { } } + /** + * This test is used to see if the first timestamp of the batch has been successfully + * converted to a delete horizon for the tombstones / transaction markers of the batch. + */ + @Test + public void testFirstTimestampToDeleteHorizonConversion() { + if (magic >= RecordBatch.MAGIC_VALUE_V2) { + ByteBuffer buffer = ByteBuffer.allocate(2048); + MemoryRecordsBuilder builder = MemoryRecords.builder(buffer, magic, compression, TimestampType.CREATE_TIME, + 0L, RecordBatch.NO_TIMESTAMP, partitionLeaderEpoch); + builder.append(10L, "1".getBytes(), null); + + ByteBuffer filtered = ByteBuffer.allocate(2048); + final long deleteHorizon = Integer.MAX_VALUE / 2; + final RecordFilter recordFilter = new MemoryRecords.RecordFilter(deleteHorizon - 1, 1) { + @Override + protected boolean shouldRetainRecord(RecordBatch recordBatch, Record record) { + return true; + } + + @Override + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { + return new BatchRetentionResult(BatchRetention.RETAIN_EMPTY, true); + } + }; + builder.build().filterTo(new TopicPartition("random", 0), recordFilter, filtered, Integer.MAX_VALUE, BufferSupplier.NO_CACHING); + filtered.flip(); + MemoryRecords filteredRecords = MemoryRecords.readableRecords(filtered); + + List batches = TestUtils.toList(filteredRecords.batches()); + assertEquals(1, batches.size()); + assertEquals(deleteHorizon, batches.get(0).deleteHorizonMs()); + } + } + @Test public void testFilterToEmptyBatchRetention() { if (magic >= RecordBatch.MAGIC_VALUE_V2) { @@ -269,11 +305,11 @@ public void testFilterToEmptyBatchRetention() { ByteBuffer filtered = ByteBuffer.allocate(2048); MemoryRecords.FilterResult filterResult = records.filterTo(new TopicPartition("foo", 0), - new MemoryRecords.RecordFilter() { + new MemoryRecords.RecordFilter(0, 0) { @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { // retain all batches - return BatchRetention.RETAIN_EMPTY; + return new BatchRetentionResult(BatchRetention.RETAIN_EMPTY, false); } @Override @@ -331,11 +367,11 @@ public void testEmptyBatchRetention() { ByteBuffer filtered = ByteBuffer.allocate(2048); MemoryRecords records = MemoryRecords.readableRecords(buffer); MemoryRecords.FilterResult filterResult = records.filterTo(new TopicPartition("foo", 0), - new MemoryRecords.RecordFilter() { + new MemoryRecords.RecordFilter(0, 0) { @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { // retain all batches - return BatchRetention.RETAIN_EMPTY; + return new BatchRetentionResult(BatchRetention.RETAIN_EMPTY, false); } @Override @@ -381,10 +417,10 @@ public void testEmptyBatchDeletion() { ByteBuffer filtered = ByteBuffer.allocate(2048); MemoryRecords records = MemoryRecords.readableRecords(buffer); MemoryRecords.FilterResult filterResult = records.filterTo(new TopicPartition("foo", 0), - new MemoryRecords.RecordFilter() { + new MemoryRecords.RecordFilter(0, 0) { @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { - return deleteRetention; + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { + return new BatchRetentionResult(deleteRetention, false); } @Override @@ -469,13 +505,13 @@ public void testFilterToBatchDiscard() { buffer.flip(); ByteBuffer filtered = ByteBuffer.allocate(2048); - MemoryRecords.readableRecords(buffer).filterTo(new TopicPartition("foo", 0), new MemoryRecords.RecordFilter() { + MemoryRecords.readableRecords(buffer).filterTo(new TopicPartition("foo", 0), new MemoryRecords.RecordFilter(0, 0) { @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { // discard the second and fourth batches if (batch.lastOffset() == 2L || batch.lastOffset() == 6L) - return BatchRetention.DELETE; - return BatchRetention.DELETE_EMPTY; + return new BatchRetentionResult(BatchRetention.DELETE, false); + return new BatchRetentionResult(BatchRetention.DELETE_EMPTY, false); } @Override @@ -909,9 +945,13 @@ public static Collection data() { } private static class RetainNonNullKeysFilter extends MemoryRecords.RecordFilter { + public RetainNonNullKeysFilter() { + super(0, 0); + } + @Override - protected BatchRetention checkBatchRetention(RecordBatch batch) { - return BatchRetention.DELETE_EMPTY; + protected BatchRetentionResult checkBatchRetention(RecordBatch batch) { + return new BatchRetentionResult(BatchRetention.DELETE_EMPTY, false); } @Override diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala index ec2498721b902..62496b5528819 100644 --- a/core/src/main/scala/kafka/log/Log.scala +++ b/core/src/main/scala/kafka/log/Log.scala @@ -220,7 +220,8 @@ class Log(@volatile private var _dir: File, val producerIdExpirationCheckIntervalMs: Int, val topicPartition: TopicPartition, val producerStateManager: ProducerStateManager, - logDirFailureChannel: LogDirFailureChannel) extends Logging with KafkaMetricsGroup { + logDirFailureChannel: LogDirFailureChannel, + @volatile var latestDeleteHorizon: Long = RecordBatch.NO_TIMESTAMP) extends Logging with KafkaMetricsGroup { import kafka.log.Log._ diff --git a/core/src/main/scala/kafka/log/LogCleaner.scala b/core/src/main/scala/kafka/log/LogCleaner.scala index 2ce1a8546007c..2bc12806c5e19 100644 --- a/core/src/main/scala/kafka/log/LogCleaner.scala +++ b/core/src/main/scala/kafka/log/LogCleaner.scala @@ -489,14 +489,15 @@ private[log] class Cleaner(val id: Int, case None => 0L case Some(seg) => seg.lastModified - cleanable.log.config.deleteRetentionMs } - - doClean(cleanable, deleteHorizonMs) + doClean(cleanable, time.milliseconds(), legacyDeleteHorizonMs = deleteHorizonMs) } - private[log] def doClean(cleanable: LogToClean, deleteHorizonMs: Long): (Long, CleanerStats) = { + private[log] def doClean(cleanable: LogToClean, currentTime: Long, legacyDeleteHorizonMs: Long = -1L): (Long, CleanerStats) = { info("Beginning cleaning of log %s.".format(cleanable.log.name)) val log = cleanable.log + log.latestDeleteHorizon = RecordBatch.NO_TIMESTAMP + val stats = new CleanerStats() // build the offset map @@ -511,13 +512,13 @@ private[log] class Cleaner(val id: Int, val cleanableHorizonMs = log.logSegments(0, cleanable.firstUncleanableOffset).lastOption.map(_.lastModified).getOrElse(0L) // group the segments and clean the groups - info("Cleaning log %s (cleaning prior to %s, discarding tombstones prior to %s)...".format(log.name, new Date(cleanableHorizonMs), new Date(deleteHorizonMs))) + info("Cleaning log %s (cleaning prior to %s, discarding legacy tombstones prior to %s)...".format(log.name, new Date(cleanableHorizonMs), new Date(legacyDeleteHorizonMs))) val transactionMetadata = new CleanedTransactionMetadata val groupedSegments = groupSegmentsBySize(log.logSegments(0, endOffset), log.config.segmentSize, log.config.maxIndexSize, cleanable.firstUncleanableOffset) for (group <- groupedSegments) - cleanSegments(log, group, offsetMap, deleteHorizonMs, stats, transactionMetadata) + cleanSegments(log, group, offsetMap, currentTime, stats, transactionMetadata, legacyDeleteHorizonMs = legacyDeleteHorizonMs) // record buffer utilization stats.bufferUtilization = offsetMap.utilization @@ -533,17 +534,19 @@ private[log] class Cleaner(val id: Int, * @param log The log being cleaned * @param segments The group of segments being cleaned * @param map The offset map to use for cleaning segments - * @param deleteHorizonMs The time to retain delete tombstones + * @param currentTime The current time in milliseconds * @param stats Collector for cleaning statistics * @param transactionMetadata State of ongoing transactions which is carried between the cleaning * of the grouped segments + * @param legacyDeleteHorizonMs The delete horizon used for tombstones whose version is less than 2 */ private[log] def cleanSegments(log: Log, segments: Seq[LogSegment], map: OffsetMap, - deleteHorizonMs: Long, + currentTime: Long, stats: CleanerStats, - transactionMetadata: CleanedTransactionMetadata): Unit = { + transactionMetadata: CleanedTransactionMetadata, + legacyDeleteHorizonMs: Long = -1L): Unit = { // create a new segment with a suffix appended to the name of the log and indexes val cleaned = LogCleaner.createNewCleanedSegment(log, segments.head.baseOffset) transactionMetadata.cleanedIndex = Some(cleaned.txnIndex) @@ -563,14 +566,17 @@ private[log] class Cleaner(val id: Int, val abortedTransactions = log.collectAbortedTransactions(startOffset, upperBoundOffset) transactionMetadata.addAbortedTransactions(abortedTransactions) - val retainDeletesAndTxnMarkers = currentSegment.lastModified > deleteHorizonMs + val retainLegacyDeletesAndTxnMarkers = currentSegment.lastModified > legacyDeleteHorizonMs info(s"Cleaning $currentSegment in log ${log.name} into ${cleaned.baseOffset} " + - s"with deletion horizon $deleteHorizonMs, " + - s"${if(retainDeletesAndTxnMarkers) "retaining" else "discarding"} deletes.") + s"with legacy deletion horizon $legacyDeleteHorizonMs, " + + s"${if(retainLegacyDeletesAndTxnMarkers) "retaining" else "discarding"} deletes.") try { - cleanInto(log.topicPartition, currentSegment.log, cleaned, map, retainDeletesAndTxnMarkers, log.config.maxMessageSize, - transactionMetadata, lastOffsetOfActiveProducers, stats) + val latestDeleteHorizon: Long = cleanInto(log.topicPartition, currentSegment.log, cleaned, map, retainLegacyDeletesAndTxnMarkers, log.config.deleteRetentionMs, + log.config.maxMessageSize, transactionMetadata, lastOffsetOfActiveProducers, stats, currentTime = currentTime) + if (log.latestDeleteHorizon < latestDeleteHorizon) { + log.latestDeleteHorizon = latestDeleteHorizon + } } catch { case e: LogSegmentOffsetOverflowException => // Split the current segment. It's also safest to abort the current cleaning process, so that we retry from @@ -611,26 +617,43 @@ private[log] class Cleaner(val id: Int, * @param sourceRecords The dirty log segment * @param dest The cleaned log segment * @param map The key=>offset mapping - * @param retainDeletesAndTxnMarkers Should tombstones and markers be retained while cleaning this segment + * @param retainLegacyDeletesAndTxnMarkers Should tombstones (lower than version 2) and markers be retained while cleaning this segment + * @param deleteRetentionMs Defines how long a tombstone should be kept as defined by log configuration * @param maxLogMessageSize The maximum message size of the corresponding topic * @param stats Collector for cleaning statistics + * @param currentTime The time at which the clean was initiated */ private[log] def cleanInto(topicPartition: TopicPartition, sourceRecords: FileRecords, dest: LogSegment, map: OffsetMap, - retainDeletesAndTxnMarkers: Boolean, + retainLegacyDeletesAndTxnMarkers: Boolean, + deleteRetentionMs: Long, maxLogMessageSize: Int, transactionMetadata: CleanedTransactionMetadata, lastRecordsOfActiveProducers: Map[Long, LastRecord], - stats: CleanerStats): Unit = { - val logCleanerFilter: RecordFilter = new RecordFilter { + stats: CleanerStats, + currentTime: Long): Long = { + var latestDeleteHorizon: Long = RecordBatch.NO_TIMESTAMP + + val logCleanerFilter: RecordFilter = new RecordFilter(currentTime, deleteRetentionMs) { var discardBatchRecords: Boolean = _ - override def checkBatchRetention(batch: RecordBatch): BatchRetention = { + override def checkBatchRetention(batch: RecordBatch): RecordFilter.BatchRetentionResult = { // we piggy-back on the tombstone retention logic to delay deletion of transaction markers. // note that we will never delete a marker until all the records from that transaction are removed. - discardBatchRecords = shouldDiscardBatch(batch, transactionMetadata, retainTxnMarkers = retainDeletesAndTxnMarkers) + val canDiscardBatch = shouldDiscardBatch(batch, transactionMetadata) + + if (batch.isControlBatch) { + if (batch.magic() < 2) { + discardBatchRecords = canDiscardBatch && !retainLegacyDeletesAndTxnMarkers + } else { + discardBatchRecords = canDiscardBatch && + batch.hasDeleteHorizonMs() && batch.deleteHorizonMs() <= currentTime + } + } else { + discardBatchRecords = canDiscardBatch + } def isBatchLastRecordOfProducer: Boolean = { // We retain the batch in order to preserve the state of active producers. There are three cases: @@ -647,20 +670,24 @@ private[log] class Cleaner(val id: Int, } } - if (batch.hasProducerId && isBatchLastRecordOfProducer) - BatchRetention.RETAIN_EMPTY - else if (discardBatchRecords) - BatchRetention.DELETE - else - BatchRetention.DELETE_EMPTY + val batchRetention: BatchRetention = + if (batch.hasProducerId && isBatchLastRecordOfProducer) + BatchRetention.RETAIN_EMPTY + else if (discardBatchRecords) + BatchRetention.DELETE + else + BatchRetention.DELETE_EMPTY + new RecordFilter.BatchRetentionResult(batchRetention, canDiscardBatch) } override def shouldRetainRecord(batch: RecordBatch, record: Record): Boolean = { + var isRecordRetained: Boolean = true if (discardBatchRecords) // The batch is only retained to preserve producer sequence information; the records can be removed - false + isRecordRetained = false else - Cleaner.this.shouldRetainRecord(map, retainDeletesAndTxnMarkers, batch, record, stats) + isRecordRetained = Cleaner.this.shouldRetainRecord(map, retainLegacyDeletesAndTxnMarkers, batch, record, stats, currentTime = currentTime) + isRecordRetained } } @@ -675,6 +702,10 @@ private[log] class Cleaner(val id: Int, val records = MemoryRecords.readableRecords(readBuffer) throttler.maybeThrottle(records.sizeInBytes) val result = records.filterTo(topicPartition, logCleanerFilter, writeBuffer, maxLogMessageSize, decompressionBufferSupplier) + if (result.latestDeleteHorizon() > latestDeleteHorizon) { + latestDeleteHorizon = result.latestDeleteHorizon(); + } + stats.readMessages(result.messagesRead, result.bytesRead) stats.recopyMessages(result.messagesRetained, result.bytesRetained) @@ -700,6 +731,7 @@ private[log] class Cleaner(val id: Int, growBuffersOrFail(sourceRecords, position, maxLogMessageSize, records) } restoreBuffers() + latestDeleteHorizon } @@ -736,22 +768,19 @@ private[log] class Cleaner(val id: Int, } private def shouldDiscardBatch(batch: RecordBatch, - transactionMetadata: CleanedTransactionMetadata, - retainTxnMarkers: Boolean): Boolean = { - if (batch.isControlBatch) { - val canDiscardControlBatch = transactionMetadata.onControlBatchRead(batch) - canDiscardControlBatch && !retainTxnMarkers - } else { - val canDiscardBatch = transactionMetadata.onBatchRead(batch) - canDiscardBatch - } + transactionMetadata: CleanedTransactionMetadata): Boolean = { + if (batch.isControlBatch) + transactionMetadata.onControlBatchRead(batch) + else + transactionMetadata.onBatchRead(batch) } private def shouldRetainRecord(map: kafka.log.OffsetMap, - retainDeletes: Boolean, + retainDeletesForLegacyRecords: Boolean, batch: RecordBatch, record: Record, - stats: CleanerStats): Boolean = { + stats: CleanerStats, + currentTime: Long): Boolean = { val pastLatestOffset = record.offset > map.latestOffset if (pastLatestOffset) return true @@ -765,7 +794,13 @@ private[log] class Cleaner(val id: Int, * 2) The message doesn't has value but it can't be deleted now. */ val latestOffsetForKey = record.offset() >= foundOffset - val isRetainedValue = record.hasValue || retainDeletes + val supportDeleteHorizon = batch.magic() >= RecordBatch.MAGIC_VALUE_V2 + val shouldRetainDeletes = + if (supportDeleteHorizon) + !batch.hasDeleteHorizonMs() || currentTime < batch.deleteHorizonMs() + else + retainDeletesForLegacyRecords + val isRetainedValue = record.hasValue || shouldRetainDeletes latestOffsetForKey && isRetainedValue } else { stats.invalidMessage() diff --git a/core/src/main/scala/kafka/log/LogCleanerManager.scala b/core/src/main/scala/kafka/log/LogCleanerManager.scala index 473f1fb8ebea2..2a3061ee20beb 100755 --- a/core/src/main/scala/kafka/log/LogCleanerManager.scala +++ b/core/src/main/scala/kafka/log/LogCleanerManager.scala @@ -30,6 +30,7 @@ import kafka.utils.{Logging, Pool} import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.utils.Time import org.apache.kafka.common.errors.KafkaStorageException +import org.apache.kafka.common.record.RecordBatch import scala.collection.{Iterable, Seq, mutable} @@ -169,11 +170,11 @@ private[log] class LogCleanerManager(val logDirs: Seq[File], val now = time.milliseconds this.timeOfLastRun = now val lastClean = allCleanerCheckpoints + val dirtyLogs = logs.filter { - case (_, log) => log.config.compact // match logs that are marked as compacted + case (_, log) => log.config.compact }.filterNot { case (topicPartition, log) => - // skip any logs already in-progress and uncleanable partitions inProgress.contains(topicPartition) || isUncleanablePartition(log, topicPartition) }.map { case (topicPartition, log) => // create a LogToClean instance for each @@ -198,8 +199,23 @@ private[log] class LogCleanerManager(val logDirs: Seq[File], val cleanableLogs = dirtyLogs.filter { ltc => (ltc.needCompactionNow && ltc.cleanableBytes > 0) || ltc.cleanableRatio > ltc.log.config.minCleanableRatio } + if(cleanableLogs.isEmpty) { - None + val logsWithTombstonesExpired = dirtyLogs.filter { + case ltc => + // in this case, we are probably in a low throughput situation + // therefore, we should take advantage of this fact and remove tombstones if we can + // under the condition that the log's latest delete horizon is less than the current time + // tracked + ltc.log.latestDeleteHorizon != RecordBatch.NO_TIMESTAMP && ltc.log.latestDeleteHorizon <= time.milliseconds() + } + if (!logsWithTombstonesExpired.isEmpty) { + val filthiest = logsWithTombstonesExpired.max + inProgress.put(filthiest.topicPartition, LogCleaningInProgress) + Some(filthiest) + } else { + None + } } else { preCleanStats.recordCleanablePartitions(cleanableLogs.size) val filthiest = cleanableLogs.max diff --git a/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala b/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala index 1ac9e62a7b1b0..7ab8573cab348 100644 --- a/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala @@ -18,6 +18,7 @@ package kafka.log import java.io.PrintWriter +import java.util.Properties import com.yammer.metrics.core.{Gauge, MetricName} import kafka.metrics.{KafkaMetricsGroup, KafkaYammerMetrics} @@ -177,6 +178,36 @@ class LogCleanerIntegrationTest extends AbstractLogCleanerIntegrationTest with K s"but lastCleaned=$lastCleaned2", lastCleaned2 >= secondBlockCleanableSegmentOffset) } + @Test + def testTombstoneCleanWithLowThroughput() : Unit = { + val tombstoneRetentionMs = 1000 // this is in milliseconds -> 1 second + + val topicPartitions = Array(new TopicPartition("log-partition", 0)) + val props = new Properties() + props.put(LogConfig.DeleteRetentionMsProp, "1000") + cleaner = makeCleaner(partitions = topicPartitions, propertyOverrides = props, backOffMs = 100L) + + val log = cleaner.logs.get(topicPartitions(0)) + + val T0 = time.milliseconds + writeKeyDups(numKeys = 1, numDups = 1, log, CompressionType.NONE, timestamp = T0, + startValue = 0, step = 1, isRecordTombstone = true) + + // roll the active segment + log.roll() + + cleaner.startup() + + val latestOffset: Long = log.logEndOffset + + assertTrue(cleaner.awaitCleaned(new TopicPartition("log-partition", 0), + latestOffset, maxWaitMs = 5000)) + assertEquals(log.latestDeleteHorizon, T0 + tombstoneRetentionMs) + + time.sleep(tombstoneRetentionMs + 1) + TestUtils.waitUntilTrue(() => log.size == 0, "Log should be empty") + } + private def readFromLog(log: Log): Iterable[(Int, Int)] = { for (segment <- log.logSegments; record <- segment.log.records.asScala) yield { val key = TestUtils.readString(record.key).toInt @@ -185,12 +216,17 @@ class LogCleanerIntegrationTest extends AbstractLogCleanerIntegrationTest with K } } - private def writeKeyDups(numKeys: Int, numDups: Int, log: Log, codec: CompressionType, timestamp: Long, startValue: Int, step: Int): Seq[(Int, Int)] = { + private def writeKeyDups(numKeys: Int, numDups: Int, log: Log, codec: CompressionType, timestamp: Long, + startValue: Int, step: Int, isRecordTombstone: Boolean = false): Seq[(Int, Int)] = { var valCounter = startValue for (_ <- 0 until numDups; key <- 0 until numKeys) yield { val curValue = valCounter - log.appendAsLeader(TestUtils.singletonRecords(value = curValue.toString.getBytes, codec = codec, - key = key.toString.getBytes, timestamp = timestamp), leaderEpoch = 0) + if (isRecordTombstone) + log.appendAsLeader(TestUtils.singletonRecords(value = null, codec = codec, + key = key.toString.getBytes, timestamp = timestamp), leaderEpoch = 0) + else + log.appendAsLeader(TestUtils.singletonRecords(value = curValue.toString.getBytes, codec = codec, + key = key.toString.getBytes, timestamp = timestamp), leaderEpoch = 0) valCounter += step (key, curValue) } diff --git a/core/src/test/scala/unit/kafka/log/LogCleanerTest.scala b/core/src/test/scala/unit/kafka/log/LogCleanerTest.scala index cf0fb54959d8d..470f3ca93643b 100755 --- a/core/src/test/scala/unit/kafka/log/LogCleanerTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogCleanerTest.scala @@ -53,6 +53,8 @@ class LogCleanerTest { val logConfig = LogConfig(logProps) val time = new MockTime() val throttler = new Throttler(desiredRatePerSec = Double.MaxValue, checkIntervalMs = Long.MaxValue, time = time) + val tombstoneRetentionMs = 86400000 + val largeTimestamp = Long.MaxValue - tombstoneRetentionMs - 1 @After def teardown(): Unit = { @@ -85,8 +87,9 @@ class LogCleanerTest { val segments = log.logSegments.take(3).toSeq val stats = new CleanerStats() val expectedBytesRead = segments.map(_.size).sum - cleaner.cleanSegments(log, segments, map, 0L, stats, new CleanedTransactionMetadata) val shouldRemain = LogTest.keysInLog(log).filter(!keys.contains(_)) + + cleaner.cleanSegments(log, segments, map, 0L, stats, new CleanedTransactionMetadata) assertEquals(shouldRemain, LogTest.keysInLog(log)) assertEquals(expectedBytesRead, stats.bytesRead) } @@ -348,7 +351,7 @@ class LogCleanerTest { log.roll() // cannot remove the marker in this pass because there are still valid records - var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(1, 3, 2), LogTest.keysInLog(log)) assertEquals(List(0, 2, 3, 4, 5), offsetsInLog(log)) @@ -357,17 +360,17 @@ class LogCleanerTest { log.roll() // the first cleaning preserves the commit marker (at offset 3) since there were still records for the transaction - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(2, 1, 3), LogTest.keysInLog(log)) assertEquals(List(3, 4, 5, 6, 7, 8), offsetsInLog(log)) - // delete horizon forced to 0 to verify marker is not removed early - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = 0L)._1 + // current time is still before delete horizon + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(2, 1, 3), LogTest.keysInLog(log)) assertEquals(List(3, 4, 5, 6, 7, 8), offsetsInLog(log)) // clean again with large delete horizon and verify the marker is removed - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = Long.MaxValue)._1 assertEquals(List(2, 1, 3), LogTest.keysInLog(log)) assertEquals(List(4, 5, 6, 7, 8), offsetsInLog(log)) } @@ -396,11 +399,12 @@ class LogCleanerTest { log.appendAsLeader(commitMarker(producerId, producerEpoch), leaderEpoch = 0, origin = AppendOrigin.Coordinator) log.roll() - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(2), LogTest.keysInLog(log)) assertEquals(List(1, 3, 4), offsetsInLog(log)) - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + // In the first pass, the deleteHorizon for {Producer2: Commit} is set. In the second pass, it's removed. + runTwoPassClean(cleaner, LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(2), LogTest.keysInLog(log)) assertEquals(List(3, 4), offsetsInLog(log)) } @@ -437,14 +441,14 @@ class LogCleanerTest { // first time through the records are removed // Expected State: [{Producer1: EmptyBatch}, {Producer2: EmptyBatch}, {Producer2: Commit}, {2}, {3}, {Producer1: Commit}] - var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(2, 3), LogTest.keysInLog(log)) assertEquals(List(4, 5, 6, 7), offsetsInLog(log)) assertEquals(List(1, 3, 4, 5, 6, 7), lastOffsetsPerBatchInLog(log)) // the empty batch remains if cleaned again because it still holds the last sequence // Expected State: [{Producer1: EmptyBatch}, {Producer2: EmptyBatch}, {Producer2: Commit}, {2}, {3}, {Producer1: Commit}] - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(2, 3), LogTest.keysInLog(log)) assertEquals(List(4, 5, 6, 7), offsetsInLog(log)) assertEquals(List(1, 3, 4, 5, 6, 7), lastOffsetsPerBatchInLog(log)) @@ -458,13 +462,15 @@ class LogCleanerTest { log.roll() // Expected State: [{Producer1: EmptyBatch}, {Producer2: Commit}, {2}, {3}, {Producer1: Commit}, {Producer2: 1}, {Producer2: Commit}] - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + // The deleteHorizon for {Producer2: Commit} is still not set yet. + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(2, 3, 1), LogTest.keysInLog(log)) assertEquals(List(4, 5, 6, 7, 8, 9), offsetsInLog(log)) assertEquals(List(1, 4, 5, 6, 7, 8, 9), lastOffsetsPerBatchInLog(log)) // Expected State: [{Producer1: EmptyBatch}, {2}, {3}, {Producer1: Commit}, {Producer2: 1}, {Producer2: Commit}] - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + // In the first pass, the deleteHorizon for {Producer2: Commit} is set. In the second pass, it's removed. + dirtyOffset = runTwoPassClean(cleaner, LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(2, 3, 1), LogTest.keysInLog(log)) assertEquals(List(5, 6, 7, 8, 9), offsetsInLog(log)) assertEquals(List(1, 5, 6, 7, 8, 9), lastOffsetsPerBatchInLog(log)) @@ -489,14 +495,16 @@ class LogCleanerTest { // first time through the control batch is retained as an empty batch // Expected State: [{Producer1: EmptyBatch}], [{2}, {3}] - var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + // In the first pass, the deleteHorizon for the commit marker is set. In the second pass, the commit marker is removed + // but the empty batch is retained for preserving the producer epoch. + var dirtyOffset = runTwoPassClean(cleaner, LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(2, 3), LogTest.keysInLog(log)) assertEquals(List(1, 2), offsetsInLog(log)) assertEquals(List(0, 1, 2), lastOffsetsPerBatchInLog(log)) // the empty control batch does not cause an exception when cleaned // Expected State: [{Producer1: EmptyBatch}], [{2}, {3}] - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = Long.MaxValue)._1 assertEquals(List(2, 3), LogTest.keysInLog(log)) assertEquals(List(1, 2), offsetsInLog(log)) assertEquals(List(0, 1, 2), lastOffsetsPerBatchInLog(log)) @@ -520,7 +528,7 @@ class LogCleanerTest { log.roll() // Both the record and the marker should remain after cleaning - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + runTwoPassClean(cleaner, LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(0, 1), offsetsInLog(log)) assertEquals(List(0, 1), lastOffsetsPerBatchInLog(log)) } @@ -545,12 +553,12 @@ class LogCleanerTest { // Both the batch and the marker should remain after cleaning. The batch is retained // because it is the last entry for this producerId. The marker is retained because // there are still batches remaining from this transaction. - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(1), offsetsInLog(log)) assertEquals(List(0, 1), lastOffsetsPerBatchInLog(log)) // The empty batch and the marker is still retained after a second cleaning. - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = Long.MaxValue - 1) assertEquals(List(1), offsetsInLog(log)) assertEquals(List(0, 1), lastOffsetsPerBatchInLog(log)) } @@ -574,13 +582,13 @@ class LogCleanerTest { log.appendAsLeader(commitMarker(producerId, producerEpoch), leaderEpoch = 0, origin = AppendOrigin.Coordinator) log.roll() - // delete horizon set to 0 to verify marker is not removed early - val dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = 0L)._1 + // Aborted records are removed, but the abort marker is still preserved. + val dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertEquals(List(3), LogTest.keysInLog(log)) assertEquals(List(3, 4, 5), offsetsInLog(log)) - // clean again with large delete horizon and verify the marker is removed - cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + // In the first pass, the delete horizon for the abort marker is set. In the second pass, the abort marker is removed. + runTwoPassClean(cleaner, LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(3), LogTest.keysInLog(log)) assertEquals(List(4, 5), offsetsInLog(log)) } @@ -616,12 +624,12 @@ class LogCleanerTest { // Both transactional batches will be cleaned. The last one will remain in the log // as an empty batch in order to preserve the producer sequence number and epoch - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(1, 3, 4, 5), offsetsInLog(log)) assertEquals(List(1, 2, 3, 4, 5), lastOffsetsPerBatchInLog(log)) - // On the second round of cleaning, the marker from the first transaction should be removed. - cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue) + // In the first pass, the delete horizon for the first marker is set. In the second pass, the first marker is removed. + runTwoPassClean(cleaner, LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(3, 4, 5), offsetsInLog(log)) assertEquals(List(2, 3, 4, 5), lastOffsetsPerBatchInLog(log)) } @@ -653,14 +661,14 @@ class LogCleanerTest { assertAbortedTransactionIndexed() // first time through the records are removed - var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + var dirtyOffset = cleaner.doClean(LogToClean(tp, log, 0L, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertAbortedTransactionIndexed() assertEquals(List(), LogTest.keysInLog(log)) assertEquals(List(2), offsetsInLog(log)) // abort marker is retained assertEquals(List(1, 2), lastOffsetsPerBatchInLog(log)) // empty batch is retained // the empty batch remains if cleaned again because it still holds the last sequence - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = runTwoPassClean(cleaner, LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertAbortedTransactionIndexed() assertEquals(List(), LogTest.keysInLog(log)) assertEquals(List(2), offsetsInLog(log)) // abort marker is still retained @@ -670,13 +678,14 @@ class LogCleanerTest { appendProducer(Seq(1)) log.roll() - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp)._1 assertAbortedTransactionIndexed() assertEquals(List(1), LogTest.keysInLog(log)) assertEquals(List(2, 3), offsetsInLog(log)) // abort marker is not yet gone because we read the empty batch assertEquals(List(2, 3), lastOffsetsPerBatchInLog(log)) // but we do not preserve the empty batch - dirtyOffset = cleaner.doClean(LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), deleteHorizonMs = Long.MaxValue)._1 + // In the first pass, the delete horizon for the abort marker is set. In the second pass, the abort marker is removed. + dirtyOffset = runTwoPassClean(cleaner, LogToClean(tp, log, dirtyOffset, log.activeSegment.baseOffset), currentTime = largeTimestamp) assertEquals(List(1), LogTest.keysInLog(log)) assertEquals(List(3), offsetsInLog(log)) // abort marker is gone assertEquals(List(3), lastOffsetsPerBatchInLog(log)) @@ -1549,6 +1558,7 @@ class LogCleanerTest { key = "0".getBytes, timestamp = time.milliseconds() - logConfig.deleteRetentionMs - 10000), leaderEpoch = 0) log.roll() + cleaner.clean(LogToClean(new TopicPartition("test", 0), log, 1, log.activeSegment.baseOffset)) assertEquals("The tombstone should be retained.", 1, log.logSegments.head.log.batches.iterator.next().lastOffset) // Append a message and roll out another log segment. @@ -1722,6 +1732,18 @@ class LogCleanerTest { private def recoverAndCheck(config: LogConfig, expectedKeys: Iterable[Long]): Log = { LogTest.recoverAndCheck(dir, config, expectedKeys, new BrokerTopicStats(), time, time.scheduler) } + + /** + * We need to run a two pass clean to perform the following steps to stimulate a proper clean: + * 1. On the first run, set the delete horizon in the batches with tombstone or markers with empty txn records. + * 2. For the second pass, we will advance the current time by tombstoneRetentionMs, which will cause the + * tombstones to expire, leading to their prompt removal from the log. + */ + private def runTwoPassClean(cleaner: Cleaner, logToClean: LogToClean, currentTime: Long, + legacyDeleteHorizonMs: Long = -1L, tombstoneRetentionMs: Long = 86400000) : Long = { + cleaner.doClean(logToClean, currentTime, legacyDeleteHorizonMs) + cleaner.doClean(logToClean, currentTime + tombstoneRetentionMs + 1, legacyDeleteHorizonMs)._1 + } } class FakeOffsetMap(val slots: Int) extends OffsetMap { diff --git a/core/src/test/scala/unit/kafka/log/LogTest.scala b/core/src/test/scala/unit/kafka/log/LogTest.scala index 12f0b1c4bf426..f84204c764240 100755 --- a/core/src/test/scala/unit/kafka/log/LogTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogTest.scala @@ -36,7 +36,6 @@ import org.apache.kafka.common.{InvalidRecordException, KafkaException, TopicPar import org.apache.kafka.common.errors._ import org.apache.kafka.common.record.FileRecords.TimestampAndOffset import org.apache.kafka.common.record.MemoryRecords.RecordFilter -import org.apache.kafka.common.record.MemoryRecords.RecordFilter.BatchRetention import org.apache.kafka.common.record._ import org.apache.kafka.common.requests.FetchResponse.AbortedTransaction import org.apache.kafka.common.requests.{ListOffsetRequest, ListOffsetResponse} @@ -1046,8 +1045,9 @@ class LogTest { records.batches.asScala.foreach(_.setPartitionLeaderEpoch(0)) val filtered = ByteBuffer.allocate(2048) - records.filterTo(new TopicPartition("foo", 0), new RecordFilter { - override def checkBatchRetention(batch: RecordBatch): BatchRetention = RecordFilter.BatchRetention.DELETE_EMPTY + records.filterTo(new TopicPartition("foo", 0), new RecordFilter(0, 0) { + override def checkBatchRetention(batch: RecordBatch): RecordFilter.BatchRetentionResult = + new RecordFilter.BatchRetentionResult(RecordFilter.BatchRetention.DELETE_EMPTY, false) override def shouldRetainRecord(recordBatch: RecordBatch, record: Record): Boolean = !record.hasKey }, filtered, Int.MaxValue, BufferSupplier.NO_CACHING) filtered.flip() @@ -1087,8 +1087,9 @@ class LogTest { records.batches.asScala.foreach(_.setPartitionLeaderEpoch(0)) val filtered = ByteBuffer.allocate(2048) - records.filterTo(new TopicPartition("foo", 0), new RecordFilter { - override def checkBatchRetention(batch: RecordBatch): BatchRetention = RecordFilter.BatchRetention.RETAIN_EMPTY + records.filterTo(new TopicPartition("foo", 0), new RecordFilter(0, 0) { + override def checkBatchRetention(batch: RecordBatch): RecordFilter.BatchRetentionResult = + new RecordFilter.BatchRetentionResult(RecordFilter.BatchRetention.RETAIN_EMPTY, true) override def shouldRetainRecord(recordBatch: RecordBatch, record: Record): Boolean = false }, filtered, Int.MaxValue, BufferSupplier.NO_CACHING) filtered.flip() @@ -1130,8 +1131,9 @@ class LogTest { records.batches.asScala.foreach(_.setPartitionLeaderEpoch(0)) val filtered = ByteBuffer.allocate(2048) - records.filterTo(new TopicPartition("foo", 0), new RecordFilter { - override def checkBatchRetention(batch: RecordBatch): BatchRetention = RecordFilter.BatchRetention.DELETE_EMPTY + records.filterTo(new TopicPartition("foo", 0), new RecordFilter(0, 0) { + override def checkBatchRetention(batch: RecordBatch): RecordFilter.BatchRetentionResult = + new RecordFilter.BatchRetentionResult(RecordFilter.BatchRetention.DELETE_EMPTY, false) override def shouldRetainRecord(recordBatch: RecordBatch, record: Record): Boolean = !record.hasKey }, filtered, Int.MaxValue, BufferSupplier.NO_CACHING) filtered.flip()