From 9330bf544c814c4b758016f5ec2b938c77bf897d Mon Sep 17 00:00:00 2001 From: Kiran Kumar Maturi Date: Wed, 26 Jun 2024 09:52:19 +0530 Subject: [PATCH 1/5] HBASE-28665 WALs not marked closed when there are errors in closing WALs --- .../hbase/regionserver/wal/AbstractFSWAL.java | 20 +++++++++---------- .../hadoop/hbase/regionserver/wal/FSHLog.java | 3 +++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index 79033607deb5..2c575127c7ed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -744,20 +744,18 @@ private synchronized void cleanOldLogs() { // For each log file, look at its Map of regions to the highest sequence id; if all sequence ids // are older than what is currently in memory, the WAL can be GC'd. for (Map.Entry e : this.walFile2Props.entrySet()) { - if (!e.getValue().closed) { - LOG.debug("{} is not closed yet, will try archiving it next time", e.getKey()); + Map sequenceNums = e.getValue().encodedName2HighestSequenceId; + if (!e.getValue().closed && !this.sequenceIdAccounting.areAllLower(sequenceNums)) { + LOG.debug("{} is not closed yet or has unflushed entries, will try archiving it next time", e.getKey()); continue; } Path log = e.getKey(); - Map sequenceNums = e.getValue().encodedName2HighestSequenceId; - if (this.sequenceIdAccounting.areAllLower(sequenceNums)) { - if (logsToArchive == null) { - logsToArchive = new ArrayList<>(); - } - logsToArchive.add(Pair.newPair(log, e.getValue().logSize)); - if (LOG.isTraceEnabled()) { - LOG.trace("WAL file ready for archiving " + log); - } + if (logsToArchive == null) { + logsToArchive = new ArrayList<>(); + } + logsToArchive.add(Pair.newPair(log, e.getValue().logSize)); + if (LOG.isTraceEnabled()) { + LOG.trace("WAL file ready for archiving " + log); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 6afe2e06794c..e9cbdd84ed81 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -390,6 +390,9 @@ protected void doReplaceWriter(Path oldPath, Path newPath, Writer nextWriter) th closeWriter(this.writer, oldPath, true); } finally { inflightWALClosures.remove(oldPath.getName()); + if (!isUnflushedEntries()) { + markClosedAndClean(oldPath); + } } } else { Writer localWriter = this.writer; From a20ab24a419b23430f33dbc846ed21cbb90da118 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Maturi Date: Fri, 28 Jun 2024 22:49:36 +0530 Subject: [PATCH 2/5] added test case --- .../hadoop/hbase/regionserver/wal/FSHLog.java | 4 ++ .../hbase/regionserver/wal/TestFSHLog.java | 63 +++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index e9cbdd84ed81..3b401d8cce41 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -1219,4 +1219,8 @@ Writer getWriter() { void setWriter(Writer writer) { this.writer = writer; } + + protected int getClosedErrorCount(){ + return closeErrorCount.get(); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java index a8a88553141a..5505021b6716 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java @@ -317,4 +317,67 @@ public void run() { region.close(); } } + + @Test + /** + * Test for jira https://issues.apache.org/jira/browse/HBASE-28665 + */ + public void testWALClosureFailureAndCleanup() throws IOException { + + class FailingWriter implements WALProvider.Writer { + @Override + public void sync(boolean forceSync) throws IOException { + + } + + @Override + public void append(WAL.Entry entry) throws IOException { + } + + @Override + public long getLength() { + return 0; + } + + @Override + public long getSyncedLength() { + return 0; + } + + @Override + public void close() throws IOException { + throw new IOException("WAL close injected failure.."); + } + } + final byte[] b = Bytes.toBytes("b"); + String name = this.name.getMethodName(); + // Have a FSHLog writer implementation that fails during close + try (FSHLog log = new FSHLog(FS, CommonFSUtils.getRootDir(CONF), name, + HConstants.HREGION_OLDLOGDIR_NAME, CONF, null, true, null, null)) { + log.init(); + + //create a region with the wal + TableDescriptor htd = + TableDescriptorBuilder.newBuilder(TableName.valueOf(this.name.getMethodName())) + .setColumnFamily(ColumnFamilyDescriptorBuilder.of(b)).build(); + RegionInfo hri = RegionInfoBuilder.newBuilder(htd.getTableName()).build(); + ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null, + MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); + final HRegion region = TEST_UTIL.createLocalHRegion(hri, CONF, htd, log); + // Repeat the following steps twice + // * Append writes for the FSHLog + // * Create a new Writer replace the old writer + for(int i = 0; i < 2; i++) { + log.setWriter(new FailingWriter()); + region.put(new Put(b).addColumn(b, b, b)); + region.put(new Put(b).addColumn(b, b, b)); + log.rollWriter(); + } + log.markClosedAndClean(log.getOldPath()); + Threads.sleep(3000); + assertEquals(2, log.getClosedErrorCount()); + assertEquals("WAL Files not cleaned ", 0,log.walFile2Props.size()); + region.close(); + } + } } From 8ee3e400dc273276e34d1e12b7569fe338940ccc Mon Sep 17 00:00:00 2001 From: Kiran Kumar Maturi Date: Sun, 30 Jun 2024 02:16:16 +0530 Subject: [PATCH 3/5] fixed test cases --- .../hbase/regionserver/wal/AbstractFSWAL.java | 18 ++++++++++-------- .../hbase/regionserver/wal/TestFSHLog.java | 6 ++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index 2c575127c7ed..ad141237964f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -744,18 +744,20 @@ private synchronized void cleanOldLogs() { // For each log file, look at its Map of regions to the highest sequence id; if all sequence ids // are older than what is currently in memory, the WAL can be GC'd. for (Map.Entry e : this.walFile2Props.entrySet()) { - Map sequenceNums = e.getValue().encodedName2HighestSequenceId; - if (!e.getValue().closed && !this.sequenceIdAccounting.areAllLower(sequenceNums)) { + if (!e.getValue().closed) { LOG.debug("{} is not closed yet or has unflushed entries, will try archiving it next time", e.getKey()); continue; } Path log = e.getKey(); - if (logsToArchive == null) { - logsToArchive = new ArrayList<>(); - } - logsToArchive.add(Pair.newPair(log, e.getValue().logSize)); - if (LOG.isTraceEnabled()) { - LOG.trace("WAL file ready for archiving " + log); + Map sequenceNums = e.getValue().encodedName2HighestSequenceId; + if (this.sequenceIdAccounting.areAllLower(sequenceNums)) { + if (logsToArchive == null) { + logsToArchive = new ArrayList<>(); + } + logsToArchive.add(Pair.newPair(log, e.getValue().logSize)); + if (LOG.isTraceEnabled()) { + LOG.trace("WAL file ready for archiving " + log); + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java index 5505021b6716..2ff6a6fb9e2f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java @@ -373,9 +373,11 @@ public void close() throws IOException { region.put(new Put(b).addColumn(b, b, b)); log.rollWriter(); } - log.markClosedAndClean(log.getOldPath()); - Threads.sleep(3000); assertEquals(2, log.getClosedErrorCount()); + region.put(new Put(b).addColumn(b, b, b)); + region.put(new Put(b).addColumn(b, b, b)); + region.flush(true); + log.rollWriter(); assertEquals("WAL Files not cleaned ", 0,log.walFile2Props.size()); region.close(); } From 5cbb2f2f8505057bc7e0f175e9a996c11593233c Mon Sep 17 00:00:00 2001 From: Kiran Kumar Maturi Date: Sun, 30 Jun 2024 02:17:42 +0530 Subject: [PATCH 4/5] fixed log message --- .../org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index ad141237964f..79033607deb5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -745,7 +745,7 @@ private synchronized void cleanOldLogs() { // are older than what is currently in memory, the WAL can be GC'd. for (Map.Entry e : this.walFile2Props.entrySet()) { if (!e.getValue().closed) { - LOG.debug("{} is not closed yet or has unflushed entries, will try archiving it next time", e.getKey()); + LOG.debug("{} is not closed yet, will try archiving it next time", e.getKey()); continue; } Path log = e.getKey(); From 3b290944abf1b51a7c75f5478c7226c59184e296 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Maturi Date: Sun, 30 Jun 2024 02:24:38 +0530 Subject: [PATCH 5/5] spotless fixes --- .../apache/hadoop/hbase/regionserver/wal/FSHLog.java | 2 +- .../hadoop/hbase/regionserver/wal/TestFSHLog.java | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 3b401d8cce41..8b558fdb2147 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -1220,7 +1220,7 @@ void setWriter(Writer writer) { this.writer = writer; } - protected int getClosedErrorCount(){ + protected int getClosedErrorCount() { return closeErrorCount.get(); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java index 2ff6a6fb9e2f..e2d31cc61290 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestFSHLog.java @@ -356,7 +356,7 @@ public void close() throws IOException { HConstants.HREGION_OLDLOGDIR_NAME, CONF, null, true, null, null)) { log.init(); - //create a region with the wal + // create a region with the wal TableDescriptor htd = TableDescriptorBuilder.newBuilder(TableName.valueOf(this.name.getMethodName())) .setColumnFamily(ColumnFamilyDescriptorBuilder.of(b)).build(); @@ -365,9 +365,9 @@ public void close() throws IOException { MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); final HRegion region = TEST_UTIL.createLocalHRegion(hri, CONF, htd, log); // Repeat the following steps twice - // * Append writes for the FSHLog - // * Create a new Writer replace the old writer - for(int i = 0; i < 2; i++) { + // * Append writes for the FSHLog + // * Create a new Writer replace the old writer + for (int i = 0; i < 2; i++) { log.setWriter(new FailingWriter()); region.put(new Put(b).addColumn(b, b, b)); region.put(new Put(b).addColumn(b, b, b)); @@ -378,7 +378,7 @@ public void close() throws IOException { region.put(new Put(b).addColumn(b, b, b)); region.flush(true); log.rollWriter(); - assertEquals("WAL Files not cleaned ", 0,log.walFile2Props.size()); + assertEquals("WAL Files not cleaned ", 0, log.walFile2Props.size()); region.close(); } }