From 4c8591a96bb50f000b9323004a3d3aba6868fdea Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Mon, 28 Mar 2022 09:45:23 +0200 Subject: [PATCH 1/3] BookieAutoRecoveryTest.testEmptyLedgerLosesQuorumEventually fix flaky test, ensure that the Auditor is alive --- .../apache/bookkeeper/proto/BookieServer.java | 3 +- .../replication/AutoRecoveryMain.java | 5 ++++ .../replication/ReplicationWorker.java | 11 ++++++- .../replication/BookieAutoRecoveryTest.java | 6 +++- .../test/BookKeeperClusterTestCase.java | 29 ++++++++++++++++++- 5 files changed, 50 insertions(+), 4 deletions(-) diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java index 2c1eb3a7d42..2cba258dba2 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java @@ -270,7 +270,8 @@ public void run() { Thread.currentThread().interrupt(); } if (!isBookieRunning()) { - LOG.info("BookieDeathWatcher noticed the bookie is not running any more, exiting the watch loop!"); + LOG.info("BookieDeathWatcher noticed the bookie {} is not running any more, " + + "exiting the watch loop!", bookie); // death watcher has noticed that bookie is not running any more // throw an exception to fail the death watcher thread and it will // trigger the uncaught exception handler to handle this "bookie not running" situation. diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java index 39d1d740c61..0f2b2c009a1 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java @@ -185,6 +185,11 @@ public Auditor getAuditor() { return auditorElector.getAuditor(); } + @VisibleForTesting + public ReplicationWorker getReplicationWorker() { + return replicationWorker; + } + /** Is auto-recovery service running? */ public boolean isAutoRecoveryRunning() { return running; diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java index 6509cdd7996..6f41fe4fffe 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java @@ -25,6 +25,8 @@ import static org.apache.bookkeeper.replication.ReplicationStats.REPLICATE_EXCEPTION; import static org.apache.bookkeeper.replication.ReplicationStats.REPLICATION_WORKER_SCOPE; import static org.apache.bookkeeper.replication.ReplicationStats.REREPLICATE_OP; + +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Stopwatch; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; @@ -244,6 +246,12 @@ public void run() { LOG.error("UnavailableException " + "while replicating fragments", e); waitBackOffTime(rwRereplicateBackoffMs); + if (Thread.currentThread().isInterrupted()) { + LOG.error("Interrupted while replicating fragments"); + shutdown(); + Thread.currentThread().interrupt(); + return; + } } } LOG.info("ReplicationWorker exited loop!"); @@ -646,7 +654,8 @@ public void shutdown() { /** * Gives the running status of ReplicationWorker. */ - boolean isRunning() { + @VisibleForTesting + public boolean isRunning() { return workerRunning && workerThread.isAlive(); } diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java index 05937fe43e9..a8e89dcf0cf 100644 --- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java +++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java @@ -403,8 +403,12 @@ public void testEmptyLedgerLosesQuorumEventually() throws Exception { assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS)); latch = new CountDownLatch(1); s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated + + startNewBookie(); + getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs + if (s != null) { - assertTrue("Should be marked as replicated", latch.await(5, TimeUnit.SECONDS)); + assertTrue("Should be marked as replicated", latch.await(20, TimeUnit.SECONDS)); } // should be able to open ledger without issue diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java index aa4b510809b..1b14f172550 100644 --- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java +++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java @@ -73,6 +73,7 @@ import org.apache.bookkeeper.proto.BookieServer; import org.apache.bookkeeper.replication.Auditor; import org.apache.bookkeeper.replication.AutoRecoveryMain; +import org.apache.bookkeeper.replication.ReplicationWorker; import org.apache.bookkeeper.server.Main; import org.apache.bookkeeper.stats.StatsLogger; import org.apache.bookkeeper.util.DiskChecker; @@ -426,6 +427,11 @@ public ServerConfiguration getBkConf(BookieId addr) throws Exception { public ServerConfiguration killBookie(BookieId addr) throws Exception { Optional tester = byAddress(addr); if (tester.isPresent()) { + if (tester.get().autoRecovery != null + && tester.get().autoRecovery.getAuditor() != null + && tester.get().autoRecovery.getAuditor().isRunning()) { + LOG.warn("Killing bookie {} who is the current Auditor", addr); + } servers.remove(tester.get()); tester.get().shutdown(); return tester.get().getConfiguration(); @@ -763,7 +769,20 @@ public Auditor getAuditor(int timeout, TimeUnit unit) throws Exception { while (System.nanoTime() < timeoutAt) { for (ServerTester t : servers) { Auditor a = t.getAuditor(); - if (a != null) { + ReplicationWorker replicationWorker = t.getReplicationWorker(); + + // found a candidate Auditor + ReplicationWorker + if (a != null && a.isRunning() + && replicationWorker != null && replicationWorker.isRunning()) { + int deathWatchInterval = t.getConfiguration().getDeathWatchInterval(); + Thread.sleep(deathWatchInterval + 1000); + } + + // double check, because in the meantime AutoRecoveryDeathWatcher may have killed the + // AutoRecovery daemon + if (a != null && a.isRunning() + && replicationWorker != null && replicationWorker.isRunning()) { + LOG.info("Found Auditor Bookie {}", t.server.getBookieId()); return a; } } @@ -899,6 +918,14 @@ Auditor getAuditor() { } } + ReplicationWorker getReplicationWorker() { + if (autoRecovery != null) { + return autoRecovery.getReplicationWorker(); + } else { + return null; + } + } + ServerConfiguration getConfiguration() { return conf; } From 508a891daca557abcfae9d326f9a8fa5bc89322d Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Mon, 28 Mar 2022 11:17:44 +0200 Subject: [PATCH 2/3] restore line --- .../main/java/org/apache/bookkeeper/proto/BookieServer.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java index 2cba258dba2..2c1eb3a7d42 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/BookieServer.java @@ -270,8 +270,7 @@ public void run() { Thread.currentThread().interrupt(); } if (!isBookieRunning()) { - LOG.info("BookieDeathWatcher noticed the bookie {} is not running any more, " + - "exiting the watch loop!", bookie); + LOG.info("BookieDeathWatcher noticed the bookie is not running any more, exiting the watch loop!"); // death watcher has noticed that bookie is not running any more // throw an exception to fail the death watcher thread and it will // trigger the uncaught exception handler to handle this "bookie not running" situation. From 3c12d0842dad45d9d724c9272636788f1ac849ce Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Mon, 28 Mar 2022 12:01:40 +0200 Subject: [PATCH 3/3] Remove useless line --- .../org/apache/bookkeeper/replication/ReplicationWorker.java | 1 - 1 file changed, 1 deletion(-) diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java index 6f41fe4fffe..d897e77ea7c 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java @@ -249,7 +249,6 @@ public void run() { if (Thread.currentThread().isInterrupted()) { LOG.error("Interrupted while replicating fragments"); shutdown(); - Thread.currentThread().interrupt(); return; } }