From fb87f548ab4a479615a5a02fae10fb0e554d04e2 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 15 Jun 2022 10:14:44 -0700 Subject: [PATCH 1/2] Increase test timeout and decrease config times --- .../main/compose/upgrade/compose/ha/docker-config | 15 +++++++++++++++ .../src/main/smoketest/upgrade/finalize.robot | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config b/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config index 09ed5ab73cca..5b91fde61b56 100644 --- a/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config +++ b/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config @@ -34,6 +34,21 @@ OZONE-SITE.XML_ozone.scm.client.address=scm OZONE-SITE.XML_hdds.datanode.dir=/data/hdds +# If SCM sends container close commands as part of upgrade finalization while +# datanodes are doing a leader election, all 3 replicas may end up in the +# CLOSING state. The replication manager must be running to later move them to +# a CLOSED state so the datanodes can progress with finalization. +# +# This config sets the amount of time SCM will wait after safemode exit to +# start the replication manager and pipeline scrubber. The default of 5 minutes +# is fine in real clusters to prevent unnecessary over-replication, +# but it is too long for this test. +OZONE-SITE.XML_hdds.scm.wait.time.after.safemode.exit=5s +# If datanodes take too long to close pipelines during finalization, let the +# scrubber force close them to move the test forward. +OZONE-SITE.XML_ozone.scm.pipeline.scrub.interval=1m +OZONE-SITE.XML_ozone.scm.pipeline.allocated.timeout=1m + OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m diff --git a/hadoop-ozone/dist/src/main/smoketest/upgrade/finalize.robot b/hadoop-ozone/dist/src/main/smoketest/upgrade/finalize.robot index b70f3ca14781..521147ff6a62 100644 --- a/hadoop-ozone/dist/src/main/smoketest/upgrade/finalize.robot +++ b/hadoop-ozone/dist/src/main/smoketest/upgrade/finalize.robot @@ -16,7 +16,7 @@ *** Settings *** Documentation Finalize Upgrade of OMs and SCM Resource ../commonlib.robot -Test Timeout 5 minutes +Test Timeout 10 minutes Test Setup Run Keyword if '${SECURITY_ENABLED}' == 'true' Kinit test user testuser testuser.keytab *** Test Cases *** From 16fe9d5703f5ada1c4bd4327138895a3ac5e943b Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 15 Jun 2022 10:23:20 -0700 Subject: [PATCH 2/2] More graceful pipeline scrubbing timeout --- .../dist/src/main/compose/upgrade/compose/ha/docker-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config b/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config index 5b91fde61b56..4b53107b6dff 100644 --- a/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config +++ b/hadoop-ozone/dist/src/main/compose/upgrade/compose/ha/docker-config @@ -47,7 +47,7 @@ OZONE-SITE.XML_hdds.scm.wait.time.after.safemode.exit=5s # If datanodes take too long to close pipelines during finalization, let the # scrubber force close them to move the test forward. OZONE-SITE.XML_ozone.scm.pipeline.scrub.interval=1m -OZONE-SITE.XML_ozone.scm.pipeline.allocated.timeout=1m +OZONE-SITE.XML_ozone.scm.pipeline.allocated.timeout=2m OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m