From 8933f9f8538408776892598817f90117654db04c Mon Sep 17 00:00:00 2001 From: Daniel Watkins Date: Fri, 5 Feb 2021 09:55:36 -0500 Subject: [PATCH 1/4] instance: only retry for 10 minutes in _ssh_connect The previous implementation assumed that `client.connect` would take ~no time. This isn't the case, it will sometimes block for 10s of seconds, which meant that the "10 minute" wait could sometimes be substantially longer than that. --- pycloudlib/instance.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pycloudlib/instance.py b/pycloudlib/instance.py index ed8f4b2b..726f3f09 100644 --- a/pycloudlib/instance.py +++ b/pycloudlib/instance.py @@ -312,9 +312,10 @@ def _ssh_connect(self): except SSHException: pass - retries = 60 + start = time.time() + end = start + 600 last_exception = None - while retries: + while True: try: client.connect( username=self.username, @@ -328,13 +329,15 @@ def _ssh_connect(self): except (ConnectionRefusedError, AuthenticationException, BadHostKeyException, ConnectionResetError, SSHException, OSError) as e: - self._log.info( - "%s\nRetrying ssh connection %d more time(s) to %s@%s:%s", - e, retries, self.username, self.ip, self.port - ) last_exception = e - retries -= 1 - time.sleep(10) + if time.time() > end: + break + self._log.info( + "%s\nRetrying SSH connection to %s@%s:%s (%ds left)", + last_exception, self.username, self.ip, self.port, + end - time.time() + ) + time.sleep(10) self._log.error('Failed ssh connection to %s@%s:%s after 10 minutes', self.username, self.ip, self.port) From 5d1b1cc81629ac0c614bbef4ec2d2fcfad7b1168 Mon Sep 17 00:00:00 2001 From: Daniel Watkins Date: Fri, 5 Feb 2021 13:11:41 -0500 Subject: [PATCH 2/4] instance: retry more frequently in _ssh_connect --- pycloudlib/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycloudlib/instance.py b/pycloudlib/instance.py index 726f3f09..becdcd48 100644 --- a/pycloudlib/instance.py +++ b/pycloudlib/instance.py @@ -337,7 +337,7 @@ def _ssh_connect(self): last_exception, self.username, self.ip, self.port, end - time.time() ) - time.sleep(10) + time.sleep(1) self._log.error('Failed ssh connection to %s@%s:%s after 10 minutes', self.username, self.ip, self.port) From 5d927b2fc1cbbd309c7343508da3683faba0e63c Mon Sep 17 00:00:00 2001 From: Daniel Watkins Date: Fri, 5 Feb 2021 09:57:21 -0500 Subject: [PATCH 3/4] instance: retry more frequently in _wait_for_execute `self.execute` generally fails the first time it's executed, but will often succeed much sooner than 10s after: reducing the interval between attempts reduces the time we wait for boot of instances. --- pycloudlib/instance.py | 4 ++-- pycloudlib/tests/test_instance.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pycloudlib/instance.py b/pycloudlib/instance.py index becdcd48..260f29dc 100644 --- a/pycloudlib/instance.py +++ b/pycloudlib/instance.py @@ -375,7 +375,7 @@ def _wait_for_execute(self): test_instance_command = "whoami" result = self.execute(test_instance_command) if result.failed: - retries = 10 + retries = 100 while retries: result = self.execute(test_instance_command) @@ -383,7 +383,7 @@ def _wait_for_execute(self): break retries -= 1 - time.sleep(10) + time.sleep(1) if result.failed: raise OSError( diff --git a/pycloudlib/tests/test_instance.py b/pycloudlib/tests/test_instance.py index ab42c519..c4387634 100644 --- a/pycloudlib/tests/test_instance.py +++ b/pycloudlib/tests/test_instance.py @@ -52,14 +52,14 @@ def test_wait_execute_failure( expected_msg = "{}\n{}".format( "Instance can't be reached", "Failed to execute whoami command" ) - expected_call_args = [mock.call("whoami")] * 11 + expected_call_args = [mock.call("whoami")] * 101 with pytest.raises(OSError) as excinfo: instance.wait() assert expected_msg == str(excinfo.value) assert expected_call_args == m_execute.call_args_list - assert m_sleep.call_count == 10 + assert m_sleep.call_count == 100 class TestWaitForCloudinit: From bd7cdc04ea1d6494e44b12a21ff1130dd941a2eb Mon Sep 17 00:00:00 2001 From: Daniel Watkins Date: Fri, 5 Feb 2021 09:59:38 -0500 Subject: [PATCH 4/4] lxd/instance: retry more frequently in .ip LXD instances are very unlikely to have an IP address when we first launch them, but they generally gain an IP address much earlier than 20s after boot. Reducing this interval substantially speeds up booting LXD instances accessed via SSH. --- pycloudlib/lxd/instance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycloudlib/lxd/instance.py b/pycloudlib/lxd/instance.py index 0f223087..ccad1380 100644 --- a/pycloudlib/lxd/instance.py +++ b/pycloudlib/lxd/instance.py @@ -73,7 +73,7 @@ def ip(self): IP address assigned to instance. """ - retries = 5 + retries = 150 while retries != 0: command = 'lxc list {} -c 4 --format csv'.format(self.name) @@ -83,7 +83,7 @@ def ip(self): break retries -= 1 - time.sleep(20) + time.sleep(1) ip_address = result.split()[0] return ip_address