Skip to content

Commit 2df679a

Browse files
authored
Merge 6fdab5c into 7e9c6f3
2 parents 7e9c6f3 + 6fdab5c commit 2df679a

4 files changed

Lines changed: 204 additions & 33 deletions

File tree

NEWS.adoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ https://github.com/networkupstools/nut/milestone/12
119119
and `upscli_set_debug_level()` methods to facilitate NUT debugging
120120
for clients built with shared NUT private libraries. [issues #3328,
121121
#1771, #2800, PR #3330]
122+
* Revised OpenSSL handshake to support retries requested by the library,
123+
so it at least works reliably on different platforms within the scope
124+
it has now. [issue #3331, PR #3344]
122125

123126
- NUT for Windows specific updates:
124127
* Revised detection of (relative) paths to program and configuration files

clients/upsclient.c

Lines changed: 99 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <unistd.h>
4040

4141
#ifndef WIN32
42+
# include <sys/select.h> /* fd_set and select(); (or sys/time.h on older BSDs) */
4243
# include <netdb.h>
4344
# include <sys/socket.h>
4445
# include <netinet/in.h>
@@ -913,20 +914,105 @@ static int upscli_sslinit(UPSCONN_t *ups, int verifycert)
913914
SSL_set_verify(ups->ssl, SSL_VERIFY_NONE, NULL);
914915
}
915916

916-
res = SSL_connect(ups->ssl);
917-
switch(res)
917+
/* SSL_connect() on a non-blocking socket requires a retry loop.
918+
* When SSL_connect() returns -1 with SSL_ERROR_WANT_READ or
919+
* SSL_ERROR_WANT_WRITE it is signalling a non-fatal "not done yet"
920+
* condition: the TLS handshake needs more I/O turns to complete.
921+
* The correct response is to wait for the fd to become ready in the
922+
* indicated direction and call SSL_connect() again with the SAME ssl
923+
* object (per OpenSSL docs for all versions >= 0.9.x).
924+
*
925+
* On Linux the loopback is fast enough that the handshake nearly always
926+
* completes in a single call, masking this requirement. On BSD, macOS,
927+
* illumos/OmniOS/OpenIndiana and other non-Linux platforms the loopback
928+
* socket behaviour differs enough that WANT_READ/WANT_WRITE are returned
929+
* regularly, causing the previous single-shot code to treat a transient
930+
* condition as a fatal error and tear down the connection.
931+
*
932+
* The retry behaviour and the SSL_ERROR_WANT_* codes are identical
933+
* across all supported OpenSSL versions (0.9.x, 1.0.x, 1.1.x, 3.x):
934+
* the API contract has never changed in this regard.
935+
*/
918936
{
919-
case 1:
920-
upsdebugx(3, "SSL connected (%s)", SSL_get_version(ups->ssl));
921-
break;
922-
case 0:
923-
upsdebug_with_errno(1, "SSL_connect do not accept handshake.");
924-
ssl_error(ups->ssl, res);
925-
return -1;
926-
default:
927-
upsdebug_with_errno(1, "Unknown return value from SSL_connect %d", res);
928-
ssl_error(ups->ssl, res);
929-
return -1;
937+
int ssl_err;
938+
int ssl_retries = 0;
939+
/* Cap retries to avoid spinning forever on a broken socket.
940+
* 250 * 20 ms = 5 s maximum wait, which is generous for a
941+
* local handshake while being safe for CI timeouts. */
942+
const int SSL_CONNECT_MAX_RETRIES = 250;
943+
fd_set fds;
944+
struct timeval tv;
945+
946+
res = -1;
947+
while (ssl_retries < SSL_CONNECT_MAX_RETRIES) {
948+
res = SSL_connect(ups->ssl);
949+
950+
if (res == 1) {
951+
upsdebugx(3, "SSL connected (%s)",
952+
SSL_get_version(ups->ssl));
953+
break;
954+
}
955+
956+
ssl_err = SSL_get_error(ups->ssl, res);
957+
958+
if (ssl_err == SSL_ERROR_WANT_READ
959+
|| ssl_err == SSL_ERROR_WANT_WRITE
960+
) {
961+
/* Non-fatal: handshake needs another I/O turn.
962+
* Wait up to 20 ms for the fd to be ready, then
963+
* retry SSL_connect() with the same ssl object. */
964+
FD_ZERO(&fds);
965+
FD_SET(ups->fd, &fds);
966+
tv.tv_sec = 0;
967+
tv.tv_usec = 20000; /* 20 ms */
968+
969+
upsdebugx(4,
970+
"%s: SSL_connect WANT_%s, retry %d/%d",
971+
__func__,
972+
(ssl_err == SSL_ERROR_WANT_READ)
973+
? "READ" : "WRITE",
974+
ssl_retries + 1,
975+
SSL_CONNECT_MAX_RETRIES);
976+
977+
if (select(ups->fd + 1,
978+
(ssl_err == SSL_ERROR_WANT_READ) ? &fds : NULL,
979+
(ssl_err == SSL_ERROR_WANT_WRITE) ? &fds : NULL,
980+
NULL, &tv) < 0
981+
) {
982+
upsdebug_with_errno(1,
983+
"%s: select() failed during SSL_connect",
984+
__func__);
985+
ssl_error(ups->ssl, res);
986+
return -1;
987+
}
988+
ssl_retries++;
989+
continue;
990+
}
991+
992+
/* Any other error is fatal */
993+
if (res == 0) {
994+
upsdebug_with_errno(1,
995+
"%s: SSL_connect did not accept handshake"
996+
" (SSL_ERROR %d)",
997+
__func__, ssl_err);
998+
} else {
999+
upsdebug_with_errno(1,
1000+
"%s: SSL_connect failed"
1001+
" (SSL_ERROR %d)",
1002+
__func__, ssl_err);
1003+
}
1004+
ssl_error(ups->ssl, res);
1005+
return -1;
1006+
}
1007+
1008+
if (ssl_retries >= SSL_CONNECT_MAX_RETRIES) {
1009+
upslogx(LOG_ERR,
1010+
"%s: SSL_connect timed out after %d retries"
1011+
" (non-blocking handshake never completed)",
1012+
__func__, ssl_retries);
1013+
ssl_error(ups->ssl, res);
1014+
return -1;
1015+
}
9301016
}
9311017

9321018
return 1;

server/netssl.c

Lines changed: 98 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#ifndef WIN32
3232
# include <netinet/in.h>
3333
# include <sys/socket.h>
34+
# include <sys/select.h> /* fd_set and select(); (or sys/time.h on older BSDs) */
3435
#else /* WIN32 */
3536
# include "wincompat.h"
3637
#endif /* WIN32 */
@@ -366,25 +367,106 @@ void net_starttls(nut_ctype_t *client, size_t numarg, const char **arg)
366367
return;
367368
}
368369

369-
ret = SSL_accept(client->ssl);
370-
switch (ret)
370+
/* SSL_accept() on a non-blocking socket (which upsd uses) requires a
371+
* retry loop. When SSL_accept() returns -1 with SSL_ERROR_WANT_READ or
372+
* SSL_ERROR_WANT_WRITE it is signalling a non-fatal "not done yet"
373+
* condition: the TLS handshake needs more I/O turns to complete.
374+
* The correct response is to wait for the fd to become ready in the
375+
* indicated direction and call SSL_accept() again with the SAME ssl
376+
* object and arguments (per OpenSSL docs for all versions >= 0.9.x).
377+
*
378+
* On Linux the loopback is fast enough that the handshake nearly always
379+
* completes in a single call, masking this requirement. On BSD, macOS,
380+
* illumos/OmniOS/OpenIndiana and other non-Linux platforms the loopback
381+
* socket behaviour differs enough that WANT_READ/WANT_WRITE are returned
382+
* regularly, causing the previous single-shot code to treat a transient
383+
* condition as a fatal error and tear down the connection.
384+
*
385+
* The retry behaviour and the SSL_ERROR_WANT_* codes are identical
386+
* across all supported OpenSSL versions (0.9.x, 1.0.x, 1.1.x, 3.x):
387+
* the API contract has never changed in this regard.
388+
*/
371389
{
372-
case 1:
373-
client->ssl_connected = 1;
374-
upsdebugx(3, "SSL connected (%s)", SSL_get_version(client->ssl));
375-
break;
390+
int ssl_err;
391+
int ssl_retries = 0;
392+
/* Cap retries to avoid spinning forever on a broken socket.
393+
* 250 * 20 ms = 5 s maximum wait, which is generous for a
394+
* local handshake while being safe for CI timeouts. */
395+
const int SSL_ACCEPT_MAX_RETRIES = 250;
396+
fd_set fds;
397+
struct timeval tv;
398+
399+
ret = -1;
400+
while (ssl_retries < SSL_ACCEPT_MAX_RETRIES) {
401+
ret = SSL_accept(client->ssl);
402+
403+
if (ret == 1) {
404+
client->ssl_connected = 1;
405+
upsdebugx(3, "SSL_accept succeeded (%s)",
406+
SSL_get_version(client->ssl));
407+
break;
408+
}
376409

377-
case 0:
378-
upslog_with_errno(LOG_ERR, "SSL_accept do not accept handshake.");
379-
ssl_error(client->ssl, ret);
380-
break;
410+
ssl_err = SSL_get_error(client->ssl, ret);
381411

382-
case -1:
383-
upslog_with_errno(LOG_ERR, "Unknown return value from SSL_accept");
384-
ssl_error(client->ssl, ret);
385-
break;
386-
default:
387-
break;
412+
if (ssl_err == SSL_ERROR_WANT_READ
413+
|| ssl_err == SSL_ERROR_WANT_WRITE
414+
) {
415+
/* Non-fatal: handshake needs another I/O turn.
416+
* Wait up to 20 ms for the fd to be ready, then
417+
* retry SSL_accept() with the same ssl object. */
418+
FD_ZERO(&fds);
419+
FD_SET(client->sock_fd, &fds);
420+
tv.tv_sec = 0;
421+
tv.tv_usec = 20000; /* 20 ms */
422+
423+
upsdebugx(4,
424+
"%s: SSL_accept WANT_%s, retry %d/%d",
425+
__func__,
426+
(ssl_err == SSL_ERROR_WANT_READ)
427+
? "READ" : "WRITE",
428+
ssl_retries + 1,
429+
SSL_ACCEPT_MAX_RETRIES);
430+
431+
if (select(client->sock_fd + 1,
432+
(ssl_err == SSL_ERROR_WANT_READ) ? &fds : NULL,
433+
(ssl_err == SSL_ERROR_WANT_WRITE) ? &fds : NULL,
434+
NULL, &tv) < 0
435+
) {
436+
upslog_with_errno(LOG_ERR,
437+
"%s: select() failed during SSL_accept",
438+
__func__);
439+
ssl_error(client->ssl, ret);
440+
return;
441+
}
442+
ssl_retries++;
443+
continue;
444+
}
445+
446+
/* Any other error is fatal */
447+
if (ret == 0) {
448+
upslog_with_errno(LOG_ERR,
449+
"%s: SSL_accept did not accept handshake"
450+
" (SSL_ERROR %d)",
451+
__func__, ssl_err);
452+
} else {
453+
upslog_with_errno(LOG_ERR,
454+
"%s: SSL_accept failed"
455+
" (SSL_ERROR %d)",
456+
__func__, ssl_err);
457+
}
458+
ssl_error(client->ssl, ret);
459+
return;
460+
}
461+
462+
if (ssl_retries >= SSL_ACCEPT_MAX_RETRIES) {
463+
upslogx(LOG_ERR,
464+
"%s: SSL_accept timed out after %d retries"
465+
" (non-blocking handshake never completed)",
466+
__func__, ssl_retries);
467+
ssl_error(client->ssl, ret);
468+
return;
469+
}
388470
}
389471

390472
# elif defined(WITH_NSS) /* not WITH_OPENSSL */

tests/NIT/Makefile.am

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ ABS_TOP_BUILDDIR = @ABS_TOP_BUILDDIR@
2727
# Early in issue #1711 OpenSSL tests failed more often than we liked, so
2828
# this toggle was added. It may still be useful for CI on some platforms.
2929
# Developers can run the script directly or override the make variable.
30-
if WITH_OPENSSL
31-
WITHOUT_SSL_TESTS = true
32-
else !WITH_OPENSSL
30+
### if WITH_OPENSSL
31+
### WITHOUT_SSL_TESTS = true
32+
### else !WITH_OPENSSL
3333
WITHOUT_SSL_TESTS = false
34-
endif !WITH_OPENSSL
34+
### endif !WITH_OPENSSL
3535

3636
# Run in builddir, use script from srcdir
3737
# Avoid running "$<" - not all make implementations handle that

0 commit comments

Comments
 (0)