Skip to content

Commit b4104c9

Browse files
authored
Merge 2107cbc into 7e9c6f3
2 parents 7e9c6f3 + 2107cbc commit b4104c9

4 files changed

Lines changed: 202 additions & 33 deletions

File tree

NEWS.adoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ https://github.com/networkupstools/nut/milestone/12
119119
and `upscli_set_debug_level()` methods to facilitate NUT debugging
120120
for clients built with shared NUT private libraries. [issues #3328,
121121
#1771, #2800, PR #3330]
122+
* Revised OpenSSL handshake to support retries requested by the library,
123+
so it at least works reliably on different platforms within the scope
124+
it has now. [issue #3331]
122125

123126
- NUT for Windows specific updates:
124127
* Revised detection of (relative) paths to program and configuration files

clients/upsclient.c

Lines changed: 98 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <stdlib.h>
3838
#include <string.h>
3939
#include <unistd.h>
40+
#include <sys/select.h> /* fd_set and select(); (or sys/time.h on older BSDs) */
4041

4142
#ifndef WIN32
4243
# include <netdb.h>
@@ -913,20 +914,104 @@ static int upscli_sslinit(UPSCONN_t *ups, int verifycert)
913914
SSL_set_verify(ups->ssl, SSL_VERIFY_NONE, NULL);
914915
}
915916

916-
res = SSL_connect(ups->ssl);
917-
switch(res)
917+
/* SSL_connect() on a non-blocking socket requires a retry loop.
918+
* When SSL_connect() returns -1 with SSL_ERROR_WANT_READ or
919+
* SSL_ERROR_WANT_WRITE it is signalling a non-fatal "not done yet"
920+
* condition: the TLS handshake needs more I/O turns to complete.
921+
* The correct response is to wait for the fd to become ready in the
922+
* indicated direction and call SSL_connect() again with the SAME ssl
923+
* object (per OpenSSL docs for all versions >= 0.9.x).
924+
*
925+
* On Linux the loopback is fast enough that the handshake nearly always
926+
* completes in a single call, masking this requirement. On BSD, macOS,
927+
* illumos/OmniOS/OpenIndiana and other non-Linux platforms the loopback
928+
* socket behaviour differs enough that WANT_READ/WANT_WRITE are returned
929+
* regularly, causing the previous single-shot code to treat a transient
930+
* condition as a fatal error and tear down the connection.
931+
*
932+
* The retry behaviour and the SSL_ERROR_WANT_* codes are identical
933+
* across all supported OpenSSL versions (0.9.x, 1.0.x, 1.1.x, 3.x):
934+
* the API contract has never changed in this regard.
935+
*/
918936
{
919-
case 1:
920-
upsdebugx(3, "SSL connected (%s)", SSL_get_version(ups->ssl));
921-
break;
922-
case 0:
923-
upsdebug_with_errno(1, "SSL_connect do not accept handshake.");
924-
ssl_error(ups->ssl, res);
925-
return -1;
926-
default:
927-
upsdebug_with_errno(1, "Unknown return value from SSL_connect %d", res);
928-
ssl_error(ups->ssl, res);
929-
return -1;
937+
int ssl_err;
938+
int ssl_retries = 0;
939+
/* Cap retries to avoid spinning forever on a broken socket.
940+
* 250 * 20 ms = 5 s maximum wait, which is generous for a
941+
* local handshake while being safe for CI timeouts. */
942+
const int SSL_CONNECT_MAX_RETRIES = 250;
943+
fd_set fds;
944+
struct timeval tv;
945+
946+
while (ssl_retries < SSL_CONNECT_MAX_RETRIES) {
947+
res = SSL_connect(ups->ssl);
948+
949+
if (res == 1) {
950+
upsdebugx(3, "SSL connected (%s)",
951+
SSL_get_version(ups->ssl));
952+
break;
953+
}
954+
955+
ssl_err = SSL_get_error(ups->ssl, res);
956+
957+
if (ssl_err == SSL_ERROR_WANT_READ
958+
|| ssl_err == SSL_ERROR_WANT_WRITE
959+
) {
960+
/* Non-fatal: handshake needs another I/O turn.
961+
* Wait up to 20 ms for the fd to be ready, then
962+
* retry SSL_connect() with the same ssl object. */
963+
FD_ZERO(&fds);
964+
FD_SET(ups->fd, &fds);
965+
tv.tv_sec = 0;
966+
tv.tv_usec = 20000; /* 20 ms */
967+
968+
upsdebugx(4,
969+
"%s: SSL_connect WANT_%s, retry %d/%d",
970+
__func__,
971+
(ssl_err == SSL_ERROR_WANT_READ)
972+
? "READ" : "WRITE",
973+
ssl_retries + 1,
974+
SSL_CONNECT_MAX_RETRIES);
975+
976+
if (select(ups->fd + 1,
977+
(ssl_err == SSL_ERROR_WANT_READ) ? &fds : NULL,
978+
(ssl_err == SSL_ERROR_WANT_WRITE) ? &fds : NULL,
979+
NULL, &tv) < 0
980+
) {
981+
upsdebug_with_errno(1,
982+
"%s: select() failed during SSL_connect",
983+
__func__);
984+
ssl_error(ups->ssl, res);
985+
return -1;
986+
}
987+
ssl_retries++;
988+
continue;
989+
}
990+
991+
/* Any other error is fatal */
992+
if (res == 0) {
993+
upsdebug_with_errno(1,
994+
"%s: SSL_connect did not accept handshake"
995+
" (SSL_ERROR %d)",
996+
__func__, ssl_err);
997+
} else {
998+
upsdebug_with_errno(1,
999+
"%s: SSL_connect failed"
1000+
" (SSL_ERROR %d)",
1001+
__func__, ssl_err);
1002+
}
1003+
ssl_error(ups->ssl, res);
1004+
return -1;
1005+
}
1006+
1007+
if (ssl_retries >= SSL_CONNECT_MAX_RETRIES) {
1008+
upslogx(LOG_ERR,
1009+
"%s: SSL_connect timed out after %d retries"
1010+
" (non-blocking handshake never completed)",
1011+
__func__, ssl_retries);
1012+
ssl_error(ups->ssl, res);
1013+
return -1;
1014+
}
9301015
}
9311016

9321017
return 1;

server/netssl.c

Lines changed: 97 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "common.h" /* for upsdebugx() etc */
2929

3030
#include <sys/types.h>
31+
#include <sys/select.h> /* fd_set and select(); (or sys/time.h on older BSDs) */
3132
#ifndef WIN32
3233
# include <netinet/in.h>
3334
# include <sys/socket.h>
@@ -366,25 +367,105 @@ void net_starttls(nut_ctype_t *client, size_t numarg, const char **arg)
366367
return;
367368
}
368369

369-
ret = SSL_accept(client->ssl);
370-
switch (ret)
370+
/* SSL_accept() on a non-blocking socket (which upsd uses) requires a
371+
* retry loop. When SSL_accept() returns -1 with SSL_ERROR_WANT_READ or
372+
* SSL_ERROR_WANT_WRITE it is signalling a non-fatal "not done yet"
373+
* condition: the TLS handshake needs more I/O turns to complete.
374+
* The correct response is to wait for the fd to become ready in the
375+
* indicated direction and call SSL_accept() again with the SAME ssl
376+
* object and arguments (per OpenSSL docs for all versions >= 0.9.x).
377+
*
378+
* On Linux the loopback is fast enough that the handshake nearly always
379+
* completes in a single call, masking this requirement. On BSD, macOS,
380+
* illumos/OmniOS/OpenIndiana and other non-Linux platforms the loopback
381+
* socket behaviour differs enough that WANT_READ/WANT_WRITE are returned
382+
* regularly, causing the previous single-shot code to treat a transient
383+
* condition as a fatal error and tear down the connection.
384+
*
385+
* The retry behaviour and the SSL_ERROR_WANT_* codes are identical
386+
* across all supported OpenSSL versions (0.9.x, 1.0.x, 1.1.x, 3.x):
387+
* the API contract has never changed in this regard.
388+
*/
371389
{
372-
case 1:
373-
client->ssl_connected = 1;
374-
upsdebugx(3, "SSL connected (%s)", SSL_get_version(client->ssl));
375-
break;
390+
int ssl_err;
391+
int ssl_retries = 0;
392+
/* Cap retries to avoid spinning forever on a broken socket.
393+
* 250 * 20 ms = 5 s maximum wait, which is generous for a
394+
* local handshake while being safe for CI timeouts. */
395+
const int SSL_ACCEPT_MAX_RETRIES = 250;
396+
fd_set fds;
397+
struct timeval tv;
398+
399+
while (ssl_retries < SSL_ACCEPT_MAX_RETRIES) {
400+
ret = SSL_accept(client->ssl);
401+
402+
if (ret == 1) {
403+
client->ssl_connected = 1;
404+
upsdebugx(3, "SSL_accept succeeded (%s)",
405+
SSL_get_version(client->ssl));
406+
break;
407+
}
376408

377-
case 0:
378-
upslog_with_errno(LOG_ERR, "SSL_accept do not accept handshake.");
379-
ssl_error(client->ssl, ret);
380-
break;
409+
ssl_err = SSL_get_error(client->ssl, ret);
381410

382-
case -1:
383-
upslog_with_errno(LOG_ERR, "Unknown return value from SSL_accept");
384-
ssl_error(client->ssl, ret);
385-
break;
386-
default:
387-
break;
411+
if (ssl_err == SSL_ERROR_WANT_READ
412+
|| ssl_err == SSL_ERROR_WANT_WRITE
413+
) {
414+
/* Non-fatal: handshake needs another I/O turn.
415+
* Wait up to 20 ms for the fd to be ready, then
416+
* retry SSL_accept() with the same ssl object. */
417+
FD_ZERO(&fds);
418+
FD_SET(client->sock_fd, &fds);
419+
tv.tv_sec = 0;
420+
tv.tv_usec = 20000; /* 20 ms */
421+
422+
upsdebugx(4,
423+
"%s: SSL_accept WANT_%s, retry %d/%d",
424+
__func__,
425+
(ssl_err == SSL_ERROR_WANT_READ)
426+
? "READ" : "WRITE",
427+
ssl_retries + 1,
428+
SSL_ACCEPT_MAX_RETRIES);
429+
430+
if (select(client->sock_fd + 1,
431+
(ssl_err == SSL_ERROR_WANT_READ) ? &fds : NULL,
432+
(ssl_err == SSL_ERROR_WANT_WRITE) ? &fds : NULL,
433+
NULL, &tv) < 0
434+
) {
435+
upslog_with_errno(LOG_ERR,
436+
"%s: select() failed during SSL_accept",
437+
__func__);
438+
ssl_error(client->ssl, ret);
439+
return;
440+
}
441+
ssl_retries++;
442+
continue;
443+
}
444+
445+
/* Any other error is fatal */
446+
if (ret == 0) {
447+
upslog_with_errno(LOG_ERR,
448+
"%s: SSL_accept did not accept handshake"
449+
" (SSL_ERROR %d)",
450+
__func__, ssl_err);
451+
} else {
452+
upslog_with_errno(LOG_ERR,
453+
"%s: SSL_accept failed"
454+
" (SSL_ERROR %d)",
455+
__func__, ssl_err);
456+
}
457+
ssl_error(client->ssl, ret);
458+
return;
459+
}
460+
461+
if (ssl_retries >= SSL_ACCEPT_MAX_RETRIES) {
462+
upslogx(LOG_ERR,
463+
"%s: SSL_accept timed out after %d retries"
464+
" (non-blocking handshake never completed)",
465+
__func__, ssl_retries);
466+
ssl_error(client->ssl, ret);
467+
return;
468+
}
388469
}
389470

390471
# elif defined(WITH_NSS) /* not WITH_OPENSSL */

tests/NIT/Makefile.am

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ ABS_TOP_BUILDDIR = @ABS_TOP_BUILDDIR@
2727
# Early in issue #1711 OpenSSL tests failed more often than we liked, so
2828
# this toggle was added. It may still be useful for CI on some platforms.
2929
# Developers can run the script directly or override the make variable.
30-
if WITH_OPENSSL
31-
WITHOUT_SSL_TESTS = true
32-
else !WITH_OPENSSL
30+
### if WITH_OPENSSL
31+
### WITHOUT_SSL_TESTS = true
32+
### else !WITH_OPENSSL
3333
WITHOUT_SSL_TESTS = false
34-
endif !WITH_OPENSSL
34+
### endif !WITH_OPENSSL
3535

3636
# Run in builddir, use script from srcdir
3737
# Avoid running "$<" - not all make implementations handle that

0 commit comments

Comments
 (0)