From 335b4d5abc5306dc22f1ec0ac164403faca2b862 Mon Sep 17 00:00:00 2001 From: cyli Date: Wed, 1 Feb 2017 02:47:42 +0400 Subject: [PATCH 1/4] Do slightly more validation of the join token when downloading the root certificate so it's more obvious whether a swarm token is mistyped (for instance, copy-and-pasted wrong). Signed-off-by: cyli --- ca/config.go | 2 +- ca/config_test.go | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ca/config.go b/ca/config.go index d2664bd635..d8fc833d5a 100644 --- a/ca/config.go +++ b/ca/config.go @@ -189,7 +189,7 @@ func GenerateJoinToken(rootCA *RootCA) string { func getCAHashFromToken(token string) (digest.Digest, error) { split := strings.Split(token, "-") - if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" { + if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" || len(split[2]) != base36DigestLen || len(split[3]) != maxGeneratedSecretLength { return "", errors.New("invalid join token") } diff --git a/ca/config_test.go b/ca/config_test.go index 2e5bea9f46..cb85d0fb46 100644 --- a/ca/config_test.go +++ b/ca/config_test.go @@ -54,9 +54,14 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) { os.RemoveAll(tc.Paths.RootCA.Cert) // invalid token - _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, "invalidtoken", tc.ConnBroker) - require.Error(t, err) - require.Contains(t, err.Error(), "invalid join token") + for _, invalid := range []string{ + "invalidtoken", // completely invalid + "SWMTKN-1-3wkodtpeoipd1u1hi0ykdcdwhw16dk73ulqqtn14b3indz68rf-4myj5xihyto11dg1cn55w8p6", // mistyped + } { + _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, invalid, tc.ConnBroker) + require.Error(t, err) + require.Contains(t, err.Error(), "invalid join token") + } // invalid hash token splitToken := strings.Split(tc.ManagerToken, "-") @@ -65,7 +70,7 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) { os.RemoveAll(tc.Paths.RootCA.Cert) - _, err = ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker) + _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker) require.Error(t, err) require.Contains(t, err.Error(), "remote CA does not match fingerprint.") } From aefc3e91d98def2e68f32ecc83bcca76edd77218 Mon Sep 17 00:00:00 2001 From: cyli Date: Tue, 31 Jan 2017 17:37:25 -0800 Subject: [PATCH 2/4] Include timestamp information when validating a TLS certificate fails with an expiry error. Signed-off-by: cyli --- ca/certificates.go | 18 +++++++++++++++++- ca/config.go | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ca/certificates.go b/ca/certificates.go index 7258e30593..acc6dc6a27 100644 --- a/ca/certificates.go +++ b/ca/certificates.go @@ -151,6 +151,22 @@ func (rca *RootCA) IssueAndSaveNewCertificates(kw KeyWriter, cn, ou, org string) return &tlsKeyPair, nil } +// Normally we can just call cert.Verify(opts), but since we actually want more information about +// whether a certificate is not yet valid or expired, we also need to perform the expiry checks ourselves. +func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions) error { + _, err := cert.Verify(opts) + if invalidErr, ok := err.(x509.CertificateInvalidError); ok && invalidErr.Reason == x509.Expired { + now := time.Now().UTC() + if now.Before(cert.NotBefore) { + return errors.Wrapf(err, "certificate not valid before %s, and it is currently %s", + cert.NotBefore.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) + } + return errors.Wrapf(err, "certificate expires at %s, and it is currently %s", + cert.NotAfter.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) + } + return err +} + // RequestAndSaveNewCertificates gets new certificates issued, either by signing them locally if a signer is // available, or by requesting them from the remote server at remoteAddr. func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWriter, config CertificateRequestConfig) (*tls.Certificate, error) { @@ -199,7 +215,7 @@ func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWrit Roots: rca.Pool, } // Check to see if this certificate was signed by our CA, and isn't expired - if _, err := X509Cert.Verify(opts); err != nil { + if err := verifyCertificate(X509Cert, opts); err != nil { return nil, err } diff --git a/ca/config.go b/ca/config.go index d8fc833d5a..48d4431e5f 100644 --- a/ca/config.go +++ b/ca/config.go @@ -273,7 +273,7 @@ func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) } // Check to see if this certificate was signed by our CA, and isn't expired - if _, err := X509Cert.Verify(opts); err != nil { + if err := verifyCertificate(X509Cert, opts); err != nil { return nil, err } From d1d1ae6c47f14102802cd7d5fba58886bb855c55 Mon Sep 17 00:00:00 2001 From: cyli Date: Wed, 1 Feb 2017 13:41:31 -0800 Subject: [PATCH 3/4] LoadSecurityConfig can now take a parameter that allows the expiry check to be bypassed (although the NotBefore time is still checked - this just bypasses the NotAfter check). This is needed in order to load TLS certificates that may be expired when forcing a new cluster. Signed-off-by: cyli --- ca/certificates.go | 7 +++-- ca/config.go | 4 +-- ca/config_test.go | 77 ++++++++++++++++++++++++++++++++++++++++++++-- node/node.go | 4 +-- 4 files changed, 83 insertions(+), 9 deletions(-) diff --git a/ca/certificates.go b/ca/certificates.go index acc6dc6a27..63fa323fb5 100644 --- a/ca/certificates.go +++ b/ca/certificates.go @@ -153,7 +153,7 @@ func (rca *RootCA) IssueAndSaveNewCertificates(kw KeyWriter, cn, ou, org string) // Normally we can just call cert.Verify(opts), but since we actually want more information about // whether a certificate is not yet valid or expired, we also need to perform the expiry checks ourselves. -func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions) error { +func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions, allowExpired bool) error { _, err := cert.Verify(opts) if invalidErr, ok := err.(x509.CertificateInvalidError); ok && invalidErr.Reason == x509.Expired { now := time.Now().UTC() @@ -161,6 +161,9 @@ func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions) error { return errors.Wrapf(err, "certificate not valid before %s, and it is currently %s", cert.NotBefore.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) } + if allowExpired { + return nil + } return errors.Wrapf(err, "certificate expires at %s, and it is currently %s", cert.NotAfter.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) } @@ -215,7 +218,7 @@ func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWrit Roots: rca.Pool, } // Check to see if this certificate was signed by our CA, and isn't expired - if err := verifyCertificate(X509Cert, opts); err != nil { + if err := verifyCertificate(X509Cert, opts, false); err != nil { return nil, err } diff --git a/ca/config.go b/ca/config.go index 48d4431e5f..5d5aae3c9f 100644 --- a/ca/config.go +++ b/ca/config.go @@ -242,7 +242,7 @@ func DownloadRootCA(ctx context.Context, paths CertPaths, token string, connBrok // LoadSecurityConfig loads TLS credentials from disk, or returns an error if // these credentials do not exist or are unusable. -func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) (*SecurityConfig, error) { +func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter, allowExpired bool) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") // At this point we've successfully loaded the CA details from disk, or @@ -273,7 +273,7 @@ func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) } // Check to see if this certificate was signed by our CA, and isn't expired - if err := verifyCertificate(X509Cert, opts); err != nil { + if err := verifyCertificate(X509Cert, opts, allowExpired); err != nil { return nil, err } diff --git a/ca/config_test.go b/ca/config_test.go index cb85d0fb46..8d6f3d674f 100644 --- a/ca/config_test.go +++ b/ca/config_test.go @@ -1,7 +1,12 @@ package ca_test import ( + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "io/ioutil" + "math/big" "os" "strings" "testing" @@ -10,10 +15,12 @@ import ( "golang.org/x/net/context" cfconfig "github.com/cloudflare/cfssl/config" + "github.com/cloudflare/cfssl/helpers" "github.com/docker/swarmkit/ca" "github.com/docker/swarmkit/ca/testutils" "github.com/docker/swarmkit/ioutils" "github.com/docker/swarmkit/manager/state/store" + "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -130,6 +137,70 @@ func TestCreateSecurityConfigNoCerts(t *testing.T) { assert.Equal(t, rootCA, *nodeConfig.RootCA()) } +func TestLoadSecurityConfigExpiredCert(t *testing.T) { + tc := testutils.NewTestCA(t) + defer tc.Stop() + + _, key, err := ca.GenerateNewCSR() + require.NoError(t, err) + require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Key, key, 0600)) + certKey, err := helpers.ParsePrivateKeyPEM(key) + require.NoError(t, err) + + rootKey, err := helpers.ParsePrivateKeyPEM(tc.RootCA.Key) + require.NoError(t, err) + rootCert, err := helpers.ParseCertificatePEM(tc.RootCA.Cert) + require.NoError(t, err) + + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + require.NoError(t, err) + + genCert := func(notBefore, notAfter time.Time) { + derBytes, err := x509.CreateCertificate(rand.Reader, &x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + CommonName: "CN", + OrganizationalUnit: []string{"OU"}, + Organization: []string{"ORG"}, + }, + NotBefore: notBefore, + NotAfter: notAfter, + }, rootCert, certKey.Public(), rootKey) + require.NoError(t, err) + certBytes := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: derBytes, + }) + require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Cert, certBytes, 0644)) + } + + krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) + now := time.Now() + + // A cert that is not yet valid is not valid even if expiry is allowed + genCert(now.Add(time.Hour), now.Add(time.Hour*2)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + // a cert that is expired is not valid if expiry is not allowed + genCert(now.Add(time.Hour*-3), now.Add(time.Hour*-1)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + // but it is valid if expiry is allowed + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true) + require.NoError(t, err) +} + func TestLoadSecurityConfigInvalidCert(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() @@ -141,7 +212,7 @@ some random garbage\n krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) - _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw) + _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) assert.Error(t, err) nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw, @@ -167,7 +238,7 @@ some random garbage\n krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) - _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw) + _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) assert.Error(t, err) nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw, @@ -190,7 +261,7 @@ func TestLoadSecurityConfigIncorrectPassphrase(t *testing.T) { "nodeID", ca.WorkerRole, tc.Organization) require.NoError(t, err) - _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil)) + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil), false) require.IsType(t, ca.ErrInvalidKEK{}, err) } diff --git a/node/node.go b/node/node.go index d36d90a5a3..c897f9e4d7 100644 --- a/node/node.go +++ b/node/node.go @@ -568,7 +568,7 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro return nil, err } if err == nil { - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw) + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, false) if err != nil { _, isInvalidKEK := errors.Cause(err).(ca.ErrInvalidKEK) if isInvalidKEK { @@ -606,7 +606,7 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro // - We wait for CreateSecurityConfig to finish since we need a certificate to operate. // Attempt to load certificate from disk - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw) + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, false) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": securityConfig.ClientTLSCreds.NodeID(), From d6df02bb3b573989b6fcbfb0025bcf3ff0b20ac3 Mon Sep 17 00:00:00 2001 From: cyli Date: Wed, 1 Feb 2017 17:50:42 -0800 Subject: [PATCH 4/4] Allow nodes that are initialized with ForceNewCluster=true to start with expired TLS certificates. This also changes the certificate renewal loop to continue trying to renew if the TLS certificates are expired, since the renewal will still fail unless the node happens to be the CA server (since other nodes must connect to the CA in order to renew). Signed-off-by: cyli --- ca/config.go | 26 +++++---- integration/cluster.go | 12 +++-- integration/integration_test.go | 94 +++++++++++++++++++++++++++++++-- integration/node.go | 29 ++++++++-- node/node.go | 5 +- 5 files changed, 141 insertions(+), 25 deletions(-) diff --git a/ca/config.go b/ca/config.go index 5d5aae3c9f..38fb4b7ffe 100644 --- a/ca/config.go +++ b/ca/config.go @@ -15,6 +15,7 @@ import ( "github.com/Sirupsen/logrus" cfconfig "github.com/cloudflare/cfssl/config" + events "github.com/docker/go-events" "github.com/docker/swarmkit/api" "github.com/docker/swarmkit/connectionbroker" "github.com/docker/swarmkit/identity" @@ -445,8 +446,15 @@ func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, connBroker *conne func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate { updates := make(chan CertificateUpdate) + backoffConfig := events.ExponentialBackoffConfig{ + Base: time.Second * 5, + Factor: time.Minute, + Max: 1 * time.Hour, + } + go func() { var retry time.Duration + expBackoff := events.NewExponentialBackoff(backoffConfig) defer close(updates) for { ctx = log.WithModule(ctx, "tls") @@ -472,18 +480,12 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti return } } else { - // If we have an expired certificate, we let's stick with the starting default in - // the hope that this is a temporary clock skew. + // If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or + // we can issue our own TLS certs. if validUntil.Before(time.Now()) { - log.WithError(err).Errorf("failed to create a new client TLS config") - - select { - case updates <- CertificateUpdate{Err: errors.New("TLS certificate is expired")}: - case <-ctx.Done(): - log.Info("shutting down certificate renewal routine") - return - } - + log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately") + // retry immediately(ish) with exponential backoff + retry = expBackoff.Proceed(nil) } else { // Random retry time between 50% and 80% of the total time to expiration retry = calculateRandomExpiry(validFrom, validUntil) @@ -508,8 +510,10 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti var certUpdate CertificateUpdate if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil { certUpdate.Err = err + expBackoff.Failure(nil, nil) } else { certUpdate.Role = s.ClientTLSCreds.Role() + expBackoff = events.NewExponentialBackoff(backoffConfig) } select { diff --git a/integration/cluster.go b/integration/cluster.go index ffdbf7ef30..94f0d112b0 100644 --- a/integration/cluster.go +++ b/integration/cluster.go @@ -9,6 +9,7 @@ import ( "google.golang.org/grpc" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" "github.com/docker/swarmkit/log" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "golang.org/x/net/context" @@ -75,12 +76,13 @@ func (c *testCluster) RandomManager() *testNode { // AddManager adds a node with the Manager role. The node will function as both // an agent and a manager. If lateBind is set, the manager is started before a -// remote API port is bound. This setting only applies to the first manager. -func (c *testCluster) AddManager(lateBind bool) error { +// remote API port is bound. If rootCA is set, the manager is bootstrapped using +// said root CA. These settings only apply to the first manager. +func (c *testCluster) AddManager(lateBind bool, rootCA *ca.RootCA) error { // first node var n *testNode if len(c.nodes) == 0 { - node, err := newTestNode("", "", lateBind) + node, err := newTestNode("", "", lateBind, rootCA) if err != nil { return err } @@ -98,7 +100,7 @@ func (c *testCluster) AddManager(lateBind bool) error { if len(clusterInfo.Clusters) == 0 { return fmt.Errorf("joining manager: there is no cluster created in storage") } - node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false) + node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false, nil) if err != nil { return err } @@ -157,7 +159,7 @@ func (c *testCluster) AddAgent() error { if len(clusterInfo.Clusters) == 0 { return fmt.Errorf("joining agent: there is no cluster created in storage") } - node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false) + node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false, nil) if err != nil { return err } diff --git a/integration/integration_test.go b/integration/integration_test.go index f182808428..e5ab0195db 100644 --- a/integration/integration_test.go +++ b/integration/integration_test.go @@ -1,16 +1,24 @@ package integration import ( + "crypto/rand" + "crypto/x509" + "encoding/pem" "flag" "fmt" + "io/ioutil" "os" + "path/filepath" "runtime" "testing" + "time" "golang.org/x/net/context" "github.com/Sirupsen/logrus" + "github.com/cloudflare/cfssl/helpers" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "github.com/pkg/errors" "github.com/stretchr/testify/require" @@ -125,7 +133,7 @@ func pollServiceReady(t *testing.T, c *testCluster, sid string) { func newCluster(t *testing.T, numWorker, numManager int) *testCluster { cl := newTestCluster() for i := 0; i < numManager; i++ { - require.NoError(t, cl.AddManager(false), "manager number %d", i+1) + require.NoError(t, cl.AddManager(false, nil), "manager number %d", i+1) } for i := 0; i < numWorker; i++ { require.NoError(t, cl.AddAgent(), "agent number %d", i+1) @@ -152,7 +160,7 @@ func TestServiceCreateLateBind(t *testing.T) { cl := newTestCluster() for i := 0; i < numManager; i++ { - require.NoError(t, cl.AddManager(true), "manager number %d", i+1) + require.NoError(t, cl.AddManager(true, nil), "manager number %d", i+1) } for i := 0; i < numWorker; i++ { require.NoError(t, cl.AddAgent(), "agent number %d", i+1) @@ -386,7 +394,7 @@ func TestDemoteDownedManager(t *testing.T) { // stop the node, then demote it, and start it back up again so when it comes back up it has to realize // it's not running anymore - require.NoError(t, demotee.Pause()) + require.NoError(t, demotee.Pause(false)) // demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since // the node is currently down @@ -429,7 +437,7 @@ func TestRestartLeader(t *testing.T) { origLeaderID := leader.node.NodeID() - require.NoError(t, leader.Pause()) + require.NoError(t, leader.Pause(false)) require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) @@ -452,3 +460,81 @@ func TestRestartLeader(t *testing.T) { pollClusterReady(t, cl, numWorker, numManager) } + +func TestForceNewCluster(t *testing.T) { + t.Parallel() + logrus.SetLevel(logrus.DebugLevel) + + // create an external CA so that we can use it to generate expired certificates + tempDir, err := ioutil.TempDir("", "external-ca") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + rootCA, err := ca.CreateRootCA("externalRoot", ca.NewConfigPaths(tempDir).RootCA) + require.NoError(t, err) + + // start a new cluster with the external CA bootstrapped + numWorker, numManager := 0, 1 + cl := newTestCluster() + defer func() { + require.NoError(t, cl.Stop()) + }() + require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1") + pollClusterReady(t, cl, numWorker, numManager) + + leader, err := cl.Leader() + require.NoError(t, err) + + sid, err := cl.CreateService("test_service", 2) + require.NoError(t, err) + pollServiceReady(t, cl, sid) + + // generate an expired certificate + rootKey, err := helpers.ParsePrivateKeyPEM(rootCA.Key) + require.NoError(t, err) + rootCert, err := helpers.ParseCertificatePEM(rootCA.Cert) + require.NoError(t, err) + + managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt") + certBytes, err := ioutil.ReadFile(managerCertFile) + require.NoError(t, err) + managerCerts, err := helpers.ParseCertificatesPEM(certBytes) + require.NoError(t, err) + expiredCertTemplate := managerCerts[0] + expiredCertTemplate.NotBefore = time.Now().Add(time.Hour * -5) + expiredCertTemplate.NotAfter = time.Now().Add(time.Hour * -3) + expiredCertDERBytes, err := x509.CreateCertificate(rand.Reader, expiredCertTemplate, rootCert, expiredCertTemplate.PublicKey, rootKey) + require.NoError(t, err) + expiredCertPEM := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: expiredCertDERBytes, + }) + + // restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed + nodeID := leader.node.NodeID() + require.NoError(t, leader.Pause(true)) + require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) + require.NoError(t, cl.StartNode(nodeID)) + pollClusterReady(t, cl, numWorker, numManager) + pollServiceReady(t, cl, sid) + + err = raftutils.PollFuncWithTimeout(nil, func() error { + certBytes, err := ioutil.ReadFile(managerCertFile) + if err != nil { + return err + } + managerCerts, err := helpers.ParseCertificatesPEM(certBytes) + if err != nil { + return err + } + if managerCerts[0].NotAfter.Before(time.Now()) { + return errors.New("certificate hasn't been renewed yet") + } + return nil + }, opsTimeout) + require.NoError(t, err) + + // restart node with an expired certificate without forcing a new cluster - it should error on start + require.NoError(t, leader.Pause(true)) + require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) + require.Error(t, cl.StartNode(nodeID)) +} diff --git a/integration/node.go b/integration/node.go index 056798d5d9..afc7cf91c7 100644 --- a/integration/node.go +++ b/integration/node.go @@ -9,6 +9,8 @@ import ( "google.golang.org/grpc" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" + "github.com/docker/swarmkit/identity" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "github.com/docker/swarmkit/node" "golang.org/x/net/context" @@ -25,8 +27,8 @@ type testNode struct { // newNode creates new node with specific role(manager or agent) and joins to // existing cluster. if joinAddr is empty string, then new cluster will be initialized. // It uses TestExecutor as executor. If lateBind is set, the remote API port is not -// bound. -func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { +// bound. If rootCA is set, this root is used to bootstrap the node's TLS certs. +func newTestNode(joinAddr, joinToken string, lateBind bool, rootCA *ca.RootCA) (*testNode, error) { tmpDir, err := ioutil.TempDir("", "swarmkit-integration-") if err != nil { return nil, err @@ -43,6 +45,26 @@ func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { if !lateBind { cfg.ListenRemoteAPI = "127.0.0.1:0" } + if rootCA != nil { + certDir := filepath.Join(tmpDir, "certificates") + if err := os.MkdirAll(certDir, 0700); err != nil { + return nil, err + } + certPaths := ca.NewConfigPaths(certDir) + if err := ioutil.WriteFile(certPaths.RootCA.Cert, rootCA.Cert, 0644); err != nil { + return nil, err + } + if err := ioutil.WriteFile(certPaths.RootCA.Key, rootCA.Key, 0600); err != nil { + return nil, err + } + // generate TLS certs for this manager for bootstrapping, else the node will generate its own CA + _, err := rootCA.IssueAndSaveNewCertificates(ca.NewKeyReadWriter(certPaths.Node, nil, nil), + identity.NewID(), ca.ManagerRole, identity.NewID()) + if err != nil { + return nil, err + } + } + node, err := node.New(cfg) if err != nil { return nil, err @@ -55,7 +77,7 @@ func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { } // Pause stops the node, and creates a new swarm node while keeping all the state -func (n *testNode) Pause() error { +func (n *testNode) Pause(forceNewCluster bool) error { rAddr, err := n.node.RemoteAPIAddr() if err != nil { rAddr = "127.0.0.1:0" @@ -71,6 +93,7 @@ func (n *testNode) Pause() error { // other remotes that are stored in the raft directory. cfg.JoinAddr = "" cfg.JoinToken = "" + cfg.ForceNewCluster = forceNewCluster node, err := node.New(cfg) if err != nil { diff --git a/node/node.go b/node/node.go index c897f9e4d7..45b178b4dc 100644 --- a/node/node.go +++ b/node/node.go @@ -568,7 +568,8 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro return nil, err } if err == nil { - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, false) + // if forcing a new cluster, we allow the certificates to be expired - a new set will be generated + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) if err != nil { _, isInvalidKEK := errors.Cause(err).(ca.ErrInvalidKEK) if isInvalidKEK { @@ -606,7 +607,7 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro // - We wait for CreateSecurityConfig to finish since we need a certificate to operate. // Attempt to load certificate from disk - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, false) + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": securityConfig.ClientTLSCreds.NodeID(),