diff --git a/ca/certificates.go b/ca/certificates.go index 7258e30593..63fa323fb5 100644 --- a/ca/certificates.go +++ b/ca/certificates.go @@ -151,6 +151,25 @@ func (rca *RootCA) IssueAndSaveNewCertificates(kw KeyWriter, cn, ou, org string) return &tlsKeyPair, nil } +// Normally we can just call cert.Verify(opts), but since we actually want more information about +// whether a certificate is not yet valid or expired, we also need to perform the expiry checks ourselves. +func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions, allowExpired bool) error { + _, err := cert.Verify(opts) + if invalidErr, ok := err.(x509.CertificateInvalidError); ok && invalidErr.Reason == x509.Expired { + now := time.Now().UTC() + if now.Before(cert.NotBefore) { + return errors.Wrapf(err, "certificate not valid before %s, and it is currently %s", + cert.NotBefore.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) + } + if allowExpired { + return nil + } + return errors.Wrapf(err, "certificate expires at %s, and it is currently %s", + cert.NotAfter.UTC().Format(time.RFC1123), now.Format(time.RFC1123)) + } + return err +} + // RequestAndSaveNewCertificates gets new certificates issued, either by signing them locally if a signer is // available, or by requesting them from the remote server at remoteAddr. func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWriter, config CertificateRequestConfig) (*tls.Certificate, error) { @@ -199,7 +218,7 @@ func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWrit Roots: rca.Pool, } // Check to see if this certificate was signed by our CA, and isn't expired - if _, err := X509Cert.Verify(opts); err != nil { + if err := verifyCertificate(X509Cert, opts, false); err != nil { return nil, err } diff --git a/ca/config.go b/ca/config.go index d2664bd635..38fb4b7ffe 100644 --- a/ca/config.go +++ b/ca/config.go @@ -15,6 +15,7 @@ import ( "github.com/Sirupsen/logrus" cfconfig "github.com/cloudflare/cfssl/config" + events "github.com/docker/go-events" "github.com/docker/swarmkit/api" "github.com/docker/swarmkit/connectionbroker" "github.com/docker/swarmkit/identity" @@ -189,7 +190,7 @@ func GenerateJoinToken(rootCA *RootCA) string { func getCAHashFromToken(token string) (digest.Digest, error) { split := strings.Split(token, "-") - if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" { + if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" || len(split[2]) != base36DigestLen || len(split[3]) != maxGeneratedSecretLength { return "", errors.New("invalid join token") } @@ -242,7 +243,7 @@ func DownloadRootCA(ctx context.Context, paths CertPaths, token string, connBrok // LoadSecurityConfig loads TLS credentials from disk, or returns an error if // these credentials do not exist or are unusable. -func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) (*SecurityConfig, error) { +func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter, allowExpired bool) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") // At this point we've successfully loaded the CA details from disk, or @@ -273,7 +274,7 @@ func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) } // Check to see if this certificate was signed by our CA, and isn't expired - if _, err := X509Cert.Verify(opts); err != nil { + if err := verifyCertificate(X509Cert, opts, allowExpired); err != nil { return nil, err } @@ -445,8 +446,15 @@ func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, connBroker *conne func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate { updates := make(chan CertificateUpdate) + backoffConfig := events.ExponentialBackoffConfig{ + Base: time.Second * 5, + Factor: time.Minute, + Max: 1 * time.Hour, + } + go func() { var retry time.Duration + expBackoff := events.NewExponentialBackoff(backoffConfig) defer close(updates) for { ctx = log.WithModule(ctx, "tls") @@ -472,18 +480,12 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti return } } else { - // If we have an expired certificate, we let's stick with the starting default in - // the hope that this is a temporary clock skew. + // If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or + // we can issue our own TLS certs. if validUntil.Before(time.Now()) { - log.WithError(err).Errorf("failed to create a new client TLS config") - - select { - case updates <- CertificateUpdate{Err: errors.New("TLS certificate is expired")}: - case <-ctx.Done(): - log.Info("shutting down certificate renewal routine") - return - } - + log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately") + // retry immediately(ish) with exponential backoff + retry = expBackoff.Proceed(nil) } else { // Random retry time between 50% and 80% of the total time to expiration retry = calculateRandomExpiry(validFrom, validUntil) @@ -508,8 +510,10 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti var certUpdate CertificateUpdate if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil { certUpdate.Err = err + expBackoff.Failure(nil, nil) } else { certUpdate.Role = s.ClientTLSCreds.Role() + expBackoff = events.NewExponentialBackoff(backoffConfig) } select { diff --git a/ca/config_test.go b/ca/config_test.go index 2e5bea9f46..8d6f3d674f 100644 --- a/ca/config_test.go +++ b/ca/config_test.go @@ -1,7 +1,12 @@ package ca_test import ( + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "io/ioutil" + "math/big" "os" "strings" "testing" @@ -10,10 +15,12 @@ import ( "golang.org/x/net/context" cfconfig "github.com/cloudflare/cfssl/config" + "github.com/cloudflare/cfssl/helpers" "github.com/docker/swarmkit/ca" "github.com/docker/swarmkit/ca/testutils" "github.com/docker/swarmkit/ioutils" "github.com/docker/swarmkit/manager/state/store" + "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -54,9 +61,14 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) { os.RemoveAll(tc.Paths.RootCA.Cert) // invalid token - _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, "invalidtoken", tc.ConnBroker) - require.Error(t, err) - require.Contains(t, err.Error(), "invalid join token") + for _, invalid := range []string{ + "invalidtoken", // completely invalid + "SWMTKN-1-3wkodtpeoipd1u1hi0ykdcdwhw16dk73ulqqtn14b3indz68rf-4myj5xihyto11dg1cn55w8p6", // mistyped + } { + _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, invalid, tc.ConnBroker) + require.Error(t, err) + require.Contains(t, err.Error(), "invalid join token") + } // invalid hash token splitToken := strings.Split(tc.ManagerToken, "-") @@ -65,7 +77,7 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) { os.RemoveAll(tc.Paths.RootCA.Cert) - _, err = ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker) + _, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker) require.Error(t, err) require.Contains(t, err.Error(), "remote CA does not match fingerprint.") } @@ -125,6 +137,70 @@ func TestCreateSecurityConfigNoCerts(t *testing.T) { assert.Equal(t, rootCA, *nodeConfig.RootCA()) } +func TestLoadSecurityConfigExpiredCert(t *testing.T) { + tc := testutils.NewTestCA(t) + defer tc.Stop() + + _, key, err := ca.GenerateNewCSR() + require.NoError(t, err) + require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Key, key, 0600)) + certKey, err := helpers.ParsePrivateKeyPEM(key) + require.NoError(t, err) + + rootKey, err := helpers.ParsePrivateKeyPEM(tc.RootCA.Key) + require.NoError(t, err) + rootCert, err := helpers.ParseCertificatePEM(tc.RootCA.Cert) + require.NoError(t, err) + + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + require.NoError(t, err) + + genCert := func(notBefore, notAfter time.Time) { + derBytes, err := x509.CreateCertificate(rand.Reader, &x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + CommonName: "CN", + OrganizationalUnit: []string{"OU"}, + Organization: []string{"ORG"}, + }, + NotBefore: notBefore, + NotAfter: notAfter, + }, rootCert, certKey.Public(), rootKey) + require.NoError(t, err) + certBytes := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: derBytes, + }) + require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Cert, certBytes, 0644)) + } + + krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) + now := time.Now() + + // A cert that is not yet valid is not valid even if expiry is allowed + genCert(now.Add(time.Hour), now.Add(time.Hour*2)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + // a cert that is expired is not valid if expiry is not allowed + genCert(now.Add(time.Hour*-3), now.Add(time.Hour*-1)) + + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) + require.Error(t, err) + require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err)) + + // but it is valid if expiry is allowed + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true) + require.NoError(t, err) +} + func TestLoadSecurityConfigInvalidCert(t *testing.T) { tc := testutils.NewTestCA(t) defer tc.Stop() @@ -136,7 +212,7 @@ some random garbage\n krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) - _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw) + _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) assert.Error(t, err) nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw, @@ -162,7 +238,7 @@ some random garbage\n krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil) - _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw) + _, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false) assert.Error(t, err) nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw, @@ -185,7 +261,7 @@ func TestLoadSecurityConfigIncorrectPassphrase(t *testing.T) { "nodeID", ca.WorkerRole, tc.Organization) require.NoError(t, err) - _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil)) + _, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil), false) require.IsType(t, ca.ErrInvalidKEK{}, err) } diff --git a/integration/cluster.go b/integration/cluster.go index ffdbf7ef30..94f0d112b0 100644 --- a/integration/cluster.go +++ b/integration/cluster.go @@ -9,6 +9,7 @@ import ( "google.golang.org/grpc" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" "github.com/docker/swarmkit/log" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "golang.org/x/net/context" @@ -75,12 +76,13 @@ func (c *testCluster) RandomManager() *testNode { // AddManager adds a node with the Manager role. The node will function as both // an agent and a manager. If lateBind is set, the manager is started before a -// remote API port is bound. This setting only applies to the first manager. -func (c *testCluster) AddManager(lateBind bool) error { +// remote API port is bound. If rootCA is set, the manager is bootstrapped using +// said root CA. These settings only apply to the first manager. +func (c *testCluster) AddManager(lateBind bool, rootCA *ca.RootCA) error { // first node var n *testNode if len(c.nodes) == 0 { - node, err := newTestNode("", "", lateBind) + node, err := newTestNode("", "", lateBind, rootCA) if err != nil { return err } @@ -98,7 +100,7 @@ func (c *testCluster) AddManager(lateBind bool) error { if len(clusterInfo.Clusters) == 0 { return fmt.Errorf("joining manager: there is no cluster created in storage") } - node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false) + node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false, nil) if err != nil { return err } @@ -157,7 +159,7 @@ func (c *testCluster) AddAgent() error { if len(clusterInfo.Clusters) == 0 { return fmt.Errorf("joining agent: there is no cluster created in storage") } - node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false) + node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false, nil) if err != nil { return err } diff --git a/integration/integration_test.go b/integration/integration_test.go index f182808428..e5ab0195db 100644 --- a/integration/integration_test.go +++ b/integration/integration_test.go @@ -1,16 +1,24 @@ package integration import ( + "crypto/rand" + "crypto/x509" + "encoding/pem" "flag" "fmt" + "io/ioutil" "os" + "path/filepath" "runtime" "testing" + "time" "golang.org/x/net/context" "github.com/Sirupsen/logrus" + "github.com/cloudflare/cfssl/helpers" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "github.com/pkg/errors" "github.com/stretchr/testify/require" @@ -125,7 +133,7 @@ func pollServiceReady(t *testing.T, c *testCluster, sid string) { func newCluster(t *testing.T, numWorker, numManager int) *testCluster { cl := newTestCluster() for i := 0; i < numManager; i++ { - require.NoError(t, cl.AddManager(false), "manager number %d", i+1) + require.NoError(t, cl.AddManager(false, nil), "manager number %d", i+1) } for i := 0; i < numWorker; i++ { require.NoError(t, cl.AddAgent(), "agent number %d", i+1) @@ -152,7 +160,7 @@ func TestServiceCreateLateBind(t *testing.T) { cl := newTestCluster() for i := 0; i < numManager; i++ { - require.NoError(t, cl.AddManager(true), "manager number %d", i+1) + require.NoError(t, cl.AddManager(true, nil), "manager number %d", i+1) } for i := 0; i < numWorker; i++ { require.NoError(t, cl.AddAgent(), "agent number %d", i+1) @@ -386,7 +394,7 @@ func TestDemoteDownedManager(t *testing.T) { // stop the node, then demote it, and start it back up again so when it comes back up it has to realize // it's not running anymore - require.NoError(t, demotee.Pause()) + require.NoError(t, demotee.Pause(false)) // demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since // the node is currently down @@ -429,7 +437,7 @@ func TestRestartLeader(t *testing.T) { origLeaderID := leader.node.NodeID() - require.NoError(t, leader.Pause()) + require.NoError(t, leader.Pause(false)) require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error { resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) @@ -452,3 +460,81 @@ func TestRestartLeader(t *testing.T) { pollClusterReady(t, cl, numWorker, numManager) } + +func TestForceNewCluster(t *testing.T) { + t.Parallel() + logrus.SetLevel(logrus.DebugLevel) + + // create an external CA so that we can use it to generate expired certificates + tempDir, err := ioutil.TempDir("", "external-ca") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + rootCA, err := ca.CreateRootCA("externalRoot", ca.NewConfigPaths(tempDir).RootCA) + require.NoError(t, err) + + // start a new cluster with the external CA bootstrapped + numWorker, numManager := 0, 1 + cl := newTestCluster() + defer func() { + require.NoError(t, cl.Stop()) + }() + require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1") + pollClusterReady(t, cl, numWorker, numManager) + + leader, err := cl.Leader() + require.NoError(t, err) + + sid, err := cl.CreateService("test_service", 2) + require.NoError(t, err) + pollServiceReady(t, cl, sid) + + // generate an expired certificate + rootKey, err := helpers.ParsePrivateKeyPEM(rootCA.Key) + require.NoError(t, err) + rootCert, err := helpers.ParseCertificatePEM(rootCA.Cert) + require.NoError(t, err) + + managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt") + certBytes, err := ioutil.ReadFile(managerCertFile) + require.NoError(t, err) + managerCerts, err := helpers.ParseCertificatesPEM(certBytes) + require.NoError(t, err) + expiredCertTemplate := managerCerts[0] + expiredCertTemplate.NotBefore = time.Now().Add(time.Hour * -5) + expiredCertTemplate.NotAfter = time.Now().Add(time.Hour * -3) + expiredCertDERBytes, err := x509.CreateCertificate(rand.Reader, expiredCertTemplate, rootCert, expiredCertTemplate.PublicKey, rootKey) + require.NoError(t, err) + expiredCertPEM := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: expiredCertDERBytes, + }) + + // restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed + nodeID := leader.node.NodeID() + require.NoError(t, leader.Pause(true)) + require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) + require.NoError(t, cl.StartNode(nodeID)) + pollClusterReady(t, cl, numWorker, numManager) + pollServiceReady(t, cl, sid) + + err = raftutils.PollFuncWithTimeout(nil, func() error { + certBytes, err := ioutil.ReadFile(managerCertFile) + if err != nil { + return err + } + managerCerts, err := helpers.ParseCertificatesPEM(certBytes) + if err != nil { + return err + } + if managerCerts[0].NotAfter.Before(time.Now()) { + return errors.New("certificate hasn't been renewed yet") + } + return nil + }, opsTimeout) + require.NoError(t, err) + + // restart node with an expired certificate without forcing a new cluster - it should error on start + require.NoError(t, leader.Pause(true)) + require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) + require.Error(t, cl.StartNode(nodeID)) +} diff --git a/integration/node.go b/integration/node.go index 056798d5d9..afc7cf91c7 100644 --- a/integration/node.go +++ b/integration/node.go @@ -9,6 +9,8 @@ import ( "google.golang.org/grpc" "github.com/docker/swarmkit/api" + "github.com/docker/swarmkit/ca" + "github.com/docker/swarmkit/identity" raftutils "github.com/docker/swarmkit/manager/state/raft/testutils" "github.com/docker/swarmkit/node" "golang.org/x/net/context" @@ -25,8 +27,8 @@ type testNode struct { // newNode creates new node with specific role(manager or agent) and joins to // existing cluster. if joinAddr is empty string, then new cluster will be initialized. // It uses TestExecutor as executor. If lateBind is set, the remote API port is not -// bound. -func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { +// bound. If rootCA is set, this root is used to bootstrap the node's TLS certs. +func newTestNode(joinAddr, joinToken string, lateBind bool, rootCA *ca.RootCA) (*testNode, error) { tmpDir, err := ioutil.TempDir("", "swarmkit-integration-") if err != nil { return nil, err @@ -43,6 +45,26 @@ func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { if !lateBind { cfg.ListenRemoteAPI = "127.0.0.1:0" } + if rootCA != nil { + certDir := filepath.Join(tmpDir, "certificates") + if err := os.MkdirAll(certDir, 0700); err != nil { + return nil, err + } + certPaths := ca.NewConfigPaths(certDir) + if err := ioutil.WriteFile(certPaths.RootCA.Cert, rootCA.Cert, 0644); err != nil { + return nil, err + } + if err := ioutil.WriteFile(certPaths.RootCA.Key, rootCA.Key, 0600); err != nil { + return nil, err + } + // generate TLS certs for this manager for bootstrapping, else the node will generate its own CA + _, err := rootCA.IssueAndSaveNewCertificates(ca.NewKeyReadWriter(certPaths.Node, nil, nil), + identity.NewID(), ca.ManagerRole, identity.NewID()) + if err != nil { + return nil, err + } + } + node, err := node.New(cfg) if err != nil { return nil, err @@ -55,7 +77,7 @@ func newTestNode(joinAddr, joinToken string, lateBind bool) (*testNode, error) { } // Pause stops the node, and creates a new swarm node while keeping all the state -func (n *testNode) Pause() error { +func (n *testNode) Pause(forceNewCluster bool) error { rAddr, err := n.node.RemoteAPIAddr() if err != nil { rAddr = "127.0.0.1:0" @@ -71,6 +93,7 @@ func (n *testNode) Pause() error { // other remotes that are stored in the raft directory. cfg.JoinAddr = "" cfg.JoinToken = "" + cfg.ForceNewCluster = forceNewCluster node, err := node.New(cfg) if err != nil { diff --git a/node/node.go b/node/node.go index d36d90a5a3..45b178b4dc 100644 --- a/node/node.go +++ b/node/node.go @@ -568,7 +568,8 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro return nil, err } if err == nil { - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw) + // if forcing a new cluster, we allow the certificates to be expired - a new set will be generated + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) if err != nil { _, isInvalidKEK := errors.Cause(err).(ca.ErrInvalidKEK) if isInvalidKEK { @@ -606,7 +607,7 @@ func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, erro // - We wait for CreateSecurityConfig to finish since we need a certificate to operate. // Attempt to load certificate from disk - securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw) + securityConfig, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": securityConfig.ClientTLSCreds.NodeID(),