Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion ca/certificates.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,25 @@ func (rca *RootCA) IssueAndSaveNewCertificates(kw KeyWriter, cn, ou, org string)
return &tlsKeyPair, nil
}

// Normally we can just call cert.Verify(opts), but since we actually want more information about
// whether a certificate is not yet valid or expired, we also need to perform the expiry checks ourselves.
func verifyCertificate(cert *x509.Certificate, opts x509.VerifyOptions, allowExpired bool) error {
_, err := cert.Verify(opts)
if invalidErr, ok := err.(x509.CertificateInvalidError); ok && invalidErr.Reason == x509.Expired {
now := time.Now().UTC()
if now.Before(cert.NotBefore) {
return errors.Wrapf(err, "certificate not valid before %s, and it is currently %s",
cert.NotBefore.UTC().Format(time.RFC1123), now.Format(time.RFC1123))
}
if allowExpired {
return nil
}
return errors.Wrapf(err, "certificate expires at %s, and it is currently %s",
cert.NotAfter.UTC().Format(time.RFC1123), now.Format(time.RFC1123))
}
return err
}

// RequestAndSaveNewCertificates gets new certificates issued, either by signing them locally if a signer is
// available, or by requesting them from the remote server at remoteAddr.
func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWriter, config CertificateRequestConfig) (*tls.Certificate, error) {
Expand Down Expand Up @@ -199,7 +218,7 @@ func (rca *RootCA) RequestAndSaveNewCertificates(ctx context.Context, kw KeyWrit
Roots: rca.Pool,
}
// Check to see if this certificate was signed by our CA, and isn't expired
if _, err := X509Cert.Verify(opts); err != nil {
if err := verifyCertificate(X509Cert, opts, false); err != nil {
return nil, err
}

Expand Down
32 changes: 18 additions & 14 deletions ca/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/Sirupsen/logrus"
cfconfig "github.com/cloudflare/cfssl/config"
events "github.com/docker/go-events"
"github.com/docker/swarmkit/api"
"github.com/docker/swarmkit/connectionbroker"
"github.com/docker/swarmkit/identity"
Expand Down Expand Up @@ -189,7 +190,7 @@ func GenerateJoinToken(rootCA *RootCA) string {

func getCAHashFromToken(token string) (digest.Digest, error) {
split := strings.Split(token, "-")
if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" {
if len(split) != 4 || split[0] != "SWMTKN" || split[1] != "1" || len(split[2]) != base36DigestLen || len(split[3]) != maxGeneratedSecretLength {
return "", errors.New("invalid join token")
}

Expand Down Expand Up @@ -242,7 +243,7 @@ func DownloadRootCA(ctx context.Context, paths CertPaths, token string, connBrok

// LoadSecurityConfig loads TLS credentials from disk, or returns an error if
// these credentials do not exist or are unusable.
func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) (*SecurityConfig, error) {
func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter, allowExpired bool) (*SecurityConfig, error) {
ctx = log.WithModule(ctx, "tls")

// At this point we've successfully loaded the CA details from disk, or
Expand Down Expand Up @@ -273,7 +274,7 @@ func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter)
}

// Check to see if this certificate was signed by our CA, and isn't expired
if _, err := X509Cert.Verify(opts); err != nil {
if err := verifyCertificate(X509Cert, opts, allowExpired); err != nil {
return nil, err
}

Expand Down Expand Up @@ -445,8 +446,15 @@ func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, connBroker *conne
func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate {
updates := make(chan CertificateUpdate)

backoffConfig := events.ExponentialBackoffConfig{
Base: time.Second * 5,
Factor: time.Minute,
Max: 1 * time.Hour,
}

go func() {
var retry time.Duration
expBackoff := events.NewExponentialBackoff(backoffConfig)
defer close(updates)
for {
ctx = log.WithModule(ctx, "tls")
Expand All @@ -472,18 +480,12 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti
return
}
} else {
// If we have an expired certificate, we let's stick with the starting default in
// the hope that this is a temporary clock skew.
// If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is clock skew, I see two cases, both of which have potential problems:

  • If a different manager is issuing certs, it looks like we'd go into a renewal loop, since we reset the exponential backoff after each attempt when the certificate comes back expired.
  • If this node is issuing its own certs, it would be issuing certs with validity periods based on its skewed clock, which means the rest of the world probably won't be able to communicate with it (since we're seeing an expired cert, the clock would be skewed towards the future, meaning NotBefore would become a timestamp in the future).

I think we should address the renewal loop issue by not retrying if we receive a certificate that appears expired.

The self-issuing case is an inherent consequence of reissuing certs if they expire. If we decide to behave this way (which I think is the best choice, overall), we probably have to live with it. The only alternatives I can think of are:

  • Require manual intervention to renew a certificate after it has expired (but provide some kind of flag for this, so it doesn't require starting over, like today)
  • Only automatically renew expired certificates in a single-node swarm

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should address the renewal loop issue by not retrying if we receive a certificate that appears expired.

Are we assuming clock skew can eventually fix itself (for instance, maybe it can't contact the NTP server for now)? If so, rather than terminate the loop entirely, would it make sense to set the retry interval to something really long?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if it's really clock skew, is there any need to keep renewing? Presumably the certificate will become valid once the clock skew is fixed, unless we're talking about really short expiry intervals.

Copy link
Copy Markdown
Contributor Author

@cyli cyli Feb 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if we terminate the renewal loop, even if the clock skew goes away, we won't be able to renew near when the certificate actually nears expiry. Let's say that the cert expires in a month, but our clock skew is 4 months off. If we kill the renew loop because it's expired, then even if the clock skew fixes itself, the cert will eventually expire without because it won't be renewed unless the node is restarted.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe that's ok, though, and our position is if there's a clock skew so drastic that it makes the certificate look expired, the only way to recover is to fix the clock skew and restart the node? (One has to restart, otherwise even if the clock skew fixes itself the node won't auto-renew certificates as it nears expiry)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems reasonable. Or we could renew every few minutes. But doing it every few seconds seems excessive.

Copy link
Copy Markdown
Contributor Author

@cyli cyli Feb 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally set it to renew every few seconds because it could take a little while for the cluster to start up in the ForceNewCluster case (since we have to wait for the cluster data to load).

But it renews based on exponential backoff, so given the current configuration, it should be:

  1. 0s
  2. 35s
  3. 1m5s
  4. 2m5s
  5. 4m5s

and then maxes out at 5 minutes (so it renews every 5 minutes). Although looking at the actual exponential backoff code, the actual time seems to be chosen from a uniform distribution between 0 and those times, so could be every few seconds if unlucky.

We could max it out at 1 hour instead.

Copy link
Copy Markdown
Contributor Author

@cyli cyli Feb 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wait, sorry for the silly enumeration of the backoff, I think I misunderstood your comment:

it looks like we'd go into a renewal loop, since we reset the exponential backoff after each attempt when the certificate comes back expired.

If the certificate expires, then the renewal fails, since RenewTLSConfigNow calls RequestAndSaveNewCertificates, which validates the cert, including expiry before returning. If it fails the expiry validation, RequestAndSaveCertificates and thus RenewTLSConfigNow fails, which means the exponential backoff doesn't get reset to 0.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the backoff config to use a factor of a minute and max out at an hour. So the progression would be:

0s -> 1m5s -> 2m5s -> 4m5s -> 8m5s -> 16m5s -> 32m5s -> 1h0m0s

This may make the integration test take longer though.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pausing 65 seconds in the integration test doesn't sound good to me. Let's make this backoff configurable by the test if necessary.

// we can issue our own TLS certs.
if validUntil.Before(time.Now()) {
log.WithError(err).Errorf("failed to create a new client TLS config")

select {
case updates <- CertificateUpdate{Err: errors.New("TLS certificate is expired")}:
case <-ctx.Done():
log.Info("shutting down certificate renewal routine")
return
}

log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately")
// retry immediately(ish) with exponential backoff
retry = expBackoff.Proceed(nil)
} else {
// Random retry time between 50% and 80% of the total time to expiration
retry = calculateRandomExpiry(validFrom, validUntil)
Expand All @@ -508,8 +510,10 @@ func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connecti
var certUpdate CertificateUpdate
if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil {
certUpdate.Err = err
expBackoff.Failure(nil, nil)
} else {
certUpdate.Role = s.ClientTLSCreds.Role()
expBackoff = events.NewExponentialBackoff(backoffConfig)
}

select {
Expand Down
90 changes: 83 additions & 7 deletions ca/config_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
package ca_test

import (
"crypto/rand"
"crypto/x509"
"crypto/x509/pkix"
"encoding/pem"
"io/ioutil"
"math/big"
"os"
"strings"
"testing"
Expand All @@ -10,10 +15,12 @@ import (
"golang.org/x/net/context"

cfconfig "github.com/cloudflare/cfssl/config"
"github.com/cloudflare/cfssl/helpers"
"github.com/docker/swarmkit/ca"
"github.com/docker/swarmkit/ca/testutils"
"github.com/docker/swarmkit/ioutils"
"github.com/docker/swarmkit/manager/state/store"
"github.com/pkg/errors"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -54,9 +61,14 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) {
os.RemoveAll(tc.Paths.RootCA.Cert)

// invalid token
_, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, "invalidtoken", tc.ConnBroker)
require.Error(t, err)
require.Contains(t, err.Error(), "invalid join token")
for _, invalid := range []string{
"invalidtoken", // completely invalid
"SWMTKN-1-3wkodtpeoipd1u1hi0ykdcdwhw16dk73ulqqtn14b3indz68rf-4myj5xihyto11dg1cn55w8p6", // mistyped
} {
_, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, invalid, tc.ConnBroker)
require.Error(t, err)
require.Contains(t, err.Error(), "invalid join token")
}

// invalid hash token
splitToken := strings.Split(tc.ManagerToken, "-")
Expand All @@ -65,7 +77,7 @@ func TestDownloadRootCAWrongCAHash(t *testing.T) {

os.RemoveAll(tc.Paths.RootCA.Cert)

_, err = ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker)
_, err := ca.DownloadRootCA(tc.Context, tc.Paths.RootCA, replacementToken, tc.ConnBroker)
require.Error(t, err)
require.Contains(t, err.Error(), "remote CA does not match fingerprint.")
}
Expand Down Expand Up @@ -125,6 +137,70 @@ func TestCreateSecurityConfigNoCerts(t *testing.T) {
assert.Equal(t, rootCA, *nodeConfig.RootCA())
}

func TestLoadSecurityConfigExpiredCert(t *testing.T) {
tc := testutils.NewTestCA(t)
defer tc.Stop()

_, key, err := ca.GenerateNewCSR()
require.NoError(t, err)
require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Key, key, 0600))
certKey, err := helpers.ParsePrivateKeyPEM(key)
require.NoError(t, err)

rootKey, err := helpers.ParsePrivateKeyPEM(tc.RootCA.Key)
require.NoError(t, err)
rootCert, err := helpers.ParseCertificatePEM(tc.RootCA.Cert)
require.NoError(t, err)

serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128)
serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
require.NoError(t, err)

genCert := func(notBefore, notAfter time.Time) {
derBytes, err := x509.CreateCertificate(rand.Reader, &x509.Certificate{
SerialNumber: serialNumber,
Subject: pkix.Name{
CommonName: "CN",
OrganizationalUnit: []string{"OU"},
Organization: []string{"ORG"},
},
NotBefore: notBefore,
NotAfter: notAfter,
}, rootCert, certKey.Public(), rootKey)
require.NoError(t, err)
certBytes := pem.EncodeToMemory(&pem.Block{
Type: "CERTIFICATE",
Bytes: derBytes,
})
require.NoError(t, ioutil.WriteFile(tc.Paths.Node.Cert, certBytes, 0644))
}

krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil)
now := time.Now()

// A cert that is not yet valid is not valid even if expiry is allowed
genCert(now.Add(time.Hour), now.Add(time.Hour*2))

_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false)
require.Error(t, err)
require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err))

_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true)
require.Error(t, err)
require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err))

// a cert that is expired is not valid if expiry is not allowed
genCert(now.Add(time.Hour*-3), now.Add(time.Hour*-1))

_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false)
require.Error(t, err)
require.IsType(t, x509.CertificateInvalidError{}, errors.Cause(err))

// but it is valid if expiry is allowed
_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, true)
require.NoError(t, err)
}

func TestLoadSecurityConfigInvalidCert(t *testing.T) {
tc := testutils.NewTestCA(t)
defer tc.Stop()
Expand All @@ -136,7 +212,7 @@ some random garbage\n

krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil)

_, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw)
_, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false)
assert.Error(t, err)

nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw,
Expand All @@ -162,7 +238,7 @@ some random garbage\n

krw := ca.NewKeyReadWriter(tc.Paths.Node, nil, nil)

_, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw)
_, err := ca.LoadSecurityConfig(tc.Context, tc.RootCA, krw, false)
assert.Error(t, err)

nodeConfig, err := tc.RootCA.CreateSecurityConfig(tc.Context, krw,
Expand All @@ -185,7 +261,7 @@ func TestLoadSecurityConfigIncorrectPassphrase(t *testing.T) {
"nodeID", ca.WorkerRole, tc.Organization)
require.NoError(t, err)

_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil))
_, err = ca.LoadSecurityConfig(tc.Context, tc.RootCA, ca.NewKeyReadWriter(paths.Node, nil, nil), false)
require.IsType(t, ca.ErrInvalidKEK{}, err)
}

Expand Down
12 changes: 7 additions & 5 deletions integration/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"google.golang.org/grpc"

"github.com/docker/swarmkit/api"
"github.com/docker/swarmkit/ca"
"github.com/docker/swarmkit/log"
raftutils "github.com/docker/swarmkit/manager/state/raft/testutils"
"golang.org/x/net/context"
Expand Down Expand Up @@ -75,12 +76,13 @@ func (c *testCluster) RandomManager() *testNode {

// AddManager adds a node with the Manager role. The node will function as both
// an agent and a manager. If lateBind is set, the manager is started before a
// remote API port is bound. This setting only applies to the first manager.
func (c *testCluster) AddManager(lateBind bool) error {
// remote API port is bound. If rootCA is set, the manager is bootstrapped using
// said root CA. These settings only apply to the first manager.
func (c *testCluster) AddManager(lateBind bool, rootCA *ca.RootCA) error {
// first node
var n *testNode
if len(c.nodes) == 0 {
node, err := newTestNode("", "", lateBind)
node, err := newTestNode("", "", lateBind, rootCA)
if err != nil {
return err
}
Expand All @@ -98,7 +100,7 @@ func (c *testCluster) AddManager(lateBind bool) error {
if len(clusterInfo.Clusters) == 0 {
return fmt.Errorf("joining manager: there is no cluster created in storage")
}
node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false)
node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Manager, false, nil)
if err != nil {
return err
}
Expand Down Expand Up @@ -157,7 +159,7 @@ func (c *testCluster) AddAgent() error {
if len(clusterInfo.Clusters) == 0 {
return fmt.Errorf("joining agent: there is no cluster created in storage")
}
node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false)
node, err := newTestNode(joinAddr, clusterInfo.Clusters[0].RootCA.JoinTokens.Worker, false, nil)
if err != nil {
return err
}
Expand Down
Loading