From 81ef7af0ed3758834f815bb8ae29cf060487f375 Mon Sep 17 00:00:00 2001 From: Holger Hans Peter Freyther Date: Sun, 15 Mar 2020 21:37:16 +0800 Subject: [PATCH] doc: Propose using Memberlist Keyring to protect a cluster Create a document to propose an easy (implementation and operation) way to protect the production cluster from accidental and unwanted members. Provide a reference implementation in addition to the design document. TODO(zecke): Figure out how to test this feature properly. Signed-off-by: Holger Hans Peter Freyther --- cluster/cluster.go | 10 ++++ cluster/cluster_test.go | 6 +++ cmd/alertmanager/main.go | 24 ++++++++++ doc/design/protect-cluster-membership.md | 58 ++++++++++++++++++++++++ 4 files changed, 98 insertions(+) create mode 100644 doc/design/protect-cluster-membership.md diff --git a/cluster/cluster.go b/cluster/cluster.go index 322481ea83..409d6ecf49 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -118,6 +118,7 @@ func Create( tcpTimeout time.Duration, probeTimeout time.Duration, probeInterval time.Duration, + keys [][]byte, ) (*Peer, error) { bindHost, bindPortStr, err := net.SplitHostPort(bindAddr) if err != nil { @@ -212,6 +213,15 @@ func Create( p.setInitialFailed(resolvedPeers, bindAddr) } + if len(keys) > 0 { + cfg.Keyring, err = memberlist.NewKeyring(keys, keys[0]) + if err != nil { + return nil, errors.Wrap(err, "create keyring") + } + cfg.GossipVerifyIncoming = true + cfg.GossipVerifyOutgoing = true + } + ml, err := memberlist.Create(cfg) if err != nil { return nil, errors.Wrap(err, "create memberlist") diff --git a/cluster/cluster_test.go b/cluster/cluster_test.go index 36ed42031a..2f695d43c0 100644 --- a/cluster/cluster_test.go +++ b/cluster/cluster_test.go @@ -51,6 +51,7 @@ func testJoinLeave(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p) @@ -78,6 +79,7 @@ func testJoinLeave(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p2) @@ -110,6 +112,7 @@ func testReconnect(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p) @@ -133,6 +136,7 @@ func testReconnect(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p2) @@ -171,6 +175,7 @@ func testRemoveFailedPeers(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p) @@ -220,6 +225,7 @@ func testInitiallyFailingPeers(t *testing.T) { DefaultTcpTimeout, DefaultProbeTimeout, DefaultProbeInterval, + nil, ) require.NoError(t, err) require.NotNil(t, p) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index aefaf59e5d..512387ead8 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -14,8 +14,10 @@ package main import ( + "bytes" "context" "fmt" + "io/ioutil" "net" "net/http" "net/url" @@ -172,6 +174,20 @@ func buildReceiverIntegrations(nc *config.Receiver, tmpl *template.Template, log return integrations, nil } +func readKeys(files []string) ([][]byte, error) { + keys := make([][]byte, 0, len(files)) + + for _, file := range files { + key, err := ioutil.ReadFile(file) + if err != nil { + return nil, err + } + keys = append(keys, bytes.TrimSpace(key)) + } + + return keys, nil +} + func main() { os.Exit(run()) } @@ -207,6 +223,7 @@ func run() int { settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration() reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration() peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration() + gossipKeyFiles = kingpin.Flag("cluster.key-file", "File containing private keys for the cluster").ExistingFiles() ) promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig) @@ -226,6 +243,12 @@ func run() int { return 1 } + gossipKeys, err := readKeys(*gossipKeyFiles) + if err != nil { + level.Error(logger).Log("msg", "Unable to read gossip key file", "err", err) + return 1 + } + var peer *cluster.Peer if *clusterBindAddr != "" { peer, err = cluster.Create( @@ -240,6 +263,7 @@ func run() int { *tcpTimeout, *probeTimeout, *probeInterval, + gossipKeys, ) if err != nil { level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err) diff --git a/doc/design/protect-cluster-membership.md b/doc/design/protect-cluster-membership.md new file mode 100644 index 0000000000..f3f58f920a --- /dev/null +++ b/doc/design/protect-cluster-membership.md @@ -0,0 +1,58 @@ +# Protect Alertmanager cluster from foreign membership + +Type: Design document + +Date: 2020-03-08 + +Author: Holger Hans Peter Freyther + +Status: Draft + +## Status Quo + +Alertmanager supports [high +availability](https://github.com/prometheus/alertmanager/blob/master/README.md#high-availability) +by interconnecting multiple Alertmanager instances building an Alertmanager +cluster. Instances of a cluster communicate on top of a gossip protocol managed +via Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) library. +_Memberlist_ uses two channels to communicate: TCP for reliable and UDP for +best-effort communication. + +Today knowing the address of any peer is enough to join the cluster and +(accidentally) gossip silences and the alert notification log. + + +## Goal + +Prevent non-production Alertmanager instances to accidentally gossip silences +to members of a production cluster. + +## Proposed Solution - Memberlist Keys/Keyring + +Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) allows to +manage a _Keyring_ with one or more keys and encrypt outgoing messages and +verify encryption of Gossip messages received. Enabling encryption has an +impact on the size of messages exchanged and requires extra compute. + +Introduce the _cluster.key-file_ command line to specify zero to many files +containing encryption keys to be used as keys in the [_Memberlist_]. The first +key specified will be the primary key and enable the protection of the cluster. + +Keys can be rotated by adding an additional _cluster.key-file_ and restart all +all instances of the cluster and then remove the old key. + + +## Discarded Solutions + +### Implement the secure cluster traffic document + +Implementing and operating a X509 PKI is a major challenge. An implementation +must honor certificate expiration, check revocation lists/OCSP and many more +details. Operating a PKI is equally challenging and many high profile companies +fail[1][2][3] at the basics. A more manageable solution is preferable. + + +[1] https://www.theverge.com/2020/2/3/21120248/microsoft-teams-down-outage-certificate-issue-status +[2] https://www.zdnet.com/article/ericsson-expired-certificate-caused-o2-and-softbank-outages/ +[3] https://www.theregister.co.uk/2017/11/30/linkedin_ssl_certificates_expire/ +