prometheus · zecke · Mar 15, 2020
diff --git a/cluster/cluster.go b/cluster/cluster.go
@@ -118,6 +118,7 @@ func Create(
 	tcpTimeout time.Duration,
 	probeTimeout time.Duration,
 	probeInterval time.Duration,
+	keys [][]byte,
 ) (*Peer, error) {
 	bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
 	if err != nil {
@@ -212,6 +213,15 @@ func Create(
 		p.setInitialFailed(resolvedPeers, bindAddr)
 	}
 
+	if len(keys) > 0 {
+		cfg.Keyring, err = memberlist.NewKeyring(keys, keys[0])
+		if err != nil {
+			return nil, errors.Wrap(err, "create keyring")
+		}
+		cfg.GossipVerifyIncoming = true
+		cfg.GossipVerifyOutgoing = true
+	}
+
 	ml, err := memberlist.Create(cfg)
 	if err != nil {
 		return nil, errors.Wrap(err, "create memberlist")

diff --git a/cluster/cluster_test.go b/cluster/cluster_test.go
@@ -51,6 +51,7 @@ func testJoinLeave(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p)
@@ -78,6 +79,7 @@ func testJoinLeave(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p2)
@@ -110,6 +112,7 @@ func testReconnect(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p)
@@ -133,6 +136,7 @@ func testReconnect(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p2)
@@ -171,6 +175,7 @@ func testRemoveFailedPeers(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p)
@@ -220,6 +225,7 @@ func testInitiallyFailingPeers(t *testing.T) {
 		DefaultTcpTimeout,
 		DefaultProbeTimeout,
 		DefaultProbeInterval,
+		nil,
 	)
 	require.NoError(t, err)
 	require.NotNil(t, p)

diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go
@@ -14,8 +14,10 @@
 package main
 
 import (
+	"bytes"
 	"context"
 	"fmt"
+	"io/ioutil"
 	"net"
 	"net/http"
 	"net/url"
@@ -172,6 +174,20 @@ func buildReceiverIntegrations(nc *config.Receiver, tmpl *template.Template, log
 	return integrations, nil
 }
 
+func readKeys(files []string) ([][]byte, error) {
+	keys := make([][]byte, 0, len(files))
+
+	for _, file := range files {
+		key, err := ioutil.ReadFile(file)
+		if err != nil {
+			return nil, err
+		}
+		keys = append(keys, bytes.TrimSpace(key))
+	}
+
+	return keys, nil
+}
+
 func main() {
 	os.Exit(run())
 }
@@ -207,6 +223,7 @@ func run() int {
 		settleTimeout        = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
 		reconnectInterval    = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
 		peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
+		gossipKeyFiles       = kingpin.Flag("cluster.key-file", "File containing private keys for the cluster").ExistingFiles()
 	)
 
 	promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
@@ -226,6 +243,12 @@ func run() int {
 		return 1
 	}
 
+	gossipKeys, err := readKeys(*gossipKeyFiles)
+	if err != nil {
+		level.Error(logger).Log("msg", "Unable to read gossip key file", "err", err)
+		return 1
+	}
+
 	var peer *cluster.Peer
 	if *clusterBindAddr != "" {
 		peer, err = cluster.Create(
@@ -240,6 +263,7 @@ func run() int {
 			*tcpTimeout,
 			*probeTimeout,
 			*probeInterval,
+			gossipKeys,
 		)
 		if err != nil {
 			level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)

diff --git a/doc/design/protect-cluster-membership.md b/doc/design/protect-cluster-membership.md
@@ -0,0 +1,58 @@
+# Protect Alertmanager cluster from foreign membership
+
+Type: Design document
+
+Date: 2020-03-08
+
+Author: Holger Hans Peter Freyther <automatic+am@freyther.de>
+
+Status: Draft
+
+## Status Quo
+
+Alertmanager supports [high
+availability](https://github.com/prometheus/alertmanager/blob/master/README.md#high-availability)
+by interconnecting multiple Alertmanager instances building an Alertmanager
+cluster. Instances of a cluster communicate on top of a gossip protocol managed
+via Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) library.
+_Memberlist_ uses two channels to communicate: TCP for reliable and UDP for
+best-effort communication.
+
+Today knowing the address of any peer is enough to join the cluster and
+(accidentally) gossip silences and the alert notification log.
+
+
+## Goal
+
+Prevent non-production Alertmanager instances to accidentally gossip silences
+to members of a production cluster.
+
+## Proposed Solution - Memberlist Keys/Keyring
+
+Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) allows to
+manage a _Keyring_ with one or more keys and encrypt outgoing messages and
+verify encryption of Gossip messages received. Enabling encryption has an
+impact on the size of messages exchanged and requires extra compute.
+
+Introduce the  _cluster.key-file_ command line to specify zero to many files
+containing encryption keys to be used as keys in the [_Memberlist_]. The first
+key specified will be the primary key and enable the protection of the cluster.
+
+Keys can be rotated by adding an additional _cluster.key-file_ and restart all
+all instances of the cluster and then remove the old key.
+
+
+## Discarded Solutions
+
+### Implement the secure cluster traffic document
+
+Implementing and operating a X509 PKI is a major challenge. An implementation
+must honor certificate expiration, check revocation lists/OCSP and many more
+details. Operating a PKI is equally challenging and many high profile companies
+fail[1][2][3] at the basics. A more manageable solution is preferable.
+
+
+[1] https://www.theverge.com/2020/2/3/21120248/microsoft-teams-down-outage-certificate-issue-status
+[2] https://www.zdnet.com/article/ericsson-expired-certificate-caused-o2-and-softbank-outages/
+[3] https://www.theregister.co.uk/2017/11/30/linkedin_ssl_certificates_expire/
+