Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ func Create(
tcpTimeout time.Duration,
probeTimeout time.Duration,
probeInterval time.Duration,
keys [][]byte,
) (*Peer, error) {
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
if err != nil {
Expand Down Expand Up @@ -212,6 +213,15 @@ func Create(
p.setInitialFailed(resolvedPeers, bindAddr)
}

if len(keys) > 0 {
cfg.Keyring, err = memberlist.NewKeyring(keys, keys[0])
if err != nil {
return nil, errors.Wrap(err, "create keyring")
}
cfg.GossipVerifyIncoming = true
cfg.GossipVerifyOutgoing = true
}

ml, err := memberlist.Create(cfg)
if err != nil {
return nil, errors.Wrap(err, "create memberlist")
Expand Down
6 changes: 6 additions & 0 deletions cluster/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func testJoinLeave(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p)
Expand Down Expand Up @@ -78,6 +79,7 @@ func testJoinLeave(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p2)
Expand Down Expand Up @@ -110,6 +112,7 @@ func testReconnect(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p)
Expand All @@ -133,6 +136,7 @@ func testReconnect(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p2)
Expand Down Expand Up @@ -171,6 +175,7 @@ func testRemoveFailedPeers(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p)
Expand Down Expand Up @@ -220,6 +225,7 @@ func testInitiallyFailingPeers(t *testing.T) {
DefaultTcpTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
)
require.NoError(t, err)
require.NotNil(t, p)
Expand Down
24 changes: 24 additions & 0 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
package main

import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net"
"net/http"
"net/url"
Expand Down Expand Up @@ -172,6 +174,20 @@ func buildReceiverIntegrations(nc *config.Receiver, tmpl *template.Template, log
return integrations, nil
}

func readKeys(files []string) ([][]byte, error) {
keys := make([][]byte, 0, len(files))

for _, file := range files {
key, err := ioutil.ReadFile(file)
if err != nil {
return nil, err
}
keys = append(keys, bytes.TrimSpace(key))
}

return keys, nil
}

func main() {
os.Exit(run())
}
Expand Down Expand Up @@ -207,6 +223,7 @@ func run() int {
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
gossipKeyFiles = kingpin.Flag("cluster.key-file", "File containing private keys for the cluster").ExistingFiles()
)

promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
Expand All @@ -226,6 +243,12 @@ func run() int {
return 1
}

gossipKeys, err := readKeys(*gossipKeyFiles)
if err != nil {
level.Error(logger).Log("msg", "Unable to read gossip key file", "err", err)
return 1
}

var peer *cluster.Peer
if *clusterBindAddr != "" {
peer, err = cluster.Create(
Expand All @@ -240,6 +263,7 @@ func run() int {
*tcpTimeout,
*probeTimeout,
*probeInterval,
gossipKeys,
)
if err != nil {
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
Expand Down
58 changes: 58 additions & 0 deletions doc/design/protect-cluster-membership.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Protect Alertmanager cluster from foreign membership

Type: Design document

Date: 2020-03-08

Author: Holger Hans Peter Freyther <automatic+am@freyther.de>

Status: Draft

## Status Quo

Alertmanager supports [high
availability](https://github.com/prometheus/alertmanager/blob/master/README.md#high-availability)
by interconnecting multiple Alertmanager instances building an Alertmanager
cluster. Instances of a cluster communicate on top of a gossip protocol managed
via Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) library.
_Memberlist_ uses two channels to communicate: TCP for reliable and UDP for
best-effort communication.

Today knowing the address of any peer is enough to join the cluster and
(accidentally) gossip silences and the alert notification log.


## Goal

Prevent non-production Alertmanager instances to accidentally gossip silences
to members of a production cluster.

## Proposed Solution - Memberlist Keys/Keyring

Hashicorps [_Memberlist_](https://github.com/hashicorp/memberlist) allows to
manage a _Keyring_ with one or more keys and encrypt outgoing messages and
verify encryption of Gossip messages received. Enabling encryption has an
impact on the size of messages exchanged and requires extra compute.

Introduce the _cluster.key-file_ command line to specify zero to many files
containing encryption keys to be used as keys in the [_Memberlist_]. The first
key specified will be the primary key and enable the protection of the cluster.

Keys can be rotated by adding an additional _cluster.key-file_ and restart all
all instances of the cluster and then remove the old key.


## Discarded Solutions

### Implement the secure cluster traffic document

Implementing and operating a X509 PKI is a major challenge. An implementation
must honor certificate expiration, check revocation lists/OCSP and many more
details. Operating a PKI is equally challenging and many high profile companies
fail[1][2][3] at the basics. A more manageable solution is preferable.


[1] https://www.theverge.com/2020/2/3/21120248/microsoft-teams-down-outage-certificate-issue-status
[2] https://www.zdnet.com/article/ericsson-expired-certificate-caused-o2-and-softbank-outages/
[3] https://www.theregister.co.uk/2017/11/30/linkedin_ssl_certificates_expire/