diff --git a/manager/manager.go b/manager/manager.go index 856554d290..9e986ac40d 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -875,8 +875,39 @@ func (m *Manager) rotateRootCAKEK(ctx context.Context, clusterID string) error { }) } +// getLeaderNodeID is a small helper function returning a string with the +// leader's node ID. it is only used for logging, and should not be relied on +// to give a node ID for actual operational purposes (because it returns errors +// as nicely decorated strings) +func (m *Manager) getLeaderNodeID() string { + // get the current leader ID. this variable tracks the leader *only* for + // the purposes of logging leadership changes, and should not be relied on + // for other purposes + leader, leaderErr := m.raftNode.Leader() + switch leaderErr { + case raft.ErrNoRaftMember: + // this is an unlikely case, but we have to handle it. this means this + // node is not a member of the raft quorum. this won't look very pretty + // in logs ("leadership changed from aslkdjfa to ErrNoRaftMember") but + // it also won't be very common + return "not yet part of a raft cluster" + case raft.ErrNoClusterLeader: + return "no cluster leader" + default: + id, err := m.raftNode.GetNodeIDByRaftID(leader) + // the only possible error here is "ErrMemberUnknown" + if err != nil { + return "an unknown node" + } + return id + } +} + // handleLeadershipEvents handles the is leader event or is follower event. func (m *Manager) handleLeadershipEvents(ctx context.Context, leadershipCh chan events.Event) { + // get the current leader and save it for logging leadership changes in + // this loop + oldLeader := m.getLeaderNodeID() for { select { case leadershipEvent := <-leadershipCh: @@ -895,6 +926,12 @@ func (m *Manager) handleLeadershipEvents(ctx context.Context, leadershipCh chan leaderMetric.Set(0) } m.mu.Unlock() + + newLeader := m.getLeaderNodeID() + // maybe we should use logrus fields for old and new leader, so + // that users are better able to ingest leadership changes into log + // aggregators? + log.G(ctx).Infof("leadership changed from %v to %v", oldLeader, newLeader) case <-ctx.Done(): return } diff --git a/manager/state/raft/raft.go b/manager/state/raft/raft.go index 56b7c7c966..a707913682 100644 --- a/manager/state/raft/raft.go +++ b/manager/state/raft/raft.go @@ -1703,6 +1703,18 @@ func (n *Node) GetMemberByNodeID(nodeID string) *membership.Member { return nil } +// GetNodeIDByRaftID returns the generic Node ID of a member given its raft ID. +// It returns ErrMemberUnknown if the raft ID is unknown. +func (n *Node) GetNodeIDByRaftID(raftID uint64) (string, error) { + if member, ok := n.cluster.Members()[raftID]; ok { + return member.NodeID, nil + } + // this is the only possible error value that should be returned; the + // manager code depends on this. if you need to add more errors later, make + // sure that you update the callers of this method accordingly + return "", ErrMemberUnknown +} + // IsMember checks if the raft node has effectively joined // a cluster of existing members. func (n *Node) IsMember() bool { diff --git a/manager/state/raft/raft_test.go b/manager/state/raft/raft_test.go index 3b3172e3dd..b33baa91a7 100644 --- a/manager/state/raft/raft_test.go +++ b/manager/state/raft/raft_test.go @@ -1024,3 +1024,42 @@ func TestStreamRaftMessage(t *testing.T) { errStr = fmt.Sprintf("Raft message chunk is not of type %d", raftpb.MsgSnap) assert.Equal(t, errStr, s.Message()) } + +// TestGetNodeIDByRaftID tests the GetNodeIDByRaftID function. It's a very +// simple test but those are the kind that make a difference over time +func TestGetNodeIDByRaftID(t *testing.T) { + t.Parallel() + + nodes, _ := raftutils.NewRaftCluster(t, tc) + defer raftutils.TeardownCluster(nodes) + + // get the member list + members := nodes[1].GetMemberlist() + // get all of the raft ids + raftIDs := make([]uint64, 0, len(members)) + for _, member := range members { + raftIDs = append(raftIDs, member.RaftID) + } + + // now go and get the nodeID of every raftID + for _, id := range raftIDs { + nodeid, err := nodes[1].GetNodeIDByRaftID(id) + assert.NoError(t, err, "raft ID %v should give us a node ID", id) + // now go through the member manually list and make sure this is + // correct + for _, member := range members { + assert.True(t, + // either both should match, or both should not match. if they + // are different, then there is an error + (member.RaftID == id) == (member.NodeID == nodeid), + "member with id %v has node id %v, but we expected member with id %v to have node id %v", + member.RaftID, member.NodeID, id, nodeid, + ) + } + } + + // now expect a nonexistent raft member to return ErrNoMember + id, err := nodes[1].GetNodeIDByRaftID(8675309) + assert.Equal(t, err, raft.ErrMemberUnknown) + assert.Empty(t, id) +}