From 4c5d6443cbe0cfdb754543b0ed06a69673a7e0b4 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 20 Aug 2021 10:59:53 +0100 Subject: [PATCH 01/22] Parent tweaks --- router/peer.go | 14 ++++------ router/tree.go | 75 ++++++++++++++++++++++++++------------------------ 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/router/peer.go b/router/peer.go index 1c552bfc..55c8531f 100644 --- a/router/peer.go +++ b/router/peer.go @@ -161,14 +161,12 @@ func (p *Peer) stop() error { return nil } -/* func (p *Peer) generateKeepalive() *types.Frame { frame := types.GetFrame() frame.Version = types.Version0 frame.Type = types.TypeKeepalive return frame } -*/ func (p *Peer) generateAnnouncement() *types.Frame { if p.port == 0 { @@ -311,8 +309,8 @@ var bufPool = sync.Pool{ } func (p *Peer) writer(ctx context.Context) { - //tick := time.NewTicker(PeerKeepaliveInterval) - //defer tick.Stop() + tick := time.NewTicker(PeerKeepaliveInterval) + defer tick.Stop() send := func(frame *types.Frame) { if frame == nil { return @@ -419,10 +417,10 @@ func (p *Peer) writer(ctx context.Context) { p.statistics.txTrafficDropped.Inc() } continue - //case <-tick.C: - // send(p.generateKeepalive()) - // p.statistics.txProtoSuccessful.Inc() - // continue + case <-tick.C: + send(p.generateKeepalive()) + p.statistics.txProtoSuccessful.Inc() + continue } } } diff --git a/router/tree.go b/router/tree.go index e66d1972..15d0efbf 100644 --- a/router/tree.go +++ b/router/tree.go @@ -28,7 +28,7 @@ import ( // announcementInterval is the frequency at which this // node will send root announcements to other peers. -const announcementInterval = PeerKeepaliveInterval // time.Minute * 15 +const announcementInterval = time.Minute * 15 // announcementTimeout is the amount of time that must // pass without receiving a root announcement before we @@ -41,7 +41,7 @@ func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) { r.log.Println("Error unmarshalling announcement:", err) return } - if err := r.tree.Update(peer, new); err != nil { + if err := r.tree.Update(peer, new, false); err != nil { r.log.Println("Error handling announcement on port", peer.port, ":", err) } } @@ -114,7 +114,6 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { func (t *spanningTree) selectNewParent() { t.updateMutex.Lock() defer t.updateMutex.Unlock() - t.becomeRoot() bestDist := math.MaxInt32 bestKey := t.r.public var bestTime time.Time @@ -155,10 +154,11 @@ func (t *spanningTree) selectNewParent() { return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence == bestSeq && hops == bestDist && ann.at.Before(bestTime) }) if bestAnn != nil { - t.parent.Store(bestPort) - if err := t.Update(t.r.ports[bestPort], bestAnn.SwitchAnnouncement); err != nil { + if err := t.Update(t.r.ports[bestPort], bestAnn.SwitchAnnouncement, true); err != nil { t.r.log.Println("t.Update: %w", err) } + } else { + t.becomeRoot() } } @@ -251,7 +251,7 @@ func (t *spanningTree) Parent() types.SwitchPortID { return 0 } -func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error { +func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globalUpdate bool) error { sigs := make(map[string]struct{}) isLoopbackUpdate := false for index, sig := range newUpdate.Signatures { @@ -314,42 +314,43 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error t.updateMutex.Lock() defer t.updateMutex.Unlock() - lastGlobalUpdate := t.Root() - globalKeyDelta := newUpdate.RootPublicKey.CompareTo(lastGlobalUpdate.RootPublicKey) - globalTimeSince := time.Since(lastGlobalUpdate.at) - globalUpdate := false + if !globalUpdate { + lastGlobalUpdate := t.Root() + globalKeyDelta := newUpdate.RootPublicKey.CompareTo(lastGlobalUpdate.RootPublicKey) + globalTimeSince := time.Since(lastGlobalUpdate.at) - switch { - case globalTimeSince > announcementTimeout: - // We haven't seen a suitable root update recently so we'll accept - // this one instead - globalUpdate = true + switch { + case globalTimeSince > announcementTimeout: + // We haven't seen a suitable root update recently so we'll accept + // this one instead + globalUpdate = true - case globalKeyDelta < 0: - // The root key is weaker than our existing root, so it's no good - return nil + case globalKeyDelta < 0: + // The root key is weaker than our existing root, so it's no good + return nil - case globalKeyDelta == 0: - // The update is from the same root node, let's see if it matches - // any other useful conditions + case globalKeyDelta == 0: + // The update is from the same root node, let's see if it matches + // any other useful conditions - switch { - case newUpdate.Sequence < lastGlobalUpdate.Sequence: - // This is a replay of an earlier update, therefore we should - // ignore it, even if it came from our parent node - return nil + switch { + case newUpdate.Sequence < lastGlobalUpdate.Sequence: + // This is a replay of an earlier update, therefore we should + // ignore it, even if it came from our parent node + return nil + + case len(newUpdate.Signatures) < len(lastGlobalUpdate.Signatures): + // The path to the root is shorter than our last update, so + // we'll accept it + globalUpdate = true + } - case len(newUpdate.Signatures) < len(lastGlobalUpdate.Signatures): - // The path to the root is shorter than our last update, so - // we'll accept it + case globalKeyDelta > 0: + // The root key is stronger than our existing root, therefore we'll + // accept it anyway, since we always want to converge on the + // strongest root key globalUpdate = true } - - case globalKeyDelta > 0: - // The root key is stronger than our existing root, therefore we'll - // accept it anyway, since we always want to converge on the - // strongest root key - globalUpdate = true } if parent := t.parent.Load(); p.port == parent || globalUpdate { @@ -362,9 +363,9 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error at: time.Now(), SwitchAnnouncement: newUpdate, } - t.parent.Store(p.port) oldcoords := t.Coords() newcoords := t.root.Coords() + t.parent.Store(p.port) t.coords.Store(newcoords) t.rootMutex.Unlock() @@ -375,7 +376,9 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error if oldRootKey != newUpdate.RootPublicKey { defer t.r.snake.rootNodeChanged(newUpdate.RootPublicKey) } + } + if parent := t.parent.Load(); p.port == parent { t.advertise() t.rootReset <- struct{}{} } From 167597cb3266b8e396134e70c3b36d0d80e90d94 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 20 Aug 2021 12:17:34 +0100 Subject: [PATCH 02/22] Remove separate root record --- router/tree.go | 55 +++++++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/router/tree.go b/router/tree.go index 15d0efbf..f1ce74b8 100644 --- a/router/tree.go +++ b/router/tree.go @@ -52,14 +52,11 @@ type rootAnnouncementWithTime struct { } type spanningTree struct { - r *Router // - context context.Context // - root *rootAnnouncementWithTime // - rootMutex sync.RWMutex // - rootReset chan struct{} // - updateMutex sync.Mutex // - parent atomic.Value // types.SwitchPortID - coords atomic.Value // types.SwitchPorts + r *Router // + context context.Context // + rootReset chan struct{} // + updateMutex sync.Mutex // + parent atomic.Value // types.SwitchPortID callback func(parent types.SwitchPortID, coords types.SwitchPorts) } @@ -71,17 +68,22 @@ func newSpanningTree(r *Router, f func(parent types.SwitchPortID, coords types.S callback: f, } t.becomeRoot() + t.advertise() go t.workerForRoot() go t.workerForAnnouncements() return t } func (t *spanningTree) Coords() types.SwitchPorts { - coords, ok := t.coords.Load().(types.SwitchPorts) - if ok { - return coords + parent, ok := t.parent.Load().(types.SwitchPortID) + if !ok { + return types.SwitchPorts{} + } + ann := t.r.ports[parent].lastAnnouncement() + if ann == nil { + return types.SwitchPorts{} } - return types.SwitchPorts{} + return ann.Coords() } func (t *spanningTree) Ancestors() ([]types.PublicKey, types.SwitchPortID) { @@ -113,7 +115,7 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { func (t *spanningTree) selectNewParent() { t.updateMutex.Lock() - defer t.updateMutex.Unlock() + t.becomeRoot() bestDist := math.MaxInt32 bestKey := t.r.public var bestTime time.Time @@ -153,12 +155,11 @@ func (t *spanningTree) selectNewParent() { checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence == bestSeq && hops == bestDist && ann.at.Before(bestTime) }) + t.updateMutex.Unlock() if bestAnn != nil { if err := t.Update(t.r.ports[bestPort], bestAnn.SwitchAnnouncement, true); err != nil { t.r.log.Println("t.Update: %w", err) } - } else { - t.becomeRoot() } } @@ -180,7 +181,6 @@ func (t *spanningTree) becomeRoot() { if !t.Coords().EqualTo(newCoords) { go t.callback(0, types.SwitchPorts{}) } - t.advertise() } func (t *spanningTree) workerForAnnouncements() { @@ -222,9 +222,7 @@ func (t *spanningTree) IsRoot() bool { } func (t *spanningTree) Root() *rootAnnouncementWithTime { - t.rootMutex.RLock() - root := t.root - t.rootMutex.RUnlock() + root := t.r.ports[t.Parent()].lastAnnouncement() if root == nil || time.Since(root.at) > announcementTimeout { return &rootAnnouncementWithTime{ at: time.Now(), @@ -354,27 +352,16 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa } if parent := t.parent.Load(); p.port == parent || globalUpdate { - t.rootMutex.Lock() - var oldRootKey types.PublicKey - if t.root != nil { - oldRootKey = t.root.RootPublicKey - } - t.root = &rootAnnouncementWithTime{ - at: time.Now(), - SwitchAnnouncement: newUpdate, - } - oldcoords := t.Coords() - newcoords := t.root.Coords() + oldcoords, oldroot := t.Coords(), t.Root().RootPublicKey t.parent.Store(p.port) - t.coords.Store(newcoords) - t.rootMutex.Unlock() + newcoords, newroot := t.Coords(), t.Root().RootPublicKey if t.callback != nil && !oldcoords.EqualTo(newcoords) { go t.callback(p.port, newcoords) } - if oldRootKey != newUpdate.RootPublicKey { - defer t.r.snake.rootNodeChanged(newUpdate.RootPublicKey) + if oldroot != newroot { + defer t.r.snake.rootNodeChanged(newroot) } } From d5083e7e9d86ac6e71524ca59075caf90b39141c Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 20 Aug 2021 12:44:04 +0100 Subject: [PATCH 03/22] Fix bugs in spanning tree --- router/tree.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/router/tree.go b/router/tree.go index f1ce74b8..9f0d3442 100644 --- a/router/tree.go +++ b/router/tree.go @@ -108,7 +108,7 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { t.becomeRoot() return } - if parent := t.parent.Load().(types.SwitchPortID); parent == port { + if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || parent == port { t.selectNewParent() } } @@ -223,7 +223,7 @@ func (t *spanningTree) IsRoot() bool { func (t *spanningTree) Root() *rootAnnouncementWithTime { root := t.r.ports[t.Parent()].lastAnnouncement() - if root == nil || time.Since(root.at) > announcementTimeout { + if root == nil { return &rootAnnouncementWithTime{ at: time.Now(), SwitchAnnouncement: types.SwitchAnnouncement{ @@ -351,7 +351,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa } } - if parent := t.parent.Load(); p.port == parent || globalUpdate { + if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || p.port == parent || globalUpdate { oldcoords, oldroot := t.Coords(), t.Root().RootPublicKey t.parent.Store(p.port) newcoords, newroot := t.Coords(), t.Root().RootPublicKey From b4ae9d9f7f7897c5960b73cc32f90416bd1c326d Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 20 Aug 2021 14:35:29 +0100 Subject: [PATCH 04/22] Other bug fixes --- router/tree.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/tree.go b/router/tree.go index 9f0d3442..666ca04e 100644 --- a/router/tree.go +++ b/router/tree.go @@ -365,7 +365,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa } } - if parent := t.parent.Load(); p.port == parent { + if parent, ok := t.parent.Load().(types.SwitchPortID); ok && p.port == parent { t.advertise() t.rootReset <- struct{}{} } From 919be9776a19694dde47c95c22cc3c9587d17bed Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Mon, 23 Aug 2021 13:03:47 +0100 Subject: [PATCH 05/22] Tweaks --- router/tree.go | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/router/tree.go b/router/tree.go index 666ca04e..1dd06b38 100644 --- a/router/tree.go +++ b/router/tree.go @@ -68,7 +68,6 @@ func newSpanningTree(r *Router, f func(parent types.SwitchPortID, coords types.S callback: f, } t.becomeRoot() - t.advertise() go t.workerForRoot() go t.workerForAnnouncements() return t @@ -115,7 +114,6 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { func (t *spanningTree) selectNewParent() { t.updateMutex.Lock() - t.becomeRoot() bestDist := math.MaxInt32 bestKey := t.r.public var bestTime time.Time @@ -160,6 +158,8 @@ func (t *spanningTree) selectNewParent() { if err := t.Update(t.r.ports[bestPort], bestAnn.SwitchAnnouncement, true); err != nil { t.r.log.Println("t.Update: %w", err) } + } else { + t.becomeRoot() } } @@ -181,6 +181,7 @@ func (t *spanningTree) becomeRoot() { if !t.Coords().EqualTo(newCoords) { go t.callback(0, types.SwitchPorts{}) } + t.advertise() } func (t *spanningTree) workerForAnnouncements() { @@ -249,7 +250,7 @@ func (t *spanningTree) Parent() types.SwitchPortID { return 0 } -func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globalUpdate bool) error { +func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updateParent bool) error { sigs := make(map[string]struct{}) isLoopbackUpdate := false for index, sig := range newUpdate.Signatures { @@ -299,6 +300,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa } } + lastGlobalUpdate := t.Root() if err := p.updateAnnouncement(&newUpdate); err != nil { return fmt.Errorf("p.updateAnnouncement: %w", err) } @@ -312,8 +314,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa t.updateMutex.Lock() defer t.updateMutex.Unlock() - if !globalUpdate { - lastGlobalUpdate := t.Root() + if !updateParent { globalKeyDelta := newUpdate.RootPublicKey.CompareTo(lastGlobalUpdate.RootPublicKey) globalTimeSince := time.Since(lastGlobalUpdate.at) @@ -321,7 +322,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa case globalTimeSince > announcementTimeout: // We haven't seen a suitable root update recently so we'll accept // this one instead - globalUpdate = true + updateParent = true case globalKeyDelta < 0: // The root key is weaker than our existing root, so it's no good @@ -340,19 +341,19 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, globa case len(newUpdate.Signatures) < len(lastGlobalUpdate.Signatures): // The path to the root is shorter than our last update, so // we'll accept it - globalUpdate = true + updateParent = true } case globalKeyDelta > 0: // The root key is stronger than our existing root, therefore we'll // accept it anyway, since we always want to converge on the // strongest root key - globalUpdate = true + updateParent = true } } - if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || p.port == parent || globalUpdate { - oldcoords, oldroot := t.Coords(), t.Root().RootPublicKey + if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || p.port == parent || updateParent { + oldcoords, oldroot := lastGlobalUpdate.Coords(), lastGlobalUpdate.RootPublicKey t.parent.Store(p.port) newcoords, newroot := t.Coords(), t.Root().RootPublicKey From b8a32eed5e04a10912c1c40e433b3905f81dd8a5 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Mon, 23 Aug 2021 17:37:52 +0100 Subject: [PATCH 06/22] Some light refactoring to see if we can improve de-parenting performance (although still not functioning as well as I'd like...) --- router/peer.go | 10 +++++++++ router/tree.go | 61 +++++++++++++++++++++++++++----------------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/router/peer.go b/router/peer.go index 55c8531f..5eccf550 100644 --- a/router/peer.go +++ b/router/peer.go @@ -43,6 +43,7 @@ type Peer struct { port types.SwitchPortID // started atomic.Bool // worker goroutines started? alive atomic.Bool // have we received a handshake? + child atomic.Bool // is this node a child of ours? mutex sync.RWMutex // protects everything below this line zone string // peertype int // @@ -126,6 +127,13 @@ func (p *Peer) updateAnnouncement(new *types.SwitchAnnouncement) error { at: time.Now(), } p.coords = coords + isChild := false + for _, sig := range new.Signatures { + if sig.PublicKey.EqualTo(p.r.public) { + isChild = true + } + } + p.child.Store(isChild) return nil } @@ -146,6 +154,7 @@ func (p *Peer) start() error { return errors.New("switch peer is already started") } p.alive.Store(false) + p.child.Store(false) go p.reader(p.context) go p.writer(p.context) return nil @@ -156,6 +165,7 @@ func (p *Peer) stop() error { return errors.New("switch peer is already stopped") } p.alive.Store(false) + p.child.Store(false) p.cancel() _ = p.conn.Close() return nil diff --git a/router/tree.go b/router/tree.go index 1dd06b38..f0a8a6db 100644 --- a/router/tree.go +++ b/router/tree.go @@ -107,7 +107,7 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { t.becomeRoot() return } - if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || parent == port { + if parent := t.parent.Load().(types.SwitchPortID); parent == port { t.selectNewParent() } } @@ -122,6 +122,9 @@ func (t *spanningTree) selectNewParent() { var bestSeq types.Varu64 portsToCheck := map[*Peer]*rootAnnouncementWithTime{} for _, p := range t.r.activePorts() { + if p.child.Load() { + continue + } ann := p.lastAnnouncement() if ann == nil { continue @@ -180,6 +183,7 @@ func (t *spanningTree) becomeRoot() { newCoords := types.SwitchPorts{} if !t.Coords().EqualTo(newCoords) { go t.callback(0, types.SwitchPorts{}) + defer t.r.snake.rootNodeChanged(t.r.public) } t.advertise() } @@ -252,7 +256,6 @@ func (t *spanningTree) Parent() types.SwitchPortID { func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updateParent bool) error { sigs := make(map[string]struct{}) - isLoopbackUpdate := false for index, sig := range newUpdate.Signatures { if index == 0 && sig.PublicKey != newUpdate.RootPublicKey { // The first signature in the announcement must be from the @@ -272,15 +275,6 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat // trying to replay someone else's announcement to us. return fmt.Errorf("rejecting update (last signature must be from peer)") } - if sig.PublicKey.EqualTo(t.r.public) { - // A child update is one that contains our public key in the - // signatures already - it's probably one of our direct peers - // sending our root announcement back to us. In this case there - // is some special behaviour: we usually will need to accept - // the update on the port, but we don't want to do anything that - // would influence root or coordinate changes. - isLoopbackUpdate = true - } pk := hex.EncodeToString(sig.PublicKey[:]) if _, ok := sigs[pk]; ok { // One of the signatures has appeared in the update more than @@ -300,14 +294,29 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat } } - lastGlobalUpdate := t.Root() + lastParentUpdate := t.Root() if err := p.updateAnnouncement(&newUpdate); err != nil { return fmt.Errorf("p.updateAnnouncement: %w", err) } - if isLoopbackUpdate { - // The update contains our own signature already, so using it for - // a root update would create a loop + if lastPortUpdate != nil { + // Check if anything strange has happened to our parent that might + // indicate a problem with their parent, like a sudden lengthening + // of the path or a root key change altogether. + if parent := t.parent.Load().(types.SwitchPortID); p.port == parent { + switch { + case newUpdate.RootPublicKey.CompareTo(lastPortUpdate.RootPublicKey) < 0: // the root key got weaker + t.selectNewParent() + return nil + + case len(newUpdate.Signatures)-len(lastPortUpdate.Signatures) > 0: // the path got longer + t.selectNewParent() + return nil + } + } + } + + if p.child.Load() { return nil } @@ -315,36 +324,34 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat defer t.updateMutex.Unlock() if !updateParent { - globalKeyDelta := newUpdate.RootPublicKey.CompareTo(lastGlobalUpdate.RootPublicKey) - globalTimeSince := time.Since(lastGlobalUpdate.at) + keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) switch { - case globalTimeSince > announcementTimeout: + case time.Since(lastParentUpdate.at) > announcementTimeout: // We haven't seen a suitable root update recently so we'll accept // this one instead updateParent = true - case globalKeyDelta < 0: - // The root key is weaker than our existing root, so it's no good + case keyDeltaSinceLastParentUpdate < 0: + // The root key is weaker than our existing root so ignore it. return nil - case globalKeyDelta == 0: + case keyDeltaSinceLastParentUpdate == 0: // The update is from the same root node, let's see if it matches // any other useful conditions - switch { - case newUpdate.Sequence < lastGlobalUpdate.Sequence: + case newUpdate.Sequence < lastParentUpdate.Sequence: // This is a replay of an earlier update, therefore we should // ignore it, even if it came from our parent node return nil - case len(newUpdate.Signatures) < len(lastGlobalUpdate.Signatures): + case len(newUpdate.Signatures) < len(lastParentUpdate.Signatures): // The path to the root is shorter than our last update, so // we'll accept it updateParent = true } - case globalKeyDelta > 0: + case keyDeltaSinceLastParentUpdate > 0: // The root key is stronger than our existing root, therefore we'll // accept it anyway, since we always want to converge on the // strongest root key @@ -353,7 +360,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat } if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || p.port == parent || updateParent { - oldcoords, oldroot := lastGlobalUpdate.Coords(), lastGlobalUpdate.RootPublicKey + oldcoords, oldroot := lastParentUpdate.Coords(), lastParentUpdate.RootPublicKey t.parent.Store(p.port) newcoords, newroot := t.Coords(), t.Root().RootPublicKey @@ -364,9 +371,7 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat if oldroot != newroot { defer t.r.snake.rootNodeChanged(newroot) } - } - if parent, ok := t.parent.Load().(types.SwitchPortID); ok && p.port == parent { t.advertise() t.rootReset <- struct{}{} } From 199423cec6e91c2b4018d76c872ec171e9bba9a5 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Wed, 1 Sep 2021 11:03:40 +0100 Subject: [PATCH 07/22] Detect ancestor parent changes, tweaks --- router/nexthop_virtualsnake.go | 4 ++-- router/tree.go | 8 +++++--- types/announcement.go | 7 +++++++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 361468de..043ee2ce 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -372,7 +372,7 @@ func (t *virtualSnake) getVirtualSnakeTeardownNextHop(from *Peer, rx *types.Fram } if asc := t.ascending(); asc != nil && t.r.public.EqualTo(rx.DestinationKey) && asc.PathID == teardown.PathID { t.setAscending(nil) - defer time.AfterFunc(time.Millisecond*500, t.bootstrapNow) + defer time.AfterFunc(time.Second, t.bootstrapNow) } t.tableMutex.Lock() defer t.tableMutex.Unlock() @@ -584,7 +584,7 @@ func (t *virtualSnake) handleSetup(from *Peer, rx *types.Frame, nextHops types.S // Did the setup hit a dead end on the way to the ascending node? if nextHops.EqualTo(types.SwitchPorts{0}) || nextHops.EqualTo(types.SwitchPorts{}) { - if !rx.DestinationKey.EqualTo(t.r.public) || !rx.Destination.EqualTo(t.r.Coords()) { + if !rx.DestinationKey.EqualTo(t.r.public) { t.teardownPath(rx.SourceKey, setup.PathID, from.port, false, fmt.Errorf("rejecting setup (hit dead end)")) return fmt.Errorf("setup for %q (%s) en route to %q %s hit dead end at %s", rx.SourceKey, hex.EncodeToString(setup.PathID[:]), rx.DestinationKey, rx.Destination, t.r.Coords()) } diff --git a/router/tree.go b/router/tree.go index f0a8a6db..4f4e4d6d 100644 --- a/router/tree.go +++ b/router/tree.go @@ -305,11 +305,13 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat // of the path or a root key change altogether. if parent := t.parent.Load().(types.SwitchPortID); p.port == parent { switch { + case newUpdate.AncestorParent() != lastPortUpdate.AncestorParent(): // our ancestor's parent changed + fallthrough + case newUpdate.RootPublicKey.CompareTo(lastPortUpdate.RootPublicKey) < 0: // the root key got weaker - t.selectNewParent() - return nil + fallthrough - case len(newUpdate.Signatures)-len(lastPortUpdate.Signatures) > 0: // the path got longer + case len(newUpdate.Signatures) > len(lastPortUpdate.Signatures): // the path got suddenly longer t.selectNewParent() return nil } diff --git a/types/announcement.go b/types/announcement.go index 8382e604..162c38c8 100644 --- a/types/announcement.go +++ b/types/announcement.go @@ -109,3 +109,10 @@ func (a *SwitchAnnouncement) PeerCoords(public PublicKey) (SwitchPorts, error) { } return coords, nil } + +func (a *SwitchAnnouncement) AncestorParent() PublicKey { + if len(a.Signatures) < 2 { + return a.RootPublicKey + } + return a.Signatures[len(a.Signatures)-2].PublicKey +} From 3e28429bc2009901bcb9bd49def64a74ee6511dd Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Thu, 2 Sep 2021 17:50:37 +0100 Subject: [PATCH 08/22] Various refactoring, fixes and tweaks that might hopefully make parent selection less horrible --- router/dht.go | 4 +- router/nexthop_greedy.go | 7 +- router/nexthop_source.go | 2 +- router/nexthop_virtualsnake.go | 4 - router/peer.go | 45 ++++--- router/router.go | 15 +-- router/tree.go | 229 +++++++++++++++++++-------------- 7 files changed, 168 insertions(+), 138 deletions(-) diff --git a/router/dht.go b/router/dht.go index 0f46edd7..6ecd8eff 100644 --- a/router/dht.go +++ b/router/dht.go @@ -31,7 +31,7 @@ import ( type dhtEntry interface { PublicKey() types.PublicKey Coordinates() types.SwitchPorts - SeenRecently() bool + Alive() bool } type dht struct { @@ -59,7 +59,7 @@ func (d *dht) table() []dhtEntry { results := make([]dhtEntry, 0, len(d.sorted)) for _, n := range d.sorted { - if n.SeenRecently() { + if n.Alive() { results = append(results, n) } } diff --git a/router/nexthop_greedy.go b/router/nexthop_greedy.go index a38959a6..a4d6d76b 100644 --- a/router/nexthop_greedy.go +++ b/router/nexthop_greedy.go @@ -58,11 +58,12 @@ func (r *Router) getGreedyRoutedNextHop(from *Peer, rx *types.Frame) types.Switc peerCoords := p.Coordinates() peerDist := int64(peerCoords.DistanceTo(rx.Destination)) switch { - case peerDist == 0: - return []types.SwitchPortID{p.port} - case rx.Destination.EqualTo(peerCoords): + case peerDist == 0 || rx.Destination.EqualTo(peerCoords): + // The peer is the actual destination. return []types.SwitchPortID{p.port} + case peerDist < bestDist: + // The peer is closer to the destination. bestPeer, bestDist = p.port, peerDist default: } diff --git a/router/nexthop_source.go b/router/nexthop_source.go index 140f5f91..a652c4f1 100644 --- a/router/nexthop_source.go +++ b/router/nexthop_source.go @@ -51,7 +51,7 @@ func (r *Router) getSourceRoutedNextHop(from *Peer, rx *types.Frame) types.Switc return nil } - if peer := r.ports[to]; !peer.started.Load() || !peer.alive.Load() { + if peer := r.ports[to]; !peer.started.Load() || !peer.Alive() { // Don't try to send packets to a port that has nothing // connected to it or isn't alive. return nil diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 043ee2ce..949d256e 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -203,10 +203,6 @@ func (t *virtualSnake) rootNodeChanged(root types.PublicKey) { } } -func (t *virtualSnake) portWasConnected(port types.SwitchPortID) { - // time.AfterFunc(time.Second, t.bootstrapNow) -} - // portWasDisconnected is called by the router when a peer disconnects // allowing us to clean up the virtual snake state. func (t *virtualSnake) portWasDisconnected(port types.SwitchPortID) { diff --git a/router/peer.go b/router/peer.go index 5eccf550..f4930b02 100644 --- a/router/peer.go +++ b/router/peer.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "encoding/binary" + "encoding/hex" "errors" "fmt" "io" @@ -42,7 +43,6 @@ type Peer struct { r *Router // port types.SwitchPortID // started atomic.Bool // worker goroutines started? - alive atomic.Bool // have we received a handshake? child atomic.Bool // is this node a child of ours? mutex sync.RWMutex // protects everything below this line zone string // @@ -77,6 +77,16 @@ func (s *peerStatistics) reset() { s.rxTraffic.Store(0) } +func (p *Peer) IsParent() bool { + return p.r.tree.Parent() == p.port +} + +func (p *Peer) Alive() bool { + p.mutex.RLock() + defer p.mutex.RUnlock() + return p.announcement != nil && time.Since(p.announcement.at) < announcementTimeout +} + func (p *Peer) PublicKey() types.PublicKey { p.mutex.RLock() defer p.mutex.RUnlock() @@ -90,7 +100,7 @@ func (p *Peer) Coordinates() types.SwitchPorts { } func (p *Peer) SeenCommonRootRecently() bool { - if !p.alive.Load() { + if !p.Alive() { return false } last := p.lastAnnouncement() @@ -99,14 +109,10 @@ func (p *Peer) SeenCommonRootRecently() bool { } lpk := last.RootPublicKey rpk := p.r.RootPublicKey() - return lpk == rpk -} - -func (p *Peer) SeenRecently() bool { - if last := p.lastAnnouncement(); last != nil { - return true + if lpk != rpk { + fmt.Println("Root disagreement", lpk, "vs", rpk) } - return false + return lpk == rpk } func (p *Peer) updateAnnouncement(new *types.SwitchAnnouncement) error { @@ -114,14 +120,10 @@ func (p *Peer) updateAnnouncement(new *types.SwitchAnnouncement) error { defer p.mutex.Unlock() coords, err := new.PeerCoords(p.public) if err != nil { - p.alive.Store(false) p.announcement = nil p.coords = nil return fmt.Errorf("new.PeerCoords: %w", err) } - if p.alive.CAS(false, true) { - p.r.snake.portWasConnected(p.port) - } p.announcement = &rootAnnouncementWithTime{ SwitchAnnouncement: *new, at: time.Now(), @@ -131,6 +133,7 @@ func (p *Peer) updateAnnouncement(new *types.SwitchAnnouncement) error { for _, sig := range new.Signatures { if sig.PublicKey.EqualTo(p.r.public) { isChild = true + break } } p.child.Store(isChild) @@ -153,10 +156,11 @@ func (p *Peer) start() error { if !p.started.CAS(false, true) { return errors.New("switch peer is already started") } - p.alive.Store(false) + p.announcement = nil p.child.Store(false) go p.reader(p.context) go p.writer(p.context) + p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) return nil } @@ -164,7 +168,16 @@ func (p *Peer) stop() error { if !p.started.CAS(true, false) { return errors.New("switch peer is already stopped") } - p.alive.Store(false) + p.r.active.Delete(hex.EncodeToString(p.public[:]) + p.zone) + p.mutex.Lock() + p.peertype = 0 + p.zone = "" + p.public = types.PublicKey{} + p.coords = nil + p.announcement = nil + p.protoOut.reset() + p.trafficOut.reset() + p.mutex.Unlock() p.child.Store(false) p.cancel() _ = p.conn.Close() @@ -265,7 +278,7 @@ func (p *Peer) reader(ctx context.Context) { for _, port := range p.getNextHops(frame, p.port) { // Ignore ports that are not good candidates. dest := p.r.ports[port] - if !dest.started.Load() || (dest.port != 0 && !dest.alive.Load()) { + if !dest.started.Load() || (dest.port != 0 && !dest.Alive()) { continue } if p.port != 0 && dest.port != 0 { diff --git a/router/router.go b/router/router.go index 7f8d12bf..a483d82f 100644 --- a/router/router.go +++ b/router/router.go @@ -314,7 +314,7 @@ func (r *Router) activePorts() peers { peers := make(peers, 0, PortCount) for _, p := range r.startedPorts() { switch { - case !p.alive.Load(): + case !p.Alive(): continue default: peers = append(peers, p) @@ -411,7 +411,6 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee if err := r.ports[i].start(); err != nil { return 0, fmt.Errorf("port.start: %w", err) } - r.active.Store(hex.EncodeToString(public[:])+zone, i) if i != 0 { r.dht.insertNode(r.ports[i]) } @@ -438,16 +437,6 @@ func (r *Router) Disconnect(i types.SwitchPortID, err error) error { if stoperr := r.ports[i].stop(); stoperr != nil { return fmt.Errorf("port.stop: %w", stoperr) } - r.active.Delete(hex.EncodeToString(r.ports[i].public[:]) + r.ports[i].zone) - r.ports[i].mutex.Lock() - r.ports[i].peertype = 0 - r.ports[i].zone = "" - r.ports[i].public = types.PublicKey{} - r.ports[i].coords = nil - r.ports[i].announcement = nil - r.ports[i].protoOut.reset() - r.ports[i].trafficOut.reset() - r.ports[i].mutex.Unlock() if r.ports[i].port != 0 { r.dht.deleteNode(r.ports[i].public) } @@ -488,7 +477,7 @@ func (r *Router) IsConnected(key types.PublicKey, zone string) bool { return false } port := v.(types.SwitchPortID) - return r.ports[port].started.Load() + return r.ports[port].Alive() } // PeerInfo is a gomobile-friendly type that represents a peer diff --git a/router/tree.go b/router/tree.go index 4f4e4d6d..bf771739 100644 --- a/router/tree.go +++ b/router/tree.go @@ -17,13 +17,12 @@ package router import ( "context" "encoding/hex" - "fmt" "math" "sync" - "sync/atomic" "time" "github.com/matrix-org/pinecone/types" + "go.uber.org/atomic" ) // announcementInterval is the frequency at which this @@ -36,12 +35,56 @@ const announcementInterval = time.Minute * 15 const announcementTimeout = announcementInterval * 2 func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) { - var new types.SwitchAnnouncement - if _, err := new.UnmarshalBinary(rx.Payload); err != nil { + var newUpdate types.SwitchAnnouncement + if _, err := newUpdate.UnmarshalBinary(rx.Payload); err != nil { r.log.Println("Error unmarshalling announcement:", err) return } - if err := r.tree.Update(peer, new, false); err != nil { + sigs := make(map[string]struct{}) + for index, sig := range newUpdate.Signatures { + if index == 0 && sig.PublicKey != newUpdate.RootPublicKey { + // The first signature in the announcement must be from the + // key that claims to be the root. + return + } + if sig.Hop == 0 { + // None of the hops in the update should have a port number of 0 + // as this would imply that another node has sent their router + // port, which is impossible. We'll therefore reject any update + // that tries to do that. + return + } + if index == len(newUpdate.Signatures)-1 && peer.PublicKey() != sig.PublicKey { + // The last signature in the announcement must be from the + // direct peer. If it isn't then it sounds like someone is + // trying to replay someone else's announcement to us. + return + } + pk := hex.EncodeToString(sig.PublicKey[:]) + if _, ok := sigs[pk]; ok { + // One of the signatures has appeared in the update more than + // once, which would suggest that there's a loop somewhere. + return + } + sigs[pk] = struct{}{} + } + + lastPortUpdate := peer.lastAnnouncement() + if lastPortUpdate != nil && lastPortUpdate.RootPublicKey == newUpdate.RootPublicKey { + if newUpdate.Sequence < lastPortUpdate.Sequence { + // The update is a replay of a previous announcement which doesn't + // make sense + peer.cancel() + return + } + } + + if err := peer.updateAnnouncement(&newUpdate); err != nil { + r.log.Println("Error updating announcement on port", peer.port, ":", err) + return + } + + if err := r.tree.Update(peer, newUpdate, false); err != nil { r.log.Println("Error handling announcement on port", peer.port, ":", err) } } @@ -74,11 +117,7 @@ func newSpanningTree(r *Router, f func(parent types.SwitchPortID, coords types.S } func (t *spanningTree) Coords() types.SwitchPorts { - parent, ok := t.parent.Load().(types.SwitchPortID) - if !ok { - return types.SwitchPorts{} - } - ann := t.r.ports[parent].lastAnnouncement() + ann := t.r.ports[t.Parent()].lastAnnouncement() if ann == nil { return types.SwitchPorts{} } @@ -86,9 +125,8 @@ func (t *spanningTree) Coords() types.SwitchPorts { } func (t *spanningTree) Ancestors() ([]types.PublicKey, types.SwitchPortID) { - root := t.Root() - port, ok := t.parent.Load().(types.SwitchPortID) - if !ok || port == 0 { + root, port := t.Root(), t.Parent() + if port == 0 { return nil, 0 } ancestors := make([]types.PublicKey, 0, 1+len(root.Signatures)) @@ -107,19 +145,46 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { t.becomeRoot() return } - if parent := t.parent.Load().(types.SwitchPortID); parent == port { - t.selectNewParent() + if t.Parent() == port { + t.selectNewParentAndAdvertise() + } +} + +func (t *spanningTree) selectNewParentAndAdvertise() { + lastParentUpdate := t.Root() + lastParentPort := t.Parent() + + t.selectNewParent() + + newParentUpdate := t.Root() + newParentPort := t.Parent() + + switch { + case lastParentUpdate.RootPublicKey != newParentUpdate.RootPublicKey: + fallthrough + + case lastParentUpdate.AncestorParent() != newParentUpdate.AncestorParent(): + fallthrough + + case lastParentPort != newParentPort: + t.advertise() + t.rootReset <- struct{}{} } } func (t *spanningTree) selectNewParent() { t.updateMutex.Lock() + defer t.updateMutex.Unlock() + + lastParentUpdate := t.Root() + bestDist := math.MaxInt32 bestKey := t.r.public var bestTime time.Time var bestPort types.SwitchPortID var bestAnn *rootAnnouncementWithTime var bestSeq types.Varu64 + portsToCheck := map[*Peer]*rootAnnouncementWithTime{} for _, p := range t.r.activePorts() { if p.child.Load() { @@ -131,6 +196,7 @@ func (t *spanningTree) selectNewParent() { } portsToCheck[p] = ann } + checkWithCondition := func(f func(ann *rootAnnouncementWithTime, hops int) bool) { for p, ann := range portsToCheck { hops := len(ann.Signatures) @@ -144,6 +210,7 @@ func (t *spanningTree) selectNewParent() { } } } + checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { return ann.RootPublicKey.CompareTo(bestKey) > 0 }) @@ -156,10 +223,17 @@ func (t *spanningTree) selectNewParent() { checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence == bestSeq && hops == bestDist && ann.at.Before(bestTime) }) - t.updateMutex.Unlock() - if bestAnn != nil { - if err := t.Update(t.r.ports[bestPort], bestAnn.SwitchAnnouncement, true); err != nil { - t.r.log.Println("t.Update: %w", err) + + if bestAnn != nil && bestAnn.RootPublicKey.CompareTo(t.r.public) > 0 { + t.parent.Store(bestPort) + newcoords, newroot := t.Coords(), t.Root().RootPublicKey + + if t.callback != nil && !lastParentUpdate.Coords().EqualTo(newcoords) { + defer t.callback(bestPort, newcoords) + } + + if newroot != lastParentUpdate.RootPublicKey { + defer t.r.snake.rootNodeChanged(newroot) } } else { t.becomeRoot() @@ -215,7 +289,7 @@ func (t *spanningTree) workerForRoot() { case <-time.After(announcementTimeout): if !t.IsRoot() { - t.selectNewParent() + t.selectNewParentAndAdvertise() } } } @@ -255,55 +329,24 @@ func (t *spanningTree) Parent() types.SwitchPortID { } func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updateParent bool) error { - sigs := make(map[string]struct{}) - for index, sig := range newUpdate.Signatures { - if index == 0 && sig.PublicKey != newUpdate.RootPublicKey { - // The first signature in the announcement must be from the - // key that claims to be the root. - return fmt.Errorf("rejecting update (first signature must be from root)") - } - if sig.Hop == 0 { - // None of the hops in the update should have a port number of 0 - // as this would imply that another node has sent their router - // port, which is impossible. We'll therefore reject any update - // that tries to do that. - return fmt.Errorf("rejecting update (invalid 0 hop)") - } - if index == len(newUpdate.Signatures)-1 && p.PublicKey() != sig.PublicKey { - // The last signature in the announcement must be from the - // direct peer. If it isn't then it sounds like someone is - // trying to replay someone else's announcement to us. - return fmt.Errorf("rejecting update (last signature must be from peer)") - } - pk := hex.EncodeToString(sig.PublicKey[:]) - if _, ok := sigs[pk]; ok { - // One of the signatures has appeared in the update more than - // once, which would suggest that there's a loop somewhere. - return fmt.Errorf("rejecting update (detected routing loop)") - } - sigs[pk] = struct{}{} - } - - lastPortUpdate := p.lastAnnouncement() - if lastPortUpdate != nil && lastPortUpdate.RootPublicKey == newUpdate.RootPublicKey { - if newUpdate.Sequence < lastPortUpdate.Sequence { - // The update has a lower sequence number than our last update - // on this port from this root. This shouldn't happen, but if it - // does, then just drop the update. - return nil - } - } - - lastParentUpdate := t.Root() - if err := p.updateAnnouncement(&newUpdate); err != nil { - return fmt.Errorf("p.updateAnnouncement: %w", err) + if newUpdate.RootPublicKey.CompareTo(t.Root().RootPublicKey) < 0 { + // The update that this peer sent us, for some reason, has a weaker + // root than the one that we know about. Send an announcement back + // to them with our stronger root and do nothing further with the + // update. If our own root timed out then t.Root() will return an + // update with our own key, so it's equivalent to seeing if our key + // is stronger than theirs. + p.announce <- struct{}{} + return nil } - if lastPortUpdate != nil { + if p.IsParent() { // Check if anything strange has happened to our parent that might // indicate a problem with their parent, like a sudden lengthening - // of the path or a root key change altogether. - if parent := t.parent.Load().(types.SwitchPortID); p.port == parent { + // of the path or a root key change altogether. If these cases happen + // then it might be that the parent we have is no longer the best one + // so we should try and select a new one. + if lastPortUpdate := p.lastAnnouncement(); lastPortUpdate != nil { switch { case newUpdate.AncestorParent() != lastPortUpdate.AncestorParent(): // our ancestor's parent changed fallthrough @@ -312,68 +355,56 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updat fallthrough case len(newUpdate.Signatures) > len(lastPortUpdate.Signatures): // the path got suddenly longer - t.selectNewParent() + t.selectNewParentAndAdvertise() return nil } } } - if p.child.Load() { - return nil - } - - t.updateMutex.Lock() - defer t.updateMutex.Unlock() - if !updateParent { + // At this point we're looking to see if there are any reasons + // that we should select a new parent. By this point, the peer's + // update has already been saved, so positive cases here will mean + // that we may end up switching to this node. + lastParentUpdate := t.Root() keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) switch { case time.Since(lastParentUpdate.at) > announcementTimeout: - // We haven't seen a suitable root update recently so we'll accept - // this one instead + // We haven't seen a suitable root update recently from our parent, + // so try and select a new parent. updateParent = true case keyDeltaSinceLastParentUpdate < 0: - // The root key is weaker than our existing root so ignore it. - return nil + // The root key is weaker than our existing root so there's no point + // in attempting parent selection. + + case keyDeltaSinceLastParentUpdate > 0: + // The root key is stronger than our existing root, therefore this + // is likely a good candidate to be a new parent. + updateParent = true case keyDeltaSinceLastParentUpdate == 0: // The update is from the same root node, let's see if it matches - // any other useful conditions + // any other useful conditions. switch { case newUpdate.Sequence < lastParentUpdate.Sequence: - // This is a replay of an earlier update, therefore we should - // ignore it, even if it came from our parent node - return nil + // This is a replay of an earlier update, therefore we should + // ignore it, even if it came from our parent node case len(newUpdate.Signatures) < len(lastParentUpdate.Signatures): // The path to the root is shorter than our last update, so // we'll accept it updateParent = true } - - case keyDeltaSinceLastParentUpdate > 0: - // The root key is stronger than our existing root, therefore we'll - // accept it anyway, since we always want to converge on the - // strongest root key - updateParent = true } } - if parent, ok := t.parent.Load().(types.SwitchPortID); !ok || p.port == parent || updateParent { - oldcoords, oldroot := lastParentUpdate.Coords(), lastParentUpdate.RootPublicKey - t.parent.Store(p.port) - newcoords, newroot := t.Coords(), t.Root().RootPublicKey - - if t.callback != nil && !oldcoords.EqualTo(newcoords) { - go t.callback(p.port, newcoords) - } - - if oldroot != newroot { - defer t.r.snake.rootNodeChanged(newroot) - } + switch { + case updateParent: + t.selectNewParentAndAdvertise() + case p.IsParent(): t.advertise() t.rootReset <- struct{}{} } From fd385ef02ec0b7daff9747ee911782fe3d2a0dd7 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 3 Sep 2021 15:23:59 +0100 Subject: [PATCH 09/22] Simplify parent selection again (extra iterations are ultimately unnecessary) --- router/nexthop_virtualsnake.go | 9 +- router/peer.go | 3 - router/tree.go | 183 +++++++++++++++------------------ 3 files changed, 90 insertions(+), 105 deletions(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 949d256e..2af676bc 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -24,6 +24,7 @@ import ( "github.com/matrix-org/pinecone/types" "github.com/matrix-org/pinecone/util" + "go.uber.org/atomic" ) const virtualSnakeNeighExpiryPeriod = time.Minute * 30 @@ -36,6 +37,7 @@ type virtualSnake struct { ascendingMutex sync.RWMutex _descending *virtualSnakeNeighbour descendingMutex sync.RWMutex + bootstrapping atomic.Bool } type virtualSnakeIndex struct { @@ -145,13 +147,14 @@ func (t *virtualSnake) maintain() { // predefined interval, but for now we'll continue to send // them on a regular interval until we can derive some better // connection state. - if bootstrapNow { + if bootstrapNow && t.bootstrapping.CAS(false, true) { t.bootstrapNow() } } } func (t *virtualSnake) bootstrapNow() { + t.bootstrapping.Store(false) if t.r.IsRoot() { return } @@ -180,7 +183,9 @@ func (t *virtualSnake) bootstrapNow() { func (t *virtualSnake) rootNodeChanged(root types.PublicKey) { if asc := t.ascending(); asc != nil && !asc.RootPublicKey.EqualTo(root) { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("root changed and asc no longer matches")) - defer t.bootstrapNow() + if t.bootstrapping.CAS(false, true) { + time.AfterFunc(time.Second, t.bootstrapNow) + } } if desc := t.descending(); desc != nil && !desc.RootPublicKey.EqualTo(root) { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("root changed and desc no longer matches")) diff --git a/router/peer.go b/router/peer.go index f4930b02..bad97f7a 100644 --- a/router/peer.go +++ b/router/peer.go @@ -109,9 +109,6 @@ func (p *Peer) SeenCommonRootRecently() bool { } lpk := last.RootPublicKey rpk := p.r.RootPublicKey() - if lpk != rpk { - fmt.Println("Root disagreement", lpk, "vs", rpk) - } return lpk == rpk } diff --git a/router/tree.go b/router/tree.go index bf771739..86d6576f 100644 --- a/router/tree.go +++ b/router/tree.go @@ -84,7 +84,7 @@ func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) { return } - if err := r.tree.Update(peer, newUpdate, false); err != nil { + if err := r.tree.Update(peer, newUpdate); err != nil { r.log.Println("Error handling announcement on port", peer.port, ":", err) } } @@ -100,6 +100,7 @@ type spanningTree struct { rootReset chan struct{} // updateMutex sync.Mutex // parent atomic.Value // types.SwitchPortID + reparenting atomic.Bool // callback func(parent types.SwitchPortID, coords types.SwitchPorts) } @@ -145,12 +146,14 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { t.becomeRoot() return } - if t.Parent() == port { + if t.Parent() == port && t.reparenting.CAS(false, true) { t.selectNewParentAndAdvertise() } } func (t *spanningTree) selectNewParentAndAdvertise() { + t.reparenting.Store(false) + lastParentUpdate := t.Root() lastParentPort := t.Parent() @@ -168,7 +171,6 @@ func (t *spanningTree) selectNewParentAndAdvertise() { case lastParentPort != newParentPort: t.advertise() - t.rootReset <- struct{}{} } } @@ -176,16 +178,14 @@ func (t *spanningTree) selectNewParent() { t.updateMutex.Lock() defer t.updateMutex.Unlock() - lastParentUpdate := t.Root() - + lastUpdate := t.Root() bestDist := math.MaxInt32 bestKey := t.r.public - var bestTime time.Time + bestTime := t.Root().at var bestPort types.SwitchPortID var bestAnn *rootAnnouncementWithTime var bestSeq types.Varu64 - portsToCheck := map[*Peer]*rootAnnouncementWithTime{} for _, p := range t.r.activePorts() { if p.child.Load() { continue @@ -194,45 +194,50 @@ func (t *spanningTree) selectNewParent() { if ann == nil { continue } - portsToCheck[p] = ann - } - - checkWithCondition := func(f func(ann *rootAnnouncementWithTime, hops int) bool) { - for p, ann := range portsToCheck { - hops := len(ann.Signatures) - if f(ann, hops) { - bestKey = ann.RootPublicKey - bestDist = hops - bestPort = p.port - bestTime = ann.at - bestSeq = ann.Sequence - bestAnn = ann - } + if time.Since(ann.at) > announcementTimeout { + continue + } + accept := func() { + bestKey = ann.RootPublicKey + bestDist = len(ann.Signatures) + bestPort = p.port + bestTime = ann.at + bestSeq = ann.Sequence + bestAnn = ann + } + annAt := ann.at.Round(time.Millisecond) + keyDelta := ann.RootPublicKey.CompareTo(bestKey) + switch { + case keyDelta > 0: + accept() + case keyDelta < 0: + // ignore + case ann.Sequence > bestSeq: + accept() + case ann.Sequence < bestSeq: + // ignore + case annAt.Before(bestTime): + accept() + case annAt.After(bestTime): + // ignore + case len(ann.Signatures) < bestDist: + accept() + case len(ann.Signatures) > bestDist: + // ignore + case p.public.CompareTo(bestKey) > 0: + accept() } } - checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { - return ann.RootPublicKey.CompareTo(bestKey) > 0 - }) - checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { - return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence > bestSeq - }) - checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { - return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence == bestSeq && hops < bestDist - }) - checkWithCondition(func(ann *rootAnnouncementWithTime, hops int) bool { - return ann.RootPublicKey.CompareTo(bestKey) == 0 && ann.Sequence == bestSeq && hops == bestDist && ann.at.Before(bestTime) - }) - if bestAnn != nil && bestAnn.RootPublicKey.CompareTo(t.r.public) > 0 { t.parent.Store(bestPort) newcoords, newroot := t.Coords(), t.Root().RootPublicKey - if t.callback != nil && !lastParentUpdate.Coords().EqualTo(newcoords) { + if t.callback != nil && !lastUpdate.Coords().EqualTo(newcoords) { defer t.callback(bestPort, newcoords) } - if newroot != lastParentUpdate.RootPublicKey { + if newroot != lastUpdate.RootPublicKey { defer t.r.snake.rootNodeChanged(newroot) } } else { @@ -288,7 +293,7 @@ func (t *spanningTree) workerForRoot() { case <-t.rootReset: case <-time.After(announcementTimeout): - if !t.IsRoot() { + if !t.IsRoot() && t.reparenting.CAS(false, true) { t.selectNewParentAndAdvertise() } } @@ -328,85 +333,63 @@ func (t *spanningTree) Parent() types.SwitchPortID { return 0 } -func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement, updateParent bool) error { - if newUpdate.RootPublicKey.CompareTo(t.Root().RootPublicKey) < 0 { - // The update that this peer sent us, for some reason, has a weaker - // root than the one that we know about. Send an announcement back - // to them with our stronger root and do nothing further with the - // update. If our own root timed out then t.Root() will return an - // update with our own key, so it's equivalent to seeing if our key - // is stronger than theirs. - p.announce <- struct{}{} - return nil - } - - if p.IsParent() { - // Check if anything strange has happened to our parent that might - // indicate a problem with their parent, like a sudden lengthening - // of the path or a root key change altogether. If these cases happen - // then it might be that the parent we have is no longer the best one - // so we should try and select a new one. - if lastPortUpdate := p.lastAnnouncement(); lastPortUpdate != nil { - switch { - case newUpdate.AncestorParent() != lastPortUpdate.AncestorParent(): // our ancestor's parent changed - fallthrough - - case newUpdate.RootPublicKey.CompareTo(lastPortUpdate.RootPublicKey) < 0: // the root key got weaker - fallthrough - - case len(newUpdate.Signatures) > len(lastPortUpdate.Signatures): // the path got suddenly longer - t.selectNewParentAndAdvertise() - return nil - } +func (t *spanningTree) strongestRootKeyOfPeers() types.PublicKey { + strongest := t.r.public + for _, p := range t.r.activePorts() { + if p.announcement.RootPublicKey.CompareTo(strongest) > 0 { + strongest = p.announcement.RootPublicKey } } + return strongest +} - if !updateParent { - // At this point we're looking to see if there are any reasons - // that we should select a new parent. By this point, the peer's - // update has already been saved, so positive cases here will mean - // that we may end up switching to this node. - lastParentUpdate := t.Root() - keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) +func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error { + updateParent := false + lastParentUpdate := t.Root() + keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) + keyDeltaFromStrongestRootKeyOfPeers := newUpdate.RootPublicKey.CompareTo(t.strongestRootKeyOfPeers()) - switch { - case time.Since(lastParentUpdate.at) > announcementTimeout: - // We haven't seen a suitable root update recently from our parent, - // so try and select a new parent. - updateParent = true + switch { + case time.Since(lastParentUpdate.at) > announcementTimeout: + updateParent = true + + case keyDeltaFromStrongestRootKeyOfPeers < 0 && p.port == t.Parent(): + updateParent = true - case keyDeltaSinceLastParentUpdate < 0: - // The root key is weaker than our existing root so there's no point - // in attempting parent selection. + case keyDeltaSinceLastParentUpdate != 0: + updateParent = true - case keyDeltaSinceLastParentUpdate > 0: - // The root key is stronger than our existing root, therefore this - // is likely a good candidate to be a new parent. + case keyDeltaSinceLastParentUpdate == 0: + switch { + case newUpdate.Sequence < lastParentUpdate.Sequence: + // Ignore an update with an old sequence number + + case len(newUpdate.Signatures) != len(lastParentUpdate.Signatures): updateParent = true - case keyDeltaSinceLastParentUpdate == 0: - // The update is from the same root node, let's see if it matches - // any other useful conditions. - switch { - case newUpdate.Sequence < lastParentUpdate.Sequence: - // This is a replay of an earlier update, therefore we should - // ignore it, even if it came from our parent node - - case len(newUpdate.Signatures) < len(lastParentUpdate.Signatures): - // The path to the root is shorter than our last update, so - // we'll accept it - updateParent = true + default: + for i := 0; i < len(newUpdate.Signatures); i++ { + if newUpdate.Signatures[i] != lastParentUpdate.Signatures[i] { + updateParent = true + break + } } } } switch { case updateParent: - t.selectNewParentAndAdvertise() + if t.reparenting.CAS(false, true) { + time.AfterFunc(time.Second, t.selectNewParentAndAdvertise) + } case p.IsParent(): - t.advertise() - t.rootReset <- struct{}{} + if newUpdate.Sequence >= lastParentUpdate.Sequence { + t.advertise() + } + if newUpdate.Sequence > lastParentUpdate.Sequence { + t.rootReset <- struct{}{} + } } return nil From 58332b45f5b99464c93af5c0676fc8371c338627 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 3 Sep 2021 16:18:15 +0100 Subject: [PATCH 10/22] Reuse port numbers if possible, bug fixes --- router/peer.go | 4 ++-- router/router.go | 61 ++++++++++++++++++++++++++++++------------------ router/tree.go | 8 +++---- 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/router/peer.go b/router/peer.go index bad97f7a..407b77ec 100644 --- a/router/peer.go +++ b/router/peer.go @@ -153,11 +153,11 @@ func (p *Peer) start() error { if !p.started.CAS(false, true) { return errors.New("switch peer is already started") } + defer p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) p.announcement = nil p.child.Store(false) go p.reader(p.context) go p.writer(p.context) - p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) return nil } @@ -165,7 +165,7 @@ func (p *Peer) stop() error { if !p.started.CAS(true, false) { return errors.New("switch peer is already stopped") } - p.r.active.Delete(hex.EncodeToString(p.public[:]) + p.zone) + defer p.r.active.Delete(hex.EncodeToString(p.public[:]) + p.zone) p.mutex.Lock() p.peertype = 0 p.zone = "" diff --git a/router/router.go b/router/router.go index a483d82f..bb061df6 100644 --- a/router/router.go +++ b/router/router.go @@ -384,11 +384,46 @@ func (r *Router) AuthenticatedConnect(conn net.Conn, zone string, peertype int) // port number that the node was connected to will be // returned in the event of a successful connection. func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, peertype int) (types.SwitchPortID, error) { + var reusePort types.SwitchPortID if p, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { - _ = r.Disconnect(p.(types.SwitchPortID), fmt.Errorf("another incoming connection from the same node")) + if err := r.Disconnect(p.(types.SwitchPortID), fmt.Errorf("another incoming connection from the same node")); err == nil { + reusePort = p.(types.SwitchPortID) + } + } + usePort := func(port *Peer) error { + port.mutex.Lock() + port.context, port.cancel = context.WithCancel(r.context) + port.zone = zone + port.peertype = peertype + port.conn = conn // util.NewBufferedRWCSize(conn, MaxFrameSize) + port.public = public + port.coords = nil + port.protoOut.reset() + port.trafficOut.reset() + port.announcement = nil + port.statistics.reset() + port.mutex.Unlock() + if err := port.start(); err != nil { + return fmt.Errorf("port.start: %w", err) + } + if port.port != 0 { + r.dht.insertNode(port) + } + if r.simulator != nil { + r.simulator.ReportNewLink(conn, r.public, public) + } + go r.callbacks.onConnected(port.port, public, peertype) + r.log.Printf("Connected port %d to %s (zone %q)\n", port.port, conn.RemoteAddr(), zone) + return nil } r.connections.Lock() defer r.connections.Unlock() + if reusePort != 0 { + if err := usePort(r.ports[reusePort]); err != nil { + return 0, fmt.Errorf("error reusing port: %w", err) + } + return reusePort, nil + } for i := types.SwitchPortID(0); i < PortCount; i++ { if i != 0 && bytes.Equal(r.public[:], public[:]) { return 0, fmt.Errorf("loopback connection prohibited") @@ -396,29 +431,9 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee if r.ports[i].started.Load() { continue } - r.ports[i].mutex.Lock() - r.ports[i].context, r.ports[i].cancel = context.WithCancel(r.context) - r.ports[i].zone = zone - r.ports[i].peertype = peertype - r.ports[i].conn = conn // util.NewBufferedRWCSize(conn, MaxFrameSize) - r.ports[i].public = public - r.ports[i].coords = nil - r.ports[i].protoOut.reset() - r.ports[i].trafficOut.reset() - r.ports[i].announcement = nil - r.ports[i].statistics.reset() - r.ports[i].mutex.Unlock() - if err := r.ports[i].start(); err != nil { - return 0, fmt.Errorf("port.start: %w", err) - } - if i != 0 { - r.dht.insertNode(r.ports[i]) - } - if r.simulator != nil { - r.simulator.ReportNewLink(conn, r.public, public) + if err := usePort(r.ports[i]); err != nil { + return 0, fmt.Errorf("error allocating new port: %w", err) } - go r.callbacks.onConnected(i, public, peertype) - r.log.Printf("Connected port %d to %s (zone %q)\n", i, conn.RemoteAddr(), zone) return i, nil } return 0, fmt.Errorf("no free switch ports") diff --git a/router/tree.go b/router/tree.go index 86d6576f..6aa0331d 100644 --- a/router/tree.go +++ b/router/tree.go @@ -216,14 +216,14 @@ func (t *spanningTree) selectNewParent() { accept() case ann.Sequence < bestSeq: // ignore - case annAt.Before(bestTime): - accept() - case annAt.After(bestTime): - // ignore case len(ann.Signatures) < bestDist: accept() case len(ann.Signatures) > bestDist: // ignore + case annAt.Before(bestTime): + accept() + case annAt.After(bestTime): + // ignore case p.public.CompareTo(bestKey) > 0: accept() } From ae12ac4a78cf5b21ab8e4e6245c21bc4cf0d8980 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Mon, 6 Sep 2021 17:57:11 +0100 Subject: [PATCH 11/22] Well this isn't an improvement at all --- cmd/pineconeip/main.go | 38 ++++++++++- cmd/pineconesim/simulator/links.go | 3 + cmd/pineconesim/simulator/nodes.go | 3 + router/nexthop.go | 4 +- router/nexthop_greedy.go | 1 + router/nexthop_virtualsnake.go | 7 +- router/peer.go | 105 ++++++++++------------------- router/queue.go | 12 ---- router/queuefifo.go | 81 ++++++++++------------ router/router.go | 12 +--- router/tree.go | 82 +++++++--------------- 11 files changed, 148 insertions(+), 200 deletions(-) delete mode 100644 router/queue.go diff --git a/cmd/pineconeip/main.go b/cmd/pineconeip/main.go index f807fe74..608207e2 100644 --- a/cmd/pineconeip/main.go +++ b/cmd/pineconeip/main.go @@ -21,6 +21,10 @@ import ( "log" "net" "os" + "os/signal" + "runtime/pprof" + "syscall" + "time" "net/http" _ "net/http/pprof" @@ -52,9 +56,15 @@ func main() { logger := log.New(os.Stdout, "", 0) if hostPort := os.Getenv("PPROFLISTEN"); hostPort != "" { - logger.Println("Starting pprof on", hostPort) go func() { - _ = http.ListenAndServe(hostPort, nil) + listener, err := net.Listen("tcp", hostPort) + if err != nil { + panic(err) + } + logger.Println("Starting pprof on", listener.Addr()) + if err := http.Serve(listener, nil); err != nil { + panic(err) + } }() } @@ -133,5 +143,27 @@ func main() { } }() - select {} + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGUSR1) + for { + switch <-sigs { + case syscall.SIGUSR1: + fn := fmt.Sprintf("/tmp/profile.%d", os.Getpid()) + logger.Println("Requested profile:", fn) + fp, err := os.Create(fn) + if err != nil { + logger.Println("Failed to create profile:", err) + return + } + defer fp.Close() + if err := pprof.StartCPUProfile(fp); err != nil { + logger.Println("Failed to start profiling:", err) + return + } + time.AfterFunc(time.Second*10, func() { + pprof.StopCPUProfile() + logger.Println("Profile written:", fn) + }) + } + } } diff --git a/cmd/pineconesim/simulator/links.go b/cmd/pineconesim/simulator/links.go index ab327768..c7401b8f 100644 --- a/cmd/pineconesim/simulator/links.go +++ b/cmd/pineconesim/simulator/links.go @@ -47,6 +47,9 @@ func (sim *Simulator) ConnectNodes(a, b string) error { if err := c.SetNoDelay(true); err != nil { panic(err) } + if err := c.SetLinger(0); err != nil { + panic(err) + } sc := &util.SlowConn{ Conn: c, //ReadDelay: 5 * time.Millisecond, diff --git a/cmd/pineconesim/simulator/nodes.go b/cmd/pineconesim/simulator/nodes.go index d46a5057..e3f54819 100644 --- a/cmd/pineconesim/simulator/nodes.go +++ b/cmd/pineconesim/simulator/nodes.go @@ -67,6 +67,9 @@ func (sim *Simulator) CreateNode(t string) error { if err := c.SetNoDelay(true); err != nil { panic(err) } + if err := c.SetLinger(0); err != nil { + panic(err) + } if _, err = n.AuthenticatedConnect(c, "sim", router.PeerTypeRemote); err != nil { continue } diff --git a/router/nexthop.go b/router/nexthop.go index 62de91c2..80b199ad 100644 --- a/router/nexthop.go +++ b/router/nexthop.go @@ -31,7 +31,9 @@ func (p *Peer) getNextHops(frame *types.Frame, from types.SwitchPortID) types.Sw switch frame.Type { case types.TypeSTP: if from != 0 { - p.r.handleAnnouncement(p, frame) + if err := p.r.handleAnnouncement(p, frame); err != nil { + p.r.log.Println("Failed to handle announcement:", err) + } } case types.TypeVirtualSnakeBootstrap: diff --git a/router/nexthop_greedy.go b/router/nexthop_greedy.go index a4d6d76b..6d46eb39 100644 --- a/router/nexthop_greedy.go +++ b/router/nexthop_greedy.go @@ -65,6 +65,7 @@ func (r *Router) getGreedyRoutedNextHop(from *Peer, rx *types.Frame) types.Switc case peerDist < bestDist: // The peer is closer to the destination. bestPeer, bestDist = p.port, peerDist + default: } } diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 2af676bc..c762b7c7 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -314,8 +314,11 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe newCheckedCandidate(ancestor, parentPort) } + // The next section needs us to check direct peers + activePorts := t.r.activePorts() + // Check our direct peers ancestors - for _, peer := range t.r.activePorts() { + for _, peer := range activePorts { peerAnn := peer.lastAnnouncement() if peerAnn == nil { continue @@ -327,7 +330,7 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe } // Check our direct peers - for _, peer := range t.r.activePorts() { + for _, peer := range activePorts { peerKey := peer.PublicKey() switch { case bestKey.EqualTo(peerKey): diff --git a/router/peer.go b/router/peer.go index 407b77ec..740b9fc0 100644 --- a/router/peer.go +++ b/router/peer.go @@ -30,7 +30,7 @@ import ( "go.uber.org/atomic" ) -const PeerKeepaliveInterval = time.Second * 2 +const PeerKeepaliveInterval = time.Second * 3 const PeerKeepaliveTimeout = PeerKeepaliveInterval * 3 const ( @@ -51,14 +51,30 @@ type Peer struct { cancel context.CancelFunc // conn net.Conn // underlying connection to peer public types.PublicKey // - trafficOut queue // queue traffic message to peer - protoOut queue // queue protocol message to peer + trafficOut *lifoQueue // queue traffic message to peer + protoOut *fifoQueue // queue protocol message to peer coords types.SwitchPorts // - announce chan struct{} // announcement *rootAnnouncementWithTime // statistics peerStatistics // } +func (p *Peer) reset() { + p.mutex.Lock() + defer p.mutex.Unlock() + p.started.Store(false) + p.child.Store(false) + p.zone = "" + p.peertype = 0 + p.context, p.cancel = nil, nil + //p.conn = nil + p.public = types.PublicKey{} + p.trafficOut.reset() + p.protoOut.reset() + p.coords = nil + p.announcement = nil + p.statistics.reset() +} + type peerStatistics struct { txProtoSuccessful atomic.Uint64 txProtoDropped atomic.Uint64 @@ -153,9 +169,7 @@ func (p *Peer) start() error { if !p.started.CAS(false, true) { return errors.New("switch peer is already started") } - defer p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) - p.announcement = nil - p.child.Store(false) + p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) go p.reader(p.context) go p.writer(p.context) return nil @@ -167,17 +181,10 @@ func (p *Peer) stop() error { } defer p.r.active.Delete(hex.EncodeToString(p.public[:]) + p.zone) p.mutex.Lock() - p.peertype = 0 - p.zone = "" - p.public = types.PublicKey{} - p.coords = nil - p.announcement = nil - p.protoOut.reset() - p.trafficOut.reset() - p.mutex.Unlock() - p.child.Store(false) - p.cancel() _ = p.conn.Close() + p.cancel() + p.mutex.Unlock() + p.reset() return nil } @@ -346,28 +353,19 @@ func (p *Peer) writer(ctx context.Context) { if !bytes.Equal(buf[:4], types.FrameMagicBytes) { panic("expected magic bytes") } - remaining := buf[:fn] - for len(remaining) > 0 { - n, err := p.conn.Write(remaining) - if err != nil { - _ = p.r.Disconnect(p.port, fmt.Errorf("p.conn.Write: %w", err)) - return - } - remaining = remaining[n:] + if _, err = p.conn.Write(buf[:fn]); err != nil { + _ = p.r.Disconnect(p.port, fmt.Errorf("p.conn.Write: %w", err)) + return } } - // The very first thing we send should be a tree announcement, - // so that the remote side can work out our coords and consider - // us to be "alive". - send(p.generateAnnouncement()) - for { select { case <-ctx.Done(): return - case <-p.announce: - send(p.generateAnnouncement()) + case frame := <-p.protoOut.pop(): + p.protoOut.ack() + send(frame) p.statistics.txProtoSuccessful.Inc() continue default: @@ -375,35 +373,11 @@ func (p *Peer) writer(ctx context.Context) { select { case <-ctx.Done(): return - case <-p.announce: - send(p.generateAnnouncement()) + case frame := <-p.protoOut.pop(): + p.protoOut.ack() + send(frame) p.statistics.txProtoSuccessful.Inc() continue - case <-p.protoOut.wait(): - if frame, ok := p.protoOut.pop(); ok { - send(frame) - p.statistics.txProtoSuccessful.Inc() - } else { - p.statistics.txProtoDropped.Inc() - } - continue - default: - } - select { - case <-ctx.Done(): - return - case <-p.announce: - send(p.generateAnnouncement()) - p.statistics.txProtoSuccessful.Inc() - continue - case <-p.protoOut.wait(): - if frame, ok := p.protoOut.pop(); ok { - send(frame) - p.statistics.txProtoSuccessful.Inc() - } else { - p.statistics.txProtoDropped.Inc() - } - continue case <-p.trafficOut.wait(): if frame, ok := p.trafficOut.pop(); ok { send(frame) @@ -417,18 +391,11 @@ func (p *Peer) writer(ctx context.Context) { select { case <-ctx.Done(): return - case <-p.announce: - send(p.generateAnnouncement()) + case frame := <-p.protoOut.pop(): + p.protoOut.ack() + send(frame) p.statistics.txProtoSuccessful.Inc() continue - case <-p.protoOut.wait(): - if frame, ok := p.protoOut.pop(); ok { - send(frame) - p.statistics.txProtoSuccessful.Inc() - } else { - p.statistics.txProtoDropped.Inc() - } - continue case <-p.trafficOut.wait(): if frame, ok := p.trafficOut.pop(); ok { send(frame) diff --git a/router/queue.go b/router/queue.go deleted file mode 100644 index a2219a14..00000000 --- a/router/queue.go +++ /dev/null @@ -1,12 +0,0 @@ -package router - -import "github.com/matrix-org/pinecone/types" - -type queue interface { - queuecount() int - queuesize() int - push(frame *types.Frame) bool - pop() (*types.Frame, bool) - reset() - wait() <-chan struct{} -} diff --git a/router/queuefifo.go b/router/queuefifo.go index 0bcb4381..b46a6732 100644 --- a/router/queuefifo.go +++ b/router/queuefifo.go @@ -7,85 +7,74 @@ import ( ) type fifoQueue struct { - frames []*types.Frame - count int - mutex sync.Mutex - notifs chan struct{} + entries []chan *types.Frame + mutex sync.Mutex } func newFIFOQueue() *fifoQueue { - q := &fifoQueue{ - notifs: make(chan struct{}), - } + q := &fifoQueue{} + q.reset() return q } +func (q *fifoQueue) _initialise() { + for i := range q.entries { + q.entries[i] = nil + } + q.entries = []chan *types.Frame{ + make(chan *types.Frame, 1), + } +} + func (q *fifoQueue) queuecount() int { q.mutex.Lock() defer q.mutex.Unlock() - return q.count + return len(q.entries) } func (q *fifoQueue) queuesize() int { q.mutex.Lock() defer q.mutex.Unlock() - return cap(q.frames) + return cap(q.entries) } func (q *fifoQueue) push(frame *types.Frame) bool { q.mutex.Lock() defer q.mutex.Unlock() - q.frames = append(q.frames, frame) - q.count++ - select { - case q.notifs <- struct{}{}: - default: + if len(q.entries) == 0 { + q._initialise() } + ch := q.entries[len(q.entries)-1] + ch <- frame + close(ch) + q.entries = append(q.entries, make(chan *types.Frame, 1)) return true } -func (q *fifoQueue) pop() (*types.Frame, bool) { +func (q *fifoQueue) reset() { q.mutex.Lock() defer q.mutex.Unlock() - if q.count == 0 { - return nil, false - } - frame := q.frames[0] - q.frames[0] = nil - q.frames = q.frames[1:] - q.count-- - if q.count == 0 { - // Force a GC of the underlying array, since it might have - // grown significantly if the queue was hammered for some reason - q.frames = nil + for _, ch := range q.entries { + frame := <-ch + if frame != nil { + frame.Done() + } } - return frame, true + q._initialise() } -func (q *fifoQueue) reset() { +func (q *fifoQueue) pop() <-chan *types.Frame { q.mutex.Lock() defer q.mutex.Unlock() - q.count = 0 - for i := range q.frames { - if q.frames[i] != nil { - q.frames[i].Done() - q.frames[i] = nil - } - } - q.frames = nil - close(q.notifs) - for range q.notifs { + if len(q.entries) == 0 { + q._initialise() } - q.notifs = make(chan struct{}) + entry := q.entries[0] + return entry } -func (q *fifoQueue) wait() <-chan struct{} { +func (q *fifoQueue) ack() { q.mutex.Lock() defer q.mutex.Unlock() - if q.count > 0 { - ch := make(chan struct{}) - close(ch) - return ch - } - return q.notifs + q.entries = q.entries[1:] } diff --git a/router/router.go b/router/router.go index bb061df6..3e8e913c 100644 --- a/router/router.go +++ b/router/router.go @@ -110,7 +110,6 @@ func NewRouter(log *log.Logger, id string, private ed25519.PrivateKey, public ed sw.ports[i] = &Peer{ r: sw, port: types.SwitchPortID(i), - announce: make(chan struct{}), protoOut: newFIFOQueue(), trafficOut: newLIFOQueue(TrafficBufferSize), } @@ -313,10 +312,7 @@ func (r *Router) startedPorts() peers { func (r *Router) activePorts() peers { peers := make(peers, 0, PortCount) for _, p := range r.startedPorts() { - switch { - case !p.Alive(): - continue - default: + if p.Alive() { peers = append(peers, p) } } @@ -397,12 +393,8 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee port.peertype = peertype port.conn = conn // util.NewBufferedRWCSize(conn, MaxFrameSize) port.public = public - port.coords = nil - port.protoOut.reset() - port.trafficOut.reset() - port.announcement = nil - port.statistics.reset() port.mutex.Unlock() + port.protoOut.push(port.generateAnnouncement()) if err := port.start(); err != nil { return fmt.Errorf("port.start: %w", err) } diff --git a/router/tree.go b/router/tree.go index 6aa0331d..24848818 100644 --- a/router/tree.go +++ b/router/tree.go @@ -17,6 +17,7 @@ package router import ( "context" "encoding/hex" + "fmt" "math" "sync" "time" @@ -34,37 +35,36 @@ const announcementInterval = time.Minute * 15 // will assume that the peer is dead. const announcementTimeout = announcementInterval * 2 -func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) { +func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) error { var newUpdate types.SwitchAnnouncement if _, err := newUpdate.UnmarshalBinary(rx.Payload); err != nil { - r.log.Println("Error unmarshalling announcement:", err) - return + return fmt.Errorf("failed to unmarshal root announcement: %w", err) } sigs := make(map[string]struct{}) for index, sig := range newUpdate.Signatures { if index == 0 && sig.PublicKey != newUpdate.RootPublicKey { // The first signature in the announcement must be from the // key that claims to be the root. - return + return fmt.Errorf("root announcement first signature is not from the root node") } if sig.Hop == 0 { // None of the hops in the update should have a port number of 0 // as this would imply that another node has sent their router // port, which is impossible. We'll therefore reject any update // that tries to do that. - return + return fmt.Errorf("root announcement contains an invalid port number") } if index == len(newUpdate.Signatures)-1 && peer.PublicKey() != sig.PublicKey { // The last signature in the announcement must be from the // direct peer. If it isn't then it sounds like someone is // trying to replay someone else's announcement to us. - return + return fmt.Errorf("root announcement last signature is not from the direct peer") } pk := hex.EncodeToString(sig.PublicKey[:]) if _, ok := sigs[pk]; ok { // One of the signatures has appeared in the update more than // once, which would suggest that there's a loop somewhere. - return + return fmt.Errorf("root announcement contains a routing loop") } sigs[pk] = struct{}{} } @@ -74,19 +74,16 @@ func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) { if newUpdate.Sequence < lastPortUpdate.Sequence { // The update is a replay of a previous announcement which doesn't // make sense - peer.cancel() - return + return fmt.Errorf("root announcement is a replay of an old sequence number") } } if err := peer.updateAnnouncement(&newUpdate); err != nil { - r.log.Println("Error updating announcement on port", peer.port, ":", err) - return + return fmt.Errorf("updating the peer last root announcement failed: %w", err) } - if err := r.tree.Update(peer, newUpdate); err != nil { - r.log.Println("Error handling announcement on port", peer.port, ":", err) - } + r.tree.UpdateParentIfNeeded(peer, newUpdate) + return nil } type rootAnnouncementWithTime struct { @@ -205,25 +202,25 @@ func (t *spanningTree) selectNewParent() { bestSeq = ann.Sequence bestAnn = ann } - annAt := ann.at.Round(time.Millisecond) keyDelta := ann.RootPublicKey.CompareTo(bestKey) + annAt := ann.at.Round(time.Millisecond) switch { case keyDelta > 0: accept() case keyDelta < 0: - // ignore + // ignore weaker root keys case ann.Sequence > bestSeq: accept() case ann.Sequence < bestSeq: - // ignore - case len(ann.Signatures) < bestDist: - accept() - case len(ann.Signatures) > bestDist: - // ignore + // ignore lower sequence numbers case annAt.Before(bestTime): accept() case annAt.After(bestTime): - // ignore + // ignore updates that arrived more recently + case len(ann.Signatures) < bestDist: + accept() + case len(ann.Signatures) > bestDist: + // ignore longer paths case p.public.CompareTo(bestKey) > 0: accept() } @@ -247,13 +244,7 @@ func (t *spanningTree) selectNewParent() { func (t *spanningTree) advertise() { for _, p := range t.r.startedPorts() { - go func(p *Peer) { - select { - case <-p.context.Done(): - case <-time.After(announcementTimeout): - case p.announce <- struct{}{}: - } - }(p) + p.protoOut.push(p.generateAnnouncement()) } } @@ -333,29 +324,15 @@ func (t *spanningTree) Parent() types.SwitchPortID { return 0 } -func (t *spanningTree) strongestRootKeyOfPeers() types.PublicKey { - strongest := t.r.public - for _, p := range t.r.activePorts() { - if p.announcement.RootPublicKey.CompareTo(strongest) > 0 { - strongest = p.announcement.RootPublicKey - } - } - return strongest -} - -func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error { +func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnouncement) { updateParent := false lastParentUpdate := t.Root() keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) - keyDeltaFromStrongestRootKeyOfPeers := newUpdate.RootPublicKey.CompareTo(t.strongestRootKeyOfPeers()) switch { case time.Since(lastParentUpdate.at) > announcementTimeout: updateParent = true - case keyDeltaFromStrongestRootKeyOfPeers < 0 && p.port == t.Parent(): - updateParent = true - case keyDeltaSinceLastParentUpdate != 0: updateParent = true @@ -366,31 +343,22 @@ func (t *spanningTree) Update(p *Peer, newUpdate types.SwitchAnnouncement) error case len(newUpdate.Signatures) != len(lastParentUpdate.Signatures): updateParent = true - - default: - for i := 0; i < len(newUpdate.Signatures); i++ { - if newUpdate.Signatures[i] != lastParentUpdate.Signatures[i] { - updateParent = true - break - } - } } } switch { case updateParent: if t.reparenting.CAS(false, true) { - time.AfterFunc(time.Second, t.selectNewParentAndAdvertise) + t.selectNewParentAndAdvertise() } case p.IsParent(): if newUpdate.Sequence >= lastParentUpdate.Sequence { t.advertise() } - if newUpdate.Sequence > lastParentUpdate.Sequence { - t.rootReset <- struct{}{} - } } - return nil + if newUpdate.Sequence > lastParentUpdate.Sequence { + t.rootReset <- struct{}{} + } } From 30d8b9f13d2e810d08b2a289e62752afe39cc8ba Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Tue, 7 Sep 2021 14:13:24 +0100 Subject: [PATCH 12/22] Refactor announcements a bit --- router/peer.go | 33 ------------------- router/router.go | 4 +-- router/tree.go | 83 +++++++++++++++++++++++++++++++++--------------- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/router/peer.go b/router/peer.go index 740b9fc0..34581346 100644 --- a/router/peer.go +++ b/router/peer.go @@ -195,39 +195,6 @@ func (p *Peer) generateKeepalive() *types.Frame { return frame } -func (p *Peer) generateAnnouncement() *types.Frame { - if p.port == 0 { - return nil - } - announcement := p.r.tree.Root() - for _, sig := range announcement.Signatures { - if p.r.public.EqualTo(sig.PublicKey) { - // For some reason the announcement that we want to send already - // includes our signature. This shouldn't really happen but if we - // did send it, other nodes would end up ignoring the announcement - // anyway since it would appear to be a routing loop. - return nil - } - } - // Sign the announcement. - if err := announcement.Sign(p.r.private[:], p.port); err != nil { - p.r.log.Println("Failed to sign switch announcement:", err) - return nil - } - var payload [MaxPayloadSize]byte - n, err := announcement.MarshalBinary(payload[:]) - if err != nil { - p.r.log.Println("Failed to marshal switch announcement:", err) - return nil - } - frame := types.GetFrame() - frame.Version = types.Version0 - frame.Type = types.TypeSTP - frame.Destination = types.SwitchPorts{} - frame.Payload = payload[:n] - return frame -} - func (p *Peer) reader(ctx context.Context) { buf := make([]byte, MaxFrameSize) for { diff --git a/router/router.go b/router/router.go index 3e8e913c..19a9f605 100644 --- a/router/router.go +++ b/router/router.go @@ -394,7 +394,7 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee port.conn = conn // util.NewBufferedRWCSize(conn, MaxFrameSize) port.public = public port.mutex.Unlock() - port.protoOut.push(port.generateAnnouncement()) + port.protoOut.push(r.tree.Root().ForPeer(port)) if err := port.start(); err != nil { return fmt.Errorf("port.start: %w", err) } @@ -451,8 +451,8 @@ func (r *Router) Disconnect(i types.SwitchPortID, err error) error { r.simulator.ReportDeadLink(r.public, r.ports[i].public) } r.log.Printf("Disconnected port %d: %s\n", i, err) - r.tree.portWasDisconnected(i) r.snake.portWasDisconnected(i) + r.tree.portWasDisconnected(i) go r.callbacks.onDisconnected(i, r.ports[i].public, r.ports[i].peertype, err) return nil } diff --git a/router/tree.go b/router/tree.go index 24848818..1318a753 100644 --- a/router/tree.go +++ b/router/tree.go @@ -23,7 +23,6 @@ import ( "time" "github.com/matrix-org/pinecone/types" - "go.uber.org/atomic" ) // announcementInterval is the frequency at which this @@ -91,14 +90,47 @@ type rootAnnouncementWithTime struct { at time.Time } +func (a *rootAnnouncementWithTime) ForPeer(p *Peer) *types.Frame { + if p.port == 0 { + return nil + } + announcement := a.SwitchAnnouncement + announcement.Signatures = append([]types.SignatureWithHop{}, a.Signatures...) + for _, sig := range announcement.Signatures { + if p.r.public.EqualTo(sig.PublicKey) { + // For some reason the announcement that we want to send already + // includes our signature. This shouldn't really happen but if we + // did send it, other nodes would end up ignoring the announcement + // anyway since it would appear to be a routing loop. + return nil + } + } + // Sign the announcement. + if err := announcement.Sign(p.r.private[:], p.port); err != nil { + p.r.log.Println("Failed to sign switch announcement:", err) + return nil + } + var payload [MaxPayloadSize]byte + n, err := announcement.MarshalBinary(payload[:]) + if err != nil { + p.r.log.Println("Failed to marshal switch announcement:", err) + return nil + } + frame := types.GetFrame() + frame.Version = types.Version0 + frame.Type = types.TypeSTP + frame.Destination = types.SwitchPorts{} + frame.Payload = payload[:n] + return frame +} + type spanningTree struct { - r *Router // - context context.Context // - rootReset chan struct{} // - updateMutex sync.Mutex // - parent atomic.Value // types.SwitchPortID - reparenting atomic.Bool // - callback func(parent types.SwitchPortID, coords types.SwitchPorts) + r *Router + context context.Context + rootReset chan struct{} + mutex sync.Mutex + parent types.SwitchPortID + callback func(parent types.SwitchPortID, coords types.SwitchPorts) } func newSpanningTree(r *Router, f func(parent types.SwitchPortID, coords types.SwitchPorts)) *spanningTree { @@ -143,14 +175,12 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { t.becomeRoot() return } - if t.Parent() == port && t.reparenting.CAS(false, true) { + if t.Parent() == port { t.selectNewParentAndAdvertise() } } func (t *spanningTree) selectNewParentAndAdvertise() { - t.reparenting.Store(false) - lastParentUpdate := t.Root() lastParentPort := t.Parent() @@ -172,9 +202,6 @@ func (t *spanningTree) selectNewParentAndAdvertise() { } func (t *spanningTree) selectNewParent() { - t.updateMutex.Lock() - defer t.updateMutex.Unlock() - lastUpdate := t.Root() bestDist := math.MaxInt32 bestKey := t.r.public @@ -183,6 +210,8 @@ func (t *spanningTree) selectNewParent() { var bestAnn *rootAnnouncementWithTime var bestSeq types.Varu64 + t.mutex.Lock() + for _, p := range t.r.activePorts() { if p.child.Load() { continue @@ -226,8 +255,10 @@ func (t *spanningTree) selectNewParent() { } } + t.mutex.Unlock() + if bestAnn != nil && bestAnn.RootPublicKey.CompareTo(t.r.public) > 0 { - t.parent.Store(bestPort) + t.parent = bestPort newcoords, newroot := t.Coords(), t.Root().RootPublicKey if t.callback != nil && !lastUpdate.Coords().EqualTo(newcoords) { @@ -243,18 +274,23 @@ func (t *spanningTree) selectNewParent() { } func (t *spanningTree) advertise() { + ann := t.Root() for _, p := range t.r.startedPorts() { - p.protoOut.push(p.generateAnnouncement()) + p.protoOut.push(ann.ForPeer(p)) } } func (t *spanningTree) becomeRoot() { - t.parent.Store(types.SwitchPortID(0)) + t.mutex.Lock() + t.parent = 0 + t.mutex.Unlock() + newCoords := types.SwitchPorts{} if !t.Coords().EqualTo(newCoords) { go t.callback(0, types.SwitchPorts{}) defer t.r.snake.rootNodeChanged(t.r.public) } + t.advertise() } @@ -284,7 +320,7 @@ func (t *spanningTree) workerForRoot() { case <-t.rootReset: case <-time.After(announcementTimeout): - if !t.IsRoot() && t.reparenting.CAS(false, true) { + if !t.IsRoot() { t.selectNewParentAndAdvertise() } } @@ -318,10 +354,9 @@ func (t *spanningTree) Root() *rootAnnouncementWithTime { } func (t *spanningTree) Parent() types.SwitchPortID { - if parent, ok := t.parent.Load().(types.SwitchPortID); ok { - return parent - } - return 0 + t.mutex.Lock() + defer t.mutex.Unlock() + return t.parent } func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnouncement) { @@ -348,9 +383,7 @@ func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnou switch { case updateParent: - if t.reparenting.CAS(false, true) { - t.selectNewParentAndAdvertise() - } + t.selectNewParentAndAdvertise() case p.IsParent(): if newUpdate.Sequence >= lastParentUpdate.Sequence { From 3c89e4ad717e4ab4d332b054d90a31349b1dcade Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Tue, 7 Sep 2021 15:34:39 +0100 Subject: [PATCH 13/22] Refactor peer management --- router/peer.go | 153 ++++++++++++++++++++++++++++++----------------- router/router.go | 44 +++++--------- router/tree.go | 2 +- 3 files changed, 113 insertions(+), 86 deletions(-) diff --git a/router/peer.go b/router/peer.go index 34581346..ed9b3e0d 100644 --- a/router/peer.go +++ b/router/peer.go @@ -19,7 +19,6 @@ import ( "context" "encoding/binary" "encoding/hex" - "errors" "fmt" "io" "net" @@ -42,6 +41,7 @@ const ( type Peer struct { r *Router // port types.SwitchPortID // + allocated atomic.Bool // started atomic.Bool // worker goroutines started? child atomic.Bool // is this node a child of ours? mutex sync.RWMutex // protects everything below this line @@ -58,21 +58,70 @@ type Peer struct { statistics peerStatistics // } +func (p *Peer) start() { + if !p.started.CAS(false, true) { + return + } + + var err error + ctx := p.context + index := hex.EncodeToString(p.public[:]) + p.zone + errors := make(chan error) + wg := &sync.WaitGroup{} + wg.Add(2) + + p.r.active.Store(index, p.port) + + go func() { + errors <- p.reader(wg, ctx) + }() + go func() { + errors <- p.writer(wg, ctx) + }() + + select { + case <-ctx.Done(): + case err = <-errors: + p.r.log.Println("Peer", p.port, "workers stopping due to error:", err) + ctx.Done() + } + + p.started.Store(false) + + if p.port != 0 { + p.r.dht.deleteNode(p.public) + } + if p.r.simulator != nil { + p.r.simulator.ReportDeadLink(p.r.public, p.public) + } + p.r.log.Printf("Disconnected port %d: %s\n", p.port, err) + p.r.snake.portWasDisconnected(p.port) + p.r.tree.portWasDisconnected(p.port) + go p.r.callbacks.onDisconnected(p.port, p.public, p.peertype, err) + + wg.Wait() + _ = p.conn.Close() + + p.reset() + p.r.active.Delete(index) +} + func (p *Peer) reset() { p.mutex.Lock() defer p.mutex.Unlock() + p.allocated.Store(false) p.started.Store(false) p.child.Store(false) p.zone = "" p.peertype = 0 p.context, p.cancel = nil, nil - //p.conn = nil + p.conn = nil p.public = types.PublicKey{} p.trafficOut.reset() p.protoOut.reset() + p.statistics.reset() p.coords = nil p.announcement = nil - p.statistics.reset() } type peerStatistics struct { @@ -165,26 +214,10 @@ func (p *Peer) lastAnnouncement() *rootAnnouncementWithTime { return p.announcement } -func (p *Peer) start() error { - if !p.started.CAS(false, true) { - return errors.New("switch peer is already started") - } - p.r.active.Store(hex.EncodeToString(p.public[:])+p.zone, p.port) - go p.reader(p.context) - go p.writer(p.context) - return nil -} - func (p *Peer) stop() error { - if !p.started.CAS(true, false) { - return errors.New("switch peer is already stopped") - } - defer p.r.active.Delete(hex.EncodeToString(p.public[:]) + p.zone) p.mutex.Lock() - _ = p.conn.Close() p.cancel() p.mutex.Unlock() - p.reset() return nil } @@ -195,34 +228,31 @@ func (p *Peer) generateKeepalive() *types.Frame { return frame } -func (p *Peer) reader(ctx context.Context) { +func (p *Peer) reader(wg *sync.WaitGroup, ctx context.Context) error { + defer wg.Done() buf := make([]byte, MaxFrameSize) for { select { case <-ctx.Done(): // The switch peer is shutting down. - return + return context.Canceled default: if p.port != 0 { if err := p.conn.SetReadDeadline(time.Now().Add(PeerKeepaliveTimeout)); err != nil { - _ = p.r.Disconnect(p.port, fmt.Errorf("conn.SetReadDeadline: %w", err)) - return + return fmt.Errorf("conn.SetReadDeadline: %w", err) } } if _, err := io.ReadFull(p.conn, buf[:8]); err != nil { - _ = p.r.Disconnect(p.port, fmt.Errorf("p.conn.Peek: %w", err)) - return + return fmt.Errorf("p.conn.Peek: %w", err) } if !bytes.Equal(buf[:4], types.FrameMagicBytes) { - _ = p.r.Disconnect(p.port, fmt.Errorf("missing magic bytes")) - return + return fmt.Errorf("missing magic bytes") } expecting := int(binary.BigEndian.Uint16(buf[6:8])) n, err := io.ReadFull(p.conn, buf[8:expecting]) if err != nil { - _ = p.r.Disconnect(p.port, fmt.Errorf("io.ReadFull: %w", err)) - return + return fmt.Errorf("io.ReadFull: %w", err) } if n < expecting-8 { p.r.log.Println("Expecting", expecting, "bytes but got", n, "bytes") @@ -230,16 +260,13 @@ func (p *Peer) reader(ctx context.Context) { } frame := types.GetFrame() if _, err := frame.UnmarshalBinary(buf[:n+8]); err != nil { - p.r.log.Println("Port", p.port, "error unmarshalling frame:", err) - frame.Done() - return - } - if frame.Version != types.Version0 { - p.r.log.Println("Port", p.port, "incorrect version in frame") frame.Done() - return + return fmt.Errorf("frame.UnmarshalBinary: %w", err) } - if frame.Type == types.TypeKeepalive { + switch { + case frame.Version != types.Version0: + fallthrough + case frame.Type == types.TypeKeepalive: frame.Done() continue } @@ -302,52 +329,62 @@ var bufPool = sync.Pool{ }, } -func (p *Peer) writer(ctx context.Context) { +func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { + defer wg.Done() + tick := time.NewTicker(PeerKeepaliveInterval) defer tick.Stop() - send := func(frame *types.Frame) { + + send := func(frame *types.Frame) error { if frame == nil { - return + return nil } buf := bufPool.Get().([]byte) defer bufPool.Put(buf) // nolint:staticcheck fn, err := frame.MarshalBinary(buf) frame.Done() if err != nil { - p.r.log.Println("Port", p.port, "error marshalling frame:", err) - return + return nil + //return fmt.Errorf("frame.MarshalBinary: %w", err) } if !bytes.Equal(buf[:4], types.FrameMagicBytes) { - panic("expected magic bytes") + return nil + //return fmt.Errorf("expected magic bytes") + } + if err := p.conn.SetWriteDeadline(time.Now().Add(PeerKeepaliveTimeout)); err != nil { + return fmt.Errorf("p.conn.SetWriteDeadline: %w", err) } if _, err = p.conn.Write(buf[:fn]); err != nil { - _ = p.r.Disconnect(p.port, fmt.Errorf("p.conn.Write: %w", err)) - return + return fmt.Errorf("p.conn.Write: %w", err) } + return nil } for { select { case <-ctx.Done(): - return + return context.Canceled case frame := <-p.protoOut.pop(): + if err := send(frame); err != nil { + return fmt.Errorf("send: %w", err) + } p.protoOut.ack() - send(frame) p.statistics.txProtoSuccessful.Inc() continue default: } select { case <-ctx.Done(): - return + return context.Canceled case frame := <-p.protoOut.pop(): + if err := send(frame); err != nil { + return fmt.Errorf("send: %w", err) + } p.protoOut.ack() - send(frame) p.statistics.txProtoSuccessful.Inc() continue case <-p.trafficOut.wait(): - if frame, ok := p.trafficOut.pop(); ok { - send(frame) + if frame, ok := p.trafficOut.pop(); ok && send(frame) == nil { p.statistics.txTrafficSuccessful.Inc() } else { p.statistics.txTrafficDropped.Inc() @@ -357,22 +394,26 @@ func (p *Peer) writer(ctx context.Context) { } select { case <-ctx.Done(): - return + return context.Canceled case frame := <-p.protoOut.pop(): + if err := send(frame); err != nil { + return fmt.Errorf("send: %w", err) + } p.protoOut.ack() - send(frame) p.statistics.txProtoSuccessful.Inc() continue case <-p.trafficOut.wait(): - if frame, ok := p.trafficOut.pop(); ok { - send(frame) + if frame, ok := p.trafficOut.pop(); ok && send(frame) == nil { + p.statistics.txTrafficSuccessful.Inc() } else { p.statistics.txTrafficDropped.Inc() } continue case <-tick.C: - send(p.generateKeepalive()) + if err := send(p.generateKeepalive()); err != nil { + return fmt.Errorf("send: %w", err) + } p.statistics.txProtoSuccessful.Inc() continue } diff --git a/router/router.go b/router/router.go index 19a9f605..dd5a00e5 100644 --- a/router/router.go +++ b/router/router.go @@ -381,23 +381,26 @@ func (r *Router) AuthenticatedConnect(conn net.Conn, zone string, peertype int) // returned in the event of a successful connection. func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, peertype int) (types.SwitchPortID, error) { var reusePort types.SwitchPortID - if p, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { - if err := r.Disconnect(p.(types.SwitchPortID), fmt.Errorf("another incoming connection from the same node")); err == nil { - reusePort = p.(types.SwitchPortID) + if _, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { + if p, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { + if err := r.Disconnect(p.(types.SwitchPortID), fmt.Errorf("another incoming connection from the same node")); err != nil { + return 0, conn.Close() + } } } - usePort := func(port *Peer) error { + usePort := func(port *Peer) bool { + if !port.allocated.CAS(false, true) { + return false + } port.mutex.Lock() port.context, port.cancel = context.WithCancel(r.context) port.zone = zone port.peertype = peertype - port.conn = conn // util.NewBufferedRWCSize(conn, MaxFrameSize) + port.conn = conn port.public = public port.mutex.Unlock() port.protoOut.push(r.tree.Root().ForPeer(port)) - if err := port.start(); err != nil { - return fmt.Errorf("port.start: %w", err) - } + go port.start() if port.port != 0 { r.dht.insertNode(port) } @@ -406,27 +409,20 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee } go r.callbacks.onConnected(port.port, public, peertype) r.log.Printf("Connected port %d to %s (zone %q)\n", port.port, conn.RemoteAddr(), zone) - return nil + return true } r.connections.Lock() defer r.connections.Unlock() - if reusePort != 0 { - if err := usePort(r.ports[reusePort]); err != nil { - return 0, fmt.Errorf("error reusing port: %w", err) - } + if reusePort != 0 && usePort(r.ports[reusePort]) { return reusePort, nil } for i := types.SwitchPortID(0); i < PortCount; i++ { if i != 0 && bytes.Equal(r.public[:], public[:]) { return 0, fmt.Errorf("loopback connection prohibited") } - if r.ports[i].started.Load() { - continue + if usePort(r.ports[i]) { + return i, nil } - if err := usePort(r.ports[i]); err != nil { - return 0, fmt.Errorf("error allocating new port: %w", err) - } - return i, nil } return 0, fmt.Errorf("no free switch ports") } @@ -444,16 +440,6 @@ func (r *Router) Disconnect(i types.SwitchPortID, err error) error { if stoperr := r.ports[i].stop(); stoperr != nil { return fmt.Errorf("port.stop: %w", stoperr) } - if r.ports[i].port != 0 { - r.dht.deleteNode(r.ports[i].public) - } - if r.simulator != nil { - r.simulator.ReportDeadLink(r.public, r.ports[i].public) - } - r.log.Printf("Disconnected port %d: %s\n", i, err) - r.snake.portWasDisconnected(i) - r.tree.portWasDisconnected(i) - go r.callbacks.onDisconnected(i, r.ports[i].public, r.ports[i].peertype, err) return nil } diff --git a/router/tree.go b/router/tree.go index 1318a753..78e90d34 100644 --- a/router/tree.go +++ b/router/tree.go @@ -368,7 +368,7 @@ func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnou case time.Since(lastParentUpdate.at) > announcementTimeout: updateParent = true - case keyDeltaSinceLastParentUpdate != 0: + case keyDeltaSinceLastParentUpdate > 0: updateParent = true case keyDeltaSinceLastParentUpdate == 0: From d1d2d8c3eb2b4ccc22e3460bae9a0abea541377e Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Tue, 7 Sep 2021 18:13:41 +0100 Subject: [PATCH 14/22] Refactor root sequence --- router/peer.go | 7 +++++-- router/tree.go | 17 ++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/router/peer.go b/router/peer.go index ed9b3e0d..9f585650 100644 --- a/router/peer.go +++ b/router/peer.go @@ -82,7 +82,6 @@ func (p *Peer) start() { select { case <-ctx.Done(): case err = <-errors: - p.r.log.Println("Peer", p.port, "workers stopping due to error:", err) ctx.Done() } @@ -94,7 +93,11 @@ func (p *Peer) start() { if p.r.simulator != nil { p.r.simulator.ReportDeadLink(p.r.public, p.public) } - p.r.log.Printf("Disconnected port %d: %s\n", p.port, err) + if err != nil { + p.r.log.Printf("Disconnected port %d: %s\n", p.port, err) + } else { + p.r.log.Printf("Disconnected port %d\n", p.port) + } p.r.snake.portWasDisconnected(p.port) p.r.tree.portWasDisconnected(p.port) go p.r.callbacks.onDisconnected(p.port, p.public, p.peertype, err) diff --git a/router/tree.go b/router/tree.go index 78e90d34..62019d44 100644 --- a/router/tree.go +++ b/router/tree.go @@ -23,6 +23,7 @@ import ( "time" "github.com/matrix-org/pinecone/types" + "go.uber.org/atomic" ) // announcementInterval is the frequency at which this @@ -130,6 +131,7 @@ type spanningTree struct { rootReset chan struct{} mutex sync.Mutex parent types.SwitchPortID + sequence atomic.Uint64 callback func(parent types.SwitchPortID, coords types.SwitchPorts) } @@ -242,14 +244,14 @@ func (t *spanningTree) selectNewParent() { accept() case ann.Sequence < bestSeq: // ignore lower sequence numbers - case annAt.Before(bestTime): - accept() - case annAt.After(bestTime): - // ignore updates that arrived more recently case len(ann.Signatures) < bestDist: accept() case len(ann.Signatures) > bestDist: // ignore longer paths + case annAt.Before(bestTime): + accept() + case annAt.After(bestTime): + // ignore updates that arrived more recently case p.public.CompareTo(bestKey) > 0: accept() } @@ -274,6 +276,7 @@ func (t *spanningTree) selectNewParent() { } func (t *spanningTree) advertise() { + t.sequence.Inc() ann := t.Root() for _, p := range t.r.startedPorts() { p.protoOut.push(ann.ForPeer(p)) @@ -339,7 +342,7 @@ func (t *spanningTree) Root() *rootAnnouncementWithTime { at: time.Now(), SwitchAnnouncement: types.SwitchAnnouncement{ RootPublicKey: t.r.public, - Sequence: types.Varu64(time.Now().UnixNano()), + Sequence: types.Varu64(t.sequence.Load()), }, } } @@ -368,12 +371,12 @@ func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnou case time.Since(lastParentUpdate.at) > announcementTimeout: updateParent = true - case keyDeltaSinceLastParentUpdate > 0: + case keyDeltaSinceLastParentUpdate != 0: updateParent = true case keyDeltaSinceLastParentUpdate == 0: switch { - case newUpdate.Sequence < lastParentUpdate.Sequence: + case newUpdate.Sequence <= lastParentUpdate.Sequence: // Ignore an update with an old sequence number case len(newUpdate.Signatures) != len(lastParentUpdate.Signatures): From fa612d8c763a88b1e7c2ee43bd3c82997b0f8725 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Wed, 8 Sep 2021 17:55:18 +0100 Subject: [PATCH 15/22] Various tweaks --- cmd/pineconesim/page.html | 4 +- cmd/pineconesim/simulator/links.go | 3 - multicast/multicast.go | 2 +- router/nexthop_virtualsnake.go | 27 ++++--- router/peer.go | 31 +++++--- router/router.go | 122 ++++++++++++---------------- router/tree.go | 124 ++++++++++++++++++++--------- 7 files changed, 175 insertions(+), 138 deletions(-) diff --git a/cmd/pineconesim/page.html b/cmd/pineconesim/page.html index 2483df31..8f878cb9 100644 --- a/cmd/pineconesim/page.html +++ b/cmd/pineconesim/page.html @@ -125,7 +125,7 @@

Node Summary

{{range .Nodes}} {{if not .IsExternal}} - {{.Name}} + {{.Name}} {{.Port}} {{.Coords}} {{.Root}} @@ -208,7 +208,7 @@

Peers

{{range .NodeInfo.Peers}} - {{.Name}} + {{.Name}} {{.PublicKey}} {{.Port}} {{.RootPublicKey}} diff --git a/cmd/pineconesim/simulator/links.go b/cmd/pineconesim/simulator/links.go index c7401b8f..ab327768 100644 --- a/cmd/pineconesim/simulator/links.go +++ b/cmd/pineconesim/simulator/links.go @@ -47,9 +47,6 @@ func (sim *Simulator) ConnectNodes(a, b string) error { if err := c.SetNoDelay(true); err != nil { panic(err) } - if err := c.SetLinger(0); err != nil { - panic(err) - } sc := &util.SlowConn{ Conn: c, //ReadDelay: 5 * time.Millisecond, diff --git a/multicast/multicast.go b/multicast/multicast.go index a0e6e834..2029f53f 100644 --- a/multicast/multicast.go +++ b/multicast/multicast.go @@ -308,7 +308,7 @@ func (m *Multicast) listen(intf *multicastInterface, conn net.PacketConn, srcadd } if _, err := m.r.AuthenticatedConnect(peer, udpaddr.Zone, router.PeerTypeMulticast); err != nil { - m.log.Println("m.s.AuthenticatedConnect:", err) + m.log.Println("m.s.AuthenticatedConnect:", err, peer.Close()) continue } } diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index c762b7c7..d9cfc25d 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -24,7 +24,6 @@ import ( "github.com/matrix-org/pinecone/types" "github.com/matrix-org/pinecone/util" - "go.uber.org/atomic" ) const virtualSnakeNeighExpiryPeriod = time.Minute * 30 @@ -37,7 +36,7 @@ type virtualSnake struct { ascendingMutex sync.RWMutex _descending *virtualSnakeNeighbour descendingMutex sync.RWMutex - bootstrapping atomic.Bool + bootstrap *time.Timer } type virtualSnakeIndex struct { @@ -74,6 +73,8 @@ func newVirtualSnake(r *Router) *virtualSnake { r: r, table: make(virtualSnakeTable), } + snake.bootstrap = time.AfterFunc(time.Second, snake.bootstrapNow) + snake.bootstrap.Stop() go snake.maintain() return snake } @@ -147,14 +148,17 @@ func (t *virtualSnake) maintain() { // predefined interval, but for now we'll continue to send // them on a regular interval until we can derive some better // connection state. - if bootstrapNow && t.bootstrapping.CAS(false, true) { - t.bootstrapNow() + if bootstrapNow { + t.bootstrapIn(0) } } } +func (t *virtualSnake) bootstrapIn(d time.Duration) { + t.bootstrap.Reset(d) +} + func (t *virtualSnake) bootstrapNow() { - t.bootstrapping.Store(false) if t.r.IsRoot() { return } @@ -183,9 +187,7 @@ func (t *virtualSnake) bootstrapNow() { func (t *virtualSnake) rootNodeChanged(root types.PublicKey) { if asc := t.ascending(); asc != nil && !asc.RootPublicKey.EqualTo(root) { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("root changed and asc no longer matches")) - if t.bootstrapping.CAS(false, true) { - time.AfterFunc(time.Second, t.bootstrapNow) - } + t.bootstrapIn(time.Second) } if desc := t.descending(); desc != nil && !desc.RootPublicKey.EqualTo(root) { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("root changed and desc no longer matches")) @@ -227,6 +229,7 @@ func (t *virtualSnake) portWasDisconnected(port types.SwitchPortID) { // this port change. if asc := t.ascending(); asc != nil && asc.Port == port { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("port teardown")) + t.bootstrapIn(time.Second) } if desc := t.descending(); desc != nil && desc.Port == port { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("port teardown")) @@ -376,7 +379,7 @@ func (t *virtualSnake) getVirtualSnakeTeardownNextHop(from *Peer, rx *types.Fram } if asc := t.ascending(); asc != nil && t.r.public.EqualTo(rx.DestinationKey) && asc.PathID == teardown.PathID { t.setAscending(nil) - defer time.AfterFunc(time.Second, t.bootstrapNow) + t.bootstrapIn(time.Second) } t.tableMutex.Lock() defer t.tableMutex.Unlock() @@ -441,7 +444,7 @@ func (t *virtualSnake) handleBootstrap(from *Peer, rx *types.Frame) error { return fmt.Errorf("util.VerifySignedTimestamp") } if !bootstrap.RootPublicKey.EqualTo(t.r.RootPublicKey()) { - return fmt.Errorf("root key doesn't match") + return fmt.Errorf("bootstrap root key doesn't match") } bootstrapACK := types.VirtualSnakeBootstrapACK{ // nolint:gosimple PathID: bootstrap.PathID, @@ -478,7 +481,7 @@ func (t *virtualSnake) handleBootstrapACK(from *Peer, rx *types.Frame) error { return fmt.Errorf("util.VerifySignedTimestamp") } if !bootstrapACK.RootPublicKey.EqualTo(t.r.RootPublicKey()) { - return fmt.Errorf("root key doesn't match") + return fmt.Errorf("bootstrap ACK root key doesn't match") } update := false asc := t.ascending() @@ -583,7 +586,7 @@ func (t *virtualSnake) handleSetup(from *Peer, rx *types.Frame, nextHops types.S } if !setup.RootPublicKey.EqualTo(t.r.RootPublicKey()) { t.teardownPath(rx.SourceKey, setup.PathID, from.port, false, fmt.Errorf("rejecting setup (root key doesn't match)")) - return fmt.Errorf("root key doesn't match") + return fmt.Errorf("setup root key doesn't match") } // Did the setup hit a dead end on the way to the ascending node? diff --git a/router/peer.go b/router/peer.go index 9f585650..c2b9827c 100644 --- a/router/peer.go +++ b/router/peer.go @@ -62,6 +62,7 @@ func (p *Peer) start() { if !p.started.CAS(false, true) { return } + defer p.reset() var err error ctx := p.context @@ -71,6 +72,7 @@ func (p *Peer) start() { wg.Add(2) p.r.active.Store(index, p.port) + defer p.r.active.Delete(index) go func() { errors <- p.reader(wg, ctx) @@ -79,13 +81,23 @@ func (p *Peer) start() { errors <- p.writer(wg, ctx) }() + if p.port != 0 { + p.r.dht.insertNode(p) + } + if p.r.simulator != nil { + p.r.simulator.ReportNewLink(p.conn, p.r.public, p.public) + } + go p.r.callbacks.onConnected(p.port, p.public, p.peertype) + + p.r.log.Printf("Connected port %d to %s (zone %q)\n", p.port, p.conn.RemoteAddr(), p.zone) + select { case <-ctx.Done(): case err = <-errors: ctx.Done() } - p.started.Store(false) + p.stop() if p.port != 0 { p.r.dht.deleteNode(p.public) @@ -98,15 +110,13 @@ func (p *Peer) start() { } else { p.r.log.Printf("Disconnected port %d\n", p.port) } + p.r.snake.portWasDisconnected(p.port) p.r.tree.portWasDisconnected(p.port) go p.r.callbacks.onDisconnected(p.port, p.public, p.peertype, err) wg.Wait() _ = p.conn.Close() - - p.reset() - p.r.active.Delete(index) } func (p *Peer) reset() { @@ -217,11 +227,11 @@ func (p *Peer) lastAnnouncement() *rootAnnouncementWithTime { return p.announcement } -func (p *Peer) stop() error { +func (p *Peer) stop() { + p.started.Store(false) p.mutex.Lock() p.cancel() p.mutex.Unlock() - return nil } func (p *Peer) generateKeepalive() *types.Frame { @@ -249,6 +259,9 @@ func (p *Peer) reader(wg *sync.WaitGroup, ctx context.Context) error { if _, err := io.ReadFull(p.conn, buf[:8]); err != nil { return fmt.Errorf("p.conn.Peek: %w", err) } + if err := p.conn.SetReadDeadline(time.Time{}); err != nil { + return fmt.Errorf("conn.SetReadDeadline: %w", err) + } if !bytes.Equal(buf[:4], types.FrameMagicBytes) { return fmt.Errorf("missing magic bytes") } @@ -334,7 +347,6 @@ var bufPool = sync.Pool{ func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { defer wg.Done() - tick := time.NewTicker(PeerKeepaliveInterval) defer tick.Stop() @@ -348,11 +360,9 @@ func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { frame.Done() if err != nil { return nil - //return fmt.Errorf("frame.MarshalBinary: %w", err) } if !bytes.Equal(buf[:4], types.FrameMagicBytes) { return nil - //return fmt.Errorf("expected magic bytes") } if err := p.conn.SetWriteDeadline(time.Now().Add(PeerKeepaliveTimeout)); err != nil { return fmt.Errorf("p.conn.SetWriteDeadline: %w", err) @@ -360,6 +370,9 @@ func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { if _, err = p.conn.Write(buf[:fn]); err != nil { return fmt.Errorf("p.conn.Write: %w", err) } + if err := p.conn.SetWriteDeadline(time.Time{}); err != nil { + return fmt.Errorf("p.conn.SetWriteDeadline: %w", err) + } return nil } diff --git a/router/router.go b/router/router.go index dd5a00e5..018b32db 100644 --- a/router/router.go +++ b/router/router.go @@ -25,7 +25,6 @@ import ( "net" "sort" "sync" - "time" "github.com/matrix-org/pinecone/types" "go.uber.org/atomic" @@ -147,7 +146,7 @@ func (r *Router) Close() error { r.cancel() for _, port := range r.ports { if port.started.Load() { - _ = port.stop() + port.stop() } } return nil @@ -326,52 +325,49 @@ func (r *Router) activePorts() peers { // the node was connected to will be returned in the event // of a successful connection. func (r *Router) AuthenticatedConnect(conn net.Conn, zone string, peertype int) (types.SwitchPortID, error) { - select { - case <-time.After(time.Second * 5): - return 0, fmt.Errorf("handshake timed out") - default: - handshake := []byte{ - ourVersion, - ourCapabilities, - 0, // unused - 0, // unused - } - handshake = append(handshake, r.public[:ed25519.PublicKeySize]...) - handshake = append(handshake, ed25519.Sign(r.private[:], handshake)...) - _ = conn.SetDeadline(time.Now().Add(time.Second * 5)) - if _, err := conn.Write(handshake); err != nil { - conn.Close() - return 0, fmt.Errorf("conn.Write: %w", err) - } - if _, err := io.ReadFull(conn, handshake); err != nil { - conn.Close() - return 0, fmt.Errorf("io.ReadFull: %w", err) - } - _ = conn.SetDeadline(time.Time{}) - if theirVersion := handshake[0]; theirVersion != ourVersion { - conn.Close() - return 0, fmt.Errorf("mismatched node version") - } - if theirCapabilities := handshake[1]; theirCapabilities&ourCapabilities != ourCapabilities { - conn.Close() - return 0, fmt.Errorf("mismatched node capabilities") - } - var public types.PublicKey - var signature types.Signature - offset := 4 - offset += copy(public[:], handshake[offset:offset+ed25519.PublicKeySize]) - copy(signature[:], handshake[offset:offset+ed25519.SignatureSize]) - if !ed25519.Verify(public[:], handshake[:offset], signature[:]) { - conn.Close() - return 0, fmt.Errorf("peer sent invalid signature") - } - port, err := r.Connect(conn, public, zone, peertype) - if err != nil { - conn.Close() - return 0, err - } - return port, nil + handshake := []byte{ + ourVersion, + ourCapabilities, + 0, // unused + 0, // unused + } + handshake = append(handshake, r.public[:ed25519.PublicKeySize]...) + handshake = append(handshake, ed25519.Sign(r.private[:], handshake)...) + //_ = conn.SetWriteDeadline(time.Now().Add(time.Second * 5)) + if _, err := conn.Write(handshake); err != nil { + conn.Close() + return 0, fmt.Errorf("conn.Write: %w", err) + } + //_ = conn.SetWriteDeadline(time.Time{}) + //_ = conn.SetReadDeadline(time.Now().Add(time.Second * 5)) + if _, err := io.ReadFull(conn, handshake); err != nil { + conn.Close() + return 0, fmt.Errorf("io.ReadFull: %w", err) } + //_ = conn.SetReadDeadline(time.Time{}) + if theirVersion := handshake[0]; theirVersion != ourVersion { + conn.Close() + return 0, fmt.Errorf("mismatched node version") + } + if theirCapabilities := handshake[1]; theirCapabilities&ourCapabilities != ourCapabilities { + conn.Close() + return 0, fmt.Errorf("mismatched node capabilities") + } + var public types.PublicKey + var signature types.Signature + offset := 4 + offset += copy(public[:], handshake[offset:offset+ed25519.PublicKeySize]) + copy(signature[:], handshake[offset:offset+ed25519.SignatureSize]) + if !ed25519.Verify(public[:], handshake[:offset], signature[:]) { + conn.Close() + return 0, fmt.Errorf("peer sent invalid signature") + } + port, err := r.Connect(conn, public, zone, peertype) + if err != nil { + conn.Close() + return 0, err + } + return port, nil } // Connect initiates a peer connection using the given @@ -380,13 +376,11 @@ func (r *Router) AuthenticatedConnect(conn net.Conn, zone string, peertype int) // port number that the node was connected to will be // returned in the event of a successful connection. func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, peertype int) (types.SwitchPortID, error) { + r.connections.Lock() + defer r.connections.Unlock() var reusePort types.SwitchPortID - if _, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { - if p, ok := r.active.Load(hex.EncodeToString(public[:]) + zone); ok { - if err := r.Disconnect(p.(types.SwitchPortID), fmt.Errorf("another incoming connection from the same node")); err != nil { - return 0, conn.Close() - } - } + if r.IsConnected(public, zone) { + return 0, conn.Close() } usePort := func(port *Peer) bool { if !port.allocated.CAS(false, true) { @@ -401,18 +395,8 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee port.mutex.Unlock() port.protoOut.push(r.tree.Root().ForPeer(port)) go port.start() - if port.port != 0 { - r.dht.insertNode(port) - } - if r.simulator != nil { - r.simulator.ReportNewLink(conn, r.public, public) - } - go r.callbacks.onConnected(port.port, public, peertype) - r.log.Printf("Connected port %d to %s (zone %q)\n", port.port, conn.RemoteAddr(), zone) return true } - r.connections.Lock() - defer r.connections.Unlock() if reusePort != 0 && usePort(r.ports[reusePort]) { return reusePort, nil } @@ -437,9 +421,7 @@ func (r *Router) Disconnect(i types.SwitchPortID, err error) error { } r.connections.Lock() defer r.connections.Unlock() - if stoperr := r.ports[i].stop(); stoperr != nil { - return fmt.Errorf("port.stop: %w", stoperr) - } + r.ports[i].stop() return nil } @@ -465,12 +447,8 @@ func (r *Router) PeerCount(peertype int) int { // IsConnected returns true if the node is connected within the // given zone, or false otherwise. func (r *Router) IsConnected(key types.PublicKey, zone string) bool { - v, ok := r.active.Load(hex.EncodeToString(key[:]) + zone) - if !ok { - return false - } - port := v.(types.SwitchPortID) - return r.ports[port].Alive() + _, ok := r.active.Load(hex.EncodeToString(key[:]) + zone) + return ok } // PeerInfo is a gomobile-friendly type that represents a peer diff --git a/router/tree.go b/router/tree.go index 62019d44..93965548 100644 --- a/router/tree.go +++ b/router/tree.go @@ -129,8 +129,9 @@ type spanningTree struct { r *Router context context.Context rootReset chan struct{} - mutex sync.Mutex + mutex *sync.Mutex parent types.SwitchPortID + reparent *time.Timer sequence atomic.Uint64 callback func(parent types.SwitchPortID, coords types.SwitchPorts) } @@ -140,8 +141,11 @@ func newSpanningTree(r *Router, f func(parent types.SwitchPortID, coords types.S r: r, context: r.context, rootReset: make(chan struct{}), + mutex: &sync.Mutex{}, callback: f, } + t.reparent = time.AfterFunc(time.Second, t.selectNewParentAndAdvertise) + t.reparent.Stop() t.becomeRoot() go t.workerForRoot() go t.workerForAnnouncements() @@ -178,15 +182,22 @@ func (t *spanningTree) portWasDisconnected(port types.SwitchPortID) { return } if t.Parent() == port { - t.selectNewParentAndAdvertise() + t.selectNewParentAndAdvertiseIn(0) } } +func (t *spanningTree) selectNewParentAndAdvertiseIn(d time.Duration) { + t.reparent.Reset(d) +} + func (t *spanningTree) selectNewParentAndAdvertise() { lastParentUpdate := t.Root() lastParentPort := t.Parent() - t.selectNewParent() + if !t.selectNewParent() { + // We became the root so no point in continuing. + return + } newParentUpdate := t.Root() newParentPort := t.Parent() @@ -203,7 +214,7 @@ func (t *spanningTree) selectNewParentAndAdvertise() { } } -func (t *spanningTree) selectNewParent() { +func (t *spanningTree) selectNewParent() bool { lastUpdate := t.Root() bestDist := math.MaxInt32 bestKey := t.r.public @@ -222,9 +233,6 @@ func (t *spanningTree) selectNewParent() { if ann == nil { continue } - if time.Since(ann.at) > announcementTimeout { - continue - } accept := func() { bestKey = ann.RootPublicKey bestDist = len(ann.Signatures) @@ -244,39 +252,53 @@ func (t *spanningTree) selectNewParent() { accept() case ann.Sequence < bestSeq: // ignore lower sequence numbers - case len(ann.Signatures) < bestDist: - accept() - case len(ann.Signatures) > bestDist: - // ignore longer paths case annAt.Before(bestTime): accept() case annAt.After(bestTime): // ignore updates that arrived more recently + case len(ann.Signatures) < bestDist: + accept() + case len(ann.Signatures) > bestDist: + // ignore longer paths case p.public.CompareTo(bestKey) > 0: accept() } } - t.mutex.Unlock() + if bestAnn != nil { + switch { + case bestAnn.RootPublicKey.CompareTo(t.r.public) == 0 && bestSeq >= lastUpdate.Sequence: + fallthrough - if bestAnn != nil && bestAnn.RootPublicKey.CompareTo(t.r.public) > 0 { - t.parent = bestPort - newcoords, newroot := t.Coords(), t.Root().RootPublicKey + case bestAnn.RootPublicKey.CompareTo(t.r.public) > 0: + t.parent = bestPort + t.mutex.Unlock() - if t.callback != nil && !lastUpdate.Coords().EqualTo(newcoords) { - defer t.callback(bestPort, newcoords) - } + newAnn := t.Root() + newCoords, newRoot := newAnn.Coords(), newAnn.RootPublicKey + + if t.callback != nil && !lastUpdate.Coords().EqualTo(newCoords) { + defer t.callback(bestPort, newCoords) + } - if newroot != lastUpdate.RootPublicKey { - defer t.r.snake.rootNodeChanged(newroot) + if newRoot != lastUpdate.RootPublicKey { + defer t.r.snake.rootNodeChanged(newRoot) + } + + return true } - } else { - t.becomeRoot() } + + t.mutex.Unlock() + t.becomeRoot() + + return false } func (t *spanningTree) advertise() { - t.sequence.Inc() + if t.IsRoot() { + t.sequence.Inc() + } ann := t.Root() for _, p := range t.r.startedPorts() { p.protoOut.push(ann.ForPeer(p)) @@ -286,6 +308,9 @@ func (t *spanningTree) advertise() { func (t *spanningTree) becomeRoot() { t.mutex.Lock() t.parent = 0 + for _, p := range t.r.ports { + p.child.Store(false) + } t.mutex.Unlock() newCoords := types.SwitchPorts{} @@ -324,7 +349,7 @@ func (t *spanningTree) workerForRoot() { case <-time.After(announcementTimeout): if !t.IsRoot() { - t.selectNewParentAndAdvertise() + t.becomeRoot() } } } @@ -363,38 +388,59 @@ func (t *spanningTree) Parent() types.SwitchPortID { } func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnouncement) { - updateParent := false - lastParentUpdate := t.Root() + updateParent, gotBadNews := false, false + lastParentUpdate, isParent := t.Root(), p.IsParent() keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) switch { - case time.Since(lastParentUpdate.at) > announcementTimeout: + case keyDeltaSinceLastParentUpdate > 0: + // The peer has sent us a key that is stronger than our last update. updateParent = true - case keyDeltaSinceLastParentUpdate != 0: + case time.Since(lastParentUpdate.at) >= announcementTimeout: + // It's been a while since we last heard from our parent so we should + // really choose a new one if we can. updateParent = true - case keyDeltaSinceLastParentUpdate == 0: - switch { - case newUpdate.Sequence <= lastParentUpdate.Sequence: - // Ignore an update with an old sequence number + case isParent && keyDeltaSinceLastParentUpdate < 0: + // Our parent sent us a weaker key than before — this implies that + // something bad happened. + gotBadNews = true - case len(newUpdate.Signatures) != len(lastParentUpdate.Signatures): - updateParent = true + case isParent && keyDeltaSinceLastParentUpdate == 0: + if newUpdate.Sequence <= lastParentUpdate.Sequence { + // Our parent sent us a lower sequence number than before for the + // same root — this isn't good news either. + gotBadNews = true } } switch { + case gotBadNews: + t.becomeRoot() + t.selectNewParentAndAdvertiseIn(time.Second) + return + case updateParent: t.selectNewParentAndAdvertise() + return case p.IsParent(): - if newUpdate.Sequence >= lastParentUpdate.Sequence { + switch { + case newUpdate.RootPublicKey != lastParentUpdate.RootPublicKey: + // The parent's root key has changed, for better or worse. + fallthrough + + case newUpdate.RootPublicKey == lastParentUpdate.RootPublicKey && newUpdate.Sequence > lastParentUpdate.Sequence: + // The parent's root key has not changed but we've got a nice + // new sequence number. t.advertise() - } - } + t.rootReset <- struct{}{} - if newUpdate.Sequence > lastParentUpdate.Sequence { - t.rootReset <- struct{}{} + default: + // If we aren't notifying anyone else, we should at least + // notify our parent that we chose them as a parent. + p.protoOut.push(t.Root().ForPeer(p)) + } } } From eb08e940206935fa76a034220cec951a6a479dd3 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Thu, 9 Sep 2021 17:08:45 +0100 Subject: [PATCH 16/22] Well that was incredibly tedious --- cmd/pinecone/main.go | 7 +- cmd/pineconeip/main.go | 8 -- cmd/pineconesim/simulator/interface.go | 6 + multicast/multicast.go | 11 +- multicast/platform_darwin.go | 1 + router/peer.go | 107 ++++++++++-------- router/queuefifo.go | 9 +- router/queuefifo_test.go | 34 ++++++ router/router.go | 15 +-- router/tree.go | 151 ++++++++----------------- types/announcement.go | 26 +++-- 11 files changed, 188 insertions(+), 187 deletions(-) create mode 100644 router/queuefifo_test.go diff --git a/cmd/pinecone/main.go b/cmd/pinecone/main.go index 182691b9..c426618c 100644 --- a/cmd/pinecone/main.go +++ b/cmd/pinecone/main.go @@ -47,12 +47,9 @@ func main() { } dialer := net.Dialer{ - Timeout: time.Second * 5, - KeepAlive: time.Second * 2, - } - listener := net.ListenConfig{ - KeepAlive: time.Second * 2, + Timeout: time.Second * 5, } + listener := net.ListenConfig{} pineconeRouter := router.NewRouter(logger, "router", sk, pk, nil) _ = sessions.NewSessions(logger, pineconeRouter) diff --git a/cmd/pineconeip/main.go b/cmd/pineconeip/main.go index 608207e2..0ca09b9a 100644 --- a/cmd/pineconeip/main.go +++ b/cmd/pineconeip/main.go @@ -82,14 +82,6 @@ func main() { if err := conn.SetNoDelay(true); err != nil { return fmt.Errorf("conn.SetNoDelay: %w", err) } - /* - if err := conn.SetKeepAlive(true); err != nil { - return fmt.Errorf("conn.SetKeepAlive: %w", err) - } - if err := conn.SetKeepAlivePeriod(time.Second); err != nil { - return fmt.Errorf("conn.SetKeepAlivePeriod: %w", err) - } - */ if err := conn.SetLinger(0); err != nil { return fmt.Errorf("conn.SetLinger: %w", err) } diff --git a/cmd/pineconesim/simulator/interface.go b/cmd/pineconesim/simulator/interface.go index 40cfbd31..1de50601 100644 --- a/cmd/pineconesim/simulator/interface.go +++ b/cmd/pineconesim/simulator/interface.go @@ -23,6 +23,8 @@ import ( ) func (sim *Simulator) LookupCoords(target string) (types.SwitchPorts, error) { + sim.nodesMutex.RLock() + defer sim.nodesMutex.RUnlock() node, ok := sim.nodes[target] if !ok { return nil, fmt.Errorf("node %q not known", target) @@ -31,6 +33,8 @@ func (sim *Simulator) LookupCoords(target string) (types.SwitchPorts, error) { } func (sim *Simulator) LookupNodeID(target types.SwitchPorts) (string, error) { + sim.nodesMutex.RLock() + defer sim.nodesMutex.RUnlock() for id, n := range sim.nodes { if n.Coords().EqualTo(target) { return id, nil @@ -40,6 +44,8 @@ func (sim *Simulator) LookupNodeID(target types.SwitchPorts) (string, error) { } func (sim *Simulator) LookupPublicKey(target types.PublicKey) (string, error) { + sim.nodesMutex.RLock() + defer sim.nodesMutex.RUnlock() for id, n := range sim.nodes { if n.PublicKey().EqualTo(target) { return id, nil diff --git a/multicast/multicast.go b/multicast/multicast.go index 2029f53f..3d291a1a 100644 --- a/multicast/multicast.go +++ b/multicast/multicast.go @@ -65,15 +65,14 @@ func NewMulticast( } m.tcpLC = net.ListenConfig{ Control: m.tcpOptions, - KeepAlive: time.Second, + KeepAlive: time.Second * 3, } m.udpLC = net.ListenConfig{ Control: m.udpOptions, } m.dialer = net.Dialer{ - Control: m.tcpOptions, - Timeout: time.Second * 5, - KeepAlive: time.Second, + Control: m.tcpOptions, + // Timeout: time.Second * 5, } return m } @@ -163,6 +162,7 @@ func (m *Multicast) accept(listener net.Listener) { if _, err := m.r.AuthenticatedConnect(conn, tcpaddr.Zone, router.PeerTypeMulticast); err != nil { //m.log.Println("m.s.AuthenticatedConnect:", err) + _ = conn.Close() continue } } @@ -308,7 +308,8 @@ func (m *Multicast) listen(intf *multicastInterface, conn net.PacketConn, srcadd } if _, err := m.r.AuthenticatedConnect(peer, udpaddr.Zone, router.PeerTypeMulticast); err != nil { - m.log.Println("m.s.AuthenticatedConnect:", err, peer.Close()) + m.log.Println("m.s.AuthenticatedConnect:", err) + _ = peer.Close() continue } } diff --git a/multicast/platform_darwin.go b/multicast/platform_darwin.go index 138e2b09..2ee8f07e 100644 --- a/multicast/platform_darwin.go +++ b/multicast/platform_darwin.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build darwin // +build darwin package multicast diff --git a/router/peer.go b/router/peer.go index c2b9827c..601844c5 100644 --- a/router/peer.go +++ b/router/peer.go @@ -43,7 +43,7 @@ type Peer struct { port types.SwitchPortID // allocated atomic.Bool // started atomic.Bool // worker goroutines started? - child atomic.Bool // is this node a child of ours? + wg *sync.WaitGroup // wait group for worker goroutines mutex sync.RWMutex // protects everything below this line zone string // peertype int // @@ -54,7 +54,7 @@ type Peer struct { trafficOut *lifoQueue // queue traffic message to peer protoOut *fifoQueue // queue protocol message to peer coords types.SwitchPorts // - announcement *rootAnnouncementWithTime // + announcement *rootAnnouncementWithTime // last received announcement from peer statistics peerStatistics // } @@ -62,25 +62,46 @@ func (p *Peer) start() { if !p.started.CAS(false, true) { return } - defer p.reset() - var err error - ctx := p.context - index := hex.EncodeToString(p.public[:]) + p.zone - errors := make(chan error) - wg := &sync.WaitGroup{} - wg.Add(2) + // When the peer dies, we need to clean up. + var lasterr error + var lasterrMutex sync.Mutex + defer func() { + p.reset() + p.r.snake.portWasDisconnected(p.port) + p.r.tree.portWasDisconnected(p.port) + go p.r.callbacks.onDisconnected(p.port, p.public, p.peertype, lasterr) + }() + // Store the fact that we're connected to this public key in + // this zone, so that the multicast code can ignore nodes we + // are already connected to. + index := hex.EncodeToString(p.public[:]) + p.zone p.r.active.Store(index, p.port) defer p.r.active.Delete(index) + // Push a root update to our new peer. This will notify them + // of our coordinates and that we are alive. + p.protoOut.push(p.r.tree.Root().ForPeer(p)) + + // Start the reader and writer goroutines for this peer. + p.wg.Add(2) go func() { - errors <- p.reader(wg, ctx) + if err := p.reader(p.context); err != nil && err != context.Canceled { + lasterrMutex.Lock() + lasterr = fmt.Errorf("reader error: %w", err) + lasterrMutex.Unlock() + } }() go func() { - errors <- p.writer(wg, ctx) + if err := p.writer(p.context); err != nil && err != context.Canceled { + lasterrMutex.Lock() + lasterr = fmt.Errorf("writer error: %w", err) + lasterrMutex.Unlock() + } }() + // Report the new connection. if p.port != 0 { p.r.dht.insertNode(p) } @@ -91,32 +112,29 @@ func (p *Peer) start() { p.r.log.Printf("Connected port %d to %s (zone %q)\n", p.port, p.conn.RemoteAddr(), p.zone) - select { - case <-ctx.Done(): - case err = <-errors: - ctx.Done() - } - - p.stop() + // Wait for the cancellation, and then for the goroutines to stop. + <-p.context.Done() + p.wg.Wait() + // Report the dicsonnection. if p.port != 0 { p.r.dht.deleteNode(p.public) } if p.r.simulator != nil { p.r.simulator.ReportDeadLink(p.r.public, p.public) } - if err != nil { - p.r.log.Printf("Disconnected port %d: %s\n", p.port, err) + + // Make sure the connection is closed. + _ = p.conn.Close() + + // ... and finally, yell about it. + lasterrMutex.Lock() + if lasterr != nil { + p.r.log.Printf("Disconnected port %d: %s\n", p.port, lasterr) } else { p.r.log.Printf("Disconnected port %d\n", p.port) } - - p.r.snake.portWasDisconnected(p.port) - p.r.tree.portWasDisconnected(p.port) - go p.r.callbacks.onDisconnected(p.port, p.public, p.peertype, err) - - wg.Wait() - _ = p.conn.Close() + lasterrMutex.Unlock() } func (p *Peer) reset() { @@ -124,7 +142,6 @@ func (p *Peer) reset() { defer p.mutex.Unlock() p.allocated.Store(false) p.started.Store(false) - p.child.Store(false) p.zone = "" p.peertype = 0 p.context, p.cancel = nil, nil @@ -193,25 +210,18 @@ func (p *Peer) SeenCommonRootRecently() bool { func (p *Peer) updateAnnouncement(new *types.SwitchAnnouncement) error { p.mutex.Lock() defer p.mutex.Unlock() - coords, err := new.PeerCoords(p.public) - if err != nil { - p.announcement = nil - p.coords = nil - return fmt.Errorf("new.PeerCoords: %w", err) + if p.announcement != nil { + if new.RootPublicKey == p.announcement.RootPublicKey && new.Sequence < p.announcement.Sequence { + p.announcement = nil + p.coords = nil + return fmt.Errorf("root announcement replays sequence number") + } } p.announcement = &rootAnnouncementWithTime{ SwitchAnnouncement: *new, at: time.Now(), } - p.coords = coords - isChild := false - for _, sig := range new.Signatures { - if sig.PublicKey.EqualTo(p.r.public) { - isChild = true - break - } - } - p.child.Store(isChild) + p.coords = new.PeerCoords() return nil } @@ -241,8 +251,10 @@ func (p *Peer) generateKeepalive() *types.Frame { return frame } -func (p *Peer) reader(wg *sync.WaitGroup, ctx context.Context) error { - defer wg.Done() +func (p *Peer) reader(ctx context.Context) error { + defer p.wg.Done() + defer p.cancel() + buf := make([]byte, MaxFrameSize) for { select { @@ -345,8 +357,10 @@ var bufPool = sync.Pool{ }, } -func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { - defer wg.Done() +func (p *Peer) writer(ctx context.Context) error { + defer p.wg.Done() + defer p.cancel() + tick := time.NewTicker(PeerKeepaliveInterval) defer tick.Stop() @@ -420,7 +434,6 @@ func (p *Peer) writer(wg *sync.WaitGroup, ctx context.Context) error { continue case <-p.trafficOut.wait(): if frame, ok := p.trafficOut.pop(); ok && send(frame) == nil { - p.statistics.txTrafficSuccessful.Inc() } else { p.statistics.txTrafficDropped.Inc() diff --git a/router/queuefifo.go b/router/queuefifo.go index b46a6732..43e8e829 100644 --- a/router/queuefifo.go +++ b/router/queuefifo.go @@ -55,9 +55,12 @@ func (q *fifoQueue) reset() { q.mutex.Lock() defer q.mutex.Unlock() for _, ch := range q.entries { - frame := <-ch - if frame != nil { - frame.Done() + select { + case frame := <-ch: + if frame != nil { + frame.Done() + } + default: } } q._initialise() diff --git a/router/queuefifo_test.go b/router/queuefifo_test.go new file mode 100644 index 00000000..cdc91988 --- /dev/null +++ b/router/queuefifo_test.go @@ -0,0 +1,34 @@ +package router + +import ( + "testing" + + "github.com/matrix-org/pinecone/types" +) + +func TestFIFOQueue(t *testing.T) { + q := newFIFOQueue() + iterations := types.SwitchPortID(1024) + + go func() { + for i := types.SwitchPortID(0); i < iterations; i++ { + q.push(&types.Frame{ + Destination: types.SwitchPorts{i}, + }) + } + }() + + got := types.SwitchPortID(0) + + for i := types.SwitchPortID(0); i < iterations; i++ { + frame := <-q.pop() + if frame == nil { + t.Fatalf("unexpected nil frame") + } + if frame.Destination[0] < got || frame.Destination[0] > got+1 { + t.Fatalf("ordering problem") + } + got = frame.Destination[0] + q.ack() + } +} diff --git a/router/router.go b/router/router.go index 018b32db..ac60f9ab 100644 --- a/router/router.go +++ b/router/router.go @@ -109,6 +109,7 @@ func NewRouter(log *log.Logger, id string, private ed25519.PrivateKey, public ed sw.ports[i] = &Peer{ r: sw, port: types.SwitchPortID(i), + wg: &sync.WaitGroup{}, protoOut: newFIFOQueue(), trafficOut: newLIFOQueue(TrafficBufferSize), } @@ -376,12 +377,12 @@ func (r *Router) AuthenticatedConnect(conn net.Conn, zone string, peertype int) // port number that the node was connected to will be // returned in the event of a successful connection. func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, peertype int) (types.SwitchPortID, error) { - r.connections.Lock() - defer r.connections.Unlock() - var reusePort types.SwitchPortID if r.IsConnected(public, zone) { - return 0, conn.Close() + _ = conn.Close() + return 0, fmt.Errorf("already connected") } + r.connections.Lock() + defer r.connections.Unlock() usePort := func(port *Peer) bool { if !port.allocated.CAS(false, true) { return false @@ -393,13 +394,9 @@ func (r *Router) Connect(conn net.Conn, public types.PublicKey, zone string, pee port.conn = conn port.public = public port.mutex.Unlock() - port.protoOut.push(r.tree.Root().ForPeer(port)) go port.start() return true } - if reusePort != 0 && usePort(r.ports[reusePort]) { - return reusePort, nil - } for i := types.SwitchPortID(0); i < PortCount; i++ { if i != 0 && bytes.Equal(r.public[:], public[:]) { return 0, fmt.Errorf("loopback connection prohibited") @@ -419,8 +416,6 @@ func (r *Router) Disconnect(i types.SwitchPortID, err error) error { if i == 0 { return fmt.Errorf("cannot disconnect port %d", i) } - r.connections.Lock() - defer r.connections.Unlock() r.ports[i].stop() return nil } diff --git a/router/tree.go b/router/tree.go index 93965548..55426bc3 100644 --- a/router/tree.go +++ b/router/tree.go @@ -69,21 +69,8 @@ func (r *Router) handleAnnouncement(peer *Peer, rx *types.Frame) error { sigs[pk] = struct{}{} } - lastPortUpdate := peer.lastAnnouncement() - if lastPortUpdate != nil && lastPortUpdate.RootPublicKey == newUpdate.RootPublicKey { - if newUpdate.Sequence < lastPortUpdate.Sequence { - // The update is a replay of a previous announcement which doesn't - // make sense - return fmt.Errorf("root announcement is a replay of an old sequence number") - } - } - - if err := peer.updateAnnouncement(&newUpdate); err != nil { - return fmt.Errorf("updating the peer last root announcement failed: %w", err) - } - - r.tree.UpdateParentIfNeeded(peer, newUpdate) - return nil + defer r.tree.UpdateParentIfNeeded(peer, newUpdate) + return peer.updateAnnouncement(&newUpdate) } type rootAnnouncementWithTime struct { @@ -191,44 +178,17 @@ func (t *spanningTree) selectNewParentAndAdvertiseIn(d time.Duration) { } func (t *spanningTree) selectNewParentAndAdvertise() { - lastParentUpdate := t.Root() - lastParentPort := t.Parent() - - if !t.selectNewParent() { - // We became the root so no point in continuing. - return - } - - newParentUpdate := t.Root() - newParentPort := t.Parent() - - switch { - case lastParentUpdate.RootPublicKey != newParentUpdate.RootPublicKey: - fallthrough - - case lastParentUpdate.AncestorParent() != newParentUpdate.AncestorParent(): - fallthrough - - case lastParentPort != newParentPort: - t.advertise() - } -} - -func (t *spanningTree) selectNewParent() bool { lastUpdate := t.Root() + bestKey := lastUpdate.RootPublicKey + bestSeq := lastUpdate.Sequence bestDist := math.MaxInt32 - bestKey := t.r.public - bestTime := t.Root().at + bestTime := time.Now() var bestPort types.SwitchPortID var bestAnn *rootAnnouncementWithTime - var bestSeq types.Varu64 t.mutex.Lock() for _, p := range t.r.activePorts() { - if p.child.Load() { - continue - } ann := p.lastAnnouncement() if ann == nil { continue @@ -244,6 +204,8 @@ func (t *spanningTree) selectNewParent() bool { keyDelta := ann.RootPublicKey.CompareTo(bestKey) annAt := ann.at.Round(time.Millisecond) switch { + case ann.IsLoopOrChildOf(p.r.public): + // ignore our children or loopy announcements case keyDelta > 0: accept() case keyDelta < 0: @@ -266,40 +228,48 @@ func (t *spanningTree) selectNewParent() bool { } if bestAnn != nil { - switch { - case bestAnn.RootPublicKey.CompareTo(t.r.public) == 0 && bestSeq >= lastUpdate.Sequence: - fallthrough - - case bestAnn.RootPublicKey.CompareTo(t.r.public) > 0: - t.parent = bestPort - t.mutex.Unlock() - - newAnn := t.Root() - newCoords, newRoot := newAnn.Coords(), newAnn.RootPublicKey + t.parent = bestPort + t.mutex.Unlock() - if t.callback != nil && !lastUpdate.Coords().EqualTo(newCoords) { - defer t.callback(bestPort, newCoords) - } - - if newRoot != lastUpdate.RootPublicKey { - defer t.r.snake.rootNodeChanged(newRoot) - } + newCoords := bestAnn.Coords() + coordsChanged := !lastUpdate.Coords().EqualTo(bestAnn.Coords()) + rootChanged := bestAnn.RootPublicKey != lastUpdate.RootPublicKey - return true + if rootChanged { + t.r.snake.rootNodeChanged(bestAnn.RootPublicKey) } + if rootChanged || coordsChanged { + t.callback(bestPort, newCoords) + } + + t.advertise() + return } + // No suitable other peer was found, so we'll just become the root + // and hope that one of our peers corrects us if it matters. t.mutex.Unlock() t.becomeRoot() - - return false } func (t *spanningTree) advertise() { - if t.IsRoot() { - t.sequence.Inc() + t.mutex.Lock() + defer t.mutex.Unlock() + + t.r.ports[t.parent].mutex.RLock() + ann := t.r.ports[t.parent].announcement + t.r.ports[t.parent].mutex.RUnlock() + + if t.parent == 0 || ann == nil { // we are the root + ann = &rootAnnouncementWithTime{ + at: time.Now(), + SwitchAnnouncement: types.SwitchAnnouncement{ + RootPublicKey: t.r.public, + Sequence: types.Varu64(t.sequence.Inc()), + }, + } } - ann := t.Root() + for _, p := range t.r.startedPorts() { p.protoOut.push(ann.ForPeer(p)) } @@ -308,9 +278,6 @@ func (t *spanningTree) advertise() { func (t *spanningTree) becomeRoot() { t.mutex.Lock() t.parent = 0 - for _, p := range t.r.ports { - p.child.Store(false) - } t.mutex.Unlock() newCoords := types.SwitchPorts{} @@ -388,59 +355,41 @@ func (t *spanningTree) Parent() types.SwitchPortID { } func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnouncement) { - updateParent, gotBadNews := false, false + updateParentNow, gotBadNews := false, false lastParentUpdate, isParent := t.Root(), p.IsParent() keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) switch { case keyDeltaSinceLastParentUpdate > 0: // The peer has sent us a key that is stronger than our last update. - updateParent = true + updateParentNow = true case time.Since(lastParentUpdate.at) >= announcementTimeout: // It's been a while since we last heard from our parent so we should // really choose a new one if we can. - updateParent = true + updateParentNow = true case isParent && keyDeltaSinceLastParentUpdate < 0: // Our parent sent us a weaker key than before — this implies that // something bad happened. gotBadNews = true - case isParent && keyDeltaSinceLastParentUpdate == 0: - if newUpdate.Sequence <= lastParentUpdate.Sequence { - // Our parent sent us a lower sequence number than before for the - // same root — this isn't good news either. - gotBadNews = true - } + case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence <= lastParentUpdate.Sequence: + // Our parent sent us a lower sequence number than before for the + // same root — this isn't good news either. + gotBadNews = true } switch { case gotBadNews: t.becomeRoot() t.selectNewParentAndAdvertiseIn(time.Second) - return - case updateParent: - t.selectNewParentAndAdvertise() - return + case updateParentNow: + t.selectNewParentAndAdvertiseIn(0) - case p.IsParent(): - switch { - case newUpdate.RootPublicKey != lastParentUpdate.RootPublicKey: - // The parent's root key has changed, for better or worse. - fallthrough - - case newUpdate.RootPublicKey == lastParentUpdate.RootPublicKey && newUpdate.Sequence > lastParentUpdate.Sequence: - // The parent's root key has not changed but we've got a nice - // new sequence number. - t.advertise() - t.rootReset <- struct{}{} - - default: - // If we aren't notifying anyone else, we should at least - // notify our parent that we chose them as a parent. - p.protoOut.push(t.Root().ForPeer(p)) - } + case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence > lastParentUpdate.Sequence: + t.advertise() + t.rootReset <- struct{}{} } } diff --git a/types/announcement.go b/types/announcement.go index 162c38c8..cd61802b 100644 --- a/types/announcement.go +++ b/types/announcement.go @@ -97,17 +97,13 @@ func (a *SwitchAnnouncement) Coords() SwitchPorts { return coords } -func (a *SwitchAnnouncement) PeerCoords(public PublicKey) (SwitchPorts, error) { +func (a *SwitchAnnouncement) PeerCoords() SwitchPorts { sigs := a.Signatures - last := len(sigs) - 1 - if sigs[last].PublicKey != public { - return nil, fmt.Errorf("invalid last hop") - } - coords := make(SwitchPorts, 0, len(sigs)) - for _, sig := range sigs[:last] { + coords := make(SwitchPorts, 0, len(sigs)-1) + for _, sig := range sigs[:len(sigs)-1] { coords = append(coords, SwitchPortID(sig.Hop)) } - return coords, nil + return coords } func (a *SwitchAnnouncement) AncestorParent() PublicKey { @@ -116,3 +112,17 @@ func (a *SwitchAnnouncement) AncestorParent() PublicKey { } return a.Signatures[len(a.Signatures)-2].PublicKey } + +func (a *SwitchAnnouncement) IsLoopOrChildOf(pk PublicKey) bool { + m := map[PublicKey]struct{}{} + for _, sig := range a.Signatures { + if sig.PublicKey.EqualTo(pk) { + return true + } + if _, ok := m[sig.PublicKey]; ok { + return true + } + m[sig.PublicKey] = struct{}{} + } + return false +} From 498bcb5ee10d2ba3429eb5d178284904be4486e5 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 10:52:52 +0100 Subject: [PATCH 17/22] Tuning --- router/nexthop_virtualsnake.go | 22 +++++++++++----------- router/tree.go | 30 ++++++++++++++++++------------ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index d9cfc25d..9e50c0a6 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -332,6 +332,17 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe } } + // Check our DHT entries + t.tableMutex.RLock() + for dhtKey, entry := range t.table { + switch { + case !entry.Valid(): + continue + } + newCheckedCandidate(dhtKey.PublicKey, entry.SourcePort) + } + t.tableMutex.RUnlock() + // Check our direct peers for _, peer := range activePorts { peerKey := peer.PublicKey() @@ -345,17 +356,6 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe } } - // Check our DHT entries - t.tableMutex.RLock() - for dhtKey, entry := range t.table { - switch { - case !entry.Valid(): - continue - } - newCheckedCandidate(dhtKey.PublicKey, entry.SourcePort) - } - t.tableMutex.RUnlock() - if bootstrap { return types.SwitchPorts{bestPort} } else { diff --git a/router/tree.go b/router/tree.go index 55426bc3..2328944a 100644 --- a/router/tree.go +++ b/router/tree.go @@ -355,38 +355,44 @@ func (t *spanningTree) Parent() types.SwitchPortID { } func (t *spanningTree) UpdateParentIfNeeded(p *Peer, newUpdate types.SwitchAnnouncement) { - updateParentNow, gotBadNews := false, false + const immediately = time.Duration(0) + reparentIn, becomeRoot := time.Duration(-1), false lastParentUpdate, isParent := t.Root(), p.IsParent() keyDeltaSinceLastParentUpdate := newUpdate.RootPublicKey.CompareTo(lastParentUpdate.RootPublicKey) switch { case keyDeltaSinceLastParentUpdate > 0: // The peer has sent us a key that is stronger than our last update. - updateParentNow = true + reparentIn = immediately case time.Since(lastParentUpdate.at) >= announcementTimeout: // It's been a while since we last heard from our parent so we should // really choose a new one if we can. - updateParentNow = true + reparentIn = immediately case isParent && keyDeltaSinceLastParentUpdate < 0: // Our parent sent us a weaker key than before — this implies that // something bad happened. - gotBadNews = true + reparentIn, becomeRoot = time.Second/2, true - case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence <= lastParentUpdate.Sequence: + case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence < lastParentUpdate.Sequence: // Our parent sent us a lower sequence number than before for the // same root — this isn't good news either. - gotBadNews = true + reparentIn, becomeRoot = time.Second/2, true + + case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence == lastParentUpdate.Sequence: + // Our parent sent us an equal sequence number, which probably means + // that their path to the root has changed. This isn't as bad news as + // a weaker key but we should still try to find a new parent. + reparentIn, becomeRoot = time.Second/4, false } switch { - case gotBadNews: - t.becomeRoot() - t.selectNewParentAndAdvertiseIn(time.Second) - - case updateParentNow: - t.selectNewParentAndAdvertiseIn(0) + case reparentIn >= 0: + if becomeRoot { + t.becomeRoot() + } + t.selectNewParentAndAdvertiseIn(reparentIn) case isParent && keyDeltaSinceLastParentUpdate == 0 && newUpdate.Sequence > lastParentUpdate.Sequence: t.advertise() From 9a5de50cb22b39a3aff8c609d5d0b36832ec03db Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 11:17:27 +0100 Subject: [PATCH 18/22] Check direct peers in default case --- router/nexthop_virtualsnake.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 9e50c0a6..f97d722c 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -338,8 +338,9 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe switch { case !entry.Valid(): continue + default: + newCheckedCandidate(dhtKey.PublicKey, entry.SourcePort) } - newCheckedCandidate(dhtKey.PublicKey, entry.SourcePort) } t.tableMutex.RUnlock() @@ -353,6 +354,8 @@ func (t *virtualSnake) getVirtualSnakeNextHop(from *Peer, destKey types.PublicKe // are directly peered with that node, so use the more direct // path instead newCandidate(peerKey, peer.port) + default: + newCheckedCandidate(peerKey, peer.port) } } From cad3a62b8f2bd998ebfb62b1b689469801662a18 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 12:33:37 +0100 Subject: [PATCH 19/22] Bootstrap faster if we lose the port that our ascending node was reachable via --- router/nexthop_virtualsnake.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index f97d722c..9cab9f11 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -229,7 +229,7 @@ func (t *virtualSnake) portWasDisconnected(port types.SwitchPortID) { // this port change. if asc := t.ascending(); asc != nil && asc.Port == port { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("port teardown")) - t.bootstrapIn(time.Second) + t.bootstrapIn(0) } if desc := t.descending(); desc != nil && desc.Port == port { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("port teardown")) From 02ac1059ded9d7b375e4343fe2ff3e45a7dca703 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 13:40:45 +0100 Subject: [PATCH 20/22] De-race SNEK a bit --- router/nexthop_virtualsnake.go | 40 +++++++++++++++------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/router/nexthop_virtualsnake.go b/router/nexthop_virtualsnake.go index 9cab9f11..a021bcbe 100644 --- a/router/nexthop_virtualsnake.go +++ b/router/nexthop_virtualsnake.go @@ -26,7 +26,7 @@ import ( "github.com/matrix-org/pinecone/util" ) -const virtualSnakeNeighExpiryPeriod = time.Minute * 30 +const virtualSnakeNeighExpiryPeriod = time.Hour type virtualSnake struct { r *Router @@ -108,37 +108,34 @@ func (t *virtualSnake) setDescending(desc *virtualSnakeNeighbour) { // bootstraps and setup messages as needed. func (t *virtualSnake) maintain() { for { - peerCount := t.r.PeerCount(-1) - bootstrapNow := false select { case <-t.r.context.Done(): return case <-time.After(time.Second): } - if peerCount == 0 { - // If there are no peers connected then we don't need - // to do any hard maintenance work. - continue - } + + rootKey := t.r.RootPublicKey() + canBootstrap := rootKey != t.r.public && t.r.PeerCount(-1) > 0 + willBootstrap := false if asc := t.ascending(); asc != nil { switch { case time.Since(asc.LastSeen) > virtualSnakeNeighExpiryPeriod: t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("ascending neighbour expired")) - bootstrapNow = true - case asc.RootPublicKey != t.r.RootPublicKey(): + willBootstrap = canBootstrap + case asc.RootPublicKey != rootKey: t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("ascending root changed")) - bootstrapNow = true + willBootstrap = canBootstrap } } else { - bootstrapNow = true + willBootstrap = canBootstrap } if desc := t.descending(); desc != nil { switch { case time.Since(desc.LastSeen) > virtualSnakeNeighExpiryPeriod: t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("descending neighbour expired")) - case !desc.RootPublicKey.EqualTo(t.r.RootPublicKey()): + case !desc.RootPublicKey.EqualTo(rootKey): t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("descending root changed")) } } @@ -148,8 +145,8 @@ func (t *virtualSnake) maintain() { // predefined interval, but for now we'll continue to send // them on a regular interval until we can derive some better // connection state. - if bootstrapNow { - t.bootstrapIn(0) + if willBootstrap { + t.bootstrapIn(time.Second) } } } @@ -159,7 +156,7 @@ func (t *virtualSnake) bootstrapIn(d time.Duration) { } func (t *virtualSnake) bootstrapNow() { - if t.r.IsRoot() { + if t.r.IsRoot() || t.r.PeerCount(-1) == 0 { return } ts, err := util.SignedTimestamp(t.r.private) @@ -187,7 +184,7 @@ func (t *virtualSnake) bootstrapNow() { func (t *virtualSnake) rootNodeChanged(root types.PublicKey) { if asc := t.ascending(); asc != nil && !asc.RootPublicKey.EqualTo(root) { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("root changed and asc no longer matches")) - t.bootstrapIn(time.Second) + defer t.bootstrapIn(time.Second) } if desc := t.descending(); desc != nil && !desc.RootPublicKey.EqualTo(root) { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("root changed and desc no longer matches")) @@ -229,7 +226,7 @@ func (t *virtualSnake) portWasDisconnected(port types.SwitchPortID) { // this port change. if asc := t.ascending(); asc != nil && asc.Port == port { t.teardownPath(t.r.public, asc.PathID, asc.Port, true, fmt.Errorf("port teardown")) - t.bootstrapIn(0) + defer t.bootstrapIn(0) } if desc := t.descending(); desc != nil && desc.Port == port { t.teardownPath(desc.PublicKey, desc.PathID, desc.Port, false, fmt.Errorf("port teardown")) @@ -382,7 +379,7 @@ func (t *virtualSnake) getVirtualSnakeTeardownNextHop(from *Peer, rx *types.Fram } if asc := t.ascending(); asc != nil && t.r.public.EqualTo(rx.DestinationKey) && asc.PathID == teardown.PathID { t.setAscending(nil) - t.bootstrapIn(time.Second) + defer t.bootstrapIn(time.Second / 4) } t.tableMutex.Lock() defer t.tableMutex.Unlock() @@ -423,10 +420,9 @@ func (t *virtualSnake) teardownPath(pk types.PublicKey, pathID types.VirtualSnak Payload: payload[:], } _ = t.getVirtualSnakeTeardownNextHop(t.r.ports[0], &frame) - if !t.r.ports[via].started.Load() { - return + if t.r.ports[via].started.Load() { + t.r.ports[via].protoOut.push(frame.Borrow()) } - t.r.ports[via].protoOut.push(frame.Borrow()) } // handleBootstrap is called in response to an incoming bootstrap From 5b02c13582463e24e2f8d07c746c9c83622fe6cb Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 14:37:42 +0100 Subject: [PATCH 21/22] The moment a peer is cancelled, it should be marked as no longer started --- router/peer.go | 3 ++- router/router.go | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/router/peer.go b/router/peer.go index 601844c5..602fb5f4 100644 --- a/router/peer.go +++ b/router/peer.go @@ -114,9 +114,10 @@ func (p *Peer) start() { // Wait for the cancellation, and then for the goroutines to stop. <-p.context.Done() + p.started.Store(false) p.wg.Wait() - // Report the dicsonnection. + // Report the disconnection. if p.port != 0 { p.r.dht.deleteNode(p.public) } diff --git a/router/router.go b/router/router.go index ac60f9ab..d398f36c 100644 --- a/router/router.go +++ b/router/router.go @@ -442,8 +442,12 @@ func (r *Router) PeerCount(peertype int) int { // IsConnected returns true if the node is connected within the // given zone, or false otherwise. func (r *Router) IsConnected(key types.PublicKey, zone string) bool { - _, ok := r.active.Load(hex.EncodeToString(key[:]) + zone) - return ok + v, ok := r.active.Load(hex.EncodeToString(key[:]) + zone) + if !ok { + return false + } + port := v.(types.SwitchPortID) + return r.ports[port].started.Load() } // PeerInfo is a gomobile-friendly type that represents a peer From e0b542fb1afb39404447b97b9f84d2025726d9d6 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Fri, 10 Sep 2021 14:45:53 +0100 Subject: [PATCH 22/22] This is backward-incompatible --- router/version.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/router/version.go b/router/version.go index 75a5ca58..a56e3af5 100644 --- a/router/version.go +++ b/router/version.go @@ -16,12 +16,13 @@ package router const ( // reserved = 1 - capabilityVirtualSnake = 2 - capabilityHardState = 4 - capabilityPathIDs = 8 - capabilityRootInUpdates = 16 - capabilityNewHeaders = 32 + capabilityVirtualSnake = 2 + capabilityHardState = 4 + capabilityPathIDs = 8 + capabilityRootInUpdates = 16 + capabilityNewHeaders = 32 + capabilitySlowRootInterval = 64 ) const ourVersion = 0 -const ourCapabilities = capabilityVirtualSnake | capabilityHardState | capabilityPathIDs | capabilityRootInUpdates | capabilityNewHeaders +const ourCapabilities = capabilityVirtualSnake | capabilityHardState | capabilityPathIDs | capabilityRootInUpdates | capabilityNewHeaders | capabilitySlowRootInterval