From 3755f80aa0b178acd0447585539a5c13072fe85e Mon Sep 17 00:00:00 2001 From: Chris Telfer Date: Fri, 8 Jun 2018 13:53:36 -0400 Subject: [PATCH] Fix deadlock introduced in bd613df2 Commit bd613df2 prevented data corruption due to simultaneous driver.CreateNetwork()/driver.DeleteNetwork() by holding the network lock through the read/modify part of the operation. However, part of the DeleteNetwork operation entails sending a message to the peerDB to tell that goroutine to flush entries on deletion. This can lead to a deadlock where: * driver.DeleteNetwork() starts and acquires driver.Lock() * peerDB receives some other request (e.g. EventNotify) and blocks on driver.Lock() * driver.DeleteNetwork() attempts a peerDB flush and blocks waiting on the synchronous peerDB operation channel This patch fixes the issue by deferring the peerDB flush operation until after DeleteNetwork() unlocks driver.Lock(). Commit bd613df2 only modified CreateNetwork() and DeleteNetwork() and the critical section that driver.Lock() protects in CreateNetwork() does not perform any peerDB notifications or other locks of driver data structures. So this solution should be a complete fix for any regressions introduced in bd613df2. Signed-off-by: Chris Telfer --- drivers/overlay/ov_network.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 9c83127d4f..8b6839656b 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -244,7 +244,15 @@ func (d *driver) DeleteNetwork(nid string) error { } d.Lock() - defer d.Unlock() + // Only perform a peer flush operation (if required) AFTER unlocking + // the driver lock to avoid deadlocking w/ the peerDB. + var doPeerFlush bool + defer func() { + d.Unlock() + if doPeerFlush { + d.peerFlush(nid) + } + }() // This is similar to d.network(), but we need to keep holding the lock // until we are done removing this network. @@ -270,7 +278,7 @@ func (d *driver) DeleteNetwork(nid string) error { } } // flush the peerDB entries - d.peerFlush(nid) + doPeerFlush = true delete(d.networks, nid) vnis, err := n.releaseVxlanID()