From 0db9138abb6b07f5ec6a910bf26dbbdec45c6c69 Mon Sep 17 00:00:00 2001 From: ziggie Date: Tue, 12 Nov 2024 23:16:17 +0100 Subject: [PATCH 1/4] contractcourt: make channel arb starts async. When starting the channel arbitrators we make sure they are started concurrently. --- contractcourt/chain_arbitrator.go | 63 +++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/contractcourt/chain_arbitrator.go b/contractcourt/chain_arbitrator.go index c29178b4384..11eaed6fe22 100644 --- a/contractcourt/chain_arbitrator.go +++ b/contractcourt/chain_arbitrator.go @@ -1,6 +1,7 @@ package contractcourt import ( + "context" "errors" "fmt" "sync" @@ -22,11 +23,18 @@ import ( "github.com/lightningnetwork/lnd/lnwallet" "github.com/lightningnetwork/lnd/lnwallet/chainfee" "github.com/lightningnetwork/lnd/lnwire" + "golang.org/x/sync/errgroup" ) // ErrChainArbExiting signals that the chain arbitrator is shutting down. var ErrChainArbExiting = errors.New("ChainArbitrator exiting") +const ( + // chainArbTimeout is the timeout for the chain arbitrator to start + // the channel arbitrators for each channel. + chainArbTimeout = 5 * time.Minute +) + // ResolutionMsg is a message sent by resolvers to outside sub-systems once an // outgoing contract has been fully resolved. For multi-hop contracts, if we // resolve the outgoing contract, we'll also need to ensure that the incoming @@ -767,20 +775,67 @@ func (c *ChainArbitrator) Start() error { // Launch all the goroutines for each arbitrator so they can carry out // their duties. + // Set a timeout for the group so we don't wait indefinitely if one + // startup function blocks. + ctx, cancel := context.WithTimeout( + context.Background(), chainArbTimeout, + ) + + eg, egCtx := errgroup.WithContext(ctx) + for _, arbitrator := range c.activeChannels { startState, ok := startStates[arbitrator.cfg.ChanPoint] if !ok { stopAndLog() + + // In case we encounter an error we need to cancel the + // context to ensure all goroutines are cleaned up. + cancel() return fmt.Errorf("arbitrator: %v has no start state", arbitrator.cfg.ChanPoint) } - if err := arbitrator.Start(startState); err != nil { - stopAndLog() - return err - } + arb := arbitrator // Create new variable for closure + eg.Go(func() error { + // Create a non-blocking goroutine for the actual Start + // call + startErrChan := make(chan error, 1) + go func() { + startErrChan <- arb.Start(startState) + }() + + select { + case startErr := <-startErrChan: + if startErr != nil { + cancel() + } + + return startErr + + case <-egCtx.Done(): + return egCtx.Err() + + case <-c.quit: + + return ErrChainArbExiting + } + }) } + // Wait for the error group to finish and if an error occurred we + // trigger a graceful shutdown of LND. + // + // NOTE: We do not add this collector to the waitGroup because we want + // to stop the chain arbitrator if there occurs an error. + go func() { + defer cancel() + + if err := eg.Wait(); err != nil { + log.Criticalf("ChainArbitrator failed to start all "+ + "channel arbitrators: %v", err) + } + }() + // Subscribe to a single stream of block epoch notifications that we // will dispatch to all active arbitrators. blockEpoch, err := c.cfg.Notifier.RegisterBlockEpochNtfn(nil) From 1ca237387a60e894ad06bbc467e2b62aca8440de Mon Sep 17 00:00:00 2001 From: ziggie Date: Wed, 13 Nov 2024 13:46:38 +0100 Subject: [PATCH 2/4] multi: make timeout configurable The channel arbitrator startups have now a configurable timeout but this config value is hidden. --- config.go | 12 ++++++++++++ contractcourt/chain_arbitrator.go | 21 +++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/config.go b/config.go index 9b9e9573e00..355e2191d4a 100644 --- a/config.go +++ b/config.go @@ -28,6 +28,7 @@ import ( "github.com/lightningnetwork/lnd/chainreg" "github.com/lightningnetwork/lnd/chanbackup" "github.com/lightningnetwork/lnd/channeldb" + "github.com/lightningnetwork/lnd/contractcourt" "github.com/lightningnetwork/lnd/discovery" "github.com/lightningnetwork/lnd/funding" "github.com/lightningnetwork/lnd/htlcswitch" @@ -405,6 +406,8 @@ type Config struct { ChannelCommitBatchSize uint32 `long:"channel-commit-batch-size" description:"The maximum number of channel state updates that is accumulated before signing a new commitment."` + ChannelArbStartTimeout time.Duration `long:"channel-arb-start-timeout" hidden:"true" description:"The maximum time that is allowed to pass while the chain arbitrator starts the individual channel arbitrators. This value might need to be increased if lnd is running with aux components that depend on all subsystems to be fully started."` + KeepFailedPaymentAttempts bool `long:"keep-failed-payment-attempts" description:"Keeps persistent record of all failed payment attempts for successfully settled payments."` StoreFinalHtlcResolutions bool `long:"store-final-htlc-resolutions" description:"Persistently store the final resolution of incoming htlcs."` @@ -720,6 +723,7 @@ func DefaultConfig() Config { ChannelCommitInterval: defaultChannelCommitInterval, PendingCommitInterval: defaultPendingCommitInterval, ChannelCommitBatchSize: defaultChannelCommitBatchSize, + ChannelArbStartTimeout: contractcourt.DefaultChainArbTimeout, CoinSelectionStrategy: defaultCoinSelectionStrategy, KeepFailedPaymentAttempts: defaultKeepFailedPaymentAttempts, RemoteSigner: &lncfg.RemoteSigner{ @@ -1668,6 +1672,14 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser, maxPendingCommitInterval) } + // Limit the chain arbitrator startup timeout to a reasonable minimum + // value. Too short might run into restart loops. + if cfg.ChannelArbStartTimeout < contractcourt.DefaultChainArbTimeout { + return nil, mkErr("channel-arb-start-timeout (%v) must be at "+ + "least %v", cfg.ChannelArbStartTimeout, + contractcourt.DefaultChainArbTimeout) + } + if err := cfg.Gossip.Parse(); err != nil { return nil, mkErr("error parsing gossip syncer: %v", err) } diff --git a/contractcourt/chain_arbitrator.go b/contractcourt/chain_arbitrator.go index 11eaed6fe22..15b76b0921f 100644 --- a/contractcourt/chain_arbitrator.go +++ b/contractcourt/chain_arbitrator.go @@ -30,9 +30,12 @@ import ( var ErrChainArbExiting = errors.New("ChainArbitrator exiting") const ( - // chainArbTimeout is the timeout for the chain arbitrator to start - // the channel arbitrators for each channel. - chainArbTimeout = 5 * time.Minute + // DefaultChainArbTimeout is the default timeout for the chain + // arbitrator to start the channel arbitrators for each channel. This + // value must be long enough because starting arbitrators might depend + // on external components (e.g. aux components, hooks), which might + // require other lnd subsystems to be fully started. + DefaultChainArbTimeout = 5 * time.Minute ) // ResolutionMsg is a message sent by resolvers to outside sub-systems once an @@ -226,6 +229,10 @@ type ChainArbitratorConfig struct { // lower package. QueryIncomingCircuit func(circuit models.CircuitKey) *models.CircuitKey + // StartupTimeout is the maximum time the arbitrator will wait when + // starting the individual channel arbitrators. + StartupTimeout time.Duration + // AuxLeafStore is an optional store that can be used to store auxiliary // leaves for certain custom channel types. AuxLeafStore fn.Option[lnwallet.AuxLeafStore] @@ -280,6 +287,12 @@ type ChainArbitrator struct { func NewChainArbitrator(cfg ChainArbitratorConfig, db *channeldb.DB) *ChainArbitrator { + // Let's use the default timeout for tests. For normal operation, the + // value is validated when parsing the configuration. + if cfg.StartupTimeout == 0 { + cfg.StartupTimeout = DefaultChainArbTimeout + } + return &ChainArbitrator{ cfg: cfg, activeChannels: make(map[wire.OutPoint]*ChannelArbitrator), @@ -778,7 +791,7 @@ func (c *ChainArbitrator) Start() error { // Set a timeout for the group so we don't wait indefinitely if one // startup function blocks. ctx, cancel := context.WithTimeout( - context.Background(), chainArbTimeout, + context.Background(), c.cfg.StartupTimeout, ) eg, egCtx := errgroup.WithContext(ctx) From ae120e36af28e9da9fe4f508cda67f2b82883f19 Mon Sep 17 00:00:00 2001 From: ziggie Date: Wed, 13 Nov 2024 16:58:55 +0100 Subject: [PATCH 3/4] contractcourt: use concurrent safe waitGroup. --- contractcourt/channel_arbitrator.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/contractcourt/channel_arbitrator.go b/contractcourt/channel_arbitrator.go index cc1ee69589a..ddaac07c0cd 100644 --- a/contractcourt/channel_arbitrator.go +++ b/contractcourt/channel_arbitrator.go @@ -382,7 +382,7 @@ type ChannelArbitrator struct { // upon start up to decide which actions to take. state ArbitratorState - wg sync.WaitGroup + wg *fn.GoroutineManager quit chan struct{} } @@ -413,6 +413,7 @@ func NewChannelArbitrator(cfg ChannelArbitratorConfig, activeHTLCs: htlcSets, unmergedSet: unmerged, cfg: cfg, + wg: fn.NewGoroutineManager(context.Background()), quit: make(chan struct{}), } } @@ -570,9 +571,9 @@ func (c *ChannelArbitrator) Start(state *chanArbStartState) error { } } - c.wg.Add(1) - go c.channelAttendant(bestHeight) - return nil + return c.wg.Go(func(ctx context.Context) { + c.channelAttendant(bestHeight) + }) } // maybeAugmentTaprootResolvers will update the contract resolution information @@ -844,7 +845,7 @@ func (c *ChannelArbitrator) Stop() error { c.activeResolversLock.RUnlock() close(c.quit) - c.wg.Wait() + c.wg.Stop() return nil } @@ -1568,8 +1569,13 @@ func (c *ChannelArbitrator) launchResolvers(resolvers []ContractResolver, c.activeResolvers = resolvers for _, contract := range resolvers { - c.wg.Add(1) - go c.resolveContract(contract, immediate) + err := c.wg.Go(func(ctx context.Context) { + c.resolveContract(contract, immediate) + }) + if err != nil { + log.Criticalf("ChannelArbitrator(%v): unable to "+ + "launch resolver: %v", c.cfg.ChanPoint, err) + } } } From b0ee8656b5ba6e3ce58fb7acf99bbe09f0571a94 Mon Sep 17 00:00:00 2001 From: ziggie Date: Wed, 13 Nov 2024 17:01:29 +0100 Subject: [PATCH 4/4] docs: add release-notes --- docs/release-notes/release-notes-0.19.0.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/release-notes/release-notes-0.19.0.md b/docs/release-notes/release-notes-0.19.0.md index c3048c78c58..5737341e9f3 100644 --- a/docs/release-notes/release-notes-0.19.0.md +++ b/docs/release-notes/release-notes-0.19.0.md @@ -54,6 +54,11 @@ * Make sure the RPC clients used to access the chain backend are [properly shutdown](https://github.com/lightningnetwork/lnd/pull/9261). + +* [Start channel arbitrators concurrently]( + https://github.com/lightningnetwork/lnd/pull/9262) to prevent potential + deadlocks when lnd depends on external components (e.g. aux components, hooks) + which might require other lnd subsystems to be fully started. # New Features ## Functional Enhancements