Skip to content

Commit

Permalink
Merge pull request #7711 from Roasbeef/commit-dance-stall-disconnect
Browse files Browse the repository at this point in the history
htlcswitch+peer: actually actually disconnect if we detect a stalled remote channel state machine
  • Loading branch information
Roasbeef authored May 23, 2023
2 parents 7dacf5e + 4d633f0 commit b9b20ac
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 31 deletions.
15 changes: 12 additions & 3 deletions docs/release-notes/release-notes-0.16.3.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
# Release Notes

## Mempool
## Mempool Optimizations

* Optimized [mempool
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the CPU
usage.
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the
CPU usage.

## Misc

* [Re-encrypt/regenerate](https://github.com/lightningnetwork/lnd/pull/7705)
all macaroon DB root keys on `ChangePassword`/`GenerateNewRootKey`
respectively.

## Channel Link Bug Fix

* If we detect the remote link is inactive, [we'll now tear down the
connection](https://github.com/lightningnetwork/lnd/pull/7711) in addition to
stopping the link's statemachine. If we're persistently connected with the
peer, then this'll force a reconnect, which may restart things and help avoid
certain force close scenarios.

# Contributors (Alphabetical Order)

* Elle Mouton
* Olaoluwa Osuntokun
* Yong Yu
27 changes: 16 additions & 11 deletions htlcswitch/link.go
Original file line number Diff line number Diff line change
Expand Up @@ -1036,8 +1036,8 @@ func (l *channelLink) htlcManager() {
// storing the transaction in the db.
l.fail(
LinkFailureError{
code: ErrSyncError,
ForceClose: true,
code: ErrSyncError,
FailureAction: LinkFailureForceClose, //nolint:lll
},
"unable to synchronize channel "+
"states: %v", err,
Expand Down Expand Up @@ -1077,8 +1077,8 @@ func (l *channelLink) htlcManager() {

l.fail(
LinkFailureError{
code: ErrRecoveryError,
ForceClose: false,
code: ErrRecoveryError,
FailureAction: LinkFailureForceNone,
},
"unable to synchronize channel "+
"states: %v", err,
Expand Down Expand Up @@ -1239,8 +1239,13 @@ func (l *channelLink) htlcManager() {
}

case <-l.cfg.PendingCommitTicker.Ticks():
l.fail(LinkFailureError{code: ErrRemoteUnresponsive},
"unable to complete dance")
l.fail(
LinkFailureError{
code: ErrRemoteUnresponsive,
FailureAction: LinkFailureDisconnect,
},
"unable to complete dance",
)
return

// A message from the switch was just received. This indicates
Expand Down Expand Up @@ -1782,8 +1787,8 @@ func (l *channelLink) handleUpstreamMsg(msg lnwire.Message) {
if err := l.channel.ReceiveHTLCSettle(pre, idx); err != nil {
l.fail(
LinkFailureError{
code: ErrInvalidUpdate,
ForceClose: true,
code: ErrInvalidUpdate,
FailureAction: LinkFailureForceClose,
},
"unable to handle upstream settle HTLC: %v", err,
)
Expand Down Expand Up @@ -1947,9 +1952,9 @@ func (l *channelLink) handleUpstreamMsg(msg lnwire.Message) {
}
l.fail(
LinkFailureError{
code: ErrInvalidCommitment,
ForceClose: true,
SendData: sendData,
code: ErrInvalidCommitment,
FailureAction: LinkFailureForceClose,
SendData: sendData,
},
"ChannelPoint(%v): unable to accept new "+
"commitment: %v",
Expand Down
21 changes: 12 additions & 9 deletions htlcswitch/link_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5457,8 +5457,10 @@ func TestChannelLinkFail(t *testing.T) {
// If we expect the link to force close the channel in this
// case, check that it happens. If not, make sure it does not
// happen.
require.Equal(
t, test.shouldForceClose, linkErr.ForceClose, test.name,
isForceCloseErr := (linkErr.FailureAction ==
LinkFailureForceClose)
require.True(
t, test.shouldForceClose == isForceCloseErr, test.name,
)
require.Equal(
t, test.permanentFailure, linkErr.PermanentFailure,
Expand Down Expand Up @@ -6342,11 +6344,12 @@ func TestPendingCommitTicker(t *testing.T) {
// Assert that we get the expected link failure from Alice.
select {
case linkErr := <-linkErrs:
if linkErr.code != ErrRemoteUnresponsive {
t.Fatalf("error code mismatch, "+
"want: ErrRemoteUnresponsive, got: %v",
linkErr.code)
}
require.Equal(
t, linkErr.code, ErrRemoteUnresponsive,
fmt.Sprintf("error code mismatch, want: "+
"ErrRemoteUnresponsive, got: %v", linkErr.code),
)
require.Equal(t, linkErr.FailureAction, LinkFailureDisconnect)

case <-time.After(time.Second):
t.Fatalf("did not receive failure")
Expand Down Expand Up @@ -6523,7 +6526,7 @@ func TestPipelineSettle(t *testing.T) {
// ForceClose should be false.
select {
case linkErr := <-linkErrors:
require.False(t, linkErr.ForceClose)
require.False(t, linkErr.FailureAction == LinkFailureForceClose)
case <-forwardChan:
t.Fatal("packet was erroneously forwarded")
}
Expand Down Expand Up @@ -6559,7 +6562,7 @@ func TestPipelineSettle(t *testing.T) {
// ForceClose should be false.
select {
case linkErr := <-linkErrors:
require.False(t, linkErr.ForceClose)
require.False(t, linkErr.FailureAction == LinkFailureForceClose)
case <-forwardChan:
t.Fatal("packet was erroneously forwarded")
}
Expand Down
23 changes: 20 additions & 3 deletions htlcswitch/linkfailure.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,24 @@ const (
ErrCircuitError
)

// LinkFailureAction is an enum-like type that describes the action that should
// be taken in response to a link failure.
type LinkFailureAction uint8

const (
// LinkFailureForceNone indicates no action is to be taken.
LinkFailureForceNone LinkFailureAction = iota

// LinkFailureForceClose indicates that the channel should be force
// closed.
LinkFailureForceClose

// LinkFailureDisconnect indicates that we should disconnect in an
// attempt to recycle the connection. This can be useful if we think a
// TCP connection or state machine is stalled.
LinkFailureDisconnect
)

// LinkFailureError encapsulates an error that will make us fail the current
// link. It contains the necessary information needed to determine if we should
// force close the channel in the process, and if any error data should be sent
Expand All @@ -61,9 +79,8 @@ type LinkFailureError struct {
// code is the type of error this LinkFailureError encapsulates.
code errorCode

// ForceClose indicates whether we should force close the channel
// because of this error.
ForceClose bool
// FailureAction describes what we should do to fail the channel.
FailureAction LinkFailureAction

// PermanentFailure indicates whether this failure is permanent, and
// the channel should not be attempted loaded again.
Expand Down
16 changes: 11 additions & 5 deletions peer/brontide.go
Original file line number Diff line number Diff line change
Expand Up @@ -3094,11 +3094,10 @@ func (p *Brontide) handleLinkFailure(failure linkFailureReport) {
// being applied.
p.WipeChannel(&failure.chanPoint)

// If the error encountered was severe enough, we'll now force close the
// channel to prevent reading it to the switch in the future.
if failure.linkErr.ForceClose {
p.log.Warnf("Force closing link(%v)",
failure.shortChanID)
// If the error encountered was severe enough, we'll now force close
// the channel to prevent reading it to the switch in the future.
if failure.linkErr.FailureAction == htlcswitch.LinkFailureForceClose {
p.log.Warnf("Force closing link(%v)", failure.shortChanID)

closeTx, err := p.cfg.ChainArb.ForceCloseContract(
failure.chanPoint,
Expand Down Expand Up @@ -3143,6 +3142,13 @@ func (p *Brontide) handleLinkFailure(failure linkFailureReport) {
"remote peer: %v", err)
}
}

// If the failure action is disconnect, then we'll execute that now. If
// we had to send an error above, it was a sync call, so we expect the
// message to be flushed on the wire by now.
if failure.linkErr.FailureAction == htlcswitch.LinkFailureDisconnect {
p.Disconnect(fmt.Errorf("link requested disconnect"))
}
}

// tryLinkShutdown attempts to fetch a target link from the switch, calls
Expand Down

0 comments on commit b9b20ac

Please sign in to comment.