server: always backoff for unstable peers

This commit modifies the connection peer backoff
logic such that it will always backoff for "unstable"
peers. Unstable in this context is determined by
connections whose duration is shorter than 10
minutes. If a disconnect happens with a peer
whose connection lasts longer than 10 minutes,
we will scale back our stored backoff for that peer.

This resolves an issue that would result in a tight
connection loop with remote peers. This stemmed
from the connection duration being very short,
and always driving the backoff to the default
backoff of 1 second. Short connections like
this are now caught by the stable connection
threshold.

This also modifies the computation on the
backoff relaxation to subtract the connection
duration after applying randomized exponential
backoff, which offers better stability when
the connection duration and backoff are roughly
equal.
This commit is contained in:
Conner Fromknecht 2018-09-01 14:09:16 -07:00
parent 4f43c1c943
commit 8d7eb41d48
No known key found for this signature in database
GPG Key ID: E7D737B67FA592C7

@ -53,6 +53,12 @@ const (
// maximumBackoff is the largest backoff we will permit when // maximumBackoff is the largest backoff we will permit when
// reattempting connections to persistent peers. // reattempting connections to persistent peers.
maximumBackoff = time.Hour maximumBackoff = time.Hour
// defaultStableConnDuration is a floor under which all reconnection
// attempts will apply exponential randomized backoff. Connections
// durations exceeding this value will be eligible to have their
// backoffs reduced.
defaultStableConnDuration = 10 * time.Minute
) )
var ( var (
@ -1951,19 +1957,28 @@ func (s *server) nextPeerBackoff(pubStr string,
return computeNextBackoff(backoff) return computeNextBackoff(backoff)
} }
// The peer succeeded in starting. We'll reduce the timeout duration // The peer succeeded in starting. If the connection didn't last long
// by the length of the connection before applying randomized // enough to be considered stable, we'll continue to back off retries
// exponential backoff. We'll only apply this if: // with this peer.
// backoff - connDuration > defaultBackoff
connDuration := time.Now().Sub(startTime) connDuration := time.Now().Sub(startTime)
relaxedBackoff := backoff - connDuration if connDuration < defaultStableConnDuration {
if relaxedBackoff > defaultBackoff { return computeNextBackoff(backoff)
return computeNextBackoff(relaxedBackoff)
} }
// Otherwise, backoff - connDuration <= defaultBackoff, meaning the // The peer succeed in starting and this was stable peer, so we'll
// connection lasted much longer than our previous backoff. To reward // reduce the timeout duration by the length of the connection after
// such good behavior, we'll reconnect after the default timeout. // applying randomized exponential backoff. We'll only apply this in the
// case that:
// reb(curBackoff) - connDuration > defaultBackoff
relaxedBackoff := computeNextBackoff(backoff) - connDuration
if relaxedBackoff > defaultBackoff {
return relaxedBackoff
}
// Lastly, if reb(currBackoff) - connDuration <= defaultBackoff, meaning
// the stable connection lasted much longer than our previous backoff.
// To reward such good behavior, we'll reconnect after the default
// timeout.
return defaultBackoff return defaultBackoff
} }
@ -2451,7 +2466,7 @@ func (s *server) peerTerminationWatcher(p *peer, ready chan struct{}) {
links, err := p.server.htlcSwitch.GetLinksByInterface(p.pubKeyBytes) links, err := p.server.htlcSwitch.GetLinksByInterface(p.pubKeyBytes)
if err != nil && err != htlcswitch.ErrNoLinksFound { if err != nil && err != htlcswitch.ErrNoLinksFound {
srvrLog.Errorf("Unable to get channel links for %x: %v", srvrLog.Errorf("Unable to get channel links for %x: %v",
p.pubKeyBytes, err) p.PubKey(), err)
} }
for _, link := range links { for _, link := range links {