Merge pull request #1706 from joostjager/errorprocessing

routing: make routing retry behaviour consistent
This commit is contained in:
Olaoluwa Osuntokun 2018-12-03 18:29:07 -08:00 committed by GitHub
commit 5075394617
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 143 additions and 164 deletions

@ -164,6 +164,12 @@ type paymentSession struct {
bandwidthHints map[uint64]lnwire.MilliSatoshi bandwidthHints map[uint64]lnwire.MilliSatoshi
// errFailedFeeChans is a map of the short channel ID's that were the
// source of policy related routing failures during this payment attempt.
// We'll use this map to prune out channels when the first error may not
// require pruning, but any subsequent ones do.
errFailedPolicyChans map[uint64]struct{}
mc *missionControl mc *missionControl
haveRoutes bool haveRoutes bool
@ -236,10 +242,11 @@ func (m *missionControl) NewPaymentSession(routeHints [][]HopHint,
} }
return &paymentSession{ return &paymentSession{
pruneViewSnapshot: viewSnapshot, pruneViewSnapshot: viewSnapshot,
additionalEdges: edges, additionalEdges: edges,
bandwidthHints: bandwidthHints, bandwidthHints: bandwidthHints,
mc: m, errFailedPolicyChans: make(map[uint64]struct{}),
mc: m,
}, nil }, nil
} }
@ -249,10 +256,11 @@ func (m *missionControl) NewPaymentSession(routeHints [][]HopHint,
// used for things like channel rebalancing, and swaps. // used for things like channel rebalancing, and swaps.
func (m *missionControl) NewPaymentSessionFromRoutes(routes []*Route) *paymentSession { func (m *missionControl) NewPaymentSessionFromRoutes(routes []*Route) *paymentSession {
return &paymentSession{ return &paymentSession{
pruneViewSnapshot: m.GraphPruneView(), pruneViewSnapshot: m.GraphPruneView(),
haveRoutes: true, haveRoutes: true,
preBuiltRoutes: routes, preBuiltRoutes: routes,
mc: m, errFailedPolicyChans: make(map[uint64]struct{}),
mc: m,
} }
} }
@ -331,6 +339,31 @@ func (p *paymentSession) ReportChannelFailure(e uint64) {
p.mc.Unlock() p.mc.Unlock()
} }
// ReportChannelPolicyFailure handles a failure message that relates to a
// channel policy. For these types of failures, the policy is updated and we
// want to keep it included during path finding. This function does mark the
// edge as 'policy failed once'. The next time it fails, the whole node will be
// pruned. This is to prevent nodes from keeping us busy by continuously sending
// new channel updates.
func (p *paymentSession) ReportChannelPolicyFailure(
errSource Vertex, failedChanID uint64) {
// Check to see if we've already reported a policy related failure for
// this channel. If so, then we'll prune out the vertex.
_, ok := p.errFailedPolicyChans[failedChanID]
if ok {
// TODO(joostjager): is this aggresive pruning still necessary?
// Just pruning edges may also work unless there is a huge
// number of failing channels from that node?
p.ReportVertexFailure(errSource)
return
}
// Finally, we'll record a policy failure from this node and move on.
p.errFailedPolicyChans[failedChanID] = struct{}{}
}
// RequestRoute returns a route which is likely to be capable for successfully // RequestRoute returns a route which is likely to be capable for successfully
// routing the specified HTLC payment to the target node. Initially the first // routing the specified HTLC payment to the target node. Initially the first
// set of paths returned from this method may encounter routing failure along // set of paths returned from this method may encounter routing failure along

@ -1641,12 +1641,6 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
sendError error sendError error
) )
// errFailedFeeChans is a map of the short channel ID's that were the
// source of fee related routing failures during this payment attempt.
// We'll use this map to prune out channels when the first error may
// not require pruning, but any subsequent ones do.
errFailedFeeChans := make(map[lnwire.ShortChannelID]struct{})
// We'll also fetch the current block height so we can properly // We'll also fetch the current block height so we can properly
// calculate the required HTLC time locks within the route. // calculate the required HTLC time locks within the route.
_, currentHeight, err := r.cfg.Chain.GetBestBlock() _, currentHeight, err := r.cfg.Chain.GetBestBlock()
@ -1758,11 +1752,55 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
} }
errSource := fErr.ErrorSource errSource := fErr.ErrorSource
errVertex := NewVertex(errSource)
log.Tracef("node=%x reported failure when sending "+ log.Tracef("node=%x reported failure when sending "+
"htlc=%x", errSource.SerializeCompressed(), "htlc=%x", errSource.SerializeCompressed(),
payment.PaymentHash[:]) payment.PaymentHash[:])
// Always determine chan id ourselves, because a channel
// update with id may not be available.
failedChanID, err := getFailedChannelID(route, errSource)
if err != nil {
return preImage, nil, err
}
// processChannelUpdateAndRetry is a closure that
// handles a failure message containing a channel
// update. This function always tries to apply the
// channel update and passes on the result to the
// payment session to adjust its view on the reliability
// of the network.
//
// As channel id, the locally determined channel id is
// used. It does not rely on the channel id that is part
// of the channel update message, because the remote
// node may lie to us or the update may be corrupt.
processChannelUpdateAndRetry := func(
update *lnwire.ChannelUpdate,
pubKey *btcec.PublicKey) {
// Try to apply the channel update.
updateOk := r.applyChannelUpdate(update, pubKey)
// If the update could not be applied, prune the
// edge. There is no reason to continue trying
// this channel.
//
// TODO: Could even prune the node completely?
// Or is there a valid reason for the channel
// update to fail?
if !updateOk {
paySession.ReportChannelFailure(
failedChanID,
)
}
paySession.ReportChannelPolicyFailure(
NewVertex(errSource), failedChanID,
)
}
switch onionErr := fErr.FailureMessage.(type) { switch onionErr := fErr.FailureMessage.(type) {
// If the end destination didn't know they payment // If the end destination didn't know they payment
// hash, then we'll terminate immediately. // hash, then we'll terminate immediately.
@ -1803,16 +1841,8 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
// that sent us this error, as it doesn't now what the // that sent us this error, as it doesn't now what the
// correct block height is. // correct block height is.
case *lnwire.FailExpiryTooSoon: case *lnwire.FailExpiryTooSoon:
update := onionErr.Update r.applyChannelUpdate(&onionErr.Update, errSource)
err := r.applyChannelUpdate(&update, errSource) paySession.ReportVertexFailure(errVertex)
if err != nil {
log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
}
pruneVertexFailure(
paySession, route, errSource, false,
)
continue continue
// If we hit an instance of onion payload corruption or // If we hit an instance of onion payload corruption or
@ -1825,66 +1855,30 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
case *lnwire.FailInvalidOnionKey: case *lnwire.FailInvalidOnionKey:
return preImage, nil, sendError return preImage, nil, sendError
// If the onion error includes a channel update, and // If we get a failure due to violating the minimum
// isn't necessarily fatal, then we'll apply the update // amount, we'll apply the new minimum amount and retry
// and continue with the rest of the routes. // routing.
case *lnwire.FailAmountBelowMinimum: case *lnwire.FailAmountBelowMinimum:
update := onionErr.Update processChannelUpdateAndRetry(
err := r.applyChannelUpdate(&update, errSource) &onionErr.Update, errSource,
if err != nil { )
log.Errorf("unable to apply channel "+ continue
"update for onion error: %v", err)
}
return preImage, nil, sendError // If we get a failure due to a fee, we'll apply the
// If we get a failure due to a fee, so we'll apply the
// new fee update, and retry our attempt using the // new fee update, and retry our attempt using the
// newly updated fees. // newly updated fees.
case *lnwire.FailFeeInsufficient: case *lnwire.FailFeeInsufficient:
update := onionErr.Update processChannelUpdateAndRetry(
err := r.applyChannelUpdate(&update, errSource) &onionErr.Update, errSource,
if err != nil { )
log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
pruneEdgeFailure(
paySession, route, errSource,
)
}
// We'll now check to see if we've already
// reported a fee related failure for this
// node. If so, then we'll actually prune out
// the vertex for now.
chanID := update.ShortChannelID
_, ok := errFailedFeeChans[chanID]
if ok {
pruneVertexFailure(
paySession, route, errSource, false,
)
continue
}
// Finally, we'll record a fee failure from
// this node and move on.
errFailedFeeChans[chanID] = struct{}{}
continue continue
// If we get the failure for an intermediate node that // If we get the failure for an intermediate node that
// disagrees with our time lock values, then we'll // disagrees with our time lock values, then we'll
// prune it out for now, and continue with path // apply the new delta value and try it once more.
// finding.
case *lnwire.FailIncorrectCltvExpiry: case *lnwire.FailIncorrectCltvExpiry:
update := onionErr.Update processChannelUpdateAndRetry(
err := r.applyChannelUpdate(&update, errSource) &onionErr.Update, errSource,
if err != nil {
log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
}
pruneVertexFailure(
paySession, route, errSource, false,
) )
continue continue
@ -1892,46 +1886,30 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
// forward one is currently disabled, so we'll apply // forward one is currently disabled, so we'll apply
// the update and continue. // the update and continue.
case *lnwire.FailChannelDisabled: case *lnwire.FailChannelDisabled:
update := onionErr.Update r.applyChannelUpdate(&onionErr.Update, errSource)
err := r.applyChannelUpdate(&update, errSource) paySession.ReportChannelFailure(failedChanID)
if err != nil {
log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
}
pruneEdgeFailure(paySession, route, errSource)
continue continue
// It's likely that the outgoing channel didn't have // It's likely that the outgoing channel didn't have
// sufficient capacity, so we'll prune this edge for // sufficient capacity, so we'll prune this edge for
// now, and continue onwards with our path finding. // now, and continue onwards with our path finding.
case *lnwire.FailTemporaryChannelFailure: case *lnwire.FailTemporaryChannelFailure:
update := onionErr.Update r.applyChannelUpdate(onionErr.Update, errSource)
err := r.applyChannelUpdate(update, errSource) paySession.ReportChannelFailure(failedChanID)
if err != nil {
log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
}
pruneEdgeFailure(paySession, route, errSource)
continue continue
// If the send fail due to a node not having the // If the send fail due to a node not having the
// required features, then we'll note this error and // required features, then we'll note this error and
// continue. // continue.
case *lnwire.FailRequiredNodeFeatureMissing: case *lnwire.FailRequiredNodeFeatureMissing:
pruneVertexFailure( paySession.ReportVertexFailure(errVertex)
paySession, route, errSource, false,
)
continue continue
// If the send fail due to a node not having the // If the send fail due to a node not having the
// required features, then we'll note this error and // required features, then we'll note this error and
// continue. // continue.
case *lnwire.FailRequiredChannelFeatureMissing: case *lnwire.FailRequiredChannelFeatureMissing:
pruneVertexFailure( paySession.ReportVertexFailure(errVertex)
paySession, route, errSource, false,
)
continue continue
// If the next hop in the route wasn't known or // If the next hop in the route wasn't known or
@ -1942,22 +1920,18 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
// returning errors in order to attempt to black list // returning errors in order to attempt to black list
// another node. // another node.
case *lnwire.FailUnknownNextPeer: case *lnwire.FailUnknownNextPeer:
pruneEdgeFailure(paySession, route, errSource) paySession.ReportChannelFailure(failedChanID)
continue continue
// If the node wasn't able to forward for which ever // If the node wasn't able to forward for which ever
// reason, then we'll note this and continue with the // reason, then we'll note this and continue with the
// routes. // routes.
case *lnwire.FailTemporaryNodeFailure: case *lnwire.FailTemporaryNodeFailure:
pruneVertexFailure( paySession.ReportVertexFailure(errVertex)
paySession, route, errSource, false,
)
continue continue
case *lnwire.FailPermanentNodeFailure: case *lnwire.FailPermanentNodeFailure:
pruneVertexFailure( paySession.ReportVertexFailure(errVertex)
paySession, route, errSource, false,
)
continue continue
// If we crafted a route that contains a too long time // If we crafted a route that contains a too long time
@ -1970,16 +1944,14 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
// that as a hint during future path finding through // that as a hint during future path finding through
// that node. // that node.
case *lnwire.FailExpiryTooFar: case *lnwire.FailExpiryTooFar:
pruneVertexFailure( paySession.ReportVertexFailure(errVertex)
paySession, route, errSource, false,
)
continue continue
// If we get a permanent channel or node failure, then // If we get a permanent channel or node failure, then
// we'll note this (exclude the vertex/edge), and // we'll note this (exclude the vertex/edge), and
// continue with the rest of the routes. // continue with the rest of the routes.
case *lnwire.FailPermanentChannelFailure: case *lnwire.FailPermanentChannelFailure:
pruneEdgeFailure(paySession, route, errSource) paySession.ReportChannelFailure(failedChanID)
continue continue
default: default:
@ -1991,74 +1963,46 @@ func (r *ChannelRouter) sendPayment(payment *LightningPayment,
} }
} }
// pruneVertexFailure will attempt to prune a vertex from the current available // getFailedChannelID tries to locate the failing channel given a route and the
// vertexes of the target payment session in response to an encountered routing // pubkey of the node that sent the error. It will assume that the error is
// error. // associated with the outgoing channel of the error node.
func pruneVertexFailure(paySession *paymentSession, route *Route, func getFailedChannelID(route *Route, errSource *btcec.PublicKey) (
errSource *btcec.PublicKey, nextNode bool) { uint64, error) {
// By default, we'll try to prune the node that actually sent us the
// error.
errNode := NewVertex(errSource)
// If this failure indicates that the node _after_ the source of the
// error was not found. As a result, we'll locate the vertex for that
// node itself.
if nextNode {
nodeToPrune, ok := route.nextHopVertex(errSource)
if ok {
errNode = nodeToPrune
}
}
// Once we've located the vertex, we'll report this failure to
// missionControl and restart path finding.
paySession.ReportVertexFailure(errNode)
}
// pruneEdgeFailure will attempts to prune an edge from the current available
// edges of the target payment session in response to an encountered routing
// error.
func pruneEdgeFailure(paySession *paymentSession, route *Route,
errSource *btcec.PublicKey) {
// As this error indicates that the target channel was unable to carry // As this error indicates that the target channel was unable to carry
// this HTLC (for w/e reason), we'll query the index to find the // this HTLC (for w/e reason), we'll query the index to find the
// _outgoing_ channel the source of the error was meant to pass the // _outgoing_ channel the source of the error was meant to pass the
// HTLC along to. // HTLC along to.
badChan, ok := route.nextHopChannel(errSource) if badChan, ok := route.nextHopChannel(errSource); ok {
if !ok { return badChan.ChannelID, nil
// If we weren't able to find the hop *after* this node, then
// we'll attempt to disable the previous channel.
prevChan, ok := route.prevHopChannel(
errSource,
)
if !ok {
return
}
badChan = prevChan
} }
// If the channel was found, then we'll inform mission control of this // If we weren't able to find the hop *after* this node, then we'll
// failure so future attempts avoid this link temporarily. // attempt to disable the previous channel.
paySession.ReportChannelFailure(badChan.ChannelID) //
// TODO(joostjager): errSource must be the final hop then? In that case,
// certain types of errors are not expected. For example
// FailUnknownNextPeer. This could be a reason to prune the node?
if prevChan, ok := route.prevHopChannel(errSource); ok {
return prevChan.ChannelID, nil
}
return 0, fmt.Errorf("cannot find channel in route")
} }
// applyChannelUpdate validates a channel update and if valid, applies it to the // applyChannelUpdate validates a channel update and if valid, applies it to the
// database. // database. It returns a bool indicating whether the updates was successful.
func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate, func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate,
pubKey *btcec.PublicKey) error { pubKey *btcec.PublicKey) bool {
// If we get passed a nil channel update (as it's optional with some // If we get passed a nil channel update (as it's optional with some
// onion errors), then we'll exit early with a nil error. // onion errors), then we'll exit early with a success result.
if msg == nil { if msg == nil {
return nil return true
} }
if err := ValidateChannelUpdateAnn(pubKey, msg); err != nil { if err := ValidateChannelUpdateAnn(pubKey, msg); err != nil {
return err log.Errorf("Unable to validate channel update: %v", err)
return false
} }
err := r.UpdateEdge(&channeldb.ChannelEdgePolicy{ err := r.UpdateEdge(&channeldb.ChannelEdgePolicy{
@ -2072,10 +2016,11 @@ func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate,
FeeProportionalMillionths: lnwire.MilliSatoshi(msg.FeeRate), FeeProportionalMillionths: lnwire.MilliSatoshi(msg.FeeRate),
}) })
if err != nil && !IsError(err, ErrIgnored, ErrOutdated) { if err != nil && !IsError(err, ErrIgnored, ErrOutdated) {
return fmt.Errorf("Unable to apply channel update: %v", err) log.Errorf("Unable to apply channel update: %v", err)
return false
} }
return nil return true
} }
// AddNode is used to add information about a node to the router database. If // AddNode is used to add information about a node to the router database. If

@ -421,9 +421,10 @@ func TestChannelUpdateValidation(t *testing.T) {
}, },
} }
route := &Route{ route := NewRouteFromHops(
Hops: hops, lnwire.MilliSatoshi(10000), 100,
} NewVertex(ctx.aliases["a"]), hops,
)
// Set up a channel update message with an invalid signature to be // Set up a channel update message with an invalid signature to be
// returned to the sender. // returned to the sender.