Merge pull request #893 from Roasbeef/routing-error-handling-bug-fix

routing: add timeout during pay attempts, don't exit if unable apply update, account for all onion errors
This commit is contained in:
Olaoluwa Osuntokun 2018-03-21 15:24:29 -07:00 committed by GitHub
commit 90cb2944eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 279 additions and 86 deletions

@ -38,6 +38,10 @@ const (
// this update can't bring us something new, or because a node // this update can't bring us something new, or because a node
// announcement was given for node not found in any channel. // announcement was given for node not found in any channel.
ErrIgnored ErrIgnored
// ErrPaymentAttemptTimeout is an error that indicates that a payment
// attempt timed out before we were able to successfully route an HTLC.
ErrPaymentAttemptTimeout
) )
// routerError is a structure that represent the error inside the routing package, // routerError is a structure that represent the error inside the routing package,

@ -31,6 +31,11 @@ const (
// DefaultFinalCLTVDelta is the default value to be used as the final // DefaultFinalCLTVDelta is the default value to be used as the final
// CLTV delta for a route if one is unspecified. // CLTV delta for a route if one is unspecified.
DefaultFinalCLTVDelta = 9 DefaultFinalCLTVDelta = 9
// defaultPayAttemptTimeout is a duration that we'll use to determine
// if we should give up on a payment attempt. This will be used if a
// value isn't specified in the LightningNode struct.
defaultPayAttemptTimeout = time.Duration(time.Second * 60)
) )
// ChannelGraphSource represents the source of information about the topology // ChannelGraphSource represents the source of information about the topology
@ -1453,6 +1458,12 @@ type LightningPayment struct {
// used. // used.
FinalCLTVDelta *uint16 FinalCLTVDelta *uint16
// PayAttemptTimeout is a timeout value that we'll use to determine
// when we should should abandon the payment attempt after consecutive
// payment failure. This prevents us from attempting to send a payment
// indefinitely.
PayAttemptTimeout time.Duration
// TODO(roasbeef): add e2e message? // TODO(roasbeef): add e2e message?
} }
@ -1476,6 +1487,12 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
sendError error sendError error
) )
// errFailedFeeChans is a map of the short channel ID's that were the
// source of fee related routing failures during this payment attempt.
// We'll use this map to prune out channels when the first error may
// not require pruning, but any subsequent ones do.
errFailedFeeChans := make(map[lnwire.ShortChannelID]struct{})
// We'll also fetch the current block height so we can properly // We'll also fetch the current block height so we can properly
// calculate the required HTLC time locks within the route. // calculate the required HTLC time locks within the route.
_, currentHeight, err := r.cfg.Chain.GetBestBlock() _, currentHeight, err := r.cfg.Chain.GetBestBlock()
@ -1490,6 +1507,15 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
finalCLTVDelta = *payment.FinalCLTVDelta finalCLTVDelta = *payment.FinalCLTVDelta
} }
var payAttemptTimeout time.Duration
if payment.PayAttemptTimeout == time.Duration(0) {
payAttemptTimeout = defaultPayAttemptTimeout
} else {
payAttemptTimeout = payment.PayAttemptTimeout
}
timeoutChan := time.After(payAttemptTimeout)
// Before starting the HTLC routing attempt, we'll create a fresh // Before starting the HTLC routing attempt, we'll create a fresh
// payment session which will report our errors back to mission // payment session which will report our errors back to mission
// control. // control.
@ -1498,6 +1524,27 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
// We'll continue until either our payment succeeds, or we encounter a // We'll continue until either our payment succeeds, or we encounter a
// critical error during path finding. // critical error during path finding.
for { for {
// Before we attempt this next payment, we'll check to see if
// either we've gone past the payment attempt timeout, or the
// router is exiting. In either case, we'll stop this payment
// attempt short.
select {
case <-timeoutChan:
errStr := fmt.Sprintf("payment attempt not completed "+
"before timeout of %v", payAttemptTimeout)
return preImage, nil, newErr(
ErrPaymentAttemptTimeout, errStr,
)
case <-r.quit:
return preImage, nil, fmt.Errorf("router shutting down")
default:
// Fall through if we haven't hit our time limit, or
// are expiring.
}
// We'll kick things off by requesting a new route from mission // We'll kick things off by requesting a new route from mission
// control, which will incorporate the current best known state // control, which will incorporate the current best known state
// of the channel graph and our past HTLC routing // of the channel graph and our past HTLC routing
@ -1526,8 +1573,9 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
// Generate the raw encoded sphinx packet to be included along // Generate the raw encoded sphinx packet to be included along
// with the htlcAdd message that we send directly to the // with the htlcAdd message that we send directly to the
// switch. // switch.
onionBlob, circuit, err := generateSphinxPacket(route, onionBlob, circuit, err := generateSphinxPacket(
payment.PaymentHash[:]) route, payment.PaymentHash[:],
)
if err != nil { if err != nil {
return preImage, nil, err return preImage, nil, err
} }
@ -1546,8 +1594,9 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
// the payment. If this attempt fails, then we'll continue on // the payment. If this attempt fails, then we'll continue on
// to the next available route. // to the next available route.
firstHop := route.Hops[0].Channel.Node.PubKeyBytes firstHop := route.Hops[0].Channel.Node.PubKeyBytes
preImage, sendError = r.cfg.SendToSwitch(firstHop, htlcAdd, preImage, sendError = r.cfg.SendToSwitch(
circuit) firstHop, htlcAdd, circuit,
)
if sendError != nil { if sendError != nil {
// An error occurred when attempting to send the // An error occurred when attempting to send the
// payment, depending on the error type, we'll either // payment, depending on the error type, we'll either
@ -1608,7 +1657,8 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
case *lnwire.FailExpiryTooSoon: case *lnwire.FailExpiryTooSoon:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(&update); err != nil { if err := r.applyChannelUpdate(&update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
} }
return preImage, nil, sendError return preImage, nil, sendError
@ -1625,145 +1675,129 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
// If the onion error includes a channel update, and // If the onion error includes a channel update, and
// isn't necessarily fatal, then we'll apply the update // isn't necessarily fatal, then we'll apply the update
// an continue with the rest of the routes. // an continue with the rest of the routes.
//
// TODO(roasbeef): should re-query for routes with new updates
case *lnwire.FailAmountBelowMinimum: case *lnwire.FailAmountBelowMinimum:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(&update); err != nil { if err := r.applyChannelUpdate(&update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
} }
return preImage, nil, sendError return preImage, nil, sendError
// If we get a failure due to a fee, so we'll apply the
// new fee update, and retry our attempt using the
// newly updated fees.
case *lnwire.FailFeeInsufficient: case *lnwire.FailFeeInsufficient:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(&update); err != nil { if err := r.applyChannelUpdate(&update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
pruneEdgeFailure(
paySession, route, errSource,
)
} }
return preImage, nil, sendError // We'll now check to see if we've already
// reported a fee related failure for this
// node. If so, then we'll actually prune out
// the edge for now.
chanID := update.ShortChannelID
_, ok := errFailedFeeChans[chanID]
if ok {
pruneEdgeFailure(
paySession, route, errSource,
)
continue
}
// Finally, we'll record a fee failure from
// this node and move on.
errFailedFeeChans[chanID] = struct{}{}
continue
case *lnwire.FailIncorrectCltvExpiry: case *lnwire.FailIncorrectCltvExpiry:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(&update); err != nil { if err := r.applyChannelUpdate(&update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
} }
return preImage, nil, sendError return preImage, nil, sendError
// The outgoing channel that this node was meant to
// forward one is currently disabled, so we'll apply
// the update and continue.
case *lnwire.FailChannelDisabled: case *lnwire.FailChannelDisabled:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(&update); err != nil { if err := r.applyChannelUpdate(&update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
} }
return preImage, nil, sendError pruneEdgeFailure(paySession, route, errSource)
continue
// It's likely that the outgoing channel didn't have
// sufficient capacity, so we'll prune this edge for
// now, and continue onwards with our path finding.
case *lnwire.FailTemporaryChannelFailure: case *lnwire.FailTemporaryChannelFailure:
update := onionErr.Update update := onionErr.Update
if err := r.applyChannelUpdate(update); err != nil { if err := r.applyChannelUpdate(update); err != nil {
return preImage, nil, err log.Errorf("unable to apply channel "+
"update for onion error: %v", err)
} }
// As this error indicates that the target pruneEdgeFailure(paySession, route, errSource)
// channel was unable to carry this HTLC (for
// w/e reason), we'll query the index to find
// the _outgoing_ channel the source of the
// error was meant to pass the HTLC along to.
badChan, ok := route.nextHopChannel(errSource)
if !ok {
// If we weren't able to find the hop
// *after* this node, then we'll
// attempt to disable the previous
// channel.
badChan, ok = route.prevHopChannel(
errSource,
)
if !ok {
continue
}
}
// If the channel was found, then we'll inform
// mission control of this failure so future
// attempts avoid this link temporarily.
paySession.ReportChannelFailure(badChan.ChannelID)
continue continue
// If the send fail due to a node not having the // If the send fail due to a node not having the
// required features, then we'll note this error and // required features, then we'll note this error and
// continue. // continue.
//
// TODO(roasbeef): remove node from path
case *lnwire.FailRequiredNodeFeatureMissing: case *lnwire.FailRequiredNodeFeatureMissing:
pruneVertexFailure(
paySession, route, errSource, false,
)
continue continue
// If the send fail due to a node not having the // If the send fail due to a node not having the
// required features, then we'll note this error and // required features, then we'll note this error and
// continue. // continue.
//
// TODO(roasbeef): remove channel from path
case *lnwire.FailRequiredChannelFeatureMissing: case *lnwire.FailRequiredChannelFeatureMissing:
pruneVertexFailure(
paySession, route, errSource, false,
)
continue continue
// If the next hop in the route wasn't known or // If the next hop in the route wasn't known or
// offline, we'll prune the _next_ hop from the set of // offline, we'll prune the _next_ hop from the set of
// routes and retry. // routes and retry.
case *lnwire.FailUnknownNextPeer: case *lnwire.FailUnknownNextPeer:
// This failure indicates that the node _after_ pruneVertexFailure(
// the source of the error was not found. As a paySession, route, errSource, true,
// result, we'll locate the vertex for that )
// node itself.
missingNode, ok := route.nextHopVertex(errSource)
if !ok {
continue
}
// Once we've located the vertex, we'll report
// this failure to missionControl and restart
// path finding.
paySession.ReportVertexFailure(missingNode)
continue continue
// If the node wasn't able to forward for which ever // If the node wasn't able to forward for which ever
// reason, then we'll note this and continue with the // reason, then we'll note this and continue with the
// routes. // routes.
case *lnwire.FailTemporaryNodeFailure: case *lnwire.FailTemporaryNodeFailure:
missingNode, ok := route.nextHopVertex(errSource) pruneVertexFailure(
if !ok { paySession, route, errSource, false,
)
continue continue
}
paySession.ReportVertexFailure(missingNode) case *lnwire.FailPermanentNodeFailure:
pruneVertexFailure(
paySession, route, errSource, false,
)
continue continue
// If we get a permanent channel or node failure, then // If we get a permanent channel or node failure, then
// we'll note this (exclude the vertex/edge), and // we'll note this (exclude the vertex/edge), and
// continue with the rest of the routes. // continue with the rest of the routes.
case *lnwire.FailPermanentChannelFailure: case *lnwire.FailPermanentChannelFailure:
// As this error indicates that the target pruneEdgeFailure(paySession, route, errSource)
// channel was unable to carry this HTLC (for
// w/e reason), we'll query the index to find
// the _outgoing_ channel the source of the
// error was meant to pass the HTLC along to.
badChan, ok := route.nextHopChannel(errSource)
if !ok {
// If we weren't able to find the hop
// *after* this node, then we'll
// attempt to disable the previous
// channel.
badChan, ok = route.prevHopChannel(
errSource,
)
if !ok {
continue
}
}
// If the channel was found, then we'll inform
// mission control of this failure so future
// attempts avoid this link temporarily.
paySession.ReportChannelFailure(badChan.ChannelID)
continue
case *lnwire.FailPermanentNodeFailure:
// TODO(roasbeef): remove node from path
continue continue
default: default:
@ -1775,6 +1809,62 @@ func (r *ChannelRouter) SendPayment(payment *LightningPayment) ([32]byte, *Route
} }
} }
// pruneVertexFailure will attempt to prune a vertex from the current available
// vertexes of the target payment session in response to an encountered routing
// error.
func pruneVertexFailure(paySession *paymentSession, route *Route,
errSource *btcec.PublicKey, nextNode bool) {
// By default, we'll try to prune the node that actually sent us the
// error.
errNode := NewVertex(errSource)
// If this failure indicates that the node _after_ the source of the
// error was not found. As a result, we'll locate the vertex for that
// node itself.
if nextNode {
nodeToPrune, ok := route.nextHopVertex(errSource)
if ok {
errNode = nodeToPrune
}
}
// Once we've located the vertex, we'll report this failure to
// missionControl and restart path finding.
paySession.ReportVertexFailure(errNode)
}
// pruneEdgeFailure will attempts to prune an edge from the current available
// edges of the target payment session in response to an encountered routing
// error.
func pruneEdgeFailure(paySession *paymentSession, route *Route,
errSource *btcec.PublicKey) {
// As this error indicates that the target channel was unable to carry
// this HTLC (for w/e reason), we'll query the index to find the
// _outgoing_ channel the source of the error was meant to pass the
// HTLC along to.
badChan, ok := route.nextHopChannel(errSource)
if !ok {
// If we weren't able to find the hop *after* this node, then
// we'll attempt to disable the previous channel.
prevChan, ok := route.prevHopChannel(
errSource,
)
if !ok {
return
}
badChan = prevChan
}
// If the channel was found, then we'll inform mission control of this
// failure so future attempts avoid this link temporarily.
paySession.ReportChannelFailure(badChan.ChannelID)
}
// applyChannelUpdate applies a channel update directly to the database, // applyChannelUpdate applies a channel update directly to the database,
// skipping preliminary validation. // skipping preliminary validation.
func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate) error { func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate) error {

@ -280,6 +280,105 @@ func TestSendPaymentRouteFailureFallback(t *testing.T) {
} }
} }
// TestSendPaymentErrorRepeatedFeeInsufficient tests that if we receive
// multiple fee related errors from a channel that we're attempting to route
// through, then we'll prune the channel after the second attempt.
func TestSendPaymentErrorRepeatedFeeInsufficient(t *testing.T) {
t.Parallel()
const startingBlockHeight = 101
ctx, cleanUp, err := createTestCtx(startingBlockHeight, basicGraphFilePath)
defer cleanUp()
if err != nil {
t.Fatalf("unable to create router: %v", err)
}
// Craft a LightningPayment struct that'll send a payment from roasbeef
// to luo ji for 100 satoshis.
var payHash [32]byte
payment := LightningPayment{
Target: ctx.aliases["luoji"],
Amount: lnwire.NewMSatFromSatoshis(1000),
PaymentHash: payHash,
}
var preImage [32]byte
copy(preImage[:], bytes.Repeat([]byte{9}, 32))
// We'll also fetch the first outgoing channel edge from roasbeef to
// luo ji. We'll obtain this as we'll need to to generate the
// FeeInsufficient error that we'll send back.
chanID := uint64(689530843)
_, _, edgeUpateToFail, err := ctx.graph.FetchChannelEdgesByID(chanID)
if err != nil {
t.Fatalf("unable to fetch chan id: %v", err)
}
errChanUpdate := lnwire.ChannelUpdate{
ShortChannelID: lnwire.NewShortChanIDFromInt(chanID),
Timestamp: uint32(edgeUpateToFail.LastUpdate.Unix()),
Flags: edgeUpateToFail.Flags,
TimeLockDelta: edgeUpateToFail.TimeLockDelta,
HtlcMinimumMsat: edgeUpateToFail.MinHTLC,
BaseFee: uint32(edgeUpateToFail.FeeBaseMSat),
FeeRate: uint32(edgeUpateToFail.FeeProportionalMillionths),
}
sourceNode := ctx.router.selfNode
// We'll now modify the SendToSwitch method to return an error for the
// outgoing channel to luo ji. This will be a fee related error, so it
// should only cause the edge to be pruned after the second attempt.
ctx.router.cfg.SendToSwitch = func(n [33]byte,
_ *lnwire.UpdateAddHTLC, _ *sphinx.Circuit) ([32]byte, error) {
if bytes.Equal(ctx.aliases["luoji"].SerializeCompressed(), n[:]) {
pub, err := sourceNode.PubKey()
if err != nil {
return preImage, err
}
return [32]byte{}, &htlcswitch.ForwardingError{
ErrorSource: pub,
// Within our error, we'll add a channel update
// which is meant to refelct he new fee
// schedule for the node/channel.
FailureMessage: &lnwire.FailFeeInsufficient{
Update: errChanUpdate,
},
}
}
return preImage, nil
}
// Send off the payment request to the router, route through satoshi
// should've been selected as a fall back and succeeded correctly.
paymentPreImage, route, err := ctx.router.SendPayment(&payment)
if err != nil {
t.Fatalf("unable to send payment: %v", err)
}
// The route selected should have two hops
if len(route.Hops) != 2 {
t.Fatalf("incorrect route length: expected %v got %v", 2,
len(route.Hops))
}
// The preimage should match up with the once created above.
if !bytes.Equal(paymentPreImage[:], preImage[:]) {
t.Fatalf("incorrect preimage used: expected %x got %x",
preImage[:], paymentPreImage[:])
}
// The route should have satoshi as the first hop.
if route.Hops[0].Channel.Node.Alias != "satoshi" {
t.Fatalf("route should go through satoshi as first hop, "+
"instead passes through: %v",
route.Hops[0].Channel.Node.Alias)
}
}
// TestSendPaymentErrorPathPruning tests that the send of candidate routes // TestSendPaymentErrorPathPruning tests that the send of candidate routes
// properly gets pruned in response to ForwardingError response from the // properly gets pruned in response to ForwardingError response from the
// underlying SendToSwitch function. // underlying SendToSwitch function.