routing: stricter payment result interpretation

This commit overhauls the interpretation of failed payments. It changes the interpretation rules so that we always apply the strongest possible set of penalties, without making assumptions that would hurt good nodes. Main changes are: - Apply different rule sets for intermediate and final nodes. Both types of nodes have different sets of failures that we expect. Penalize nodes that send unexpected failure messages. - Distinguish between direct payments and multi-hop payments. For direct payments, we can infer more about the performance of our peer because we trust ourselves. - In many cases it is impossible for the sender to determine which of the two nodes in a pair is responsible for the failure. In this situation, we now penalize bidirectionally. This does not hurt the good node of the pair, because only its connection to a bad node is penalized. - Previously we always penalized the outgoing connection of the reporting node. This is incorrect for policy related failures. For policy related failures, it could also be that the reporting node received a wrongly crafted htlc from its predecessor. By penalizing the incoming channel, we surely hit the responsible node. - FailExpiryTooSoon is a failure that could have been caused by any node up to the reporting node by delaying forwarding of the htlc. We don't know which node is responsible, therefore we now penalize all node pairs in the route.
2019-06-25 10:51:55 +02:00 · 2019-06-25 10:51:55 +02:00 · d9ec158412
commit d9ec158412
parent e7a457f1ce
2 changed files with 280 additions and 208 deletions
--- a/routing/result_interpretation.go
+++ b/routing/result_interpretation.go
@ -2,7 +2,6 @@ package routing
 import (
 	"github.com/lightningnetwork/lnd/channeldb"
 	"github.com/lightningnetwork/lnd/lnwire"
 	"github.com/lightningnetwork/lnd/routing/route"
 )
@ -10,6 +9,7 @@ import (
 // Instantiate variables to allow taking a reference from the failure reason.
 var (
 	reasonError            = channeldb.FailureReasonError
 	reasonIncorrectDetails = channeldb.FailureReasonIncorrectPaymentDetails
 )
 // interpretedResult contains the result of the interpretation of a payment
@ -44,10 +44,7 @@ func interpretResult(rt *route.Route, failureSrcIdx *int,
 		pairResults: make(map[DirectedNodePair]lnwire.MilliSatoshi),
 	}
-	final, reason := i.processFail(rt, failureSrcIdx, failure)
+	i.processFail(rt, failureSrcIdx, failure)
 	if final {
 		i.finalFailureReason = &reason
 	}
 	return i
 }
@ -55,187 +52,240 @@ func interpretResult(rt *route.Route, failureSrcIdx *int,
 // processFail processes a failed payment attempt.
 func (i *interpretedResult) processFail(
 	rt *route.Route, errSourceIdx *int,
-	failure lnwire.FailureMessage) (bool, channeldb.FailureReason) {
+	failure lnwire.FailureMessage) {
 	if errSourceIdx == nil {
 		i.processPaymentOutcomeUnknown(rt)
-		return false, 0
+		return
 	}
-	var failureVertex route.Vertex
+	switch *errSourceIdx {
-	failureSourceIdxInt := *errSourceIdx
+	// We are the source of the failure.
-	if failureSourceIdxInt > 0 {
+	case 0:
-		failureVertex = rt.Hops[failureSourceIdxInt-1].PubKeyBytes
+		i.processPaymentOutcomeSelf(rt, failure)
 	} else {
 		failureVertex = rt.SourcePubKey
 	}
 	log.Tracef("Node %x (index %v) reported failure when sending htlc",
 		failureVertex, errSourceIdx)
-	// Always determine chan id ourselves, because a channel update with id
+	// A failure from the final hop was received.
-	// may not be available.
+	case len(rt.Hops):
-	failedPair, failedAmt := getFailedPair(
+		i.processPaymentOutcomeFinal(
-		rt, failureSourceIdxInt,
+			rt, failure,
 		)
 	// An intermediate hop failed. Interpret the outcome, update reputation
 	// and try again.
 	default:
 		i.processPaymentOutcomeIntermediate(
 			rt, *errSourceIdx, failure,
 		)
 	}
 }
 // processPaymentOutcomeSelf handles failures sent by ourselves.
 func (i *interpretedResult) processPaymentOutcomeSelf(
 	rt *route.Route, failure lnwire.FailureMessage) {
 	switch failure.(type) {
-	// If the end destination didn't know the payment hash or we sent the
+	// We receive a malformed htlc failure from our peer. We trust ourselves
-	// wrong payment amount to the destination, then we'll terminate
+	// to send the correct htlc, so our peer must be at fault.
-	// immediately.
+	case *lnwire.FailInvalidOnionVersion,
-	case *lnwire.FailIncorrectDetails:
+		*lnwire.FailInvalidOnionHmac,
-		// TODO(joostjager): Check onionErr.Amount() whether it matches
+		*lnwire.FailInvalidOnionKey:
 		// what we expect. (Will it ever not match, because if not
 		// final_incorrect_htlc_amount would be returned?)
-		return true, channeldb.FailureReasonIncorrectPaymentDetails
+		i.failNode(rt, 1)
-	// If we sent the wrong amount to the destination, then we'll exit
+		// If this was a payment to a direct peer, we can stop trying.
-	// early.
+		if len(rt.Hops) == 1 {
-	case *lnwire.FailIncorrectPaymentAmount:
+			i.finalFailureReason = &reasonError
-		return true, channeldb.FailureReasonIncorrectPaymentDetails
+		}
-	// If the time-lock that was extended to the final node was incorrect,
+	// Any other failure originating from ourselves should be temporary and
-	// then we can't proceed.
+	// caused by changing conditions between path finding and execution of
-	case *lnwire.FailFinalIncorrectCltvExpiry:
+	// the payment. We just retry and trust that the information locally
-		// TODO(joostjager): Take into account that second last hop may
+	// available in the link has been updated.
 		// have deliberately handed out an htlc that expires too soon.
 		// In that case we should continue routing.
 		return true, channeldb.FailureReasonError
 	// If we crafted an invalid onion payload for the final node, then we'll
 	// exit early.
 	case *lnwire.FailFinalIncorrectHtlcAmount:
 		// TODO(joostjager): Take into account that second last hop may
 		// have deliberately handed out an htlc with a too low value. In
 		// that case we should continue routing.
 		return true, channeldb.FailureReasonError
 	// Similarly, if the HTLC expiry that we extended to the final hop
 	// expires too soon, then will fail the payment.
 	//
 	// TODO(roasbeef): can happen to to race condition, try again with
 	// recent block height
 	case *lnwire.FailFinalExpiryTooSoon:
 		// TODO(joostjager): Take into account that any hop may have
 		// delayed. Ideally we should continue routing. Knowing the
 		// delaying node at this point would help.
 		return true, channeldb.FailureReasonIncorrectPaymentDetails
 	// If we erroneously attempted to cross a chain border, then we'll
 	// cancel the payment.
 	case *lnwire.FailInvalidRealm:
 		return true, channeldb.FailureReasonError
 	// If we get a notice that the expiry was too soon for an intermediate
 	// node, then we'll prune out the node that sent us this error, as it
 	// doesn't now what the correct block height is.
 	case *lnwire.FailExpiryTooSoon:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	// If we hit an instance of onion payload corruption or an invalid
 	// version, then we'll exit early as this shouldn't happen in the
 	// typical case.
 	//
 	// TODO(joostjager): Take into account that the previous hop may have
 	// tampered with the onion. Routing should continue using other paths.
 	case *lnwire.FailInvalidOnionVersion:
 		return true, channeldb.FailureReasonError
 	case *lnwire.FailInvalidOnionHmac:
 		return true, channeldb.FailureReasonError
 	case *lnwire.FailInvalidOnionKey:
 		return true, channeldb.FailureReasonError
 	// If we get a failure due to violating the minimum amount, we'll apply
 	// the new minimum amount and retry routing.
 	case *lnwire.FailAmountBelowMinimum:
 		i.policyFailure = &failedPair
 		i.pairResults[failedPair] = 0
 		return false, 0
 	// If we get a failure due to a fee, we'll apply the new fee update, and
 	// retry our attempt using the newly updated fees.
 	case *lnwire.FailFeeInsufficient:
 		i.policyFailure = &failedPair
 		i.pairResults[failedPair] = 0
 		return false, 0
 	// If we get the failure for an intermediate node that disagrees with
 	// our time lock values, then we'll apply the new delta value and try it
 	// once more.
 	case *lnwire.FailIncorrectCltvExpiry:
 		i.policyFailure = &failedPair
 		i.pairResults[failedPair] = 0
 		return false, 0
 	// The outgoing channel that this node was meant to forward one is
 	// currently disabled, so we'll apply the update and continue.
 	case *lnwire.FailChannelDisabled:
 		i.pairResults[failedPair] = 0
 		return false, 0
 	// It's likely that the outgoing channel didn't have sufficient
 	// capacity, so we'll prune this edge for now, and continue onwards with
 	// our path finding.
 	case *lnwire.FailTemporaryChannelFailure:
 		i.pairResults[failedPair] = failedAmt
 		return false, 0
 	// If the send fail due to a node not having the required features, then
 	// we'll note this error and continue.
 	case *lnwire.FailRequiredNodeFeatureMissing:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	// If the send fail due to a node not having the required features, then
 	// we'll note this error and continue.
 	case *lnwire.FailRequiredChannelFeatureMissing:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	// If the next hop in the route wasn't known or offline, we'll only the
 	// channel which we attempted to route over. This is conservative, and
 	// it can handle faulty channels between nodes properly. Additionally,
 	// this guards against routing nodes returning errors in order to
 	// attempt to black list another node.
 	case *lnwire.FailUnknownNextPeer:
 		i.pairResults[failedPair] = 0
 		return false, 0
 	// If the node wasn't able to forward for which ever reason, then we'll
 	// note this and continue with the routes.
 	case *lnwire.FailTemporaryNodeFailure:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	case *lnwire.FailPermanentNodeFailure:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	// If we crafted a route that contains a too long time lock for an
 	// intermediate node, we'll prune the node. As there currently is no way
 	// of knowing that node's maximum acceptable cltv, we cannot take this
 	// constraint into account during routing.
 	//
 	// TODO(joostjager): Record the rejected cltv and use that as a hint
 	// during future path finding through that node.
 	case *lnwire.FailExpiryTooFar:
 		i.nodeFailure = &failureVertex
 		return false, 0
 	// If we get a permanent channel or node failure, then we'll prune the
 	// channel in both directions and continue with the rest of the routes.
 	case *lnwire.FailPermanentChannelFailure:
 		i.pairResults[failedPair] = 0
 		i.pairResults[failedPair.Reverse()] = 0
 		return false, 0
 	// Any other failure or an empty failure will get the node pruned.
 	default:
-		i.nodeFailure = &failureVertex
+		log.Warnf("Routing failure for local channel %v occurred",
-		return false, 0
+			rt.Hops[0].ChannelID)
 	}
 }
 // processPaymentOutcomeFinal handles failures sent by the final hop.
 func (i *interpretedResult) processPaymentOutcomeFinal(
 	route *route.Route, failure lnwire.FailureMessage) {
 	n := len(route.Hops)
 	// If a failure from the final node is received, we will fail the
 	// payment in almost all cases. Only when the penultimate node sends an
 	// incorrect htlc, we want to retry via another route. Invalid onion
 	// failures are not expected, because the final node wouldn't be able to
 	// encrypt that failure.
 	switch failure.(type) {
 	// Expiry or amount of the HTLC doesn't match the onion, try another
 	// route.
 	case *lnwire.FailFinalIncorrectCltvExpiry,
 		*lnwire.FailFinalIncorrectHtlcAmount:
 		// We trust ourselves. If this is a direct payment, we penalize
 		// the final node and fail the payment.
 		if n == 1 {
 			i.failNode(route, n)
 			i.finalFailureReason = &reasonError
 			return
 		}
 		// Otherwise penalize the last pair of the route and retry.
 		// Either the final node is at fault, or it gets sent a bad htlc
 		// from its predecessor.
 		i.failPair(route, n-1)
 	// We are using wrong payment hash or amount, fail the payment.
 	case *lnwire.FailIncorrectPaymentAmount,
 		*lnwire.FailIncorrectDetails:
 		i.finalFailureReason = &reasonIncorrectDetails
 	// The HTLC that was extended to the final hop expires too soon. Fail
 	// the payment, because we may be using the wrong final cltv delta.
 	case *lnwire.FailFinalExpiryTooSoon:
 		// TODO(roasbeef): can happen to to race condition, try again
 		// with recent block height
 		// TODO(joostjager): can also happen because a node delayed
 		// deliberately. What to penalize?
 		i.finalFailureReason = &reasonIncorrectDetails
 	default:
 		// All other errors are considered terminal if coming from the
 		// final hop. They indicate that something is wrong at the
 		// recipient, so we do apply a penalty.
 		i.failNode(route, n)
 		i.finalFailureReason = &reasonError
 	}
 }
 // processPaymentOutcomeIntermediate handles failures sent by an intermediate
 // hop.
 func (i *interpretedResult) processPaymentOutcomeIntermediate(
 	route *route.Route, errorSourceIdx int,
 	failure lnwire.FailureMessage) {
 	reportOutgoing := func() {
 		i.failPair(
 			route, errorSourceIdx,
 		)
 	}
 	reportOutgoingBalance := func() {
 		i.failPairBalance(
 			route, errorSourceIdx,
 		)
 	}
 	reportIncoming := func() {
 		// We trust ourselves. If the error comes from the first hop, we
 		// can penalize the whole node. In that case there is no
 		// uncertainty as to which node to blame.
 		if errorSourceIdx == 1 {
 			i.failNode(route, errorSourceIdx)
 			return
 		}
 		// Otherwise report the incoming pair.
 		i.failPair(
 			route, errorSourceIdx-1,
 		)
 	}
 	reportAll := func() {
 		// We trust ourselves. If the error comes from the first hop, we
 		// can penalize the whole node. In that case there is no
 		// uncertainty as to which node to blame.
 		if errorSourceIdx == 1 {
 			i.failNode(route, errorSourceIdx)
 			return
 		}
 		// Otherwise penalize all pairs up to the error source. This
 		// includes our own outgoing connection.
 		i.failPairRange(
 			route, 0, errorSourceIdx-1,
 		)
 	}
 	switch failure.(type) {
 	// If a node reports onion payload corruption or an invalid version,
 	// that node may be responsible, but it could also be that it is just
 	// relaying a malformed htlc failure from it successor. By reporting the
 	// outgoing channel set, we will surely hit the responsible node. At
 	// this point, it is not possible that the node's predecessor corrupted
 	// the onion blob. If the predecessor would have corrupted the payload,
 	// the error source wouldn't have been able to encrypt this failure
 	// message for us.
 	case *lnwire.FailInvalidOnionVersion,
 		*lnwire.FailInvalidOnionHmac,
 		*lnwire.FailInvalidOnionKey:
 		reportOutgoing()
 	// If the next hop in the route wasn't known or offline, we'll only
 	// penalize the channel set which we attempted to route over. This is
 	// conservative, and it can handle faulty channels between nodes
 	// properly. Additionally, this guards against routing nodes returning
 	// errors in order to attempt to black list another node.
 	case *lnwire.FailUnknownNextPeer:
 		reportOutgoing()
 	// If we get a permanent channel, we'll prune the channel set in both
 	// directions and continue with the rest of the routes.
 	case *lnwire.FailPermanentChannelFailure:
 		reportOutgoing()
 	// When an HTLC parameter is incorrect, the node sending the error may
 	// be doing something wrong. But it could also be that its predecessor
 	// is intentionally modifying the htlc parameters that we instructed it
 	// via the hop payload. Therefore we penalize the incoming node pair. A
 	// third cause of this error may be that we have an out of date channel
 	// update. This is handled by the second chance logic up in mission
 	// control.
 	case *lnwire.FailAmountBelowMinimum,
 		*lnwire.FailFeeInsufficient,
 		*lnwire.FailIncorrectCltvExpiry,
 		*lnwire.FailChannelDisabled:
 		// Set the node pair for which a channel update may be out of
 		// date. The second chance logic uses the policyFailure field.
 		i.policyFailure = &DirectedNodePair{
 			From: route.Hops[errorSourceIdx-1].PubKeyBytes,
 			To:   route.Hops[errorSourceIdx].PubKeyBytes,
 		}
 		// We report incoming channel. If a second pair is granted in
 		// mission control, this report is ignored.
 		reportIncoming()
 	// If the outgoing channel doesn't have enough capacity, we penalize.
 	// But we penalize only in a single direction and only for amounts
 	// greater than the attempted amount.
 	case *lnwire.FailTemporaryChannelFailure:
 		reportOutgoingBalance()
 	// If FailExpiryTooSoon is received, there must have been some delay
 	// along the path. We can't know which node is causing the delay, so we
 	// penalize all of them up to the error source.
 	//
 	// Alternatively it could also be that we ourselves have fallen behind
 	// somehow. We ignore that case for now.
 	case *lnwire.FailExpiryTooSoon:
 		reportAll()
 	// In all other cases, we penalize the reporting node. These are all
 	// failures that should not happen.
 	default:
 		i.failNode(route, errorSourceIdx)
 	}
 }
@ -263,7 +313,8 @@ func (i *interpretedResult) failNode(rt *route.Route, idx int) {
 	i.nodeFailure = &rt.Hops[idx-1].PubKeyBytes
 }
-// failPairRange marks the node pairs from node fromIdx to node toIdx as failed.
+// failPairRange marks the node pairs from node fromIdx to node toIdx as failed
 // in both direction.
 func (i *interpretedResult) failPairRange(
 	rt *route.Route, fromIdx, toIdx int) {
@ -283,6 +334,15 @@ func (i *interpretedResult) failPair(
 	i.pairResults[pair.Reverse()] = 0
 }
 // failPairBalance marks a pair as failed with a minimum penalization amount.
 func (i *interpretedResult) failPairBalance(
 	rt *route.Route, channelIdx int) {
 	pair, amt := getPair(rt, channelIdx)
 	i.pairResults[pair] = amt
 }
 // getPair returns a node pair from the route and the amount passed between that
 // pair.
 func getPair(rt *route.Route, channelIdx int) (DirectedNodePair,
@ -306,36 +366,3 @@ func getPair(rt *route.Route, channelIdx int) (DirectedNodePair,
 	return pair, amt
 }
 // getFailedPair tries to locate the failing pair given a route and the pubkey
 // of the node that sent the failure. It will assume that the failure is
 // associated with the outgoing channel set of the failing node. As a second
 // result, it returns the amount sent between the pair.
 func getFailedPair(route *route.Route, failureSource int) (DirectedNodePair,
 	lnwire.MilliSatoshi) {
 	// Determine if we have a failure from the final hop. If it is, we
 	// assume that the failing channel is the incoming channel.
 	//
 	// TODO(joostjager): In this case, certain types of failures are not
 	// expected. For example FailUnknownNextPeer. This could be a reason to
 	// prune the node?
 	if failureSource == len(route.Hops) {
 		failureSource--
 	}
 	// As this failure indicates that the target channel was unable to carry
 	// this HTLC (for w/e reason), we'll return the _outgoing_ channel that
 	// the source of the failure was meant to pass the HTLC along to.
 	if failureSource == 0 {
 		return NewDirectedNodePair(
 			route.SourcePubKey,
 			route.Hops[0].PubKeyBytes,
 		), route.TotalAmount
 	}
 	return NewDirectedNodePair(
 		route.Hops[failureSource-1].PubKeyBytes,
 		route.Hops[failureSource].PubKeyBytes,
 	), route.Hops[failureSource-1].AmtToForward
 }
--- a/routing/result_interpretation_test.go
+++ b/routing/result_interpretation_test.go
@ -14,6 +14,14 @@ var (
 		{1, 0}, {1, 1}, {1, 2}, {1, 3}, {1, 4},
 	}
 	routeOneHop = route.Route{
 		SourcePubKey: hops[0],
 		TotalAmount:  100,
 		Hops: []*route.Hop{
 			{PubKeyBytes: hops[1], AmtToForward: 99},
 		},
 	}
 	routeTwoHop = route.Route{
 		SourcePubKey: hops[0],
 		TotalAmount:  100,
@ -35,6 +43,10 @@ var (
 	}
 )
 func getTestPair(from, to int) DirectedNodePair {
 	return NewDirectedNodePair(hops[from], hops[to])
 }
 type resultTestCase struct {
 	name          string
 	route         *route.Route
@ -55,7 +67,7 @@ var resultTestCases = []resultTestCase{
 		expectedResult: &interpretedResult{
 			pairResults: map[DirectedNodePair]lnwire.MilliSatoshi{
-				NewDirectedNodePair(hops[1], hops[2]): 99,
+				getTestPair(1, 2): 99,
 			},
 		},
 	},
@ -68,7 +80,40 @@ var resultTestCases = []resultTestCase{
 		failure:       lnwire.NewExpiryTooSoon(lnwire.ChannelUpdate{}),
 		expectedResult: &interpretedResult{
-			nodeFailure: &hops[3],
+			pairResults: map[DirectedNodePair]lnwire.MilliSatoshi{
 				getTestPair(0, 1): 0,
 				getTestPair(1, 0): 0,
 				getTestPair(1, 2): 0,
 				getTestPair(2, 1): 0,
 				getTestPair(2, 3): 0,
 				getTestPair(3, 2): 0,
 			},
 		},
 	},
 	// Tests a malformed htlc from a direct peer.
 	{
 		name:          "fail malformed htlc from direct peer",
 		route:         &routeTwoHop,
 		failureSrcIdx: 0,
 		failure:       lnwire.NewInvalidOnionKey(nil),
 		expectedResult: &interpretedResult{
 			nodeFailure: &hops[1],
 		},
 	},
 	// Tests a malformed htlc from a direct peer that is also the final
 	// destination.
 	{
 		name:          "fail malformed htlc from direct final peer",
 		route:         &routeOneHop,
 		failureSrcIdx: 0,
 		failure:       lnwire.NewInvalidOnionKey(nil),
 		expectedResult: &interpretedResult{
 			finalFailureReason: &reasonError,
 			nodeFailure:        &hops[1],
 		},
 	},
 }