routing: move failure interpretation into mission control

This commit is contained in:
Joost Jager 2019-06-26 09:49:16 +02:00
parent add905d17f
commit 934ea8e78d
No known key found for this signature in database
GPG Key ID: A61B9D4C393C59C7
4 changed files with 312 additions and 264 deletions

@ -5,6 +5,7 @@ import (
"sync"
"time"
"github.com/lightningnetwork/lnd/channeldb"
"github.com/lightningnetwork/lnd/lnwire"
"github.com/lightningnetwork/lnd/routing/route"
)
@ -282,8 +283,8 @@ func (m *MissionControl) createHistoryIfNotExists(vertex route.Vertex) *nodeHist
return node
}
// ReportVertexFailure reports a node level failure.
func (m *MissionControl) ReportVertexFailure(v route.Vertex) {
// reportVertexFailure reports a node level failure.
func (m *MissionControl) reportVertexFailure(v route.Vertex) {
log.Debugf("Reporting vertex %v failure to Mission Control", v)
now := m.now()
@ -295,8 +296,8 @@ func (m *MissionControl) ReportVertexFailure(v route.Vertex) {
history.lastFail = &now
}
// ReportEdgePolicyFailure reports a policy related failure.
func (m *MissionControl) ReportEdgePolicyFailure(failedEdge edge) {
// reportEdgePolicyFailure reports a policy related failure.
func (m *MissionControl) reportEdgePolicyFailure(failedEdge edge) {
now := m.now()
m.Lock()
@ -315,10 +316,10 @@ func (m *MissionControl) ReportEdgePolicyFailure(failedEdge edge) {
history.lastFail = &now
}
// ReportEdgeFailure reports a channel level failure.
// reportEdgeFailure reports a channel level failure.
//
// TODO(roasbeef): also add value attempted to send and capacity of channel
func (m *MissionControl) ReportEdgeFailure(failedEdge edge,
func (m *MissionControl) reportEdgeFailure(failedEdge edge,
minPenalizeAmt lnwire.MilliSatoshi) {
log.Debugf("Reporting channel %v failure to Mission Control",
@ -387,3 +388,238 @@ func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
return &snapshot
}
// ReportPaymentFail reports a failed payment to mission control as input for
// future probability estimates. It returns a bool indicating whether this error
// is a final error and no further payment attempts need to be made.
func (m *MissionControl) ReportPaymentFail(rt *route.Route,
failureSourceIdx int, failure lnwire.FailureMessage) (bool,
channeldb.FailureReason) {
var (
failureVertex route.Vertex
)
// For any non-self failure, look up the source pub key in the hops
// slice. Otherwise return the self node pubkey.
if failureSourceIdx > 0 {
failureVertex = rt.Hops[failureSourceIdx-1].PubKeyBytes
} else {
failureVertex = rt.SourcePubKey
}
log.Tracef("Node %x (index %v) reported failure when sending htlc",
failureVertex, failureSourceIdx)
// Always determine chan id ourselves, because a channel
// update with id may not be available.
failedEdge, failedAmt := getFailedEdge(rt, failureSourceIdx)
switch failure.(type) {
// If the end destination didn't know the payment
// hash or we sent the wrong payment amount to the
// destination, then we'll terminate immediately.
case *lnwire.FailUnknownPaymentHash:
// TODO(joostjager): Check onionErr.Amount() whether it matches
// what we expect. (Will it ever not match, because if not
// final_incorrect_htlc_amount would be returned?)
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If we sent the wrong amount to the destination, then
// we'll exit early.
case *lnwire.FailIncorrectPaymentAmount:
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If the time-lock that was extended to the final node
// was incorrect, then we can't proceed.
case *lnwire.FailFinalIncorrectCltvExpiry:
// TODO(joostjager): Take into account that second last hop may
// have deliberately handed out an htlc that expires too soon.
// In that case we should continue routing.
return true, channeldb.FailureReasonError
// If we crafted an invalid onion payload for the final
// node, then we'll exit early.
case *lnwire.FailFinalIncorrectHtlcAmount:
// TODO(joostjager): Take into account that second last hop may
// have deliberately handed out an htlc with a too low value. In
// that case we should continue routing.
return true, channeldb.FailureReasonError
// Similarly, if the HTLC expiry that we extended to
// the final hop expires too soon, then will fail the
// payment.
//
// TODO(roasbeef): can happen to to race condition, try
// again with recent block height
case *lnwire.FailFinalExpiryTooSoon:
// TODO(joostjager): Take into account that any hop may have
// delayed. Ideally we should continue routing. Knowing the
// delaying node at this point would help.
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If we erroneously attempted to cross a chain border,
// then we'll cancel the payment.
case *lnwire.FailInvalidRealm:
return true, channeldb.FailureReasonError
// If we get a notice that the expiry was too soon for
// an intermediate node, then we'll prune out the node
// that sent us this error, as it doesn't now what the
// correct block height is.
case *lnwire.FailExpiryTooSoon:
m.reportVertexFailure(failureVertex)
return false, 0
// If we hit an instance of onion payload corruption or an invalid
// version, then we'll exit early as this shouldn't happen in the
// typical case.
//
// TODO(joostjager): Take into account that the previous hop may have
// tampered with the onion. Routing should continue using other paths.
case *lnwire.FailInvalidOnionVersion:
return true, channeldb.FailureReasonError
case *lnwire.FailInvalidOnionHmac:
return true, channeldb.FailureReasonError
case *lnwire.FailInvalidOnionKey:
return true, channeldb.FailureReasonError
// If we get a failure due to violating the minimum
// amount, we'll apply the new minimum amount and retry
// routing.
case *lnwire.FailAmountBelowMinimum:
m.reportEdgePolicyFailure(failedEdge)
return false, 0
// If we get a failure due to a fee, we'll apply the
// new fee update, and retry our attempt using the
// newly updated fees.
case *lnwire.FailFeeInsufficient:
m.reportEdgePolicyFailure(failedEdge)
return false, 0
// If we get the failure for an intermediate node that
// disagrees with our time lock values, then we'll
// apply the new delta value and try it once more.
case *lnwire.FailIncorrectCltvExpiry:
m.reportEdgePolicyFailure(failedEdge)
return false, 0
// The outgoing channel that this node was meant to
// forward one is currently disabled, so we'll apply
// the update and continue.
case *lnwire.FailChannelDisabled:
m.reportEdgeFailure(failedEdge, 0)
return false, 0
// It's likely that the outgoing channel didn't have
// sufficient capacity, so we'll prune this edge for
// now, and continue onwards with our path finding.
case *lnwire.FailTemporaryChannelFailure:
m.reportEdgeFailure(failedEdge, failedAmt)
return false, 0
// If the send fail due to a node not having the
// required features, then we'll note this error and
// continue.
case *lnwire.FailRequiredNodeFeatureMissing:
m.reportVertexFailure(failureVertex)
return false, 0
// If the send fail due to a node not having the
// required features, then we'll note this error and
// continue.
case *lnwire.FailRequiredChannelFeatureMissing:
m.reportVertexFailure(failureVertex)
return false, 0
// If the next hop in the route wasn't known or
// offline, we'll only the channel which we attempted
// to route over. This is conservative, and it can
// handle faulty channels between nodes properly.
// Additionally, this guards against routing nodes
// returning errors in order to attempt to black list
// another node.
case *lnwire.FailUnknownNextPeer:
m.reportEdgeFailure(failedEdge, 0)
return false, 0
// If the node wasn't able to forward for which ever
// reason, then we'll note this and continue with the
// routes.
case *lnwire.FailTemporaryNodeFailure:
m.reportVertexFailure(failureVertex)
return false, 0
case *lnwire.FailPermanentNodeFailure:
m.reportVertexFailure(failureVertex)
return false, 0
// If we crafted a route that contains a too long time
// lock for an intermediate node, we'll prune the node.
// As there currently is no way of knowing that node's
// maximum acceptable cltv, we cannot take this
// constraint into account during routing.
//
// TODO(joostjager): Record the rejected cltv and use
// that as a hint during future path finding through
// that node.
case *lnwire.FailExpiryTooFar:
m.reportVertexFailure(failureVertex)
return false, 0
// If we get a permanent channel or node failure, then
// we'll prune the channel in both directions and
// continue with the rest of the routes.
case *lnwire.FailPermanentChannelFailure:
m.reportEdgeFailure(failedEdge, 0)
m.reportEdgeFailure(edge{
from: failedEdge.to,
to: failedEdge.from,
channel: failedEdge.channel,
}, 0)
return false, 0
// Any other failure or an empty failure will get the node pruned.
default:
m.reportVertexFailure(failureVertex)
return false, 0
}
}
// getFailedEdge tries to locate the failing channel given a route and the
// pubkey of the node that sent the failure. It will assume that the failure is
// associated with the outgoing channel of the failing node. As a second result,
// it returns the amount sent over the edge.
func getFailedEdge(route *route.Route, failureSource int) (edge,
lnwire.MilliSatoshi) {
// Determine if we have a failure from the final hop. If it is, we
// assume that the failing channel is the incoming channel.
//
// TODO(joostjager): In this case, certain types of failures are not
// expected. For example FailUnknownNextPeer. This could be a reason to
// prune the node?
if failureSource == len(route.Hops) {
failureSource--
}
// As this failure indicates that the target channel was unable to carry
// this HTLC (for w/e reason), we'll return the _outgoing_ channel that
// the source of the failure was meant to pass the HTLC along to.
if failureSource == 0 {
return edge{
from: route.SourcePubKey,
to: route.Hops[0].PubKeyBytes,
channel: route.Hops[0].ChannelID,
}, route.TotalAmount
}
return edge{
from: route.Hops[failureSource-1].PubKeyBytes,
to: route.Hops[failureSource].PubKeyBytes,
channel: route.Hops[failureSource].ChannelID,
}, route.Hops[failureSource-1].AmtToForward
}

@ -9,11 +9,28 @@ import (
)
var (
mcTestNode = route.Vertex{}
mcTestEdge = EdgeLocator{
ChannelID: 123,
ChannelID: 2,
}
mcTestTime = time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)
mcTestRoute = &route.Route{
SourcePubKey: route.Vertex{10},
Hops: []*route.Hop{
{
ChannelID: 1,
PubKeyBytes: route.Vertex{11},
AmtToForward: 1000,
},
{
ChannelID: 2,
PubKeyBytes: route.Vertex{12},
},
},
}
mcTestTime = time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)
mcTestNode1 = mcTestRoute.Hops[0].PubKeyBytes
mcTestNode2 = mcTestRoute.Hops[1].PubKeyBytes
)
type mcTestContext struct {
@ -47,12 +64,24 @@ func (ctx *mcTestContext) expectP(amt lnwire.MilliSatoshi,
ctx.t.Helper()
p := ctx.mc.GetEdgeProbability(mcTestNode, mcTestEdge, amt)
p := ctx.mc.GetEdgeProbability(mcTestNode1, mcTestEdge, amt)
if p != expected {
ctx.t.Fatalf("unexpected probability %v", p)
}
}
// reportFailure reports a failure by using a test route.
func (ctx *mcTestContext) reportFailure(t time.Time,
amt lnwire.MilliSatoshi, failure lnwire.FailureMessage) {
mcTestRoute.Hops[0].AmtToForward = amt
errorSourceIdx := 1
ctx.mc.ReportPaymentFail(
mcTestRoute, errorSourceIdx, failure,
)
}
// TestMissionControl tests mission control probability estimation.
func TestMissionControl(t *testing.T) {
ctx := createMcTestContext(t)
@ -61,16 +90,14 @@ func TestMissionControl(t *testing.T) {
testTime := time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)
testNode := route.Vertex{}
testEdge := edge{
channel: 123,
}
// Initial probability is expected to be 1.
ctx.expectP(1000, 0.8)
// Expect probability to be zero after reporting the edge as failed.
ctx.mc.ReportEdgeFailure(testEdge, 1000)
ctx.reportFailure(
testTime, 1000,
lnwire.NewTemporaryChannelFailure(nil),
)
ctx.expectP(1000, 0)
// As we reported with a min penalization amt, a lower amt than reported
@ -83,7 +110,10 @@ func TestMissionControl(t *testing.T) {
// Edge fails again, this time without a min penalization amt. The edge
// should be penalized regardless of amount.
ctx.mc.ReportEdgeFailure(testEdge, 0)
ctx.reportFailure(
ctx.now, 0,
lnwire.NewTemporaryChannelFailure(nil),
)
ctx.expectP(1000, 0)
ctx.expectP(500, 0)
@ -93,7 +123,10 @@ func TestMissionControl(t *testing.T) {
// A node level failure should bring probability of every channel back
// to zero.
ctx.mc.ReportVertexFailure(testNode)
ctx.reportFailure(
ctx.now, 0,
lnwire.NewExpiryTooSoon(lnwire.ChannelUpdate{}),
)
ctx.expectP(1000, 0)
// Check whether history snapshot looks sane.
@ -112,19 +145,19 @@ func TestMissionControl(t *testing.T) {
func TestMissionControlChannelUpdate(t *testing.T) {
ctx := createMcTestContext(t)
testEdge := edge{
channel: 123,
}
// Report a policy related failure. Because it is the first, we don't
// expect a penalty.
ctx.mc.ReportEdgePolicyFailure(testEdge)
ctx.reportFailure(
ctx.now, 0,
lnwire.NewFeeInsufficient(0, lnwire.ChannelUpdate{}),
)
ctx.expectP(0, 0.8)
// Report another failure for the same channel. We expect it to be
// pruned.
ctx.mc.ReportEdgePolicyFailure(testEdge)
ctx.reportFailure(
ctx.now, 0,
lnwire.NewFeeInsufficient(0, lnwire.ChannelUpdate{}),
)
ctx.expectP(0, 0)
}

@ -98,6 +98,13 @@ type mockMissionControl struct {
var _ MissionController = (*mockMissionControl)(nil)
func (m *mockMissionControl) ReportPaymentFail(rt *route.Route,
failureSourceIdx int, failure lnwire.FailureMessage) (bool,
channeldb.FailureReason) {
return false, 0
}
func (m *mockMissionControl) ReportEdgeFailure(failedEdge edge,
minPenalizeAmt lnwire.MilliSatoshi) {
}

@ -174,15 +174,13 @@ type PaymentSessionSource interface {
// MissionController is an interface that exposes failure reporting and
// probability estimation.
type MissionController interface {
// ReportEdgeFailure reports a channel level failure.
ReportEdgeFailure(failedEdge edge,
minPenalizeAmt lnwire.MilliSatoshi)
// ReportEdgePolicyFailure reports a policy related failure.
ReportEdgePolicyFailure(failedEdge edge)
// ReportVertexFailure reports a node level failure.
ReportVertexFailure(v route.Vertex)
// ReportPaymentFail reports a failed payment to mission control as
// input for future probability estimates. It returns a bool indicating
// whether this error is a final error and no further payment attempts
// need to be made.
ReportPaymentFail(rt *route.Route,
failureSourceIdx int, failure lnwire.FailureMessage) (bool,
channeldb.FailureReason)
// GetEdgeProbability is expected to return the success probability of a
// payment from fromNode along edge.
@ -1929,195 +1927,9 @@ func (r *ChannelRouter) processSendError(rt *route.Route, sendErr error) (
}
}
var failureVertex route.Vertex
// For any non-self failure, look up the source pub key in the hops
// slice. Otherwise return the self node pubkey.
if failureSourceIdx > 0 {
failureVertex = rt.Hops[failureSourceIdx-1].PubKeyBytes
} else {
failureVertex = r.selfNode.PubKeyBytes
}
log.Tracef("Node %x (index %v) reported failure when sending htlc",
failureVertex, failureSourceIdx)
// Always determine chan id ourselves, because a channel
// update with id may not be available.
failedEdge, failedAmt := getFailedEdge(rt, failureSourceIdx)
switch fErr.FailureMessage.(type) {
// If the end destination didn't know the payment
// hash or we sent the wrong payment amount to the
// destination, then we'll terminate immediately.
case *lnwire.FailUnknownPaymentHash:
// TODO(joostjager): Check onionErr.Amount() whether it matches
// what we expect. (Will it ever not match, because if not
// final_incorrect_htlc_amount would be returned?)
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If we sent the wrong amount to the destination, then
// we'll exit early.
case *lnwire.FailIncorrectPaymentAmount:
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If the time-lock that was extended to the final node
// was incorrect, then we can't proceed.
case *lnwire.FailFinalIncorrectCltvExpiry:
// TODO(joostjager): Take into account that second last hop may
// have deliberately handed out an htlc that expires too soon.
// In that case we should continue routing.
return true, channeldb.FailureReasonError
// If we crafted an invalid onion payload for the final
// node, then we'll exit early.
case *lnwire.FailFinalIncorrectHtlcAmount:
// TODO(joostjager): Take into account that second last hop may
// have deliberately handed out an htlc with a too low value. In
// that case we should continue routing.
return true, channeldb.FailureReasonError
// Similarly, if the HTLC expiry that we extended to
// the final hop expires too soon, then will fail the
// payment.
//
// TODO(roasbeef): can happen to to race condition, try
// again with recent block height
case *lnwire.FailFinalExpiryTooSoon:
// TODO(joostjager): Take into account that any hop may have
// delayed. Ideally we should continue routing. Knowing the
// delaying node at this point would help.
return true, channeldb.FailureReasonIncorrectPaymentDetails
// If we erroneously attempted to cross a chain border,
// then we'll cancel the payment.
case *lnwire.FailInvalidRealm:
return true, channeldb.FailureReasonError
// If we get a notice that the expiry was too soon for
// an intermediate node, then we'll prune out the node
// that sent us this error, as it doesn't now what the
// correct block height is.
case *lnwire.FailExpiryTooSoon:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
// If we hit an instance of onion payload corruption or an invalid
// version, then we'll exit early as this shouldn't happen in the
// typical case.
//
// TODO(joostjager): Take into account that the previous hop may have
// tampered with the onion. Routing should continue using other paths.
case *lnwire.FailInvalidOnionVersion:
return true, channeldb.FailureReasonError
case *lnwire.FailInvalidOnionHmac:
return true, channeldb.FailureReasonError
case *lnwire.FailInvalidOnionKey:
return true, channeldb.FailureReasonError
// If we get a failure due to violating the minimum
// amount, we'll apply the new minimum amount and retry
// routing.
case *lnwire.FailAmountBelowMinimum:
r.cfg.MissionControl.ReportEdgePolicyFailure(failedEdge)
return false, 0
// If we get a failure due to a fee, we'll apply the
// new fee update, and retry our attempt using the
// newly updated fees.
case *lnwire.FailFeeInsufficient:
r.cfg.MissionControl.ReportEdgePolicyFailure(failedEdge)
return false, 0
// If we get the failure for an intermediate node that
// disagrees with our time lock values, then we'll
// apply the new delta value and try it once more.
case *lnwire.FailIncorrectCltvExpiry:
r.cfg.MissionControl.ReportEdgePolicyFailure(failedEdge)
return false, 0
// The outgoing channel that this node was meant to
// forward one is currently disabled, so we'll apply
// the update and continue.
case *lnwire.FailChannelDisabled:
r.cfg.MissionControl.ReportEdgeFailure(failedEdge, 0)
return false, 0
// It's likely that the outgoing channel didn't have
// sufficient capacity, so we'll prune this edge for
// now, and continue onwards with our path finding.
case *lnwire.FailTemporaryChannelFailure:
r.cfg.MissionControl.ReportEdgeFailure(failedEdge, failedAmt)
return false, 0
// If the send fail due to a node not having the
// required features, then we'll note this error and
// continue.
case *lnwire.FailRequiredNodeFeatureMissing:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
// If the send fail due to a node not having the
// required features, then we'll note this error and
// continue.
case *lnwire.FailRequiredChannelFeatureMissing:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
// If the next hop in the route wasn't known or
// offline, we'll only the channel which we attempted
// to route over. This is conservative, and it can
// handle faulty channels between nodes properly.
// Additionally, this guards against routing nodes
// returning errors in order to attempt to black list
// another node.
case *lnwire.FailUnknownNextPeer:
r.cfg.MissionControl.ReportEdgeFailure(failedEdge, 0)
return false, 0
// If the node wasn't able to forward for which ever
// reason, then we'll note this and continue with the
// routes.
case *lnwire.FailTemporaryNodeFailure:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
case *lnwire.FailPermanentNodeFailure:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
// If we crafted a route that contains a too long time
// lock for an intermediate node, we'll prune the node.
// As there currently is no way of knowing that node's
// maximum acceptable cltv, we cannot take this
// constraint into account during routing.
//
// TODO(joostjager): Record the rejected cltv and use
// that as a hint during future path finding through
// that node.
case *lnwire.FailExpiryTooFar:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
// If we get a permanent channel or node failure, then
// we'll prune the channel in both directions and
// continue with the rest of the routes.
case *lnwire.FailPermanentChannelFailure:
r.cfg.MissionControl.ReportEdgeFailure(failedEdge, 0)
r.cfg.MissionControl.ReportEdgeFailure(edge{
from: failedEdge.to,
to: failedEdge.from,
channel: failedEdge.channel,
}, 0)
return false, 0
// Any other failure or an empty failure will get the node pruned.
default:
r.cfg.MissionControl.ReportVertexFailure(failureVertex)
return false, 0
}
return r.cfg.MissionControl.ReportPaymentFail(
rt, failureSourceIdx, failureMessage,
)
}
// extractChannelUpdate examines the error and extracts the channel update.
@ -2143,46 +1955,6 @@ func (r *ChannelRouter) extractChannelUpdate(
return update
}
// getFailedEdge tries to locate the failing channel given a route and the
// pubkey of the node that sent the failure. It will assume that the failure is
// associated with the outgoing channel of the failing node. As a second result,
// it returns the amount sent over the edge.
func getFailedEdge(route *route.Route, failureSource int) (edge,
lnwire.MilliSatoshi) {
// Determine if we have a failure from the final hop. If it is, we
// assume that the failing channel is the incoming channel. In this
// function the outgoing channel of the hop indicated by failureSource
// is returned, where index zero is the self node. By decrementing
// failureSource by one, the outgoing channel of the penultimate hop is
// returned, which is the same as the incoming channel of the final
// node.
//
// TODO(joostjager): In this case, certain types of failures are not
// expected. For example FailUnknownNextPeer. This could be a reason to
// prune the node?
if failureSource == len(route.Hops) {
failureSource--
}
// As this failure indicates that the target channel was unable to carry
// this HTLC (for w/e reason), we'll return the _outgoing_ channel that
// the source of the failure was meant to pass the HTLC along to.
if failureSource == 0 {
return edge{
from: route.SourcePubKey,
to: route.Hops[0].PubKeyBytes,
channel: route.Hops[0].ChannelID,
}, route.TotalAmount
}
return edge{
from: route.Hops[failureSource-1].PubKeyBytes,
to: route.Hops[failureSource].PubKeyBytes,
channel: route.Hops[failureSource].ChannelID,
}, route.Hops[failureSource-1].AmtToForward
}
// applyChannelUpdate validates a channel update and if valid, applies it to the
// database. It returns a bool indicating whether the updates was successful.
func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate,