lnd.xprv/routing/missioncontrol.go
Joost Jager dc13da5abb
routing: move second chance logic into mission control
If nodes return a channel policy related failure, they may get a second
chance. Our graph may not be up to date. Previously this logic was
contained in the payment session.

This commit moves that into global mission control and thereby removes
the last mission control state that was kept on the payment level.

Because mission control is not aware of the relation between payment
attempts and payments, the second chance logic is no longer based
tracking second chances given per payment.

Instead a time based approach is used. If a node reports a policy
failure that prevents forwarding to its peer, it will get a second
chance. But it will get it only if the previous second chance was
long enough ago.

Also those second chances are no longer dependent on whether an
associated channel update is valid. It will get the second chance
regardless, to prevent creating a dependency between mission control and
the graph. This would interfer with (future) replay of history, because
the graph may not be the same anymore at that point.
2019-07-13 22:38:23 +02:00

390 lines
12 KiB
Go

package routing
import (
"math"
"sync"
"time"
"github.com/lightningnetwork/lnd/lnwire"
"github.com/lightningnetwork/lnd/routing/route"
)
const (
// DefaultPenaltyHalfLife is the default half-life duration. The
// half-life duration defines after how much time a penalized node or
// channel is back at 50% probability.
DefaultPenaltyHalfLife = time.Hour
// minSecondChanceInterval is the minimum time required between
// second-chance failures.
//
// If nodes return a channel policy related failure, they may get a
// second chance to forward the payment. It could be that the channel
// policy that we are aware of is not up to date. This is especially
// important in case of mobile apps that are mostly offline.
//
// However, we don't want to give nodes the option to endlessly return
// new channel updates so that we are kept busy trying to route through
// that node until the payment loop times out.
//
// Therefore we only grant a second chance to a node if the previous
// second chance is sufficiently long ago. This is what
// minSecondChanceInterval defines. If a second policy failure comes in
// within that interval, we will apply a penalty.
//
// Second chances granted are tracked on the level of node pairs. This
// means that if a node has multiple channels to the same peer, they
// will only get a single second chance to route to that peer again.
// Nodes forward non-strict, so it isn't necessary to apply a less
// restrictive channel level tracking scheme here.
minSecondChanceInterval = time.Minute
)
// MissionControl contains state which summarizes the past attempts of HTLC
// routing by external callers when sending payments throughout the network. It
// acts as a shared memory during routing attempts with the goal to optimize the
// payment attempt success rate.
//
// Failed payment attempts are reported to mission control. These reports are
// used to track the time of the last node or channel level failure. The time
// since the last failure is used to estimate a success probability that is fed
// into the path finding process for subsequent payment attempts.
type MissionControl struct {
history map[route.Vertex]*nodeHistory
// lastSecondChance tracks the last time a second chance was granted for
// a directed node pair.
lastSecondChance map[DirectedNodePair]time.Time
// now is expected to return the current time. It is supplied as an
// external function to enable deterministic unit tests.
now func() time.Time
cfg *MissionControlConfig
sync.Mutex
// TODO(roasbeef): further counters, if vertex continually unavailable,
// add to another generation
// TODO(roasbeef): also add favorable metrics for nodes
}
// MissionControlConfig defines parameters that control mission control
// behaviour.
type MissionControlConfig struct {
// PenaltyHalfLife defines after how much time a penalized node or
// channel is back at 50% probability.
PenaltyHalfLife time.Duration
// AprioriHopProbability is the assumed success probability of a hop in
// a route when no other information is available.
AprioriHopProbability float64
}
// nodeHistory contains a summary of payment attempt outcomes involving a
// particular node.
type nodeHistory struct {
// lastFail is the last time a node level failure occurred, if any.
lastFail *time.Time
// channelLastFail tracks history per channel, if available for that
// channel.
channelLastFail map[uint64]*channelHistory
}
// channelHistory contains a summary of payment attempt outcomes involving a
// particular channel.
type channelHistory struct {
// lastFail is the last time a channel level failure occurred.
lastFail time.Time
// minPenalizeAmt is the minimum amount for which to take this failure
// into account.
minPenalizeAmt lnwire.MilliSatoshi
}
// MissionControlSnapshot contains a snapshot of the current state of mission
// control.
type MissionControlSnapshot struct {
// Nodes contains the per node information of this snapshot.
Nodes []MissionControlNodeSnapshot
}
// MissionControlNodeSnapshot contains a snapshot of the current node state in
// mission control.
type MissionControlNodeSnapshot struct {
// Node pubkey.
Node route.Vertex
// Lastfail is the time of last failure, if any.
LastFail *time.Time
// Channels is a list of channels for which specific information is
// logged.
Channels []MissionControlChannelSnapshot
// OtherChanSuccessProb is the success probability for channels not in
// the Channels slice.
OtherChanSuccessProb float64
}
// MissionControlChannelSnapshot contains a snapshot of the current channel
// state in mission control.
type MissionControlChannelSnapshot struct {
// ChannelID is the short channel id of the snapshot.
ChannelID uint64
// LastFail is the time of last failure.
LastFail time.Time
// MinPenalizeAmt is the minimum amount for which the channel will be
// penalized.
MinPenalizeAmt lnwire.MilliSatoshi
// SuccessProb is the success probability estimation for this channel.
SuccessProb float64
}
// NewMissionControl returns a new instance of missionControl.
func NewMissionControl(cfg *MissionControlConfig) *MissionControl {
log.Debugf("Instantiating mission control with config: "+
"PenaltyHalfLife=%v, AprioriHopProbability=%v",
cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
return &MissionControl{
history: make(map[route.Vertex]*nodeHistory),
lastSecondChance: make(map[DirectedNodePair]time.Time),
now: time.Now,
cfg: cfg,
}
}
// ResetHistory resets the history of MissionControl returning it to a state as
// if no payment attempts have been made.
func (m *MissionControl) ResetHistory() {
m.Lock()
defer m.Unlock()
m.history = make(map[route.Vertex]*nodeHistory)
m.lastSecondChance = make(map[DirectedNodePair]time.Time)
log.Debugf("Mission control history cleared")
}
// GetEdgeProbability is expected to return the success probability of a payment
// from fromNode along edge.
func (m *MissionControl) GetEdgeProbability(fromNode route.Vertex,
edge EdgeLocator, amt lnwire.MilliSatoshi) float64 {
m.Lock()
defer m.Unlock()
// Get the history for this node. If there is no history available,
// assume that it's success probability is a constant a priori
// probability. After the attempt new information becomes available to
// adjust this probability.
nodeHistory, ok := m.history[fromNode]
if !ok {
return m.cfg.AprioriHopProbability
}
return m.getEdgeProbabilityForNode(nodeHistory, edge.ChannelID, amt)
}
// getEdgeProbabilityForNode estimates the probability of successfully
// traversing a channel based on the node history.
func (m *MissionControl) getEdgeProbabilityForNode(nodeHistory *nodeHistory,
channelID uint64, amt lnwire.MilliSatoshi) float64 {
// Calculate the last failure of the given edge. A node failure is
// considered a failure that would have affected every edge. Therefore
// we insert a node level failure into the history of every channel.
lastFailure := nodeHistory.lastFail
// Take into account a minimum penalize amount. For balance errors, a
// failure may be reported with such a minimum to prevent too aggresive
// penalization. We only take into account a previous failure if the
// amount that we currently get the probability for is greater or equal
// than the minPenalizeAmt of the previous failure.
channelHistory, ok := nodeHistory.channelLastFail[channelID]
if ok && channelHistory.minPenalizeAmt <= amt {
// If there is both a node level failure recorded and a channel
// level failure is applicable too, we take the most recent of
// the two.
if lastFailure == nil ||
channelHistory.lastFail.After(*lastFailure) {
lastFailure = &channelHistory.lastFail
}
}
if lastFailure == nil {
return m.cfg.AprioriHopProbability
}
timeSinceLastFailure := m.now().Sub(*lastFailure)
// Calculate success probability. It is an exponential curve that brings
// the probability down to zero when a failure occurs. From there it
// recovers asymptotically back to the a priori probability. The rate at
// which this happens is controlled by the penaltyHalfLife parameter.
exp := -timeSinceLastFailure.Hours() / m.cfg.PenaltyHalfLife.Hours()
probability := m.cfg.AprioriHopProbability * (1 - math.Pow(2, exp))
return probability
}
// requestSecondChance checks whether the node fromNode can have a second chance
// at providing a channel update for its channel with toNode.
func (m *MissionControl) requestSecondChance(timestamp time.Time,
fromNode, toNode route.Vertex) bool {
// Look up previous second chance time.
pair := DirectedNodePair{
From: fromNode,
To: toNode,
}
lastSecondChance, ok := m.lastSecondChance[pair]
// If the channel hasn't already be given a second chance or its last
// second chance was long ago, we give it another chance.
if !ok || timestamp.Sub(lastSecondChance) > minSecondChanceInterval {
m.lastSecondChance[pair] = timestamp
log.Debugf("Second chance granted for %v->%v", fromNode, toNode)
return true
}
// Otherwise penalize the channel, because we don't allow channel
// updates that are that frequent. This is to prevent nodes from keeping
// us busy by continuously sending new channel updates.
log.Debugf("Second chance denied for %v->%v, remaining interval: %v",
fromNode, toNode, timestamp.Sub(lastSecondChance))
return false
}
// createHistoryIfNotExists returns the history for the given node. If the node
// is yet unknown, it will create an empty history structure.
func (m *MissionControl) createHistoryIfNotExists(vertex route.Vertex) *nodeHistory {
if node, ok := m.history[vertex]; ok {
return node
}
node := &nodeHistory{
channelLastFail: make(map[uint64]*channelHistory),
}
m.history[vertex] = node
return node
}
// ReportVertexFailure reports a node level failure.
func (m *MissionControl) ReportVertexFailure(v route.Vertex) {
log.Debugf("Reporting vertex %v failure to Mission Control", v)
now := m.now()
m.Lock()
defer m.Unlock()
history := m.createHistoryIfNotExists(v)
history.lastFail = &now
}
// ReportEdgePolicyFailure reports a policy related failure.
func (m *MissionControl) ReportEdgePolicyFailure(failedEdge edge) {
now := m.now()
m.Lock()
defer m.Unlock()
// We may have an out of date graph. Therefore we don't always penalize
// immediately. If some time has passed since the last policy failure,
// we grant the node a second chance at forwarding the payment.
if m.requestSecondChance(
now, failedEdge.from, failedEdge.to,
) {
return
}
history := m.createHistoryIfNotExists(failedEdge.from)
history.lastFail = &now
}
// ReportEdgeFailure reports a channel level failure.
//
// TODO(roasbeef): also add value attempted to send and capacity of channel
func (m *MissionControl) ReportEdgeFailure(failedEdge edge,
minPenalizeAmt lnwire.MilliSatoshi) {
log.Debugf("Reporting channel %v failure to Mission Control",
failedEdge.channel)
now := m.now()
m.Lock()
defer m.Unlock()
history := m.createHistoryIfNotExists(failedEdge.from)
history.channelLastFail[failedEdge.channel] = &channelHistory{
lastFail: now,
minPenalizeAmt: minPenalizeAmt,
}
}
// GetHistorySnapshot takes a snapshot from the current mission control state
// and actual probability estimates.
func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
m.Lock()
defer m.Unlock()
log.Debugf("Requesting history snapshot from mission control: "+
"node_count=%v", len(m.history))
nodes := make([]MissionControlNodeSnapshot, 0, len(m.history))
for v, h := range m.history {
channelSnapshot := make([]MissionControlChannelSnapshot, 0,
len(h.channelLastFail),
)
for id, lastFail := range h.channelLastFail {
// Show probability assuming amount meets min
// penalization amount.
prob := m.getEdgeProbabilityForNode(
h, id, lastFail.minPenalizeAmt,
)
channelSnapshot = append(channelSnapshot,
MissionControlChannelSnapshot{
ChannelID: id,
LastFail: lastFail.lastFail,
MinPenalizeAmt: lastFail.minPenalizeAmt,
SuccessProb: prob,
},
)
}
otherProb := m.getEdgeProbabilityForNode(h, 0, 0)
nodes = append(nodes,
MissionControlNodeSnapshot{
Node: v,
LastFail: h.lastFail,
OtherChanSuccessProb: otherProb,
Channels: channelSnapshot,
},
)
}
snapshot := MissionControlSnapshot{
Nodes: nodes,
}
return &snapshot
}