lnd.xprv/routing/missioncontrol.go

390 lines
12 KiB
Go
Raw Normal View History

package routing
import (
"math"
"sync"
"time"
"github.com/lightningnetwork/lnd/lnwire"
"github.com/lightningnetwork/lnd/routing/route"
)
const (
// DefaultPenaltyHalfLife is the default half-life duration. The
// half-life duration defines after how much time a penalized node or
// channel is back at 50% probability.
DefaultPenaltyHalfLife = time.Hour
// minSecondChanceInterval is the minimum time required between
// second-chance failures.
//
// If nodes return a channel policy related failure, they may get a
// second chance to forward the payment. It could be that the channel
// policy that we are aware of is not up to date. This is especially
// important in case of mobile apps that are mostly offline.
//
// However, we don't want to give nodes the option to endlessly return
// new channel updates so that we are kept busy trying to route through
// that node until the payment loop times out.
//
// Therefore we only grant a second chance to a node if the previous
// second chance is sufficiently long ago. This is what
// minSecondChanceInterval defines. If a second policy failure comes in
// within that interval, we will apply a penalty.
//
// Second chances granted are tracked on the level of node pairs. This
// means that if a node has multiple channels to the same peer, they
// will only get a single second chance to route to that peer again.
// Nodes forward non-strict, so it isn't necessary to apply a less
// restrictive channel level tracking scheme here.
minSecondChanceInterval = time.Minute
)
2019-05-23 21:05:30 +03:00
// MissionControl contains state which summarizes the past attempts of HTLC
// routing by external callers when sending payments throughout the network. It
// acts as a shared memory during routing attempts with the goal to optimize the
// payment attempt success rate.
//
// Failed payment attempts are reported to mission control. These reports are
// used to track the time of the last node or channel level failure. The time
// since the last failure is used to estimate a success probability that is fed
// into the path finding process for subsequent payment attempts.
2019-05-23 21:05:30 +03:00
type MissionControl struct {
history map[route.Vertex]*nodeHistory
// lastSecondChance tracks the last time a second chance was granted for
// a directed node pair.
lastSecondChance map[DirectedNodePair]time.Time
// now is expected to return the current time. It is supplied as an
// external function to enable deterministic unit tests.
now func() time.Time
cfg *MissionControlConfig
sync.Mutex
// TODO(roasbeef): further counters, if vertex continually unavailable,
// add to another generation
// TODO(roasbeef): also add favorable metrics for nodes
}
// MissionControlConfig defines parameters that control mission control
// behaviour.
type MissionControlConfig struct {
// PenaltyHalfLife defines after how much time a penalized node or
// channel is back at 50% probability.
PenaltyHalfLife time.Duration
// AprioriHopProbability is the assumed success probability of a hop in
// a route when no other information is available.
AprioriHopProbability float64
}
// nodeHistory contains a summary of payment attempt outcomes involving a
// particular node.
type nodeHistory struct {
// lastFail is the last time a node level failure occurred, if any.
lastFail *time.Time
// channelLastFail tracks history per channel, if available for that
// channel.
channelLastFail map[uint64]*channelHistory
}
// channelHistory contains a summary of payment attempt outcomes involving a
// particular channel.
type channelHistory struct {
// lastFail is the last time a channel level failure occurred.
lastFail time.Time
// minPenalizeAmt is the minimum amount for which to take this failure
// into account.
minPenalizeAmt lnwire.MilliSatoshi
}
// MissionControlSnapshot contains a snapshot of the current state of mission
// control.
type MissionControlSnapshot struct {
// Nodes contains the per node information of this snapshot.
Nodes []MissionControlNodeSnapshot
}
// MissionControlNodeSnapshot contains a snapshot of the current node state in
// mission control.
type MissionControlNodeSnapshot struct {
// Node pubkey.
Node route.Vertex
// Lastfail is the time of last failure, if any.
LastFail *time.Time
// Channels is a list of channels for which specific information is
// logged.
Channels []MissionControlChannelSnapshot
// OtherChanSuccessProb is the success probability for channels not in
// the Channels slice.
OtherChanSuccessProb float64
}
// MissionControlChannelSnapshot contains a snapshot of the current channel
// state in mission control.
type MissionControlChannelSnapshot struct {
// ChannelID is the short channel id of the snapshot.
ChannelID uint64
// LastFail is the time of last failure.
LastFail time.Time
// MinPenalizeAmt is the minimum amount for which the channel will be
// penalized.
MinPenalizeAmt lnwire.MilliSatoshi
// SuccessProb is the success probability estimation for this channel.
SuccessProb float64
}
// NewMissionControl returns a new instance of missionControl.
func NewMissionControl(cfg *MissionControlConfig) *MissionControl {
log.Debugf("Instantiating mission control with config: "+
"PenaltyHalfLife=%v, AprioriHopProbability=%v",
cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
return &MissionControl{
history: make(map[route.Vertex]*nodeHistory),
lastSecondChance: make(map[DirectedNodePair]time.Time),
now: time.Now,
cfg: cfg,
}
}
2019-05-23 21:05:30 +03:00
// ResetHistory resets the history of MissionControl returning it to a state as
// if no payment attempts have been made.
2019-05-23 21:05:30 +03:00
func (m *MissionControl) ResetHistory() {
m.Lock()
defer m.Unlock()
m.history = make(map[route.Vertex]*nodeHistory)
m.lastSecondChance = make(map[DirectedNodePair]time.Time)
log.Debugf("Mission control history cleared")
}
// GetEdgeProbability is expected to return the success probability of a payment
// from fromNode along edge.
func (m *MissionControl) GetEdgeProbability(fromNode route.Vertex,
edge EdgeLocator, amt lnwire.MilliSatoshi) float64 {
m.Lock()
defer m.Unlock()
// Get the history for this node. If there is no history available,
// assume that it's success probability is a constant a priori
// probability. After the attempt new information becomes available to
// adjust this probability.
nodeHistory, ok := m.history[fromNode]
if !ok {
return m.cfg.AprioriHopProbability
}
return m.getEdgeProbabilityForNode(nodeHistory, edge.ChannelID, amt)
}
// getEdgeProbabilityForNode estimates the probability of successfully
// traversing a channel based on the node history.
func (m *MissionControl) getEdgeProbabilityForNode(nodeHistory *nodeHistory,
channelID uint64, amt lnwire.MilliSatoshi) float64 {
// Calculate the last failure of the given edge. A node failure is
// considered a failure that would have affected every edge. Therefore
// we insert a node level failure into the history of every channel.
lastFailure := nodeHistory.lastFail
// Take into account a minimum penalize amount. For balance errors, a
// failure may be reported with such a minimum to prevent too aggresive
// penalization. We only take into account a previous failure if the
// amount that we currently get the probability for is greater or equal
// than the minPenalizeAmt of the previous failure.
channelHistory, ok := nodeHistory.channelLastFail[channelID]
if ok && channelHistory.minPenalizeAmt <= amt {
// If there is both a node level failure recorded and a channel
// level failure is applicable too, we take the most recent of
// the two.
if lastFailure == nil ||
channelHistory.lastFail.After(*lastFailure) {
lastFailure = &channelHistory.lastFail
}
}
if lastFailure == nil {
return m.cfg.AprioriHopProbability
}
timeSinceLastFailure := m.now().Sub(*lastFailure)
// Calculate success probability. It is an exponential curve that brings
// the probability down to zero when a failure occurs. From there it
// recovers asymptotically back to the a priori probability. The rate at
// which this happens is controlled by the penaltyHalfLife parameter.
exp := -timeSinceLastFailure.Hours() / m.cfg.PenaltyHalfLife.Hours()
probability := m.cfg.AprioriHopProbability * (1 - math.Pow(2, exp))
return probability
}
// requestSecondChance checks whether the node fromNode can have a second chance
// at providing a channel update for its channel with toNode.
func (m *MissionControl) requestSecondChance(timestamp time.Time,
fromNode, toNode route.Vertex) bool {
// Look up previous second chance time.
pair := DirectedNodePair{
From: fromNode,
To: toNode,
}
lastSecondChance, ok := m.lastSecondChance[pair]
// If the channel hasn't already be given a second chance or its last
// second chance was long ago, we give it another chance.
if !ok || timestamp.Sub(lastSecondChance) > minSecondChanceInterval {
m.lastSecondChance[pair] = timestamp
log.Debugf("Second chance granted for %v->%v", fromNode, toNode)
return true
}
// Otherwise penalize the channel, because we don't allow channel
// updates that are that frequent. This is to prevent nodes from keeping
// us busy by continuously sending new channel updates.
log.Debugf("Second chance denied for %v->%v, remaining interval: %v",
fromNode, toNode, timestamp.Sub(lastSecondChance))
return false
}
// createHistoryIfNotExists returns the history for the given node. If the node
// is yet unknown, it will create an empty history structure.
func (m *MissionControl) createHistoryIfNotExists(vertex route.Vertex) *nodeHistory {
if node, ok := m.history[vertex]; ok {
return node
}
node := &nodeHistory{
channelLastFail: make(map[uint64]*channelHistory),
}
m.history[vertex] = node
return node
}
// ReportVertexFailure reports a node level failure.
func (m *MissionControl) ReportVertexFailure(v route.Vertex) {
log.Debugf("Reporting vertex %v failure to Mission Control", v)
now := m.now()
m.Lock()
defer m.Unlock()
history := m.createHistoryIfNotExists(v)
history.lastFail = &now
}
// ReportEdgePolicyFailure reports a policy related failure.
func (m *MissionControl) ReportEdgePolicyFailure(failedEdge edge) {
now := m.now()
m.Lock()
defer m.Unlock()
// We may have an out of date graph. Therefore we don't always penalize
// immediately. If some time has passed since the last policy failure,
// we grant the node a second chance at forwarding the payment.
if m.requestSecondChance(
now, failedEdge.from, failedEdge.to,
) {
return
}
history := m.createHistoryIfNotExists(failedEdge.from)
history.lastFail = &now
}
// ReportEdgeFailure reports a channel level failure.
//
// TODO(roasbeef): also add value attempted to send and capacity of channel
func (m *MissionControl) ReportEdgeFailure(failedEdge edge,
minPenalizeAmt lnwire.MilliSatoshi) {
log.Debugf("Reporting channel %v failure to Mission Control",
failedEdge.channel)
now := m.now()
m.Lock()
defer m.Unlock()
history := m.createHistoryIfNotExists(failedEdge.from)
history.channelLastFail[failedEdge.channel] = &channelHistory{
lastFail: now,
minPenalizeAmt: minPenalizeAmt,
}
}
// GetHistorySnapshot takes a snapshot from the current mission control state
// and actual probability estimates.
func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
m.Lock()
defer m.Unlock()
log.Debugf("Requesting history snapshot from mission control: "+
"node_count=%v", len(m.history))
nodes := make([]MissionControlNodeSnapshot, 0, len(m.history))
for v, h := range m.history {
channelSnapshot := make([]MissionControlChannelSnapshot, 0,
len(h.channelLastFail),
)
for id, lastFail := range h.channelLastFail {
// Show probability assuming amount meets min
// penalization amount.
prob := m.getEdgeProbabilityForNode(
h, id, lastFail.minPenalizeAmt,
)
channelSnapshot = append(channelSnapshot,
MissionControlChannelSnapshot{
ChannelID: id,
LastFail: lastFail.lastFail,
MinPenalizeAmt: lastFail.minPenalizeAmt,
SuccessProb: prob,
},
)
}
otherProb := m.getEdgeProbabilityForNode(h, 0, 0)
nodes = append(nodes,
MissionControlNodeSnapshot{
Node: v,
LastFail: h.lastFail,
OtherChanSuccessProb: otherProb,
Channels: channelSnapshot,
},
)
}
snapshot := MissionControlSnapshot{
Nodes: nodes,
}
return &snapshot
}