routing+routerrpc: improve prob. estimation for untried connections

This commit changes mission control to partially base the estimated
probability for untried connections on historical results obtained in
previous payment attempts. This incentivizes routing nodes to keep all
of their channels in good shape.
This commit is contained in:
Joost Jager 2019-09-04 17:40:14 +02:00
parent fab13900e2
commit 1fac41deed
No known key found for this signature in database
GPG Key ID: A61B9D4C393C59C7
9 changed files with 410 additions and 97 deletions

@ -16,6 +16,15 @@ type RoutingConfig struct {
// a route when no other information is available.
AprioriHopProbability float64 `long:"apriorihopprob" description:"Assumed success probability of a hop in a route when no other information is available."`
// AprioriWeight is a value in the range [0, 1] that defines to what
// extent historical results should be extrapolated to untried
// connections. Setting it to one will completely ignore historical
// results and always assume the configured a priori probability for
// untried connections. A value of zero will ignore the a priori
// probability completely and only base the probability on historical
// results, unless there are none available.
AprioriWeight float64 `long:"aprioriweight" description:"Weight of the a priori probability in success probability estimation. Valid values are in [0, 1]."`
// PenaltyHalfLife defines after how much time a penalized node or
// channel is back at 50% probability.
PenaltyHalfLife time.Duration `long:"penaltyhalflife" description:"Defines the duration after which a penalized node or channel is back at 50% probability"`

@ -45,6 +45,7 @@ type Config struct {
func DefaultConfig() *Config {
defaultRoutingConfig := RoutingConfig{
AprioriHopProbability: routing.DefaultAprioriHopProbability,
AprioriWeight: routing.DefaultAprioriWeight,
MinRouteProbability: routing.DefaultMinRouteProbability,
PenaltyHalfLife: routing.DefaultPenaltyHalfLife,
AttemptCost: routing.DefaultPaymentAttemptPenalty.
@ -61,6 +62,7 @@ func DefaultConfig() *Config {
func GetRoutingConfig(cfg *Config) *RoutingConfig {
return &RoutingConfig{
AprioriHopProbability: cfg.AprioriHopProbability,
AprioriWeight: cfg.AprioriWeight,
MinRouteProbability: cfg.MinRouteProbability,
AttemptCost: cfg.AttemptCost,
PenaltyHalfLife: cfg.PenaltyHalfLife,

@ -18,6 +18,7 @@ func DefaultConfig() *Config {
func GetRoutingConfig(cfg *Config) *RoutingConfig {
return &RoutingConfig{
AprioriHopProbability: routing.DefaultAprioriHopProbability,
AprioriWeight: routing.DefaultAprioriWeight,
MinRouteProbability: routing.DefaultMinRouteProbability,
AttemptCost: routing.DefaultPaymentAttemptPenalty.
ToSatoshis(),

@ -1,7 +1,6 @@
package routing
import (
"math"
"sync"
"time"
@ -47,6 +46,10 @@ const (
// prevSuccessProbability is the assumed probability for node pairs that
// successfully relayed the previous attempt.
prevSuccessProbability = 0.95
// DefaultAprioriWeight is the default a priori weight. See
// MissionControlConfig for further explanation.
DefaultAprioriWeight = 0.5
)
// NodeResults contains previous results from a node to its peers.
@ -68,9 +71,6 @@ type MissionControl struct {
// particular node.
lastPairResult map[route.Vertex]NodeResults
// lastNodeFailure tracks the last node level failure per node.
lastNodeFailure map[route.Vertex]time.Time
// lastSecondChance tracks the last time a second chance was granted for
// a directed node pair.
lastSecondChance map[DirectedNodePair]time.Time
@ -83,6 +83,10 @@ type MissionControl struct {
store *missionControlStore
// estimator is the probability estimator that is used with the payment
// results that mission control collects.
estimator *probabilityEstimator
sync.Mutex
// TODO(roasbeef): further counters, if vertex continually unavailable,
@ -105,6 +109,15 @@ type MissionControlConfig struct {
// MaxMcHistory defines the maximum number of payment results that are
// held on disk.
MaxMcHistory int
// AprioriWeight is a value in the range [0, 1] that defines to what
// extent historical results should be extrapolated to untried
// connections. Setting it to one will completely ignore historical
// results and always assume the configured a priori probability for
// untried connections. A value of zero will ignore the a priori
// probability completely and only base the probability on historical
// results, unless there are none available.
AprioriWeight float64
}
// timedPairResult describes a timestamped pair result.
@ -157,21 +170,29 @@ func NewMissionControl(db *bbolt.DB, cfg *MissionControlConfig) (
*MissionControl, error) {
log.Debugf("Instantiating mission control with config: "+
"PenaltyHalfLife=%v, AprioriHopProbability=%v",
cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
"PenaltyHalfLife=%v, AprioriHopProbability=%v, "+
"AprioriWeight=%v", cfg.PenaltyHalfLife,
cfg.AprioriHopProbability, cfg.AprioriWeight)
store, err := newMissionControlStore(db, cfg.MaxMcHistory)
if err != nil {
return nil, err
}
estimator := &probabilityEstimator{
aprioriHopProbability: cfg.AprioriHopProbability,
aprioriWeight: cfg.AprioriWeight,
penaltyHalfLife: cfg.PenaltyHalfLife,
prevSuccessProbability: prevSuccessProbability,
}
mc := &MissionControl{
lastPairResult: make(map[route.Vertex]NodeResults),
lastNodeFailure: make(map[route.Vertex]time.Time),
lastSecondChance: make(map[DirectedNodePair]time.Time),
now: time.Now,
cfg: cfg,
store: store,
estimator: estimator,
}
if err := mc.init(); err != nil {
@ -213,7 +234,6 @@ func (m *MissionControl) ResetHistory() error {
}
m.lastPairResult = make(map[route.Vertex]NodeResults)
m.lastNodeFailure = make(map[route.Vertex]time.Time)
m.lastSecondChance = make(map[DirectedNodePair]time.Time)
log.Debugf("Mission control history cleared")
@ -229,56 +249,15 @@ func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex,
m.Lock()
defer m.Unlock()
return m.getPairProbability(fromNode, toNode, amt)
}
now := m.now()
results := m.lastPairResult[fromNode]
// getProbAfterFail returns a probability estimate based on a last failure time.
func (m *MissionControl) getProbAfterFail(lastFailure time.Time) float64 {
if lastFailure.IsZero() {
return m.cfg.AprioriHopProbability
}
timeSinceLastFailure := m.now().Sub(lastFailure)
// Calculate success probability based on the weight of the last
// failure. When the failure is fresh, its weight is 1 and we'll return
// probability 0. Over time the probability recovers to the a priori
// probability.
weight := m.getWeight(timeSinceLastFailure)
probability := m.cfg.AprioriHopProbability * (1 - weight)
return probability
}
// getWeight calculates a weight in the range [0, 1] that should be assigned to
// a payment result. Weight follows an exponential curve that starts at 1 when
// the result is fresh and asymptotically approaches zero over time. The rate at
// which this happens is controlled by the penaltyHalfLife parameter.
func (m *MissionControl) getWeight(age time.Duration) float64 {
exp := -age.Hours() / m.cfg.PenaltyHalfLife.Hours()
return math.Pow(2, exp)
}
// getLastPairResult gets the last recorded result for a node pair.
func (m *MissionControl) getLastPairResult(fromNode,
toNode route.Vertex) *timedPairResult {
nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
return nil
}
lastResult, ok := nodePairs[toNode]
if !ok {
return nil
}
return &lastResult
return m.estimator.getPairProbability(now, results, toNode, amt)
}
// setLastPairResult stores a result for a node pair.
func (m *MissionControl) setLastPairResult(fromNode,
toNode route.Vertex, result *timedPairResult) {
toNode route.Vertex, result timedPairResult) {
nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
@ -286,43 +265,24 @@ func (m *MissionControl) setLastPairResult(fromNode,
m.lastPairResult[fromNode] = nodePairs
}
nodePairs[toNode] = *result
nodePairs[toNode] = result
}
// getPairProbability estimates the probability of successfully
// traversing from fromNode to toNode based on historical payment outcomes.
func (m *MissionControl) getPairProbability(fromNode,
toNode route.Vertex, amt lnwire.MilliSatoshi) float64 {
// setAllFail stores a fail result for all known connection of the given node.
func (m *MissionControl) setAllFail(fromNode route.Vertex,
timestamp time.Time) {
// Start by getting the last node level failure. A node failure is
// considered a failure that would have affected every edge. Therefore
// we insert a node level failure into the history of every channel. If
// there is none, lastFail will be zero.
lastFail := m.lastNodeFailure[fromNode]
// Retrieve the last pair outcome.
lastPairResult := m.getLastPairResult(fromNode, toNode)
// Only look at the last pair outcome if it happened after the last node
// level failure. Otherwise the node level failure is the most recent
// and used as the basis for calculation of the probability.
if lastPairResult != nil && lastPairResult.timestamp.After(lastFail) {
if lastPairResult.success {
return prevSuccessProbability
}
// Take into account a minimum penalize amount. For balance
// errors, a failure may be reported with such a minimum to
// prevent too aggresive penalization. We only take into account
// a previous failure if the amount that we currently get the
// probability for is greater or equal than the minPenalizeAmt
// of the previous failure.
if amt >= lastPairResult.minPenalizeAmt {
lastFail = lastPairResult.timestamp
}
nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
return
}
return m.getProbAfterFail(lastFail)
for connection := range nodePairs {
nodePairs[connection] = timedPairResult{
timestamp: timestamp,
pairResult: failPairResult(0),
}
}
}
// requestSecondChance checks whether the node fromNode can have a second chance
@ -363,8 +323,7 @@ func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
defer m.Unlock()
log.Debugf("Requesting history snapshot from mission control: "+
"node_failure_count=%v, pair_result_count=%v",
len(m.lastNodeFailure), len(m.lastPairResult))
"pair_result_count=%v", len(m.lastPairResult))
pairs := make([]MissionControlPairSnapshot, 0, len(m.lastPairResult))
@ -475,11 +434,28 @@ func (m *MissionControl) applyPaymentResult(
}
}
// If there is a node-level failure, record a failure for every tried
// connection of that node. A node-level failure can be considered as a
// failure that would have occurred with any of the node's channels.
//
// Ideally we'd also record the failure for the untried connections of
// the node. Unfortunately this would require access to the graph and
// adding this dependency and db calls does not outweigh the benefits.
//
// Untried connections will fall back to the node probability. After the
// call to setAllPairResult below, the node probability will be equal to
// the probability of the tried channels except that the a priori
// probability is mixed in too. This effect is controlled by the
// aprioriWeight parameter. If that parameter isn't set to an extreme
// and there are a few known connections, there shouldn't be much of a
// difference. The largest difference occurs when aprioriWeight is 1. In
// that case, a node-level failure would not be applied to untried
// channels.
if i.nodeFailure != nil {
log.Debugf("Reporting node failure to Mission Control: "+
"node=%v", *i.nodeFailure)
m.lastNodeFailure[*i.nodeFailure] = result.timeReply
m.setAllFail(*i.nodeFailure, result.timeReply)
}
for pair, pairResult := range i.pairResults {
@ -492,7 +468,7 @@ func (m *MissionControl) applyPaymentResult(
pair, pairResult.minPenalizeAmt)
}
m.setLastPairResult(pair.From, pair.To, &timedPairResult{
m.setLastPairResult(pair.From, pair.To, timedPairResult{
timestamp: result.timeReply,
pairResult: pairResult,
})

@ -34,7 +34,8 @@ var (
mcTestNode2 = mcTestRoute.Hops[1].PubKeyBytes
testPenaltyHalfLife = 30 * time.Minute
testAprioriHopProbability = 0.8
testAprioriHopProbability = 0.9
testAprioriWeight = 0.5
)
type mcTestContext struct {
@ -78,6 +79,7 @@ func (ctx *mcTestContext) restartMc() {
&MissionControlConfig{
PenaltyHalfLife: testPenaltyHalfLife,
AprioriHopProbability: testAprioriHopProbability,
AprioriWeight: testAprioriWeight,
},
)
if err != nil {
@ -136,20 +138,23 @@ func TestMissionControl(t *testing.T) {
testTime := time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)
// Initial probability is expected to be 1.
ctx.expectP(1000, 0.8)
// Initial probability is expected to be the a priori.
ctx.expectP(1000, testAprioriHopProbability)
// Expect probability to be zero after reporting the edge as failed.
ctx.reportFailure(1000, lnwire.NewTemporaryChannelFailure(nil))
ctx.expectP(1000, 0)
// As we reported with a min penalization amt, a lower amt than reported
// should be unaffected.
// should return the node probability, which is the a priori
// probability.
ctx.expectP(500, testAprioriHopProbability)
// Edge decay started.
// Edge decay started. The node probability weighted average should now
// have shifted from 1:1 to 1:0.5 -> 60%. The connection probability is
// half way through the recovery, so we expect 30% here.
ctx.now = testTime.Add(30 * time.Minute)
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)
// Edge fails again, this time without a min penalization amt. The edge
// should be penalized regardless of amount.
@ -159,11 +164,11 @@ func TestMissionControl(t *testing.T) {
// Edge decay started.
ctx.now = testTime.Add(60 * time.Minute)
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)
// Restart mission control to test persistence.
ctx.restartMc()
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)
// A node level failure should bring probability of all known channels
// back to zero.

@ -0,0 +1,155 @@
package routing
import (
"math"
"time"
"github.com/lightningnetwork/lnd/lnwire"
"github.com/lightningnetwork/lnd/routing/route"
)
// probabilityEstimator returns node and pair probabilities based on historical
// payment results.
type probabilityEstimator struct {
// penaltyHalfLife defines after how much time a penalized node or
// channel is back at 50% probability.
penaltyHalfLife time.Duration
// aprioriHopProbability is the assumed success probability of a hop in
// a route when no other information is available.
aprioriHopProbability float64
// aprioriWeight is a value in the range [0, 1] that defines to what
// extent historical results should be extrapolated to untried
// connections. Setting it to one will completely ignore historical
// results and always assume the configured a priori probability for
// untried connections. A value of zero will ignore the a priori
// probability completely and only base the probability on historical
// results, unless there are none available.
aprioriWeight float64
// prevSuccessProbability is the assumed probability for node pairs that
// successfully relayed the previous attempt.
prevSuccessProbability float64
}
// getNodeProbability calculates the probability for connections from a node
// that have not been tried before. The results parameter is a list of last
// payment results for that node.
func (p *probabilityEstimator) getNodeProbability(now time.Time,
results NodeResults, amt lnwire.MilliSatoshi) float64 {
// If the channel history is not to be taken into account, we can return
// early here with the configured a priori probability.
if p.aprioriWeight == 1 {
return p.aprioriHopProbability
}
// If there is no channel history, our best estimate is still the a
// priori probability.
if len(results) == 0 {
return p.aprioriHopProbability
}
// The value of the apriori weight is in the range [0, 1]. Convert it to
// a factor that properly expresses the intention of the weight in the
// following weight average calculation. When the apriori weight is 0,
// the apriori factor is also 0. This means it won't have any effect on
// the weighted average calculation below. When the apriori weight
// approaches 1, the apriori factor goes to infinity. It will heavily
// outweigh any observations that have been collected.
aprioriFactor := 1/(1-p.aprioriWeight) - 1
// Calculate a weighted average consisting of the apriori probability
// and historical observations. This is the part that incentivizes nodes
// to make sure that all (not just some) of their channels are in good
// shape. Senders will steer around nodes that have shown a few
// failures, even though there may be many channels still untried.
//
// If there is just a single observation and the apriori weight is 0,
// this single observation will totally determine the node probability.
// The node probability is returned for all other channels of the node.
// This means that one failure will lead to the success probability
// estimates for all other channels being 0 too. The probability for the
// channel that was tried will not even recover, because it is
// recovering to the node probability (which is zero). So one failure
// effectively prunes all channels of the node forever. This is the most
// aggressive way in which we can penalize nodes and unlikely to yield
// good results in a real network.
probabilitiesTotal := p.aprioriHopProbability * aprioriFactor
totalWeight := aprioriFactor
for _, result := range results {
age := now.Sub(result.timestamp)
switch {
// Weigh success with a constant high weight of 1. There is no
// decay.
case result.success:
totalWeight++
probabilitiesTotal += p.prevSuccessProbability
// Weigh failures in accordance with their age. The base
// probability of a failure is considered zero, so nothing needs
// to be added to probabilitiesTotal.
case amt >= result.minPenalizeAmt:
totalWeight += p.getWeight(age)
}
}
return probabilitiesTotal / totalWeight
}
// getWeight calculates a weight in the range [0, 1] that should be assigned to
// a payment result. Weight follows an exponential curve that starts at 1 when
// the result is fresh and asymptotically approaches zero over time. The rate at
// which this happens is controlled by the penaltyHalfLife parameter.
func (p *probabilityEstimator) getWeight(age time.Duration) float64 {
exp := -age.Hours() / p.penaltyHalfLife.Hours()
return math.Pow(2, exp)
}
// getPairProbability estimates the probability of successfully traversing to
// toNode based on historical payment outcomes for the from node. Those outcomes
// are passed in via the results parameter.
func (p *probabilityEstimator) getPairProbability(
now time.Time, results NodeResults,
toNode route.Vertex, amt lnwire.MilliSatoshi) float64 {
// Retrieve the last pair outcome.
lastPairResult, ok := results[toNode]
// If there is no history for this pair, return the node probability
// that is a probability estimate for untried channel.
if !ok {
return p.getNodeProbability(now, results, amt)
}
// For successes, we have a fixed (high) probability. Those pairs
// will be assumed good until proven otherwise.
if lastPairResult.success {
return p.prevSuccessProbability
}
nodeProbability := p.getNodeProbability(now, results, amt)
// Take into account a minimum penalize amount. For balance errors, a
// failure may be reported with such a minimum to prevent too aggressive
// penalization. If the current amount is smaller than the amount that
// previously triggered a failure, we act as if this is an untried
// channel.
if amt < lastPairResult.minPenalizeAmt {
return nodeProbability
}
timeSinceLastFailure := now.Sub(lastPairResult.timestamp)
// Calculate success probability based on the weight of the last
// failure. When the failure is fresh, its weight is 1 and we'll return
// probability 0. Over time the probability recovers to the node
// probability. It would be as if this channel was never tried before.
weight := p.getWeight(timeSinceLastFailure)
probability := nodeProbability * (1 - weight)
return probability
}

@ -0,0 +1,163 @@
package routing
import (
"testing"
"time"
"github.com/lightningnetwork/lnd/lnwire"
"github.com/lightningnetwork/lnd/routing/route"
)
const (
// Define node identifiers
node1 = 1
node2 = 2
node3 = 3
// untriedNode is a node id for which we don't record any results in
// this test. This can be used to assert the probability for untried
// ndoes.
untriedNode = 255
// Define test estimator parameters.
aprioriHopProb = 0.6
aprioriWeight = 0.75
aprioriPrevSucProb = 0.95
)
type estimatorTestContext struct {
t *testing.T
estimator *probabilityEstimator
// results contains a list of last results. Every element in the list
// corresponds to the last result towards a node. The list index equals
// the node id. So the first element in the list is the result towards
// node 0.
results map[int]timedPairResult
}
func newEstimatorTestContext(t *testing.T) *estimatorTestContext {
return &estimatorTestContext{
t: t,
estimator: &probabilityEstimator{
aprioriHopProbability: aprioriHopProb,
aprioriWeight: aprioriWeight,
penaltyHalfLife: time.Hour,
prevSuccessProbability: aprioriPrevSucProb,
},
}
}
// assertPairProbability asserts that the calculated success probability is
// correct.
func (c *estimatorTestContext) assertPairProbability(now time.Time,
toNode byte, amt lnwire.MilliSatoshi, expectedProb float64) {
c.t.Helper()
results := make(NodeResults)
for i, r := range c.results {
results[route.Vertex{byte(i)}] = r
}
const tolerance = 0.01
p := c.estimator.getPairProbability(now, results, route.Vertex{toNode}, amt)
diff := p - expectedProb
if diff > tolerance || diff < -tolerance {
c.t.Fatalf("expected probability %v for node %v, but got %v",
expectedProb, toNode, p)
}
}
// TestProbabilityEstimatorNoResults tests the probability estimation when no
// results are available.
func TestProbabilityEstimatorNoResults(t *testing.T) {
ctx := newEstimatorTestContext(t)
ctx.assertPairProbability(testTime, 0, 0, aprioriHopProb)
}
// TestProbabilityEstimatorOneSuccess tests the probability estimation for nodes
// that have a single success result.
func TestProbabilityEstimatorOneSuccess(t *testing.T) {
ctx := newEstimatorTestContext(t)
ctx.results = map[int]timedPairResult{
node1: {
timestamp: testTime.Add(-time.Hour),
pairResult: successPairResult(),
},
}
// Because of the previous success, this channel keep reporting a high
// probability.
ctx.assertPairProbability(
testTime, node1, 100, aprioriPrevSucProb,
)
// Untried channels are also influenced by the success. With a
// aprioriWeight of 0.75, the a priori probability is assigned weight 3.
expectedP := (3*aprioriHopProb + 1*aprioriPrevSucProb) / 4
ctx.assertPairProbability(testTime, untriedNode, 100, expectedP)
}
// TestProbabilityEstimatorOneFailure tests the probability estimation for nodes
// that have a single failure.
func TestProbabilityEstimatorOneFailure(t *testing.T) {
ctx := newEstimatorTestContext(t)
ctx.results = map[int]timedPairResult{
node1: {
timestamp: testTime.Add(-time.Hour),
pairResult: failPairResult(0),
},
}
// For an untried node, we expected the node probability. The weight for
// the failure after one hour is 0.5. This makes the node probability
// 0.51:
expectedNodeProb := (3*aprioriHopProb + 0.5*0) / 3.5
ctx.assertPairProbability(testTime, untriedNode, 100, expectedNodeProb)
// The pair probability decays back to the node probability. With the
// weight at 0.5, we expected a pair probability of 0.5 * 0.51 = 0.25.
ctx.assertPairProbability(testTime, node1, 100, expectedNodeProb/2)
}
// TestProbabilityEstimatorMix tests the probability estimation for nodes for
// which a mix of successes and failures is recorded.
func TestProbabilityEstimatorMix(t *testing.T) {
ctx := newEstimatorTestContext(t)
ctx.results = map[int]timedPairResult{
node1: {
timestamp: testTime.Add(-time.Hour),
pairResult: successPairResult(),
},
node2: {
timestamp: testTime.Add(-2 * time.Hour),
pairResult: failPairResult(0),
},
node3: {
timestamp: testTime.Add(-3 * time.Hour),
pairResult: failPairResult(0),
},
}
// We expect the probability for a previously successful channel to
// remain high.
ctx.assertPairProbability(testTime, node1, 100, prevSuccessProbability)
// For an untried node, we expected the node probability to be returned.
// This is a weighted average of the results above and the a priori
// probability: 0.62.
expectedNodeProb := (3*aprioriHopProb + 1*prevSuccessProbability) /
(3 + 1 + 0.25 + 0.125)
ctx.assertPairProbability(testTime, untriedNode, 100, expectedNodeProb)
// For the previously failed connection with node 1, we expect 0.75 *
// the node probability = 0.47.
ctx.assertPairProbability(testTime, node2, 100, expectedNodeProb*0.75)
}

@ -91,6 +91,7 @@ func createTestCtxFromGraphInstance(startingHeight uint32, graphInstance *testGr
mcConfig := &MissionControlConfig{
PenaltyHalfLife: time.Hour,
AprioriHopProbability: 0.9,
AprioriWeight: 0.5,
}
mc, err := NewMissionControl(

@ -660,6 +660,7 @@ func newServer(listenAddrs []net.Addr, chanDB *channeldb.DB,
AprioriHopProbability: routingConfig.AprioriHopProbability,
PenaltyHalfLife: routingConfig.PenaltyHalfLife,
MaxMcHistory: routingConfig.MaxMcHistory,
AprioriWeight: routingConfig.AprioriWeight,
},
)
if err != nil {