6cf66aea47
Since we store all-time flap count for a peer, we add a cooldown factor which will discount poor flap counts in the past. This is only applied to peers that have not flapped for at least a cooldown period, so that we do not downgrade our rate limiting for badly behaved peers.
419 lines
13 KiB
Go
419 lines
13 KiB
Go
package chanfitness
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/btcsuite/btcd/wire"
|
|
"github.com/lightningnetwork/lnd/clock"
|
|
)
|
|
|
|
type eventType int
|
|
|
|
const (
|
|
peerOnlineEvent eventType = iota
|
|
peerOfflineEvent
|
|
)
|
|
|
|
// String provides string representations of channel events.
|
|
func (e eventType) String() string {
|
|
switch e {
|
|
case peerOnlineEvent:
|
|
return "peer_online"
|
|
|
|
case peerOfflineEvent:
|
|
return "peer_offline"
|
|
}
|
|
|
|
return "unknown"
|
|
}
|
|
|
|
type event struct {
|
|
timestamp time.Time
|
|
eventType eventType
|
|
}
|
|
|
|
// peerLog tracks events for a peer and its channels. If we currently have no
|
|
// channels with the peer, it will simply track its current online state. If we
|
|
// do have channels open with the peer, it will track the peer's online and
|
|
// offline events so that we can calculate uptime for our channels. A single
|
|
// event log is used for these online and offline events, and uptime for a
|
|
// channel is calculated by examining a subsection of this log.
|
|
type peerLog struct {
|
|
// online stores whether the peer is currently online.
|
|
online bool
|
|
|
|
// onlineEvents is a log of timestamped events observed for the peer
|
|
// that we have committed to allocating memory to.
|
|
onlineEvents []*event
|
|
|
|
// stagedEvent represents an event that is pending addition to the
|
|
// events list. It has not yet been added because we rate limit the
|
|
// frequency that we store events at. We need to store this value
|
|
// in the log (rather than just ignore events) so that we can flush the
|
|
// aggregate outcome to our event log once the rate limiting period has
|
|
// ended.
|
|
//
|
|
// Take the following example:
|
|
// - Peer online event recorded
|
|
// - Peer offline event, not recorded due to rate limit
|
|
// - No more events, we incorrectly believe our peer to be online
|
|
// Instead of skipping events, we stage the most recent event during the
|
|
// rate limited period so that we know what happened (on aggregate)
|
|
// while we were rate limiting events.
|
|
//
|
|
// Note that we currently only store offline/online events so we can
|
|
// use this field to track our online state. With the addition of other
|
|
// event types, we need to only stage online/offline events, or split
|
|
// them out.
|
|
stagedEvent *event
|
|
|
|
// flapCount is the number of times this peer has been observed as
|
|
// going offline.
|
|
flapCount int
|
|
|
|
// lastFlap is the timestamp of the last flap we recorded for the peer.
|
|
// This value will be nil if we have never recorded a flap for the peer.
|
|
lastFlap *time.Time
|
|
|
|
// clock allows creation of deterministic unit tests.
|
|
clock clock.Clock
|
|
|
|
// channels contains a set of currently open channels. Channels will be
|
|
// added and removed from this map as they are opened and closed.
|
|
channels map[wire.OutPoint]*channelInfo
|
|
}
|
|
|
|
// newPeerLog creates a log for a peer, taking its historical flap count and
|
|
// last flap time as parameters. These values may be zero/nil if we have no
|
|
// record of historical flap count for the peer.
|
|
func newPeerLog(clock clock.Clock, flapCount int,
|
|
lastFlap *time.Time) *peerLog {
|
|
|
|
return &peerLog{
|
|
clock: clock,
|
|
flapCount: flapCount,
|
|
lastFlap: lastFlap,
|
|
channels: make(map[wire.OutPoint]*channelInfo),
|
|
}
|
|
}
|
|
|
|
// channelInfo contains information about a channel.
|
|
type channelInfo struct {
|
|
// openedAt tracks the first time this channel was seen. This is not
|
|
// necessarily the time that it confirmed on chain because channel
|
|
// events are not persisted at present.
|
|
openedAt time.Time
|
|
}
|
|
|
|
func newChannelInfo(openedAt time.Time) *channelInfo {
|
|
return &channelInfo{
|
|
openedAt: openedAt,
|
|
}
|
|
}
|
|
|
|
// onlineEvent records a peer online or offline event in the log and increments
|
|
// the peer's flap count.
|
|
func (p *peerLog) onlineEvent(online bool) {
|
|
eventTime := p.clock.Now()
|
|
|
|
// If we have a non-nil last flap time, potentially apply a cooldown
|
|
// factor to the peer's flap count before we rate limit it. This allows
|
|
// us to decrease the penalty for historical flaps over time, provided
|
|
// the peer has not flapped for a while.
|
|
if p.lastFlap != nil {
|
|
p.flapCount = cooldownFlapCount(
|
|
p.clock.Now(), p.flapCount, *p.lastFlap,
|
|
)
|
|
}
|
|
|
|
// Record flap count information and online state regardless of whether
|
|
// we have any channels open with this peer.
|
|
p.flapCount++
|
|
p.lastFlap = &eventTime
|
|
p.online = online
|
|
|
|
// If we have no channels currently open with the peer, we do not want
|
|
// to commit resources to tracking their online state beyond a simple
|
|
// online boolean, so we exit early.
|
|
if p.channelCount() == 0 {
|
|
return
|
|
}
|
|
|
|
p.addEvent(online, eventTime)
|
|
}
|
|
|
|
// addEvent records an online or offline event in our event log. and increments
|
|
// the peer's flap count.
|
|
func (p *peerLog) addEvent(online bool, time time.Time) {
|
|
eventType := peerOnlineEvent
|
|
if !online {
|
|
eventType = peerOfflineEvent
|
|
}
|
|
|
|
event := &event{
|
|
timestamp: time,
|
|
eventType: eventType,
|
|
}
|
|
|
|
// If we have no staged events, we can just stage this event and return.
|
|
if p.stagedEvent == nil {
|
|
p.stagedEvent = event
|
|
return
|
|
}
|
|
|
|
// We get the amount of time we require between events according to
|
|
// peer flap count.
|
|
aggregation := getRateLimit(p.flapCount)
|
|
nextRecordTime := p.stagedEvent.timestamp.Add(aggregation)
|
|
flushEvent := nextRecordTime.Before(event.timestamp)
|
|
|
|
// If enough time has passed since our last staged event, we add our
|
|
// event to our in-memory list.
|
|
if flushEvent {
|
|
p.onlineEvents = append(p.onlineEvents, p.stagedEvent)
|
|
}
|
|
|
|
// Finally, we replace our staged event with the new event we received.
|
|
p.stagedEvent = event
|
|
}
|
|
|
|
// addChannel adds a channel to our log. If we have not tracked any online
|
|
// events for our peer yet, we create one with our peer's current online state
|
|
// so that we know the state that the peer had at channel start, which is
|
|
// required to calculate uptime over the channel's lifetime.
|
|
func (p *peerLog) addChannel(channelPoint wire.OutPoint) error {
|
|
_, ok := p.channels[channelPoint]
|
|
if ok {
|
|
return fmt.Errorf("channel: %v already present", channelPoint)
|
|
}
|
|
|
|
openTime := p.clock.Now()
|
|
p.channels[channelPoint] = newChannelInfo(openTime)
|
|
|
|
// If we do not have any online events tracked for our peer (which is
|
|
// the case when we have no other channels open with the peer), we add
|
|
// an event with the peer's current online state so that we know that
|
|
// starting state for this peer when a channel was connected (which
|
|
// allows us to calculate uptime over the lifetime of the channel).
|
|
if len(p.onlineEvents) == 0 {
|
|
p.addEvent(p.online, openTime)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// removeChannel removes a channel from our log. If we have no more channels
|
|
// with the peer after removing this one, we clear our list of events.
|
|
func (p *peerLog) removeChannel(channelPoint wire.OutPoint) error {
|
|
_, ok := p.channels[channelPoint]
|
|
if !ok {
|
|
return fmt.Errorf("channel: %v not present", channelPoint)
|
|
}
|
|
|
|
delete(p.channels, channelPoint)
|
|
|
|
// If we have no more channels in our event log, we can discard all of
|
|
// our online events in memory, since we don't need them anymore.
|
|
// TODO(carla): this could be done on a per channel basis.
|
|
if p.channelCount() == 0 {
|
|
p.onlineEvents = nil
|
|
p.stagedEvent = nil
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// channelCount returns the number of channels that we currently have
|
|
// with the peer.
|
|
func (p *peerLog) channelCount() int {
|
|
return len(p.channels)
|
|
}
|
|
|
|
// channelUptime looks up a channel and returns the amount of time that the
|
|
// channel has been monitored for and its uptime over this period.
|
|
func (p *peerLog) channelUptime(channelPoint wire.OutPoint) (time.Duration,
|
|
time.Duration, error) {
|
|
|
|
channel, ok := p.channels[channelPoint]
|
|
if !ok {
|
|
return 0, 0, ErrChannelNotFound
|
|
}
|
|
|
|
now := p.clock.Now()
|
|
|
|
uptime, err := p.uptime(channel.openedAt, now)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
return now.Sub(channel.openedAt), uptime, nil
|
|
}
|
|
|
|
// getFlapCount returns the peer's flap count and the timestamp that we last
|
|
// recorded a flap.
|
|
func (p *peerLog) getFlapCount() (int, *time.Time) {
|
|
return p.flapCount, p.lastFlap
|
|
}
|
|
|
|
// listEvents returns all of the events that our event log has tracked,
|
|
// including events that are staged for addition to our set of events but have
|
|
// not yet been committed to (because we rate limit and store only the aggregate
|
|
// outcome over a period).
|
|
func (p *peerLog) listEvents() []*event {
|
|
if p.stagedEvent == nil {
|
|
return p.onlineEvents
|
|
}
|
|
|
|
return append(p.onlineEvents, p.stagedEvent)
|
|
}
|
|
|
|
// onlinePeriod represents a period of time over which a peer was online.
|
|
type onlinePeriod struct {
|
|
start, end time.Time
|
|
}
|
|
|
|
// getOnlinePeriods returns a list of all the periods that the event log has
|
|
// recorded the remote peer as being online. In the unexpected case where there
|
|
// are no events, the function returns early. Online periods are defined as a
|
|
// peer online event which is terminated by a peer offline event. If the event
|
|
// log ends on a peer online event, it appends a final period which is
|
|
// calculated until the present. This function expects the event log provided
|
|
// to be ordered by ascending timestamp, and can tolerate multiple consecutive
|
|
// online or offline events.
|
|
func (p *peerLog) getOnlinePeriods() []*onlinePeriod {
|
|
events := p.listEvents()
|
|
|
|
// Return early if there are no events, there are no online periods.
|
|
if len(events) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var (
|
|
// lastEvent tracks the last event that we had that was of
|
|
// a different type to our own. It is used to determine the
|
|
// start time of our online periods when we experience an
|
|
// offline event, and to track our last recorded state.
|
|
lastEvent *event
|
|
onlinePeriods []*onlinePeriod
|
|
)
|
|
|
|
// Loop through all events to build a list of periods that the peer was
|
|
// online. Online periods are added when they are terminated with a peer
|
|
// offline event. If the log ends on an online event, the period between
|
|
// the online event and the present is not tracked. The type of the most
|
|
// recent event is tracked using the offline bool so that we can add a
|
|
// final online period if necessary.
|
|
for _, event := range events {
|
|
switch event.eventType {
|
|
case peerOnlineEvent:
|
|
// If our previous event is nil, we just set it and
|
|
// break out of the switch.
|
|
if lastEvent == nil {
|
|
lastEvent = event
|
|
break
|
|
}
|
|
|
|
// If our previous event was an offline event, we update
|
|
// it to this event. We do not do this if it was an
|
|
// online event because duplicate online events would
|
|
// progress our online timestamp forward (rather than
|
|
// keep it at our earliest online event timestamp).
|
|
if lastEvent.eventType == peerOfflineEvent {
|
|
lastEvent = event
|
|
}
|
|
|
|
case peerOfflineEvent:
|
|
// If our previous event is nil, we just set it and
|
|
// break out of the switch since we cannot record an
|
|
// online period from this single event.
|
|
if lastEvent == nil {
|
|
lastEvent = event
|
|
break
|
|
}
|
|
|
|
// If the last event we saw was an online event, we
|
|
// add an online period to our set and progress our
|
|
// previous event to this offline event. We do not
|
|
// do this if we have had duplicate offline events
|
|
// because we would be tracking the most recent offline
|
|
// event (rather than keep it at our earliest offline
|
|
// event timestamp).
|
|
if lastEvent.eventType == peerOnlineEvent {
|
|
onlinePeriods = append(
|
|
onlinePeriods, &onlinePeriod{
|
|
start: lastEvent.timestamp,
|
|
end: event.timestamp,
|
|
},
|
|
)
|
|
|
|
lastEvent = event
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the last event was an peer offline event, we do not need to
|
|
// calculate a final online period and can return online periods as is.
|
|
if lastEvent.eventType == peerOfflineEvent {
|
|
return onlinePeriods
|
|
}
|
|
|
|
// The log ended on an online event, so we need to add a final online
|
|
// period which terminates at the present.
|
|
finalEvent := &onlinePeriod{
|
|
start: lastEvent.timestamp,
|
|
end: p.clock.Now(),
|
|
}
|
|
|
|
// Add the final online period to the set and return.
|
|
return append(onlinePeriods, finalEvent)
|
|
}
|
|
|
|
// uptime calculates the total uptime we have recorded for a peer over the
|
|
// inclusive range specified. An error is returned if the end of the range is
|
|
// before the start or a zero end time is returned.
|
|
func (p *peerLog) uptime(start, end time.Time) (time.Duration, error) {
|
|
// Error if we are provided with an invalid range to calculate uptime
|
|
// for.
|
|
if end.Before(start) {
|
|
return 0, fmt.Errorf("end time: %v before start time: %v",
|
|
end, start)
|
|
}
|
|
if end.IsZero() {
|
|
return 0, fmt.Errorf("zero end time")
|
|
}
|
|
|
|
var uptime time.Duration
|
|
|
|
for _, p := range p.getOnlinePeriods() {
|
|
// The online period ends before the range we're looking at, so
|
|
// we can skip over it.
|
|
if p.end.Before(start) {
|
|
continue
|
|
}
|
|
// The online period starts after the range we're looking at, so
|
|
// can stop calculating uptime.
|
|
if p.start.After(end) {
|
|
break
|
|
}
|
|
|
|
// If the online period starts before our range, shift the start
|
|
// time up so that we only calculate uptime from the start of
|
|
// our range.
|
|
if p.start.Before(start) {
|
|
p.start = start
|
|
}
|
|
|
|
// If the online period ends before our range, shift the end
|
|
// time forward so that we only calculate uptime until the end
|
|
// of the range.
|
|
if p.end.After(end) {
|
|
p.end = end
|
|
}
|
|
|
|
uptime += p.end.Sub(p.start)
|
|
}
|
|
|
|
return uptime, nil
|
|
}
|