35a2dbcabb
In this commit, we bump up the logging for failed healthchecks from `debug` to `info`. This should help us get to the bottom of the current set of reported false positives that are causing `lnd` nodes to erroneously shutdown.
232 lines
6.1 KiB
Go
232 lines
6.1 KiB
Go
// Package healthcheck contains a monitor which takes a set of liveliness checks
|
|
// which it periodically checks. If a check fails after its configured number
|
|
// of allowed call attempts, the monitor will send a request to shutdown using
|
|
// the function is is provided in its config. Checks are dispatched in their own
|
|
// goroutines so that they do not block each other.
|
|
package healthcheck
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/lightningnetwork/lnd/ticker"
|
|
)
|
|
|
|
// Config contains configuration settings for our monitor.
|
|
type Config struct {
|
|
// Checks is a set of health checks that assert that lnd has access to
|
|
// critical resources.
|
|
Checks []*Observation
|
|
|
|
// Shutdown should be called to request safe shutdown on failure of a
|
|
// health check.
|
|
Shutdown shutdownFunc
|
|
}
|
|
|
|
// shutdownFunc is the signature we use for a shutdown function which allows us
|
|
// to print our reason for shutdown.
|
|
type shutdownFunc func(format string, params ...interface{})
|
|
|
|
// Monitor periodically checks a series of configured liveliness checks to
|
|
// ensure that lnd has access to all critical resources.
|
|
type Monitor struct {
|
|
started int32 // To be used atomically.
|
|
stopped int32 // To be used atomically.
|
|
|
|
cfg *Config
|
|
|
|
quit chan struct{}
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// NewMonitor returns a monitor with the provided config.
|
|
func NewMonitor(cfg *Config) *Monitor {
|
|
return &Monitor{
|
|
cfg: cfg,
|
|
quit: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start launches the goroutines required to run our monitor.
|
|
func (m *Monitor) Start() error {
|
|
if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
|
|
return errors.New("monitor already started")
|
|
}
|
|
|
|
// Run through all of the health checks that we have configured and
|
|
// start a goroutine for each check.
|
|
for _, check := range m.cfg.Checks {
|
|
check := check
|
|
|
|
// Skip over health checks that are disabled by setting zero
|
|
// attempts.
|
|
if check.Attempts == 0 {
|
|
log.Warnf("check: %v configured with 0 attempts, "+
|
|
"skipping it", check.Name)
|
|
|
|
continue
|
|
}
|
|
|
|
m.wg.Add(1)
|
|
go func() {
|
|
defer m.wg.Done()
|
|
check.monitor(m.cfg.Shutdown, m.quit)
|
|
}()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop sends all goroutines the signal to exit and waits for them to exit.
|
|
func (m *Monitor) Stop() error {
|
|
if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
|
|
return fmt.Errorf("monitor already stopped")
|
|
}
|
|
|
|
close(m.quit)
|
|
m.wg.Wait()
|
|
|
|
return nil
|
|
}
|
|
|
|
// CreateCheck is a helper function that takes a function that produces an error
|
|
// and wraps it in a function that returns its result on an error channel.
|
|
// We do not wait group the goroutine running our checkFunc because we expect
|
|
// to be dealing with health checks that may block; if we wait group them, we
|
|
// may wait forever. Ideally future health checks will allow callers to cancel
|
|
// them early, and we can wait group this.
|
|
func CreateCheck(checkFunc func() error) func() chan error {
|
|
return func() chan error {
|
|
errChan := make(chan error, 1)
|
|
go func() {
|
|
errChan <- checkFunc()
|
|
}()
|
|
|
|
return errChan
|
|
}
|
|
}
|
|
|
|
// Observation represents a liveliness check that we periodically check.
|
|
type Observation struct {
|
|
// Name describes the health check.
|
|
Name string
|
|
|
|
// Check runs the health check itself, returning an error channel that
|
|
// is expected to receive nil or an error.
|
|
Check func() chan error
|
|
|
|
// Interval is a ticker which triggers running our check function. This
|
|
// ticker must be started and stopped by the observation.
|
|
Interval ticker.Ticker
|
|
|
|
// Attempts is the number of calls we make for a single check before
|
|
// failing.
|
|
Attempts int
|
|
|
|
// Timeout is the amount of time we allow our check function to take
|
|
// before we time it out.
|
|
Timeout time.Duration
|
|
|
|
// Backoff is the amount of time we back off between retries for failed
|
|
// checks.
|
|
Backoff time.Duration
|
|
}
|
|
|
|
// NewObservation creates an observation.
|
|
func NewObservation(name string, check func() error, interval,
|
|
timeout, backoff time.Duration, attempts int) *Observation {
|
|
|
|
return &Observation{
|
|
Name: name,
|
|
Check: CreateCheck(check),
|
|
Interval: ticker.New(interval),
|
|
Attempts: attempts,
|
|
Timeout: timeout,
|
|
Backoff: backoff,
|
|
}
|
|
}
|
|
|
|
// String returns a string representation of an observation.
|
|
func (o *Observation) String() string {
|
|
return o.Name
|
|
}
|
|
|
|
// monitor executes a health check every time its interval ticks until the quit
|
|
// channel signals that we should shutdown. This function is also responsible
|
|
// for starting and stopping our ticker.
|
|
func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
|
|
log.Debugf("Monitoring: %v", o)
|
|
|
|
o.Interval.Resume()
|
|
defer o.Interval.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-o.Interval.Ticks():
|
|
o.retryCheck(quit, shutdown)
|
|
|
|
// Exit if we receive the instruction to shutdown.
|
|
case <-quit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// retryCheck calls a check function until it succeeds, or we reach our
|
|
// configured number of attempts, waiting for our back off period between failed
|
|
// calls. If we fail to obtain a passing health check after the allowed number
|
|
// of calls, we will request shutdown.
|
|
func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
|
var count int
|
|
|
|
for count < o.Attempts {
|
|
// Increment our call count and call the health check endpoint.
|
|
count++
|
|
|
|
// Wait for our check to return, timeout to elapse, or quit
|
|
// signal to be received.
|
|
var err error
|
|
select {
|
|
case err = <-o.Check():
|
|
|
|
case <-time.After(o.Timeout):
|
|
err = fmt.Errorf("health check: %v timed out after: "+
|
|
"%v", o, o.Timeout)
|
|
|
|
case <-quit:
|
|
return
|
|
}
|
|
|
|
// If our error is nil, we have passed our health check, so we
|
|
// can exit.
|
|
if err == nil {
|
|
return
|
|
}
|
|
|
|
// If we have reached our allowed number of attempts, this
|
|
// check has failed so we request shutdown.
|
|
if count == o.Attempts {
|
|
shutdown("Health check: %v failed after %v "+
|
|
"calls", o, o.Attempts)
|
|
|
|
return
|
|
}
|
|
|
|
log.Infof("Health check: %v, call: %v failed with: %v, "+
|
|
"backing off for: %v", o, count, err, o.Backoff)
|
|
|
|
// If we are still within the number of calls allowed for this
|
|
// check, we wait for our back off period to elapse, or exit if
|
|
// we get the signal to shutdown.
|
|
select {
|
|
case <-time.After(o.Backoff):
|
|
|
|
case <-quit:
|
|
return
|
|
}
|
|
}
|
|
}
|