lnd.xprv/healthcheck/healthcheck.go
carla c365a16656
healthcheck: monitor access to chain backend
Add a new health check package which will periodically poll health
check functions and shutdown if we do not succeed after our set number
of attempts. The first check that we add is one for our chain backend,
to ensure that we are connected to a bitcoin node.
2020-08-24 09:35:36 +02:00

232 lines
6.1 KiB
Go

// Package healthcheck contains a monitor which takes a set of liveliness checks
// which it periodically checks. If a check fails after its configured number
// of allowed call attempts, the monitor will send a request to shutdown using
// the function is is provided in its config. Checks are dispatched in their own
// goroutines so that they do not block each other.
package healthcheck
import (
"errors"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/lightningnetwork/lnd/ticker"
)
// Config contains configuration settings for our monitor.
type Config struct {
// Checks is a set of health checks that assert that lnd has access to
// critical resources.
Checks []*Observation
// Shutdown should be called to request safe shutdown on failure of a
// health check.
Shutdown shutdownFunc
}
// shutdownFunc is the signature we use for a shutdown function which allows us
// to print our reason for shutdown.
type shutdownFunc func(format string, params ...interface{})
// Monitor periodically checks a series of configured liveliness checks to
// ensure that lnd has access to all critical resources.
type Monitor struct {
started int32 // To be used atomically.
stopped int32 // To be used atomically.
cfg *Config
quit chan struct{}
wg sync.WaitGroup
}
// NewMonitor returns a monitor with the provided config.
func NewMonitor(cfg *Config) *Monitor {
return &Monitor{
cfg: cfg,
quit: make(chan struct{}),
}
}
// Start launches the goroutines required to run our monitor.
func (m *Monitor) Start() error {
if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
return errors.New("monitor already started")
}
// Run through all of the health checks that we have configured and
// start a goroutine for each check.
for _, check := range m.cfg.Checks {
check := check
// Skip over health checks that are disabled by setting zero
// attempts.
if check.Attempts == 0 {
log.Warnf("check: %v configured with 0 attempts, "+
"skipping it", check.Name)
continue
}
m.wg.Add(1)
go func() {
defer m.wg.Done()
check.monitor(m.cfg.Shutdown, m.quit)
}()
}
return nil
}
// Stop sends all goroutines the signal to exit and waits for them to exit.
func (m *Monitor) Stop() error {
if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
return fmt.Errorf("monitor already stopped")
}
close(m.quit)
m.wg.Wait()
return nil
}
// CreateCheck is a helper function that takes a function that produces an error
// and wraps it in a function that returns its result on an error channel.
// We do not wait group the goroutine running our checkFunc because we expect
// to be dealing with health checks that may block; if we wait group them, we
// may wait forever. Ideally future health checks will allow callers to cancel
// them early, and we can wait group this.
func CreateCheck(checkFunc func() error) func() chan error {
return func() chan error {
errChan := make(chan error, 1)
go func() {
errChan <- checkFunc()
}()
return errChan
}
}
// Observation represents a liveliness check that we periodically check.
type Observation struct {
// Name describes the health check.
Name string
// Check runs the health check itself, returning an error channel that
// is expected to receive nil or an error.
Check func() chan error
// Interval is a ticker which triggers running our check function. This
// ticker must be started and stopped by the observation.
Interval ticker.Ticker
// Attempts is the number of calls we make for a single check before
// failing.
Attempts int
// Timeout is the amount of time we allow our check function to take
// before we time it out.
Timeout time.Duration
// Backoff is the amount of time we back off between retries for failed
// checks.
Backoff time.Duration
}
// NewObservation creates an observation.
func NewObservation(name string, check func() error, interval,
timeout, backoff time.Duration, attempts int) *Observation {
return &Observation{
Name: name,
Check: CreateCheck(check),
Interval: ticker.New(interval),
Attempts: attempts,
Timeout: timeout,
Backoff: backoff,
}
}
// String returns a string representation of an observation.
func (o *Observation) String() string {
return o.Name
}
// monitor executes a health check every time its interval ticks until the quit
// channel signals that we should shutdown. This function is also responsible
// for starting and stopping our ticker.
func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
log.Debugf("Monitoring: %v", o)
o.Interval.Resume()
defer o.Interval.Stop()
for {
select {
case <-o.Interval.Ticks():
o.retryCheck(quit, shutdown)
// Exit if we receive the instruction to shutdown.
case <-quit:
return
}
}
}
// retryCheck calls a check function until it succeeds, or we reach our
// configured number of attempts, waiting for our back off period between failed
// calls. If we fail to obtain a passing health check after the allowed number
// of calls, we will request shutdown.
func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
var count int
for count < o.Attempts {
// Increment our call count and call the health check endpoint.
count++
// Wait for our check to return, timeout to elapse, or quit
// signal to be received.
var err error
select {
case err = <-o.Check():
case <-time.After(o.Timeout):
err = fmt.Errorf("health check: %v timed out after: "+
"%v", o, o.Timeout)
case <-quit:
return
}
// If our error is nil, we have passed our health check, so we
// can exit.
if err == nil {
return
}
// If we have reached our allowed number of attempts, this
// check has failed so we request shutdown.
if count == o.Attempts {
shutdown("Health check: %v failed after %v "+
"calls", o, o.Attempts)
return
}
log.Debugf("Health check: %v, call: %v failed with: %v, "+
"backing off for: %v", o, count, err, o.Backoff)
// If we are still within the number of calls allowed for this
// check, we wait for our back off period to elapse, or exit if
// we get the signal to shutdown.
select {
case <-time.After(o.Backoff):
case <-quit:
return
}
}
}