Merge pull request #4479 from carlaKC/4306-shutdownlogger
healthcheck: add regular chain backend and disk space checks
This commit is contained in:
commit
1b8fbba615
53
build/log_shutdown.go
Normal file
53
build/log_shutdown.go
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
package build
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/btcsuite/btclog"
|
||||||
|
"github.com/lightningnetwork/lnd/signal"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ShutdownLogger wraps an existing logger with a shutdown function which will
|
||||||
|
// be called on Critical/Criticalf to prompt shutdown.
|
||||||
|
type ShutdownLogger struct {
|
||||||
|
btclog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewShutdownLogger creates a shutdown logger for the log provided which will
|
||||||
|
// use the signal package to request shutdown on critical errors.
|
||||||
|
func NewShutdownLogger(logger btclog.Logger) *ShutdownLogger {
|
||||||
|
return &ShutdownLogger{
|
||||||
|
Logger: logger,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Criticalf formats message according to format specifier and writes to
|
||||||
|
// log with LevelCritical. It will then call the shutdown logger's shutdown
|
||||||
|
// function to prompt safe shutdown.
|
||||||
|
//
|
||||||
|
// Note: it is part of the btclog.Logger interface.
|
||||||
|
func (s *ShutdownLogger) Criticalf(format string, params ...interface{}) {
|
||||||
|
s.Logger.Criticalf(format, params...)
|
||||||
|
s.shutdown()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Critical formats message using the default formats for its operands
|
||||||
|
// and writes to log with LevelCritical. It will then call the shutdown
|
||||||
|
// logger's shutdown function to prompt safe shutdown.
|
||||||
|
//
|
||||||
|
// Note: it is part of the btclog.Logger interface.
|
||||||
|
func (s *ShutdownLogger) Critical(v ...interface{}) {
|
||||||
|
s.Logger.Critical(v)
|
||||||
|
s.shutdown()
|
||||||
|
}
|
||||||
|
|
||||||
|
// shutdown checks whether we are listening for interrupts, since a shutdown
|
||||||
|
// request to the signal package will block if it is not running, and requests
|
||||||
|
// shutdown if possible.
|
||||||
|
func (s *ShutdownLogger) shutdown() {
|
||||||
|
if !signal.Listening() {
|
||||||
|
s.Logger.Info("Request for shutdown ignored")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Logger.Info("Sending request for shutdown")
|
||||||
|
signal.RequestShutdown()
|
||||||
|
}
|
@ -39,7 +39,10 @@ func NewRotatingLogWriter() *RotatingLogWriter {
|
|||||||
logWriter := &LogWriter{}
|
logWriter := &LogWriter{}
|
||||||
backendLog := btclog.NewBackend(logWriter)
|
backendLog := btclog.NewBackend(logWriter)
|
||||||
return &RotatingLogWriter{
|
return &RotatingLogWriter{
|
||||||
GenSubLogger: backendLog.Logger,
|
GenSubLogger: func(tag string) btclog.Logger {
|
||||||
|
logger := backendLog.Logger(tag)
|
||||||
|
return NewShutdownLogger(logger)
|
||||||
|
},
|
||||||
logWriter: logWriter,
|
logWriter: logWriter,
|
||||||
backendLog: backendLog,
|
backendLog: backendLog,
|
||||||
subsystemLoggers: SubLoggers{},
|
subsystemLoggers: SubLoggers{},
|
||||||
|
@ -414,7 +414,10 @@ func openChannelPsbt(ctx *cli.Context, client lnrpc.LightningClient,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("opening stream to server failed: %v", err)
|
return fmt.Errorf("opening stream to server failed: %v", err)
|
||||||
}
|
}
|
||||||
signal.Intercept()
|
|
||||||
|
if err := signal.Intercept(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// We also need to spawn a goroutine that reads from the server. This
|
// We also need to spawn a goroutine that reads from the server. This
|
||||||
// will copy the messages to the channel as long as they come in or add
|
// will copy the messages to the channel as long as they come in or add
|
||||||
|
@ -19,7 +19,10 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Hook interceptor for os signals.
|
// Hook interceptor for os signals.
|
||||||
signal.Intercept()
|
if err := signal.Intercept(); err != nil {
|
||||||
|
_, _ = fmt.Fprintln(os.Stderr, err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
// Call the "real" main in a nested manner so the defers will properly
|
// Call the "real" main in a nested manner so the defers will properly
|
||||||
// be executed in the case of a graceful shutdown.
|
// be executed in the case of a graceful shutdown.
|
||||||
|
35
config.go
35
config.go
@ -86,6 +86,21 @@ const (
|
|||||||
// HostAnnouncer will wait between DNS resolutions to check if the
|
// HostAnnouncer will wait between DNS resolutions to check if the
|
||||||
// backing IP of a host has changed.
|
// backing IP of a host has changed.
|
||||||
defaultHostSampleInterval = time.Minute * 5
|
defaultHostSampleInterval = time.Minute * 5
|
||||||
|
|
||||||
|
defaultChainInterval = time.Minute
|
||||||
|
defaultChainTimeout = time.Second * 10
|
||||||
|
defaultChainBackoff = time.Second * 30
|
||||||
|
defaultChainAttempts = 3
|
||||||
|
|
||||||
|
// By default, we will shutdown if less than 10% of disk space is
|
||||||
|
// available. We allow a longer interval for disk space checks, because
|
||||||
|
// this check is less likely to deteriorate quickly. However, we allow
|
||||||
|
// fewer retries because this should not be a flakey check.
|
||||||
|
defaultRequiredDisk = 0.1
|
||||||
|
defaultDiskInterval = time.Hour * 12
|
||||||
|
defaultDiskTimeout = time.Second * 5
|
||||||
|
defaultDiskBackoff = time.Minute
|
||||||
|
defaultDiskAttempts = 2
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@ -265,6 +280,8 @@ type Config struct {
|
|||||||
|
|
||||||
AllowCircularRoute bool `long:"allow-circular-route" description:"If true, our node will allow htlc forwards that arrive and depart on the same channel."`
|
AllowCircularRoute bool `long:"allow-circular-route" description:"If true, our node will allow htlc forwards that arrive and depart on the same channel."`
|
||||||
|
|
||||||
|
HealthChecks *lncfg.HealthCheckConfig `group:"healthcheck" namespace:"healthcheck"`
|
||||||
|
|
||||||
DB *lncfg.DB `group:"db" namespace:"db"`
|
DB *lncfg.DB `group:"db" namespace:"db"`
|
||||||
|
|
||||||
// LogWriter is the root logger that all of the daemon's subloggers are
|
// LogWriter is the root logger that all of the daemon's subloggers are
|
||||||
@ -383,6 +400,23 @@ func DefaultConfig() Config {
|
|||||||
Watchtower: &lncfg.Watchtower{
|
Watchtower: &lncfg.Watchtower{
|
||||||
TowerDir: defaultTowerDir,
|
TowerDir: defaultTowerDir,
|
||||||
},
|
},
|
||||||
|
HealthChecks: &lncfg.HealthCheckConfig{
|
||||||
|
ChainCheck: &lncfg.CheckConfig{
|
||||||
|
Interval: defaultChainInterval,
|
||||||
|
Timeout: defaultChainTimeout,
|
||||||
|
Attempts: defaultChainAttempts,
|
||||||
|
Backoff: defaultChainBackoff,
|
||||||
|
},
|
||||||
|
DiskCheck: &lncfg.DiskCheckConfig{
|
||||||
|
RequiredRemaining: defaultRequiredDisk,
|
||||||
|
CheckConfig: &lncfg.CheckConfig{
|
||||||
|
Interval: defaultDiskInterval,
|
||||||
|
Attempts: defaultDiskAttempts,
|
||||||
|
Timeout: defaultDiskTimeout,
|
||||||
|
Backoff: defaultDiskBackoff,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
MaxOutgoingCltvExpiry: htlcswitch.DefaultMaxOutgoingCltvExpiry,
|
MaxOutgoingCltvExpiry: htlcswitch.DefaultMaxOutgoingCltvExpiry,
|
||||||
MaxChannelFeeAllocation: htlcswitch.DefaultMaxLinkFeeAllocation,
|
MaxChannelFeeAllocation: htlcswitch.DefaultMaxLinkFeeAllocation,
|
||||||
LogWriter: build.NewRotatingLogWriter(),
|
LogWriter: build.NewRotatingLogWriter(),
|
||||||
@ -1124,6 +1158,7 @@ func ValidateConfig(cfg Config, usageMessage string) (*Config, error) {
|
|||||||
cfg.Caches,
|
cfg.Caches,
|
||||||
cfg.WtClient,
|
cfg.WtClient,
|
||||||
cfg.DB,
|
cfg.DB,
|
||||||
|
cfg.HealthChecks,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
1
go.mod
1
go.mod
@ -66,6 +66,7 @@ require (
|
|||||||
go.uber.org/zap v1.14.1 // indirect
|
go.uber.org/zap v1.14.1 // indirect
|
||||||
golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899
|
golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899
|
||||||
golang.org/x/net v0.0.0-20191002035440-2ec189313ef0
|
golang.org/x/net v0.0.0-20191002035440-2ec189313ef0
|
||||||
|
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5
|
||||||
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2
|
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2
|
||||||
google.golang.org/grpc v1.24.0
|
google.golang.org/grpc v1.24.0
|
||||||
gopkg.in/errgo.v1 v1.0.1 // indirect
|
gopkg.in/errgo.v1 v1.0.1 // indirect
|
||||||
|
1
go.sum
1
go.sum
@ -332,6 +332,7 @@ golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFW
|
|||||||
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
|
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
|
||||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20200724161237-0e2f3a69832c h1:UIcGWL6/wpCfyGuJnRFJRurA+yj8RrW7Q6x2YMCXt6c=
|
||||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 h1:z99zHgr7hKfrUcX/KsoJk5FJfjTceCKIp96+biqP4To=
|
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 h1:z99zHgr7hKfrUcX/KsoJk5FJfjTceCKIp96+biqP4To=
|
||||||
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
|
18
healthcheck/diskcheck.go
Normal file
18
healthcheck/diskcheck.go
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
// +build !windows,!solaris
|
||||||
|
|
||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import "syscall"
|
||||||
|
|
||||||
|
// AvailableDiskSpace returns ratio of available disk space to total capacity.
|
||||||
|
func AvailableDiskSpace(path string) (float64, error) {
|
||||||
|
s := syscall.Statfs_t{}
|
||||||
|
err := syscall.Statfs(path, &s)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate our free blocks/total blocks to get our total ratio of
|
||||||
|
// free blocks.
|
||||||
|
return float64(s.Bfree) / float64(s.Blocks), nil
|
||||||
|
}
|
17
healthcheck/diskcheck_solaris.go
Normal file
17
healthcheck/diskcheck_solaris.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import "golang.org/x/sys/unix"
|
||||||
|
|
||||||
|
// AvailableDiskSpace returns ratio of available disk space to total capacity
|
||||||
|
// for solaris.
|
||||||
|
func AvailableDiskSpace(path string) (float64, error) {
|
||||||
|
s := unix.Statvfs_t{}
|
||||||
|
err := unix.Statvfs(path, &s)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate our free blocks/total blocks to get our total ratio of
|
||||||
|
// free blocks.
|
||||||
|
return float64(s.Bfree) / float64(s.Blocks), nil
|
||||||
|
}
|
17
healthcheck/diskcheck_windows.go
Normal file
17
healthcheck/diskcheck_windows.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import "golang.org/x/sys/windows"
|
||||||
|
|
||||||
|
// AvailableDiskSpace returns ratio of available disk space to total capacity
|
||||||
|
// for windows.
|
||||||
|
func AvailableDiskSpace(path string) (float64, error) {
|
||||||
|
var free, total, avail uint64
|
||||||
|
|
||||||
|
pathPtr, err := windows.UTF16PtrFromString(path)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
err = windows.GetDiskFreeSpaceEx(pathPtr, &free, &total, &avail)
|
||||||
|
|
||||||
|
return float64(avail) / float64(total), nil
|
||||||
|
}
|
231
healthcheck/healthcheck.go
Normal file
231
healthcheck/healthcheck.go
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
// Package healthcheck contains a monitor which takes a set of liveliness checks
|
||||||
|
// which it periodically checks. If a check fails after its configured number
|
||||||
|
// of allowed call attempts, the monitor will send a request to shutdown using
|
||||||
|
// the function is is provided in its config. Checks are dispatched in their own
|
||||||
|
// goroutines so that they do not block each other.
|
||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/lightningnetwork/lnd/ticker"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config contains configuration settings for our monitor.
|
||||||
|
type Config struct {
|
||||||
|
// Checks is a set of health checks that assert that lnd has access to
|
||||||
|
// critical resources.
|
||||||
|
Checks []*Observation
|
||||||
|
|
||||||
|
// Shutdown should be called to request safe shutdown on failure of a
|
||||||
|
// health check.
|
||||||
|
Shutdown shutdownFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// shutdownFunc is the signature we use for a shutdown function which allows us
|
||||||
|
// to print our reason for shutdown.
|
||||||
|
type shutdownFunc func(format string, params ...interface{})
|
||||||
|
|
||||||
|
// Monitor periodically checks a series of configured liveliness checks to
|
||||||
|
// ensure that lnd has access to all critical resources.
|
||||||
|
type Monitor struct {
|
||||||
|
started int32 // To be used atomically.
|
||||||
|
stopped int32 // To be used atomically.
|
||||||
|
|
||||||
|
cfg *Config
|
||||||
|
|
||||||
|
quit chan struct{}
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewMonitor returns a monitor with the provided config.
|
||||||
|
func NewMonitor(cfg *Config) *Monitor {
|
||||||
|
return &Monitor{
|
||||||
|
cfg: cfg,
|
||||||
|
quit: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start launches the goroutines required to run our monitor.
|
||||||
|
func (m *Monitor) Start() error {
|
||||||
|
if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
|
||||||
|
return errors.New("monitor already started")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run through all of the health checks that we have configured and
|
||||||
|
// start a goroutine for each check.
|
||||||
|
for _, check := range m.cfg.Checks {
|
||||||
|
check := check
|
||||||
|
|
||||||
|
// Skip over health checks that are disabled by setting zero
|
||||||
|
// attempts.
|
||||||
|
if check.Attempts == 0 {
|
||||||
|
log.Warnf("check: %v configured with 0 attempts, "+
|
||||||
|
"skipping it", check.Name)
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
m.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer m.wg.Done()
|
||||||
|
check.monitor(m.cfg.Shutdown, m.quit)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop sends all goroutines the signal to exit and waits for them to exit.
|
||||||
|
func (m *Monitor) Stop() error {
|
||||||
|
if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
|
||||||
|
return fmt.Errorf("monitor already stopped")
|
||||||
|
}
|
||||||
|
|
||||||
|
close(m.quit)
|
||||||
|
m.wg.Wait()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateCheck is a helper function that takes a function that produces an error
|
||||||
|
// and wraps it in a function that returns its result on an error channel.
|
||||||
|
// We do not wait group the goroutine running our checkFunc because we expect
|
||||||
|
// to be dealing with health checks that may block; if we wait group them, we
|
||||||
|
// may wait forever. Ideally future health checks will allow callers to cancel
|
||||||
|
// them early, and we can wait group this.
|
||||||
|
func CreateCheck(checkFunc func() error) func() chan error {
|
||||||
|
return func() chan error {
|
||||||
|
errChan := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
errChan <- checkFunc()
|
||||||
|
}()
|
||||||
|
|
||||||
|
return errChan
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Observation represents a liveliness check that we periodically check.
|
||||||
|
type Observation struct {
|
||||||
|
// Name describes the health check.
|
||||||
|
Name string
|
||||||
|
|
||||||
|
// Check runs the health check itself, returning an error channel that
|
||||||
|
// is expected to receive nil or an error.
|
||||||
|
Check func() chan error
|
||||||
|
|
||||||
|
// Interval is a ticker which triggers running our check function. This
|
||||||
|
// ticker must be started and stopped by the observation.
|
||||||
|
Interval ticker.Ticker
|
||||||
|
|
||||||
|
// Attempts is the number of calls we make for a single check before
|
||||||
|
// failing.
|
||||||
|
Attempts int
|
||||||
|
|
||||||
|
// Timeout is the amount of time we allow our check function to take
|
||||||
|
// before we time it out.
|
||||||
|
Timeout time.Duration
|
||||||
|
|
||||||
|
// Backoff is the amount of time we back off between retries for failed
|
||||||
|
// checks.
|
||||||
|
Backoff time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewObservation creates an observation.
|
||||||
|
func NewObservation(name string, check func() error, interval,
|
||||||
|
timeout, backoff time.Duration, attempts int) *Observation {
|
||||||
|
|
||||||
|
return &Observation{
|
||||||
|
Name: name,
|
||||||
|
Check: CreateCheck(check),
|
||||||
|
Interval: ticker.New(interval),
|
||||||
|
Attempts: attempts,
|
||||||
|
Timeout: timeout,
|
||||||
|
Backoff: backoff,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns a string representation of an observation.
|
||||||
|
func (o *Observation) String() string {
|
||||||
|
return o.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
// monitor executes a health check every time its interval ticks until the quit
|
||||||
|
// channel signals that we should shutdown. This function is also responsible
|
||||||
|
// for starting and stopping our ticker.
|
||||||
|
func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
|
||||||
|
log.Debugf("Monitoring: %v", o)
|
||||||
|
|
||||||
|
o.Interval.Resume()
|
||||||
|
defer o.Interval.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-o.Interval.Ticks():
|
||||||
|
o.retryCheck(quit, shutdown)
|
||||||
|
|
||||||
|
// Exit if we receive the instruction to shutdown.
|
||||||
|
case <-quit:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// retryCheck calls a check function until it succeeds, or we reach our
|
||||||
|
// configured number of attempts, waiting for our back off period between failed
|
||||||
|
// calls. If we fail to obtain a passing health check after the allowed number
|
||||||
|
// of calls, we will request shutdown.
|
||||||
|
func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
|
||||||
|
var count int
|
||||||
|
|
||||||
|
for count < o.Attempts {
|
||||||
|
// Increment our call count and call the health check endpoint.
|
||||||
|
count++
|
||||||
|
|
||||||
|
// Wait for our check to return, timeout to elapse, or quit
|
||||||
|
// signal to be received.
|
||||||
|
var err error
|
||||||
|
select {
|
||||||
|
case err = <-o.Check():
|
||||||
|
|
||||||
|
case <-time.After(o.Timeout):
|
||||||
|
err = fmt.Errorf("health check: %v timed out after: "+
|
||||||
|
"%v", o, o.Timeout)
|
||||||
|
|
||||||
|
case <-quit:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If our error is nil, we have passed our health check, so we
|
||||||
|
// can exit.
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have reached our allowed number of attempts, this
|
||||||
|
// check has failed so we request shutdown.
|
||||||
|
if count == o.Attempts {
|
||||||
|
shutdown("Health check: %v failed after %v "+
|
||||||
|
"calls", o, o.Attempts)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debugf("Health check: %v, call: %v failed with: %v, "+
|
||||||
|
"backing off for: %v", o, count, err, o.Backoff)
|
||||||
|
|
||||||
|
// If we are still within the number of calls allowed for this
|
||||||
|
// check, we wait for our back off period to elapse, or exit if
|
||||||
|
// we get the signal to shutdown.
|
||||||
|
select {
|
||||||
|
case <-time.After(o.Backoff):
|
||||||
|
|
||||||
|
case <-quit:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
225
healthcheck/healthcheck_test.go
Normal file
225
healthcheck/healthcheck_test.go
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/lightningnetwork/lnd/ticker"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
errNonNil = errors.New("non-nil test error")
|
||||||
|
timeout = time.Second
|
||||||
|
testTime = time.Unix(1, 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
type mockedCheck struct {
|
||||||
|
t *testing.T
|
||||||
|
errChan chan error
|
||||||
|
}
|
||||||
|
|
||||||
|
// newMockCheck creates a new mock.
|
||||||
|
func newMockCheck(t *testing.T) *mockedCheck {
|
||||||
|
return &mockedCheck{
|
||||||
|
t: t,
|
||||||
|
errChan: make(chan error),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// call returns our mock's error channel, which we can send responses on.
|
||||||
|
func (m *mockedCheck) call() chan error {
|
||||||
|
return m.errChan
|
||||||
|
}
|
||||||
|
|
||||||
|
// sendError sends an error into our mock's error channel, mocking the sending
|
||||||
|
// of a response from our check function.
|
||||||
|
func (m *mockedCheck) sendError(err error) {
|
||||||
|
select {
|
||||||
|
case m.errChan <- err:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
m.t.Fatalf("could not send error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestMonitor tests creation and triggering of a monitor with a health check.
|
||||||
|
func TestMonitor(t *testing.T) {
|
||||||
|
intervalTicker := ticker.NewForce(time.Hour)
|
||||||
|
|
||||||
|
mock := newMockCheck(t)
|
||||||
|
shutdown := make(chan struct{})
|
||||||
|
|
||||||
|
// Create our config for monitoring. We will use a 0 back off so that
|
||||||
|
// out test does not need to wait.
|
||||||
|
cfg := &Config{
|
||||||
|
Checks: []*Observation{
|
||||||
|
{
|
||||||
|
Check: mock.call,
|
||||||
|
Interval: intervalTicker,
|
||||||
|
Attempts: 2,
|
||||||
|
Backoff: 0,
|
||||||
|
Timeout: time.Hour,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Shutdown: func(string, ...interface{}) {
|
||||||
|
shutdown <- struct{}{}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
monitor := NewMonitor(cfg)
|
||||||
|
|
||||||
|
require.NoError(t, monitor.Start(), "could not start monitor")
|
||||||
|
|
||||||
|
// Tick is a helper we will use to tick our interval.
|
||||||
|
tick := func() {
|
||||||
|
select {
|
||||||
|
case intervalTicker.Force <- testTime:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("could not tick timer")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tick our timer and provide our error channel with a nil error. This
|
||||||
|
// mocks our check function succeeding on the first call.
|
||||||
|
tick()
|
||||||
|
mock.sendError(nil)
|
||||||
|
|
||||||
|
// Now we tick our timer again. This time send a non-nil error, followed
|
||||||
|
// by a nil error. This tests our retry logic, because we allow 2
|
||||||
|
// retries, so should recover without needing to shutdown.
|
||||||
|
tick()
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
mock.sendError(nil)
|
||||||
|
|
||||||
|
// Finally, we tick our timer once more, and send two non-nil errors
|
||||||
|
// into our error channel. This mocks our check function failing twice.
|
||||||
|
tick()
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
mock.sendError(errNonNil)
|
||||||
|
|
||||||
|
// Since we have failed within our allowed number of retries, we now
|
||||||
|
// expect a call to our shutdown function.
|
||||||
|
select {
|
||||||
|
case <-shutdown:
|
||||||
|
case <-time.After(timeout):
|
||||||
|
t.Fatal("expected shutdown")
|
||||||
|
}
|
||||||
|
|
||||||
|
require.NoError(t, monitor.Stop(), "could not stop monitor")
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRetryCheck tests our retry logic. It does not include a test for exiting
|
||||||
|
// during the back off period.
|
||||||
|
func TestRetryCheck(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
|
||||||
|
// errors provides an in-order list of errors that we expect our
|
||||||
|
// health check to respond with. The number of errors in this
|
||||||
|
// list indicates the number of times we expect our check to
|
||||||
|
// be called, because our test will fail if we do not consume
|
||||||
|
// every error.
|
||||||
|
errors []error
|
||||||
|
|
||||||
|
// attempts is the number of times we call a check before
|
||||||
|
// failing.
|
||||||
|
attempts int
|
||||||
|
|
||||||
|
// timeout is the time we allow our check to take before we
|
||||||
|
// fail them.
|
||||||
|
timeout time.Duration
|
||||||
|
|
||||||
|
// expectedShutdown is true if we expect a shutdown to be
|
||||||
|
// triggered because all of our calls failed.
|
||||||
|
expectedShutdown bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "first call succeeds",
|
||||||
|
errors: []error{nil},
|
||||||
|
attempts: 2,
|
||||||
|
timeout: time.Hour,
|
||||||
|
expectedShutdown: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "first call fails",
|
||||||
|
errors: []error{errNonNil},
|
||||||
|
attempts: 1,
|
||||||
|
timeout: time.Hour,
|
||||||
|
expectedShutdown: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "fail then recover",
|
||||||
|
errors: []error{errNonNil, nil},
|
||||||
|
attempts: 2,
|
||||||
|
timeout: time.Hour,
|
||||||
|
expectedShutdown: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "always fail",
|
||||||
|
errors: []error{errNonNil, errNonNil},
|
||||||
|
attempts: 2,
|
||||||
|
timeout: time.Hour,
|
||||||
|
expectedShutdown: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no calls",
|
||||||
|
errors: nil,
|
||||||
|
attempts: 0,
|
||||||
|
timeout: time.Hour,
|
||||||
|
expectedShutdown: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "call times out",
|
||||||
|
errors: nil,
|
||||||
|
attempts: 1,
|
||||||
|
timeout: 1,
|
||||||
|
expectedShutdown: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
test := test
|
||||||
|
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
var shutdown bool
|
||||||
|
shutdownFunc := func(string, ...interface{}) {
|
||||||
|
shutdown = true
|
||||||
|
}
|
||||||
|
|
||||||
|
mock := newMockCheck(t)
|
||||||
|
|
||||||
|
// Create an observation that calls our call counting
|
||||||
|
// function. We set a zero back off so that the test
|
||||||
|
// will not wait.
|
||||||
|
observation := &Observation{
|
||||||
|
Check: mock.call,
|
||||||
|
Attempts: test.attempts,
|
||||||
|
Timeout: test.timeout,
|
||||||
|
Backoff: 0,
|
||||||
|
}
|
||||||
|
quit := make(chan struct{})
|
||||||
|
|
||||||
|
// Run our retry check in a goroutine because it blocks
|
||||||
|
// on us sending errors into the mocked caller's error
|
||||||
|
// channel.
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
observation.retryCheck(quit, shutdownFunc)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Prompt our mock caller to send responses for calls
|
||||||
|
// to our call function.
|
||||||
|
for _, err := range test.errors {
|
||||||
|
mock.sendError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that we have finished running our retry
|
||||||
|
// check function before we start checking results.
|
||||||
|
<-done
|
||||||
|
|
||||||
|
require.Equal(t, test.expectedShutdown, shutdown,
|
||||||
|
"unexpected shutdown state")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
32
healthcheck/log.go
Normal file
32
healthcheck/log.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
package healthcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/btcsuite/btclog"
|
||||||
|
"github.com/lightningnetwork/lnd/build"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Subsystem defines the logging code for this subsystem.
|
||||||
|
const Subsystem = "HLCK"
|
||||||
|
|
||||||
|
// log is a logger that is initialized with no output filters. This
|
||||||
|
// means the package will not perform any logging by default until the caller
|
||||||
|
// requests it.
|
||||||
|
var log btclog.Logger
|
||||||
|
|
||||||
|
// The default amount of logging is none.
|
||||||
|
func init() {
|
||||||
|
UseLogger(build.NewSubLogger(Subsystem, nil))
|
||||||
|
}
|
||||||
|
|
||||||
|
// DisableLog disables all library log output. Logging output is disabled
|
||||||
|
// by default until UseLogger is called.
|
||||||
|
func DisableLog() {
|
||||||
|
UseLogger(btclog.Disabled)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UseLogger uses a specified Logger to output package logging info.
|
||||||
|
// This should be used in preference to SetLogWriter if the caller is also
|
||||||
|
// using btclog.
|
||||||
|
func UseLogger(logger btclog.Logger) {
|
||||||
|
log = logger
|
||||||
|
}
|
90
lncfg/healthcheck.go
Normal file
90
lncfg/healthcheck.go
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
package lncfg
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// MinHealthCheckInterval is the minimum interval we allow between
|
||||||
|
// health checks.
|
||||||
|
MinHealthCheckInterval = time.Minute
|
||||||
|
|
||||||
|
// MinHealthCheckTimeout is the minimum timeout we allow for health
|
||||||
|
// check calls.
|
||||||
|
MinHealthCheckTimeout = time.Second
|
||||||
|
|
||||||
|
// MinHealthCheckBackoff is the minimum back off we allow between health
|
||||||
|
// check retries.
|
||||||
|
MinHealthCheckBackoff = time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// HealthCheckConfig contains the configuration for the different health checks
|
||||||
|
// the lnd runs.
|
||||||
|
type HealthCheckConfig struct {
|
||||||
|
ChainCheck *CheckConfig `group:"chainbackend" namespace:"chainbackend"`
|
||||||
|
|
||||||
|
DiskCheck *DiskCheckConfig `group:"diskspace" namespace:"diskspace"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate checks the values configured for our health checks.
|
||||||
|
func (h *HealthCheckConfig) Validate() error {
|
||||||
|
if err := h.ChainCheck.validate("chain backend"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := h.DiskCheck.validate("disk space"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if h.DiskCheck.RequiredRemaining < 0 ||
|
||||||
|
h.DiskCheck.RequiredRemaining >= 1 {
|
||||||
|
|
||||||
|
return errors.New("disk required ratio must be in [0:1)")
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type CheckConfig struct {
|
||||||
|
Interval time.Duration `long:"interval" description:"How often to run a health check."`
|
||||||
|
|
||||||
|
Attempts int `long:"attempts" description:"The number of calls we will make for the check before failing. Set this value to 0 to disable a check."`
|
||||||
|
|
||||||
|
Timeout time.Duration `long:"timeout" description:"The amount of time we allow the health check to take before failing due to timeout."`
|
||||||
|
|
||||||
|
Backoff time.Duration `long:"backoff" description:"The amount of time to back-off between failed health checks."`
|
||||||
|
}
|
||||||
|
|
||||||
|
// validate checks the values in a health check config entry if it is enabled.
|
||||||
|
func (c *CheckConfig) validate(name string) error {
|
||||||
|
if c.Attempts == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.Backoff < MinHealthCheckBackoff {
|
||||||
|
return fmt.Errorf("%v backoff: %v below minimum: %v", name,
|
||||||
|
c.Backoff, MinHealthCheckBackoff)
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.Timeout < MinHealthCheckTimeout {
|
||||||
|
return fmt.Errorf("%v timeout: %v below minimum: %v", name,
|
||||||
|
c.Timeout, MinHealthCheckTimeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.Interval < MinHealthCheckInterval {
|
||||||
|
return fmt.Errorf("%v interval: %v below minimum: %v", name,
|
||||||
|
c.Interval, MinHealthCheckInterval)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiskCheckConfig contains configuration for ensuring that our node has
|
||||||
|
// sufficient disk space.
|
||||||
|
type DiskCheckConfig struct {
|
||||||
|
RequiredRemaining float64 `long:"diskrequired" description:"The minimum ratio of free disk space to total capacity that we allow before shutting lnd down safely."`
|
||||||
|
|
||||||
|
*CheckConfig
|
||||||
|
}
|
2
log.go
2
log.go
@ -16,6 +16,7 @@ import (
|
|||||||
"github.com/lightningnetwork/lnd/channelnotifier"
|
"github.com/lightningnetwork/lnd/channelnotifier"
|
||||||
"github.com/lightningnetwork/lnd/contractcourt"
|
"github.com/lightningnetwork/lnd/contractcourt"
|
||||||
"github.com/lightningnetwork/lnd/discovery"
|
"github.com/lightningnetwork/lnd/discovery"
|
||||||
|
"github.com/lightningnetwork/lnd/healthcheck"
|
||||||
"github.com/lightningnetwork/lnd/htlcswitch"
|
"github.com/lightningnetwork/lnd/htlcswitch"
|
||||||
"github.com/lightningnetwork/lnd/invoices"
|
"github.com/lightningnetwork/lnd/invoices"
|
||||||
"github.com/lightningnetwork/lnd/lnrpc/autopilotrpc"
|
"github.com/lightningnetwork/lnd/lnrpc/autopilotrpc"
|
||||||
@ -129,6 +130,7 @@ func SetupLoggers(root *build.RotatingLogWriter) {
|
|||||||
AddSubLogger(root, routerrpc.Subsystem, routerrpc.UseLogger)
|
AddSubLogger(root, routerrpc.Subsystem, routerrpc.UseLogger)
|
||||||
AddSubLogger(root, chanfitness.Subsystem, chanfitness.UseLogger)
|
AddSubLogger(root, chanfitness.Subsystem, chanfitness.UseLogger)
|
||||||
AddSubLogger(root, verrpc.Subsystem, verrpc.UseLogger)
|
AddSubLogger(root, verrpc.Subsystem, verrpc.UseLogger)
|
||||||
|
AddSubLogger(root, healthcheck.Subsystem, healthcheck.UseLogger)
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddSubLogger is a helper method to conveniently create and register the
|
// AddSubLogger is a helper method to conveniently create and register the
|
||||||
|
@ -54,7 +54,10 @@ func Start(extraArgs string, unlockerReady, rpcReady Callback) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Hook interceptor for os signals.
|
// Hook interceptor for os signals.
|
||||||
signal.Intercept()
|
if err := signal.Intercept(); err != nil {
|
||||||
|
_, _ = fmt.Fprintln(os.Stderr, err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
// Set up channels that will be notified when the RPC servers are ready
|
// Set up channels that will be notified when the RPC servers are ready
|
||||||
// to accept calls.
|
// to accept calls.
|
||||||
|
@ -400,3 +400,39 @@ litecoin.node=ltcd
|
|||||||
; sweep funds if a breach occurs while being offline. The fee rate should be
|
; sweep funds if a breach occurs while being offline. The fee rate should be
|
||||||
; specified in sat/byte, the default is 10 sat/byte.
|
; specified in sat/byte, the default is 10 sat/byte.
|
||||||
; wtclient.sweep-fee-rate=10
|
; wtclient.sweep-fee-rate=10
|
||||||
|
|
||||||
|
[healthcheck]
|
||||||
|
; The number of times we should attempt to query our chain backend before
|
||||||
|
; gracefully shutting down. Set this value to 0 to disable this health check.
|
||||||
|
; healthcheck.chainbackend.attempts=3
|
||||||
|
|
||||||
|
; The amount of time we allow a call to our chain backend to take before we fail
|
||||||
|
; the attempt. This value must be >= 1s.
|
||||||
|
; healthcheck.chainbackend.timeout=10s
|
||||||
|
|
||||||
|
; The amount of time we should backoff between failed attempts to query chain
|
||||||
|
; backend. This value must be >= 1s.
|
||||||
|
; healthcheck.chainbackend.backoff=30s
|
||||||
|
|
||||||
|
; The amount of time we should wait between chain backend health checks. This
|
||||||
|
; value must be >= 1m.
|
||||||
|
; healthcheck.chainbackend.interval=1m
|
||||||
|
|
||||||
|
; The minimum ratio of free disk space to total capacity that we require.
|
||||||
|
; healthcheck.diskspace.diskrequired=0.1
|
||||||
|
|
||||||
|
; The number of times we should attempt to query our available disk space before
|
||||||
|
; gracefully shutting down. Set this value to 0 to disable this health check.
|
||||||
|
; healthcheck.diskspace.attempts=2
|
||||||
|
|
||||||
|
; The amount of time we allow a query for our available disk space to take
|
||||||
|
; before we fail the attempt. This value must be >= 1s.
|
||||||
|
; healthcheck.diskspace.timeout=5s
|
||||||
|
|
||||||
|
; The amount of time we should backoff between failed attempts to query
|
||||||
|
; available disk space. This value must be >= 1s.
|
||||||
|
; healthcheck.diskspace.backoff=1m
|
||||||
|
|
||||||
|
; The amount of time we should wait between disk space health checks. This
|
||||||
|
; value must be >= 1m.
|
||||||
|
; healthcheck.diskspace.interval=6h
|
||||||
|
68
server.go
68
server.go
@ -37,6 +37,7 @@ import (
|
|||||||
"github.com/lightningnetwork/lnd/contractcourt"
|
"github.com/lightningnetwork/lnd/contractcourt"
|
||||||
"github.com/lightningnetwork/lnd/discovery"
|
"github.com/lightningnetwork/lnd/discovery"
|
||||||
"github.com/lightningnetwork/lnd/feature"
|
"github.com/lightningnetwork/lnd/feature"
|
||||||
|
"github.com/lightningnetwork/lnd/healthcheck"
|
||||||
"github.com/lightningnetwork/lnd/htlcswitch"
|
"github.com/lightningnetwork/lnd/htlcswitch"
|
||||||
"github.com/lightningnetwork/lnd/htlcswitch/hop"
|
"github.com/lightningnetwork/lnd/htlcswitch/hop"
|
||||||
"github.com/lightningnetwork/lnd/input"
|
"github.com/lightningnetwork/lnd/input"
|
||||||
@ -276,6 +277,9 @@ type server struct {
|
|||||||
|
|
||||||
hostAnn *netann.HostAnnouncer
|
hostAnn *netann.HostAnnouncer
|
||||||
|
|
||||||
|
// livelinessMonitor monitors that lnd has access to critical resources.
|
||||||
|
livelinessMonitor *healthcheck.Monitor
|
||||||
|
|
||||||
quit chan struct{}
|
quit chan struct{}
|
||||||
|
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
@ -1254,6 +1258,56 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create a set of health checks using our configured values. If a
|
||||||
|
// health check has been disabled by setting attempts to 0, our monitor
|
||||||
|
// will not run it.
|
||||||
|
chainHealthCheck := healthcheck.NewObservation(
|
||||||
|
"chain backend",
|
||||||
|
func() error {
|
||||||
|
_, _, err := cc.chainIO.GetBestBlock()
|
||||||
|
return err
|
||||||
|
},
|
||||||
|
cfg.HealthChecks.ChainCheck.Interval,
|
||||||
|
cfg.HealthChecks.ChainCheck.Timeout,
|
||||||
|
cfg.HealthChecks.ChainCheck.Backoff,
|
||||||
|
cfg.HealthChecks.ChainCheck.Attempts,
|
||||||
|
)
|
||||||
|
|
||||||
|
diskCheck := healthcheck.NewObservation(
|
||||||
|
"disk space",
|
||||||
|
func() error {
|
||||||
|
free, err := healthcheck.AvailableDiskSpace(cfg.LndDir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have more free space than we require,
|
||||||
|
// we return a nil error.
|
||||||
|
if free > cfg.HealthChecks.DiskCheck.RequiredRemaining {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("require: %v free space, got: %v",
|
||||||
|
cfg.HealthChecks.DiskCheck.RequiredRemaining,
|
||||||
|
free)
|
||||||
|
},
|
||||||
|
cfg.HealthChecks.DiskCheck.Interval,
|
||||||
|
cfg.HealthChecks.DiskCheck.Timeout,
|
||||||
|
cfg.HealthChecks.DiskCheck.Backoff,
|
||||||
|
cfg.HealthChecks.DiskCheck.Attempts,
|
||||||
|
)
|
||||||
|
|
||||||
|
// If we have not disabled all of our health checks, we create a
|
||||||
|
// liveliness monitor with our configured checks.
|
||||||
|
s.livelinessMonitor = healthcheck.NewMonitor(
|
||||||
|
&healthcheck.Config{
|
||||||
|
Checks: []*healthcheck.Observation{
|
||||||
|
chainHealthCheck, diskCheck,
|
||||||
|
},
|
||||||
|
Shutdown: srvrLog.Criticalf,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
// Create the connection manager which will be responsible for
|
// Create the connection manager which will be responsible for
|
||||||
// maintaining persistent outbound connections and also accepting new
|
// maintaining persistent outbound connections and also accepting new
|
||||||
// incoming connections
|
// incoming connections
|
||||||
@ -1304,6 +1358,13 @@ func (s *server) Start() error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if s.livelinessMonitor != nil {
|
||||||
|
if err := s.livelinessMonitor.Start(); err != nil {
|
||||||
|
startErr = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Start the notification server. This is used so channel
|
// Start the notification server. This is used so channel
|
||||||
// management goroutines can be notified when a funding
|
// management goroutines can be notified when a funding
|
||||||
// transaction reaches a sufficient number of confirmations, or
|
// transaction reaches a sufficient number of confirmations, or
|
||||||
@ -1535,6 +1596,13 @@ func (s *server) Stop() error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if s.livelinessMonitor != nil {
|
||||||
|
if err := s.livelinessMonitor.Stop(); err != nil {
|
||||||
|
srvrLog.Warnf("unable to shutdown liveliness "+
|
||||||
|
"monitor: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Wait for all lingering goroutines to quit.
|
// Wait for all lingering goroutines to quit.
|
||||||
s.wg.Wait()
|
s.wg.Wait()
|
||||||
|
|
||||||
|
@ -6,8 +6,10 @@
|
|||||||
package signal
|
package signal
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -19,6 +21,10 @@ var (
|
|||||||
// gracefully, similar to when receiving SIGINT.
|
// gracefully, similar to when receiving SIGINT.
|
||||||
shutdownRequestChannel = make(chan struct{})
|
shutdownRequestChannel = make(chan struct{})
|
||||||
|
|
||||||
|
// started indicates whether we have started our main interrupt handler.
|
||||||
|
// This field should be used atomically.
|
||||||
|
started int32
|
||||||
|
|
||||||
// quit is closed when instructing the main interrupt handler to exit.
|
// quit is closed when instructing the main interrupt handler to exit.
|
||||||
quit = make(chan struct{})
|
quit = make(chan struct{})
|
||||||
|
|
||||||
@ -26,8 +32,13 @@ var (
|
|||||||
shutdownChannel = make(chan struct{})
|
shutdownChannel = make(chan struct{})
|
||||||
)
|
)
|
||||||
|
|
||||||
// Intercept starts the interception of interrupt signals.
|
// Intercept starts the interception of interrupt signals. Note that this
|
||||||
func Intercept() {
|
// function can only be called once.
|
||||||
|
func Intercept() error {
|
||||||
|
if !atomic.CompareAndSwapInt32(&started, 0, 1) {
|
||||||
|
return errors.New("intercept already started")
|
||||||
|
}
|
||||||
|
|
||||||
signalsToCatch := []os.Signal{
|
signalsToCatch := []os.Signal{
|
||||||
os.Interrupt,
|
os.Interrupt,
|
||||||
os.Kill,
|
os.Kill,
|
||||||
@ -37,6 +48,8 @@ func Intercept() {
|
|||||||
}
|
}
|
||||||
signal.Notify(interruptChannel, signalsToCatch...)
|
signal.Notify(interruptChannel, signalsToCatch...)
|
||||||
go mainInterruptHandler()
|
go mainInterruptHandler()
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mainInterruptHandler listens for SIGINT (Ctrl+C) signals on the
|
// mainInterruptHandler listens for SIGINT (Ctrl+C) signals on the
|
||||||
@ -85,6 +98,20 @@ func mainInterruptHandler() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Listening returns true if the main interrupt handler has been started, and
|
||||||
|
// has not been killed.
|
||||||
|
func Listening() bool {
|
||||||
|
// If our started field is not set, we are not yet listening for
|
||||||
|
// interrupts.
|
||||||
|
if atomic.LoadInt32(&started) != 1 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have started our main goroutine, we check whether we have
|
||||||
|
// stopped it yet.
|
||||||
|
return Alive()
|
||||||
|
}
|
||||||
|
|
||||||
// Alive returns true if the main interrupt handler has not been killed.
|
// Alive returns true if the main interrupt handler has not been killed.
|
||||||
func Alive() bool {
|
func Alive() bool {
|
||||||
select {
|
select {
|
||||||
|
Loading…
Reference in New Issue
Block a user