diff --git a/build/log_shutdown.go b/build/log_shutdown.go new file mode 100644 index 00000000..d83ff99e --- /dev/null +++ b/build/log_shutdown.go @@ -0,0 +1,53 @@ +package build + +import ( + "github.com/btcsuite/btclog" + "github.com/lightningnetwork/lnd/signal" +) + +// ShutdownLogger wraps an existing logger with a shutdown function which will +// be called on Critical/Criticalf to prompt shutdown. +type ShutdownLogger struct { + btclog.Logger +} + +// NewShutdownLogger creates a shutdown logger for the log provided which will +// use the signal package to request shutdown on critical errors. +func NewShutdownLogger(logger btclog.Logger) *ShutdownLogger { + return &ShutdownLogger{ + Logger: logger, + } +} + +// Criticalf formats message according to format specifier and writes to +// log with LevelCritical. It will then call the shutdown logger's shutdown +// function to prompt safe shutdown. +// +// Note: it is part of the btclog.Logger interface. +func (s *ShutdownLogger) Criticalf(format string, params ...interface{}) { + s.Logger.Criticalf(format, params...) + s.shutdown() +} + +// Critical formats message using the default formats for its operands +// and writes to log with LevelCritical. It will then call the shutdown +// logger's shutdown function to prompt safe shutdown. +// +// Note: it is part of the btclog.Logger interface. +func (s *ShutdownLogger) Critical(v ...interface{}) { + s.Logger.Critical(v) + s.shutdown() +} + +// shutdown checks whether we are listening for interrupts, since a shutdown +// request to the signal package will block if it is not running, and requests +// shutdown if possible. +func (s *ShutdownLogger) shutdown() { + if !signal.Listening() { + s.Logger.Info("Request for shutdown ignored") + return + } + + s.Logger.Info("Sending request for shutdown") + signal.RequestShutdown() +} diff --git a/build/logrotator.go b/build/logrotator.go index 734a3bd7..2474d2d8 100644 --- a/build/logrotator.go +++ b/build/logrotator.go @@ -39,7 +39,10 @@ func NewRotatingLogWriter() *RotatingLogWriter { logWriter := &LogWriter{} backendLog := btclog.NewBackend(logWriter) return &RotatingLogWriter{ - GenSubLogger: backendLog.Logger, + GenSubLogger: func(tag string) btclog.Logger { + logger := backendLog.Logger(tag) + return NewShutdownLogger(logger) + }, logWriter: logWriter, backendLog: backendLog, subsystemLoggers: SubLoggers{}, diff --git a/cmd/lncli/cmd_open_channel.go b/cmd/lncli/cmd_open_channel.go index b7e78d13..412306ba 100644 --- a/cmd/lncli/cmd_open_channel.go +++ b/cmd/lncli/cmd_open_channel.go @@ -414,7 +414,10 @@ func openChannelPsbt(ctx *cli.Context, client lnrpc.LightningClient, if err != nil { return fmt.Errorf("opening stream to server failed: %v", err) } - signal.Intercept() + + if err := signal.Intercept(); err != nil { + return err + } // We also need to spawn a goroutine that reads from the server. This // will copy the messages to the channel as long as they come in or add diff --git a/cmd/lnd/main.go b/cmd/lnd/main.go index 9364afbe..8fb99ed0 100644 --- a/cmd/lnd/main.go +++ b/cmd/lnd/main.go @@ -19,7 +19,10 @@ func main() { } // Hook interceptor for os signals. - signal.Intercept() + if err := signal.Intercept(); err != nil { + _, _ = fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } // Call the "real" main in a nested manner so the defers will properly // be executed in the case of a graceful shutdown. diff --git a/config.go b/config.go index 06f74979..8713aa8f 100644 --- a/config.go +++ b/config.go @@ -86,6 +86,21 @@ const ( // HostAnnouncer will wait between DNS resolutions to check if the // backing IP of a host has changed. defaultHostSampleInterval = time.Minute * 5 + + defaultChainInterval = time.Minute + defaultChainTimeout = time.Second * 10 + defaultChainBackoff = time.Second * 30 + defaultChainAttempts = 3 + + // By default, we will shutdown if less than 10% of disk space is + // available. We allow a longer interval for disk space checks, because + // this check is less likely to deteriorate quickly. However, we allow + // fewer retries because this should not be a flakey check. + defaultRequiredDisk = 0.1 + defaultDiskInterval = time.Hour * 12 + defaultDiskTimeout = time.Second * 5 + defaultDiskBackoff = time.Minute + defaultDiskAttempts = 2 ) var ( @@ -265,6 +280,8 @@ type Config struct { AllowCircularRoute bool `long:"allow-circular-route" description:"If true, our node will allow htlc forwards that arrive and depart on the same channel."` + HealthChecks *lncfg.HealthCheckConfig `group:"healthcheck" namespace:"healthcheck"` + DB *lncfg.DB `group:"db" namespace:"db"` // LogWriter is the root logger that all of the daemon's subloggers are @@ -383,6 +400,23 @@ func DefaultConfig() Config { Watchtower: &lncfg.Watchtower{ TowerDir: defaultTowerDir, }, + HealthChecks: &lncfg.HealthCheckConfig{ + ChainCheck: &lncfg.CheckConfig{ + Interval: defaultChainInterval, + Timeout: defaultChainTimeout, + Attempts: defaultChainAttempts, + Backoff: defaultChainBackoff, + }, + DiskCheck: &lncfg.DiskCheckConfig{ + RequiredRemaining: defaultRequiredDisk, + CheckConfig: &lncfg.CheckConfig{ + Interval: defaultDiskInterval, + Attempts: defaultDiskAttempts, + Timeout: defaultDiskTimeout, + Backoff: defaultDiskBackoff, + }, + }, + }, MaxOutgoingCltvExpiry: htlcswitch.DefaultMaxOutgoingCltvExpiry, MaxChannelFeeAllocation: htlcswitch.DefaultMaxLinkFeeAllocation, LogWriter: build.NewRotatingLogWriter(), @@ -1124,6 +1158,7 @@ func ValidateConfig(cfg Config, usageMessage string) (*Config, error) { cfg.Caches, cfg.WtClient, cfg.DB, + cfg.HealthChecks, ) if err != nil { return nil, err diff --git a/go.mod b/go.mod index 160da68a..3b83e961 100644 --- a/go.mod +++ b/go.mod @@ -66,6 +66,7 @@ require ( go.uber.org/zap v1.14.1 // indirect golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899 golang.org/x/net v0.0.0-20191002035440-2ec189313ef0 + golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 google.golang.org/grpc v1.24.0 gopkg.in/errgo.v1 v1.0.1 // indirect diff --git a/go.sum b/go.sum index e1c65788..3f2b8b91 100644 --- a/go.sum +++ b/go.sum @@ -332,6 +332,7 @@ golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFW golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200724161237-0e2f3a69832c h1:UIcGWL6/wpCfyGuJnRFJRurA+yj8RrW7Q6x2YMCXt6c= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 h1:z99zHgr7hKfrUcX/KsoJk5FJfjTceCKIp96+biqP4To= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/healthcheck/diskcheck.go b/healthcheck/diskcheck.go new file mode 100644 index 00000000..c3e2cbb2 --- /dev/null +++ b/healthcheck/diskcheck.go @@ -0,0 +1,18 @@ +// +build !windows,!solaris + +package healthcheck + +import "syscall" + +// AvailableDiskSpace returns ratio of available disk space to total capacity. +func AvailableDiskSpace(path string) (float64, error) { + s := syscall.Statfs_t{} + err := syscall.Statfs(path, &s) + if err != nil { + return 0, err + } + + // Calculate our free blocks/total blocks to get our total ratio of + // free blocks. + return float64(s.Bfree) / float64(s.Blocks), nil +} diff --git a/healthcheck/diskcheck_solaris.go b/healthcheck/diskcheck_solaris.go new file mode 100644 index 00000000..d44330b7 --- /dev/null +++ b/healthcheck/diskcheck_solaris.go @@ -0,0 +1,17 @@ +package healthcheck + +import "golang.org/x/sys/unix" + +// AvailableDiskSpace returns ratio of available disk space to total capacity +// for solaris. +func AvailableDiskSpace(path string) (float64, error) { + s := unix.Statvfs_t{} + err := unix.Statvfs(path, &s) + if err != nil { + return 0, err + } + + // Calculate our free blocks/total blocks to get our total ratio of + // free blocks. + return float64(s.Bfree) / float64(s.Blocks), nil +} diff --git a/healthcheck/diskcheck_windows.go b/healthcheck/diskcheck_windows.go new file mode 100644 index 00000000..7fed088b --- /dev/null +++ b/healthcheck/diskcheck_windows.go @@ -0,0 +1,17 @@ +package healthcheck + +import "golang.org/x/sys/windows" + +// AvailableDiskSpace returns ratio of available disk space to total capacity +// for windows. +func AvailableDiskSpace(path string) (float64, error) { + var free, total, avail uint64 + + pathPtr, err := windows.UTF16PtrFromString(path) + if err != nil { + panic(err) + } + err = windows.GetDiskFreeSpaceEx(pathPtr, &free, &total, &avail) + + return float64(avail) / float64(total), nil +} diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go new file mode 100644 index 00000000..a1f7864a --- /dev/null +++ b/healthcheck/healthcheck.go @@ -0,0 +1,231 @@ +// Package healthcheck contains a monitor which takes a set of liveliness checks +// which it periodically checks. If a check fails after its configured number +// of allowed call attempts, the monitor will send a request to shutdown using +// the function is is provided in its config. Checks are dispatched in their own +// goroutines so that they do not block each other. +package healthcheck + +import ( + "errors" + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/lightningnetwork/lnd/ticker" +) + +// Config contains configuration settings for our monitor. +type Config struct { + // Checks is a set of health checks that assert that lnd has access to + // critical resources. + Checks []*Observation + + // Shutdown should be called to request safe shutdown on failure of a + // health check. + Shutdown shutdownFunc +} + +// shutdownFunc is the signature we use for a shutdown function which allows us +// to print our reason for shutdown. +type shutdownFunc func(format string, params ...interface{}) + +// Monitor periodically checks a series of configured liveliness checks to +// ensure that lnd has access to all critical resources. +type Monitor struct { + started int32 // To be used atomically. + stopped int32 // To be used atomically. + + cfg *Config + + quit chan struct{} + wg sync.WaitGroup +} + +// NewMonitor returns a monitor with the provided config. +func NewMonitor(cfg *Config) *Monitor { + return &Monitor{ + cfg: cfg, + quit: make(chan struct{}), + } +} + +// Start launches the goroutines required to run our monitor. +func (m *Monitor) Start() error { + if !atomic.CompareAndSwapInt32(&m.started, 0, 1) { + return errors.New("monitor already started") + } + + // Run through all of the health checks that we have configured and + // start a goroutine for each check. + for _, check := range m.cfg.Checks { + check := check + + // Skip over health checks that are disabled by setting zero + // attempts. + if check.Attempts == 0 { + log.Warnf("check: %v configured with 0 attempts, "+ + "skipping it", check.Name) + + continue + } + + m.wg.Add(1) + go func() { + defer m.wg.Done() + check.monitor(m.cfg.Shutdown, m.quit) + }() + } + + return nil +} + +// Stop sends all goroutines the signal to exit and waits for them to exit. +func (m *Monitor) Stop() error { + if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) { + return fmt.Errorf("monitor already stopped") + } + + close(m.quit) + m.wg.Wait() + + return nil +} + +// CreateCheck is a helper function that takes a function that produces an error +// and wraps it in a function that returns its result on an error channel. +// We do not wait group the goroutine running our checkFunc because we expect +// to be dealing with health checks that may block; if we wait group them, we +// may wait forever. Ideally future health checks will allow callers to cancel +// them early, and we can wait group this. +func CreateCheck(checkFunc func() error) func() chan error { + return func() chan error { + errChan := make(chan error, 1) + go func() { + errChan <- checkFunc() + }() + + return errChan + } +} + +// Observation represents a liveliness check that we periodically check. +type Observation struct { + // Name describes the health check. + Name string + + // Check runs the health check itself, returning an error channel that + // is expected to receive nil or an error. + Check func() chan error + + // Interval is a ticker which triggers running our check function. This + // ticker must be started and stopped by the observation. + Interval ticker.Ticker + + // Attempts is the number of calls we make for a single check before + // failing. + Attempts int + + // Timeout is the amount of time we allow our check function to take + // before we time it out. + Timeout time.Duration + + // Backoff is the amount of time we back off between retries for failed + // checks. + Backoff time.Duration +} + +// NewObservation creates an observation. +func NewObservation(name string, check func() error, interval, + timeout, backoff time.Duration, attempts int) *Observation { + + return &Observation{ + Name: name, + Check: CreateCheck(check), + Interval: ticker.New(interval), + Attempts: attempts, + Timeout: timeout, + Backoff: backoff, + } +} + +// String returns a string representation of an observation. +func (o *Observation) String() string { + return o.Name +} + +// monitor executes a health check every time its interval ticks until the quit +// channel signals that we should shutdown. This function is also responsible +// for starting and stopping our ticker. +func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) { + log.Debugf("Monitoring: %v", o) + + o.Interval.Resume() + defer o.Interval.Stop() + + for { + select { + case <-o.Interval.Ticks(): + o.retryCheck(quit, shutdown) + + // Exit if we receive the instruction to shutdown. + case <-quit: + return + } + } +} + +// retryCheck calls a check function until it succeeds, or we reach our +// configured number of attempts, waiting for our back off period between failed +// calls. If we fail to obtain a passing health check after the allowed number +// of calls, we will request shutdown. +func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) { + var count int + + for count < o.Attempts { + // Increment our call count and call the health check endpoint. + count++ + + // Wait for our check to return, timeout to elapse, or quit + // signal to be received. + var err error + select { + case err = <-o.Check(): + + case <-time.After(o.Timeout): + err = fmt.Errorf("health check: %v timed out after: "+ + "%v", o, o.Timeout) + + case <-quit: + return + } + + // If our error is nil, we have passed our health check, so we + // can exit. + if err == nil { + return + } + + // If we have reached our allowed number of attempts, this + // check has failed so we request shutdown. + if count == o.Attempts { + shutdown("Health check: %v failed after %v "+ + "calls", o, o.Attempts) + + return + } + + log.Debugf("Health check: %v, call: %v failed with: %v, "+ + "backing off for: %v", o, count, err, o.Backoff) + + // If we are still within the number of calls allowed for this + // check, we wait for our back off period to elapse, or exit if + // we get the signal to shutdown. + select { + case <-time.After(o.Backoff): + + case <-quit: + return + } + } +} diff --git a/healthcheck/healthcheck_test.go b/healthcheck/healthcheck_test.go new file mode 100644 index 00000000..ed810ed7 --- /dev/null +++ b/healthcheck/healthcheck_test.go @@ -0,0 +1,225 @@ +package healthcheck + +import ( + "errors" + "testing" + "time" + + "github.com/lightningnetwork/lnd/ticker" + "github.com/stretchr/testify/require" +) + +var ( + errNonNil = errors.New("non-nil test error") + timeout = time.Second + testTime = time.Unix(1, 2) +) + +type mockedCheck struct { + t *testing.T + errChan chan error +} + +// newMockCheck creates a new mock. +func newMockCheck(t *testing.T) *mockedCheck { + return &mockedCheck{ + t: t, + errChan: make(chan error), + } +} + +// call returns our mock's error channel, which we can send responses on. +func (m *mockedCheck) call() chan error { + return m.errChan +} + +// sendError sends an error into our mock's error channel, mocking the sending +// of a response from our check function. +func (m *mockedCheck) sendError(err error) { + select { + case m.errChan <- err: + case <-time.After(timeout): + m.t.Fatalf("could not send error: %v", err) + } +} + +// TestMonitor tests creation and triggering of a monitor with a health check. +func TestMonitor(t *testing.T) { + intervalTicker := ticker.NewForce(time.Hour) + + mock := newMockCheck(t) + shutdown := make(chan struct{}) + + // Create our config for monitoring. We will use a 0 back off so that + // out test does not need to wait. + cfg := &Config{ + Checks: []*Observation{ + { + Check: mock.call, + Interval: intervalTicker, + Attempts: 2, + Backoff: 0, + Timeout: time.Hour, + }, + }, + Shutdown: func(string, ...interface{}) { + shutdown <- struct{}{} + }, + } + monitor := NewMonitor(cfg) + + require.NoError(t, monitor.Start(), "could not start monitor") + + // Tick is a helper we will use to tick our interval. + tick := func() { + select { + case intervalTicker.Force <- testTime: + case <-time.After(timeout): + t.Fatal("could not tick timer") + } + } + + // Tick our timer and provide our error channel with a nil error. This + // mocks our check function succeeding on the first call. + tick() + mock.sendError(nil) + + // Now we tick our timer again. This time send a non-nil error, followed + // by a nil error. This tests our retry logic, because we allow 2 + // retries, so should recover without needing to shutdown. + tick() + mock.sendError(errNonNil) + mock.sendError(nil) + + // Finally, we tick our timer once more, and send two non-nil errors + // into our error channel. This mocks our check function failing twice. + tick() + mock.sendError(errNonNil) + mock.sendError(errNonNil) + + // Since we have failed within our allowed number of retries, we now + // expect a call to our shutdown function. + select { + case <-shutdown: + case <-time.After(timeout): + t.Fatal("expected shutdown") + } + + require.NoError(t, monitor.Stop(), "could not stop monitor") +} + +// TestRetryCheck tests our retry logic. It does not include a test for exiting +// during the back off period. +func TestRetryCheck(t *testing.T) { + tests := []struct { + name string + + // errors provides an in-order list of errors that we expect our + // health check to respond with. The number of errors in this + // list indicates the number of times we expect our check to + // be called, because our test will fail if we do not consume + // every error. + errors []error + + // attempts is the number of times we call a check before + // failing. + attempts int + + // timeout is the time we allow our check to take before we + // fail them. + timeout time.Duration + + // expectedShutdown is true if we expect a shutdown to be + // triggered because all of our calls failed. + expectedShutdown bool + }{ + { + name: "first call succeeds", + errors: []error{nil}, + attempts: 2, + timeout: time.Hour, + expectedShutdown: false, + }, + { + name: "first call fails", + errors: []error{errNonNil}, + attempts: 1, + timeout: time.Hour, + expectedShutdown: true, + }, + { + name: "fail then recover", + errors: []error{errNonNil, nil}, + attempts: 2, + timeout: time.Hour, + expectedShutdown: false, + }, + { + name: "always fail", + errors: []error{errNonNil, errNonNil}, + attempts: 2, + timeout: time.Hour, + expectedShutdown: true, + }, + { + name: "no calls", + errors: nil, + attempts: 0, + timeout: time.Hour, + expectedShutdown: false, + }, + { + name: "call times out", + errors: nil, + attempts: 1, + timeout: 1, + expectedShutdown: true, + }, + } + + for _, test := range tests { + test := test + + t.Run(test.name, func(t *testing.T) { + var shutdown bool + shutdownFunc := func(string, ...interface{}) { + shutdown = true + } + + mock := newMockCheck(t) + + // Create an observation that calls our call counting + // function. We set a zero back off so that the test + // will not wait. + observation := &Observation{ + Check: mock.call, + Attempts: test.attempts, + Timeout: test.timeout, + Backoff: 0, + } + quit := make(chan struct{}) + + // Run our retry check in a goroutine because it blocks + // on us sending errors into the mocked caller's error + // channel. + done := make(chan struct{}) + go func() { + observation.retryCheck(quit, shutdownFunc) + close(done) + }() + + // Prompt our mock caller to send responses for calls + // to our call function. + for _, err := range test.errors { + mock.sendError(err) + } + + // Make sure that we have finished running our retry + // check function before we start checking results. + <-done + + require.Equal(t, test.expectedShutdown, shutdown, + "unexpected shutdown state") + }) + } +} diff --git a/healthcheck/log.go b/healthcheck/log.go new file mode 100644 index 00000000..159aa102 --- /dev/null +++ b/healthcheck/log.go @@ -0,0 +1,32 @@ +package healthcheck + +import ( + "github.com/btcsuite/btclog" + "github.com/lightningnetwork/lnd/build" +) + +// Subsystem defines the logging code for this subsystem. +const Subsystem = "HLCK" + +// log is a logger that is initialized with no output filters. This +// means the package will not perform any logging by default until the caller +// requests it. +var log btclog.Logger + +// The default amount of logging is none. +func init() { + UseLogger(build.NewSubLogger(Subsystem, nil)) +} + +// DisableLog disables all library log output. Logging output is disabled +// by default until UseLogger is called. +func DisableLog() { + UseLogger(btclog.Disabled) +} + +// UseLogger uses a specified Logger to output package logging info. +// This should be used in preference to SetLogWriter if the caller is also +// using btclog. +func UseLogger(logger btclog.Logger) { + log = logger +} diff --git a/lncfg/healthcheck.go b/lncfg/healthcheck.go new file mode 100644 index 00000000..a43505ca --- /dev/null +++ b/lncfg/healthcheck.go @@ -0,0 +1,90 @@ +package lncfg + +import ( + "errors" + "fmt" + "time" +) + +var ( + // MinHealthCheckInterval is the minimum interval we allow between + // health checks. + MinHealthCheckInterval = time.Minute + + // MinHealthCheckTimeout is the minimum timeout we allow for health + // check calls. + MinHealthCheckTimeout = time.Second + + // MinHealthCheckBackoff is the minimum back off we allow between health + // check retries. + MinHealthCheckBackoff = time.Second +) + +// HealthCheckConfig contains the configuration for the different health checks +// the lnd runs. +type HealthCheckConfig struct { + ChainCheck *CheckConfig `group:"chainbackend" namespace:"chainbackend"` + + DiskCheck *DiskCheckConfig `group:"diskspace" namespace:"diskspace"` +} + +// Validate checks the values configured for our health checks. +func (h *HealthCheckConfig) Validate() error { + if err := h.ChainCheck.validate("chain backend"); err != nil { + return err + } + + if err := h.DiskCheck.validate("disk space"); err != nil { + return err + } + + if h.DiskCheck.RequiredRemaining < 0 || + h.DiskCheck.RequiredRemaining >= 1 { + + return errors.New("disk required ratio must be in [0:1)") + } + + return nil +} + +type CheckConfig struct { + Interval time.Duration `long:"interval" description:"How often to run a health check."` + + Attempts int `long:"attempts" description:"The number of calls we will make for the check before failing. Set this value to 0 to disable a check."` + + Timeout time.Duration `long:"timeout" description:"The amount of time we allow the health check to take before failing due to timeout."` + + Backoff time.Duration `long:"backoff" description:"The amount of time to back-off between failed health checks."` +} + +// validate checks the values in a health check config entry if it is enabled. +func (c *CheckConfig) validate(name string) error { + if c.Attempts == 0 { + return nil + } + + if c.Backoff < MinHealthCheckBackoff { + return fmt.Errorf("%v backoff: %v below minimum: %v", name, + c.Backoff, MinHealthCheckBackoff) + } + + if c.Timeout < MinHealthCheckTimeout { + return fmt.Errorf("%v timeout: %v below minimum: %v", name, + c.Timeout, MinHealthCheckTimeout) + } + + if c.Interval < MinHealthCheckInterval { + return fmt.Errorf("%v interval: %v below minimum: %v", name, + c.Interval, MinHealthCheckInterval) + } + + return nil +} + +// DiskCheckConfig contains configuration for ensuring that our node has +// sufficient disk space. +type DiskCheckConfig struct { + RequiredRemaining float64 `long:"diskrequired" description:"The minimum ratio of free disk space to total capacity that we allow before shutting lnd down safely."` + + *CheckConfig +} diff --git a/log.go b/log.go index f809acdf..59183b7c 100644 --- a/log.go +++ b/log.go @@ -16,6 +16,7 @@ import ( "github.com/lightningnetwork/lnd/channelnotifier" "github.com/lightningnetwork/lnd/contractcourt" "github.com/lightningnetwork/lnd/discovery" + "github.com/lightningnetwork/lnd/healthcheck" "github.com/lightningnetwork/lnd/htlcswitch" "github.com/lightningnetwork/lnd/invoices" "github.com/lightningnetwork/lnd/lnrpc/autopilotrpc" @@ -129,6 +130,7 @@ func SetupLoggers(root *build.RotatingLogWriter) { AddSubLogger(root, routerrpc.Subsystem, routerrpc.UseLogger) AddSubLogger(root, chanfitness.Subsystem, chanfitness.UseLogger) AddSubLogger(root, verrpc.Subsystem, verrpc.UseLogger) + AddSubLogger(root, healthcheck.Subsystem, healthcheck.UseLogger) } // AddSubLogger is a helper method to conveniently create and register the diff --git a/mobile/bindings.go b/mobile/bindings.go index fcf8c11d..aff3c819 100644 --- a/mobile/bindings.go +++ b/mobile/bindings.go @@ -54,7 +54,10 @@ func Start(extraArgs string, unlockerReady, rpcReady Callback) { } // Hook interceptor for os signals. - signal.Intercept() + if err := signal.Intercept(); err != nil { + _, _ = fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } // Set up channels that will be notified when the RPC servers are ready // to accept calls. diff --git a/sample-lnd.conf b/sample-lnd.conf index b9cc0db6..2eb0b6bd 100644 --- a/sample-lnd.conf +++ b/sample-lnd.conf @@ -400,3 +400,39 @@ litecoin.node=ltcd ; sweep funds if a breach occurs while being offline. The fee rate should be ; specified in sat/byte, the default is 10 sat/byte. ; wtclient.sweep-fee-rate=10 + +[healthcheck] +; The number of times we should attempt to query our chain backend before +; gracefully shutting down. Set this value to 0 to disable this health check. +; healthcheck.chainbackend.attempts=3 + +; The amount of time we allow a call to our chain backend to take before we fail +; the attempt. This value must be >= 1s. +; healthcheck.chainbackend.timeout=10s + +; The amount of time we should backoff between failed attempts to query chain +; backend. This value must be >= 1s. +; healthcheck.chainbackend.backoff=30s + +; The amount of time we should wait between chain backend health checks. This +; value must be >= 1m. +; healthcheck.chainbackend.interval=1m + +; The minimum ratio of free disk space to total capacity that we require. +; healthcheck.diskspace.diskrequired=0.1 + +; The number of times we should attempt to query our available disk space before +; gracefully shutting down. Set this value to 0 to disable this health check. +; healthcheck.diskspace.attempts=2 + +; The amount of time we allow a query for our available disk space to take +; before we fail the attempt. This value must be >= 1s. +; healthcheck.diskspace.timeout=5s + +; The amount of time we should backoff between failed attempts to query +; available disk space. This value must be >= 1s. +; healthcheck.diskspace.backoff=1m + +; The amount of time we should wait between disk space health checks. This +; value must be >= 1m. +; healthcheck.diskspace.interval=6h diff --git a/server.go b/server.go index 5db6e7be..626c9099 100644 --- a/server.go +++ b/server.go @@ -37,6 +37,7 @@ import ( "github.com/lightningnetwork/lnd/contractcourt" "github.com/lightningnetwork/lnd/discovery" "github.com/lightningnetwork/lnd/feature" + "github.com/lightningnetwork/lnd/healthcheck" "github.com/lightningnetwork/lnd/htlcswitch" "github.com/lightningnetwork/lnd/htlcswitch/hop" "github.com/lightningnetwork/lnd/input" @@ -276,6 +277,9 @@ type server struct { hostAnn *netann.HostAnnouncer + // livelinessMonitor monitors that lnd has access to critical resources. + livelinessMonitor *healthcheck.Monitor + quit chan struct{} wg sync.WaitGroup @@ -1254,6 +1258,56 @@ func newServer(cfg *Config, listenAddrs []net.Addr, }) } + // Create a set of health checks using our configured values. If a + // health check has been disabled by setting attempts to 0, our monitor + // will not run it. + chainHealthCheck := healthcheck.NewObservation( + "chain backend", + func() error { + _, _, err := cc.chainIO.GetBestBlock() + return err + }, + cfg.HealthChecks.ChainCheck.Interval, + cfg.HealthChecks.ChainCheck.Timeout, + cfg.HealthChecks.ChainCheck.Backoff, + cfg.HealthChecks.ChainCheck.Attempts, + ) + + diskCheck := healthcheck.NewObservation( + "disk space", + func() error { + free, err := healthcheck.AvailableDiskSpace(cfg.LndDir) + if err != nil { + return err + } + + // If we have more free space than we require, + // we return a nil error. + if free > cfg.HealthChecks.DiskCheck.RequiredRemaining { + return nil + } + + return fmt.Errorf("require: %v free space, got: %v", + cfg.HealthChecks.DiskCheck.RequiredRemaining, + free) + }, + cfg.HealthChecks.DiskCheck.Interval, + cfg.HealthChecks.DiskCheck.Timeout, + cfg.HealthChecks.DiskCheck.Backoff, + cfg.HealthChecks.DiskCheck.Attempts, + ) + + // If we have not disabled all of our health checks, we create a + // liveliness monitor with our configured checks. + s.livelinessMonitor = healthcheck.NewMonitor( + &healthcheck.Config{ + Checks: []*healthcheck.Observation{ + chainHealthCheck, diskCheck, + }, + Shutdown: srvrLog.Criticalf, + }, + ) + // Create the connection manager which will be responsible for // maintaining persistent outbound connections and also accepting new // incoming connections @@ -1304,6 +1358,13 @@ func (s *server) Start() error { } } + if s.livelinessMonitor != nil { + if err := s.livelinessMonitor.Start(); err != nil { + startErr = err + return + } + } + // Start the notification server. This is used so channel // management goroutines can be notified when a funding // transaction reaches a sufficient number of confirmations, or @@ -1535,6 +1596,13 @@ func (s *server) Stop() error { } } + if s.livelinessMonitor != nil { + if err := s.livelinessMonitor.Stop(); err != nil { + srvrLog.Warnf("unable to shutdown liveliness "+ + "monitor: %v", err) + } + } + // Wait for all lingering goroutines to quit. s.wg.Wait() diff --git a/signal/signal.go b/signal/signal.go index 82e503d0..927931fd 100644 --- a/signal/signal.go +++ b/signal/signal.go @@ -6,8 +6,10 @@ package signal import ( + "errors" "os" "os/signal" + "sync/atomic" "syscall" ) @@ -19,6 +21,10 @@ var ( // gracefully, similar to when receiving SIGINT. shutdownRequestChannel = make(chan struct{}) + // started indicates whether we have started our main interrupt handler. + // This field should be used atomically. + started int32 + // quit is closed when instructing the main interrupt handler to exit. quit = make(chan struct{}) @@ -26,8 +32,13 @@ var ( shutdownChannel = make(chan struct{}) ) -// Intercept starts the interception of interrupt signals. -func Intercept() { +// Intercept starts the interception of interrupt signals. Note that this +// function can only be called once. +func Intercept() error { + if !atomic.CompareAndSwapInt32(&started, 0, 1) { + return errors.New("intercept already started") + } + signalsToCatch := []os.Signal{ os.Interrupt, os.Kill, @@ -37,6 +48,8 @@ func Intercept() { } signal.Notify(interruptChannel, signalsToCatch...) go mainInterruptHandler() + + return nil } // mainInterruptHandler listens for SIGINT (Ctrl+C) signals on the @@ -85,6 +98,20 @@ func mainInterruptHandler() { } } +// Listening returns true if the main interrupt handler has been started, and +// has not been killed. +func Listening() bool { + // If our started field is not set, we are not yet listening for + // interrupts. + if atomic.LoadInt32(&started) != 1 { + return false + } + + // If we have started our main goroutine, we check whether we have + // stopped it yet. + return Alive() +} + // Alive returns true if the main interrupt handler has not been killed. func Alive() bool { select {