Merge pull request #4479 from carlaKC/4306-shutdownlogger

healthcheck: add regular chain backend and disk space checks
2020-08-24 11:16:40 +02:00 · 2020-08-24 11:16:40 +02:00 · 1b8fbba615
commit 1b8fbba615
parent c3821e5ad1 5a73029442
19 changed files with 871 additions and 6 deletions
--- a/build/log_shutdown.go
+++ b/build/log_shutdown.go
@ -0,0 +1,53 @@
+package build
+
+import (
+	"github.com/btcsuite/btclog"
+	"github.com/lightningnetwork/lnd/signal"
+)
+
+// ShutdownLogger wraps an existing logger with a shutdown function which will
+// be called on Critical/Criticalf to prompt shutdown.
+type ShutdownLogger struct {
+	btclog.Logger
+}
+
+// NewShutdownLogger creates a shutdown logger for the log provided which will
+// use the signal package to request shutdown on critical errors.
+func NewShutdownLogger(logger btclog.Logger) *ShutdownLogger {
+	return &ShutdownLogger{
+		Logger: logger,
+	}
+}
+
+// Criticalf formats message according to format specifier and writes to
+// log with LevelCritical. It will then call the shutdown logger's shutdown
+// function to prompt safe shutdown.
+//
+// Note: it is part of the btclog.Logger interface.
+func (s *ShutdownLogger) Criticalf(format string, params ...interface{}) {
+	s.Logger.Criticalf(format, params...)
+	s.shutdown()
+}
+
+// Critical formats message using the default formats for its operands
+// and writes to log with LevelCritical. It will then call the shutdown
+// logger's shutdown function to prompt safe shutdown.
+//
+// Note: it is part of the btclog.Logger interface.
+func (s *ShutdownLogger) Critical(v ...interface{}) {
+	s.Logger.Critical(v)
+	s.shutdown()
+}
+
+// shutdown checks whether we are listening for interrupts, since a shutdown
+// request to the signal package will block if it is not running, and requests
+// shutdown if possible.
+func (s *ShutdownLogger) shutdown() {
+	if !signal.Listening() {
+		s.Logger.Info("Request for shutdown ignored")
+		return
+	}
+
+	s.Logger.Info("Sending request for shutdown")
+	signal.RequestShutdown()
+}
--- a/build/logrotator.go
+++ b/build/logrotator.go
@ -39,7 +39,10 @@ func NewRotatingLogWriter() *RotatingLogWriter {
 	logWriter := &LogWriter{}
 	backendLog := btclog.NewBackend(logWriter)
 	return &RotatingLogWriter{
-		GenSubLogger:     backendLog.Logger,
+		GenSubLogger: func(tag string) btclog.Logger {
+			logger := backendLog.Logger(tag)
+			return NewShutdownLogger(logger)
+		},
 		logWriter:        logWriter,
 		backendLog:       backendLog,
 		subsystemLoggers: SubLoggers{},
--- a/cmd/lncli/cmd_open_channel.go
+++ b/cmd/lncli/cmd_open_channel.go
@ -414,7 +414,10 @@ func openChannelPsbt(ctx *cli.Context, client lnrpc.LightningClient,
 	if err != nil {
 		return fmt.Errorf("opening stream to server failed: %v", err)
 	}
-	signal.Intercept()
+
+	if err := signal.Intercept(); err != nil {
+		return err
+	}

 	// We also need to spawn a goroutine that reads from the server. This
 	// will copy the messages to the channel as long as they come in or add
--- a/cmd/lnd/main.go
+++ b/cmd/lnd/main.go
@ -19,7 +19,10 @@ func main() {
 	}

 	// Hook interceptor for os signals.
-	signal.Intercept()
+	if err := signal.Intercept(); err != nil {
+		_, _ = fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}

 	// Call the "real" main in a nested manner so the defers will properly
 	// be executed in the case of a graceful shutdown.
--- a/config.go
+++ b/config.go
@ -86,6 +86,21 @@ const (
 	// HostAnnouncer will wait between DNS resolutions to check if the
 	// backing IP of a host has changed.
 	defaultHostSampleInterval = time.Minute * 5
+
+	defaultChainInterval = time.Minute
+	defaultChainTimeout  = time.Second * 10
+	defaultChainBackoff  = time.Second * 30
+	defaultChainAttempts = 3
+
+	// By default, we will shutdown if less than 10% of disk space is
+	// available. We allow a longer interval for disk space checks, because
+	// this check is less likely to deteriorate quickly. However, we allow
+	// fewer retries because this should not be a flakey check.
+	defaultRequiredDisk = 0.1
+	defaultDiskInterval = time.Hour * 12
+	defaultDiskTimeout  = time.Second * 5
+	defaultDiskBackoff  = time.Minute
+	defaultDiskAttempts = 2
 )

 var (
@ -265,6 +280,8 @@ type Config struct {

 	AllowCircularRoute bool `long:"allow-circular-route" description:"If true, our node will allow htlc forwards that arrive and depart on the same channel."`

+	HealthChecks *lncfg.HealthCheckConfig `group:"healthcheck" namespace:"healthcheck"`
+
 	DB *lncfg.DB `group:"db" namespace:"db"`

 	// LogWriter is the root logger that all of the daemon's subloggers are
@ -383,6 +400,23 @@ func DefaultConfig() Config {
 		Watchtower: &lncfg.Watchtower{
 			TowerDir: defaultTowerDir,
 		},
+		HealthChecks: &lncfg.HealthCheckConfig{
+			ChainCheck: &lncfg.CheckConfig{
+				Interval: defaultChainInterval,
+				Timeout:  defaultChainTimeout,
+				Attempts: defaultChainAttempts,
+				Backoff:  defaultChainBackoff,
+			},
+			DiskCheck: &lncfg.DiskCheckConfig{
+				RequiredRemaining: defaultRequiredDisk,
+				CheckConfig: &lncfg.CheckConfig{
+					Interval: defaultDiskInterval,
+					Attempts: defaultDiskAttempts,
+					Timeout:  defaultDiskTimeout,
+					Backoff:  defaultDiskBackoff,
+				},
+			},
+		},
 		MaxOutgoingCltvExpiry:   htlcswitch.DefaultMaxOutgoingCltvExpiry,
 		MaxChannelFeeAllocation: htlcswitch.DefaultMaxLinkFeeAllocation,
 		LogWriter:               build.NewRotatingLogWriter(),
@ -1124,6 +1158,7 @@ func ValidateConfig(cfg Config, usageMessage string) (*Config, error) {
 		cfg.Caches,
 		cfg.WtClient,
 		cfg.DB,
+		cfg.HealthChecks,
 	)
 	if err != nil {
 		return nil, err
--- a/go.mod
+++ b/go.mod
@ -66,6 +66,7 @@ require (
 	go.uber.org/zap v1.14.1 // indirect
 	golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899
 	golang.org/x/net v0.0.0-20191002035440-2ec189313ef0
+	golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5
 	golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2
 	google.golang.org/grpc v1.24.0
 	gopkg.in/errgo.v1 v1.0.1 // indirect
--- a/go.sum
+++ b/go.sum
@ -332,6 +332,7 @@ golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFW
 golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
 golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200724161237-0e2f3a69832c h1:UIcGWL6/wpCfyGuJnRFJRurA+yj8RrW7Q6x2YMCXt6c=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 h1:z99zHgr7hKfrUcX/KsoJk5FJfjTceCKIp96+biqP4To=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
--- a/healthcheck/diskcheck.go
+++ b/healthcheck/diskcheck.go
@ -0,0 +1,18 @@
+// +build !windows,!solaris
+
+package healthcheck
+
+import "syscall"
+
+// AvailableDiskSpace returns ratio of available disk space to total capacity.
+func AvailableDiskSpace(path string) (float64, error) {
+	s := syscall.Statfs_t{}
+	err := syscall.Statfs(path, &s)
+	if err != nil {
+		return 0, err
+	}
+
+	// Calculate our free blocks/total blocks to get our total ratio of
+	// free blocks.
+	return float64(s.Bfree) / float64(s.Blocks), nil
+}
--- a/healthcheck/diskcheck_solaris.go
+++ b/healthcheck/diskcheck_solaris.go
@ -0,0 +1,17 @@
+package healthcheck
+
+import "golang.org/x/sys/unix"
+
+// AvailableDiskSpace returns ratio of available disk space to total capacity
+// for solaris.
+func AvailableDiskSpace(path string) (float64, error) {
+	s := unix.Statvfs_t{}
+	err := unix.Statvfs(path, &s)
+	if err != nil {
+		return 0, err
+	}
+
+	// Calculate our free blocks/total blocks to get our total ratio of
+	// free blocks.
+	return float64(s.Bfree) / float64(s.Blocks), nil
+}
--- a/healthcheck/diskcheck_windows.go
+++ b/healthcheck/diskcheck_windows.go
@ -0,0 +1,17 @@
+package healthcheck
+
+import "golang.org/x/sys/windows"
+
+// AvailableDiskSpace returns ratio of available disk space to total capacity
+// for windows.
+func AvailableDiskSpace(path string) (float64, error) {
+	var free, total, avail uint64
+
+	pathPtr, err := windows.UTF16PtrFromString(path)
+	if err != nil {
+		panic(err)
+	}
+	err = windows.GetDiskFreeSpaceEx(pathPtr, &free, &total, &avail)
+
+	return float64(avail) / float64(total), nil
+}
--- a/healthcheck/healthcheck.go
+++ b/healthcheck/healthcheck.go
@ -0,0 +1,231 @@
+// Package healthcheck contains a monitor which takes a set of liveliness checks
+// which it periodically checks. If a check fails after its configured number
+// of allowed call attempts, the monitor will send a request to shutdown using
+// the function is is provided in its config. Checks are dispatched in their own
+// goroutines so that they do not block each other.
+package healthcheck
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/lightningnetwork/lnd/ticker"
+)
+
+// Config contains configuration settings for our monitor.
+type Config struct {
+	// Checks is a set of health checks that assert that lnd has access to
+	// critical resources.
+	Checks []*Observation
+
+	// Shutdown should be called to request safe shutdown on failure of a
+	// health check.
+	Shutdown shutdownFunc
+}
+
+// shutdownFunc is the signature we use for a shutdown function which allows us
+// to print our reason for shutdown.
+type shutdownFunc func(format string, params ...interface{})
+
+// Monitor periodically checks a series of configured liveliness checks to
+// ensure that lnd has access to all critical resources.
+type Monitor struct {
+	started int32 // To be used atomically.
+	stopped int32 // To be used atomically.
+
+	cfg *Config
+
+	quit chan struct{}
+	wg   sync.WaitGroup
+}
+
+// NewMonitor returns a monitor with the provided config.
+func NewMonitor(cfg *Config) *Monitor {
+	return &Monitor{
+		cfg:  cfg,
+		quit: make(chan struct{}),
+	}
+}
+
+// Start launches the goroutines required to run our monitor.
+func (m *Monitor) Start() error {
+	if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
+		return errors.New("monitor already started")
+	}
+
+	// Run through all of the health checks that we have configured and
+	// start a goroutine for each check.
+	for _, check := range m.cfg.Checks {
+		check := check
+
+		// Skip over health checks that are disabled by setting zero
+		// attempts.
+		if check.Attempts == 0 {
+			log.Warnf("check: %v configured with 0 attempts, "+
+				"skipping it", check.Name)
+
+			continue
+		}
+
+		m.wg.Add(1)
+		go func() {
+			defer m.wg.Done()
+			check.monitor(m.cfg.Shutdown, m.quit)
+		}()
+	}
+
+	return nil
+}
+
+// Stop sends all goroutines the signal to exit and waits for them to exit.
+func (m *Monitor) Stop() error {
+	if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
+		return fmt.Errorf("monitor already stopped")
+	}
+
+	close(m.quit)
+	m.wg.Wait()
+
+	return nil
+}
+
+// CreateCheck is a helper function that takes a function that produces an error
+// and wraps it in a function that returns its result on an error channel.
+// We do not wait group the goroutine running our checkFunc because we expect
+// to be dealing with health checks that may block; if we wait group them, we
+// may wait forever. Ideally future health checks will allow callers to cancel
+// them early, and we can wait group this.
+func CreateCheck(checkFunc func() error) func() chan error {
+	return func() chan error {
+		errChan := make(chan error, 1)
+		go func() {
+			errChan <- checkFunc()
+		}()
+
+		return errChan
+	}
+}
+
+// Observation represents a liveliness check that we periodically check.
+type Observation struct {
+	// Name describes the health check.
+	Name string
+
+	// Check runs the health check itself, returning an error channel that
+	// is expected to receive nil or an error.
+	Check func() chan error
+
+	// Interval is a ticker which triggers running our check function. This
+	// ticker must be started and stopped by the observation.
+	Interval ticker.Ticker
+
+	// Attempts is the number of calls we make for a single check before
+	// failing.
+	Attempts int
+
+	// Timeout is the amount of time we allow our check function to take
+	// before we time it out.
+	Timeout time.Duration
+
+	// Backoff is the amount of time we back off between retries for failed
+	// checks.
+	Backoff time.Duration
+}
+
+// NewObservation creates an observation.
+func NewObservation(name string, check func() error, interval,
+	timeout, backoff time.Duration, attempts int) *Observation {
+
+	return &Observation{
+		Name:     name,
+		Check:    CreateCheck(check),
+		Interval: ticker.New(interval),
+		Attempts: attempts,
+		Timeout:  timeout,
+		Backoff:  backoff,
+	}
+}
+
+// String returns a string representation of an observation.
+func (o *Observation) String() string {
+	return o.Name
+}
+
+// monitor executes a health check every time its interval ticks until the quit
+// channel signals that we should shutdown. This function is also responsible
+// for starting and stopping our ticker.
+func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
+	log.Debugf("Monitoring: %v", o)
+
+	o.Interval.Resume()
+	defer o.Interval.Stop()
+
+	for {
+		select {
+		case <-o.Interval.Ticks():
+			o.retryCheck(quit, shutdown)
+
+		// Exit if we receive the instruction to shutdown.
+		case <-quit:
+			return
+		}
+	}
+}
+
+// retryCheck calls a check function until it succeeds, or we reach our
+// configured number of attempts, waiting for our back off period between failed
+// calls. If we fail to obtain a passing health check after the allowed number
+// of calls, we will request shutdown.
+func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
+	var count int
+
+	for count < o.Attempts {
+		// Increment our call count and call the health check endpoint.
+		count++
+
+		// Wait for our check to return, timeout to elapse, or quit
+		// signal to be received.
+		var err error
+		select {
+		case err = <-o.Check():
+
+		case <-time.After(o.Timeout):
+			err = fmt.Errorf("health check: %v timed out after: "+
+				"%v", o, o.Timeout)
+
+		case <-quit:
+			return
+		}
+
+		// If our error is nil, we have passed our health check, so we
+		// can exit.
+		if err == nil {
+			return
+		}
+
+		// If we have reached our allowed number of attempts, this
+		// check has failed so we request shutdown.
+		if count == o.Attempts {
+			shutdown("Health check: %v failed after %v "+
+				"calls", o, o.Attempts)
+
+			return
+		}
+
+		log.Debugf("Health check: %v, call: %v failed with: %v, "+
+			"backing off for: %v", o, count, err, o.Backoff)
+
+		// If we are still within the number of calls allowed for this
+		// check, we wait for our back off period to elapse, or exit if
+		// we get the signal to shutdown.
+		select {
+		case <-time.After(o.Backoff):
+
+		case <-quit:
+			return
+		}
+	}
+}
--- a/healthcheck/healthcheck_test.go
+++ b/healthcheck/healthcheck_test.go
@ -0,0 +1,225 @@
+package healthcheck
+
+import (
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/lightningnetwork/lnd/ticker"
+	"github.com/stretchr/testify/require"
+)
+
+var (
+	errNonNil = errors.New("non-nil test error")
+	timeout   = time.Second
+	testTime  = time.Unix(1, 2)
+)
+
+type mockedCheck struct {
+	t       *testing.T
+	errChan chan error
+}
+
+// newMockCheck creates a new mock.
+func newMockCheck(t *testing.T) *mockedCheck {
+	return &mockedCheck{
+		t:       t,
+		errChan: make(chan error),
+	}
+}
+
+// call returns our mock's error channel, which we can send responses on.
+func (m *mockedCheck) call() chan error {
+	return m.errChan
+}
+
+// sendError sends an error into our mock's error channel, mocking the sending
+// of a response from our check function.
+func (m *mockedCheck) sendError(err error) {
+	select {
+	case m.errChan <- err:
+	case <-time.After(timeout):
+		m.t.Fatalf("could not send error: %v", err)
+	}
+}
+
+// TestMonitor tests creation and triggering of a monitor with a health check.
+func TestMonitor(t *testing.T) {
+	intervalTicker := ticker.NewForce(time.Hour)
+
+	mock := newMockCheck(t)
+	shutdown := make(chan struct{})
+
+	// Create our config for monitoring. We will use a 0 back off so that
+	// out test does not need to wait.
+	cfg := &Config{
+		Checks: []*Observation{
+			{
+				Check:    mock.call,
+				Interval: intervalTicker,
+				Attempts: 2,
+				Backoff:  0,
+				Timeout:  time.Hour,
+			},
+		},
+		Shutdown: func(string, ...interface{}) {
+			shutdown <- struct{}{}
+		},
+	}
+	monitor := NewMonitor(cfg)
+
+	require.NoError(t, monitor.Start(), "could not start monitor")
+
+	// Tick is a helper we will use to tick our interval.
+	tick := func() {
+		select {
+		case intervalTicker.Force <- testTime:
+		case <-time.After(timeout):
+			t.Fatal("could not tick timer")
+		}
+	}
+
+	// Tick our timer and provide our error channel with a nil error. This
+	// mocks our check function succeeding on the first call.
+	tick()
+	mock.sendError(nil)
+
+	// Now we tick our timer again. This time send a non-nil error, followed
+	// by a nil error. This tests our retry logic, because we allow 2
+	// retries, so should recover without needing to shutdown.
+	tick()
+	mock.sendError(errNonNil)
+	mock.sendError(nil)
+
+	// Finally, we tick our timer once more, and send two non-nil errors
+	// into our error channel. This mocks our check function failing twice.
+	tick()
+	mock.sendError(errNonNil)
+	mock.sendError(errNonNil)
+
+	// Since we have failed within our allowed number of retries, we now
+	// expect a call to our shutdown function.
+	select {
+	case <-shutdown:
+	case <-time.After(timeout):
+		t.Fatal("expected shutdown")
+	}
+
+	require.NoError(t, monitor.Stop(), "could not stop monitor")
+}
+
+// TestRetryCheck tests our retry logic. It does not include a test for exiting
+// during the back off period.
+func TestRetryCheck(t *testing.T) {
+	tests := []struct {
+		name string
+
+		// errors provides an in-order list of errors that we expect our
+		// health check to respond with. The number of errors in this
+		// list indicates the number of times we expect our check to
+		// be called, because our test will fail if we do not consume
+		// every error.
+		errors []error
+
+		// attempts is the number of times we call a check before
+		// failing.
+		attempts int
+
+		// timeout is the time we allow our check to take before we
+		// fail them.
+		timeout time.Duration
+
+		// expectedShutdown is true if we expect a shutdown to be
+		// triggered because all of our calls failed.
+		expectedShutdown bool
+	}{
+		{
+			name:             "first call succeeds",
+			errors:           []error{nil},
+			attempts:         2,
+			timeout:          time.Hour,
+			expectedShutdown: false,
+		},
+		{
+			name:             "first call fails",
+			errors:           []error{errNonNil},
+			attempts:         1,
+			timeout:          time.Hour,
+			expectedShutdown: true,
+		},
+		{
+			name:             "fail then recover",
+			errors:           []error{errNonNil, nil},
+			attempts:         2,
+			timeout:          time.Hour,
+			expectedShutdown: false,
+		},
+		{
+			name:             "always fail",
+			errors:           []error{errNonNil, errNonNil},
+			attempts:         2,
+			timeout:          time.Hour,
+			expectedShutdown: true,
+		},
+		{
+			name:             "no calls",
+			errors:           nil,
+			attempts:         0,
+			timeout:          time.Hour,
+			expectedShutdown: false,
+		},
+		{
+			name:             "call times out",
+			errors:           nil,
+			attempts:         1,
+			timeout:          1,
+			expectedShutdown: true,
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			var shutdown bool
+			shutdownFunc := func(string, ...interface{}) {
+				shutdown = true
+			}
+
+			mock := newMockCheck(t)
+
+			// Create an observation that calls our call counting
+			// function. We set a zero back off so that the test
+			// will not wait.
+			observation := &Observation{
+				Check:    mock.call,
+				Attempts: test.attempts,
+				Timeout:  test.timeout,
+				Backoff:  0,
+			}
+			quit := make(chan struct{})
+
+			// Run our retry check in a goroutine because it blocks
+			// on us sending errors into the mocked caller's error
+			// channel.
+			done := make(chan struct{})
+			go func() {
+				observation.retryCheck(quit, shutdownFunc)
+				close(done)
+			}()
+
+			// Prompt our mock caller to send responses for calls
+			// to our call function.
+			for _, err := range test.errors {
+				mock.sendError(err)
+			}
+
+			// Make sure that we have finished running our retry
+			// check function before we start checking results.
+			<-done
+
+			require.Equal(t, test.expectedShutdown, shutdown,
+				"unexpected shutdown state")
+		})
+	}
+}
--- a/healthcheck/log.go
+++ b/healthcheck/log.go
@ -0,0 +1,32 @@
+package healthcheck
+
+import (
+	"github.com/btcsuite/btclog"
+	"github.com/lightningnetwork/lnd/build"
+)
+
+// Subsystem defines the logging code for this subsystem.
+const Subsystem = "HLCK"
+
+// log is a logger that is initialized with no output filters.  This
+// means the package will not perform any logging by default until the caller
+// requests it.
+var log btclog.Logger
+
+// The default amount of logging is none.
+func init() {
+	UseLogger(build.NewSubLogger(Subsystem, nil))
+}
+
+// DisableLog disables all library log output.  Logging output is disabled
+// by default until UseLogger is called.
+func DisableLog() {
+	UseLogger(btclog.Disabled)
+}
+
+// UseLogger uses a specified Logger to output package logging info.
+// This should be used in preference to SetLogWriter if the caller is also
+// using btclog.
+func UseLogger(logger btclog.Logger) {
+	log = logger
+}
--- a/lncfg/healthcheck.go
+++ b/lncfg/healthcheck.go
@ -0,0 +1,90 @@
+package lncfg
+
+import (
+	"errors"
+	"fmt"
+	"time"
+)
+
+var (
+	// MinHealthCheckInterval is the minimum interval we allow between
+	// health checks.
+	MinHealthCheckInterval = time.Minute
+
+	// MinHealthCheckTimeout is the minimum timeout we allow for health
+	// check calls.
+	MinHealthCheckTimeout = time.Second
+
+	// MinHealthCheckBackoff is the minimum back off we allow between health
+	// check retries.
+	MinHealthCheckBackoff = time.Second
+)
+
+// HealthCheckConfig contains the configuration for the different health checks
+// the lnd runs.
+type HealthCheckConfig struct {
+	ChainCheck *CheckConfig `group:"chainbackend" namespace:"chainbackend"`
+
+	DiskCheck *DiskCheckConfig `group:"diskspace" namespace:"diskspace"`
+}
+
+// Validate checks the values configured for our health checks.
+func (h *HealthCheckConfig) Validate() error {
+	if err := h.ChainCheck.validate("chain backend"); err != nil {
+		return err
+	}
+
+	if err := h.DiskCheck.validate("disk space"); err != nil {
+		return err
+	}
+
+	if h.DiskCheck.RequiredRemaining < 0 ||
+		h.DiskCheck.RequiredRemaining >= 1 {
+
+		return errors.New("disk required ratio must be in [0:1)")
+	}
+
+	return nil
+}
+
+type CheckConfig struct {
+	Interval time.Duration `long:"interval" description:"How often to run a health check."`
+
+	Attempts int `long:"attempts" description:"The number of calls we will make for the check before failing. Set this value to 0 to disable a check."`
+
+	Timeout time.Duration `long:"timeout" description:"The amount of time we allow the health check to take before failing due to timeout."`
+
+	Backoff time.Duration `long:"backoff" description:"The amount of time to back-off between failed health checks."`
+}
+
+// validate checks the values in a health check config entry if it is enabled.
+func (c *CheckConfig) validate(name string) error {
+	if c.Attempts == 0 {
+		return nil
+	}
+
+	if c.Backoff < MinHealthCheckBackoff {
+		return fmt.Errorf("%v backoff: %v below minimum: %v", name,
+			c.Backoff, MinHealthCheckBackoff)
+	}
+
+	if c.Timeout < MinHealthCheckTimeout {
+		return fmt.Errorf("%v timeout: %v below minimum: %v", name,
+			c.Timeout, MinHealthCheckTimeout)
+	}
+
+	if c.Interval < MinHealthCheckInterval {
+		return fmt.Errorf("%v interval: %v below minimum: %v", name,
+			c.Interval, MinHealthCheckInterval)
+	}
+
+	return nil
+}
+
+// DiskCheckConfig contains configuration for ensuring that our node has
+// sufficient disk space.
+type DiskCheckConfig struct {
+	RequiredRemaining float64 `long:"diskrequired" description:"The minimum ratio of free disk space to total capacity that we allow before shutting lnd down safely."`
+
+	*CheckConfig
+}
--- a/log.go
+++ b/log.go
@ -16,6 +16,7 @@ import (
 	"github.com/lightningnetwork/lnd/channelnotifier"
 	"github.com/lightningnetwork/lnd/contractcourt"
 	"github.com/lightningnetwork/lnd/discovery"
+	"github.com/lightningnetwork/lnd/healthcheck"
 	"github.com/lightningnetwork/lnd/htlcswitch"
 	"github.com/lightningnetwork/lnd/invoices"
 	"github.com/lightningnetwork/lnd/lnrpc/autopilotrpc"
@ -129,6 +130,7 @@ func SetupLoggers(root *build.RotatingLogWriter) {
 	AddSubLogger(root, routerrpc.Subsystem, routerrpc.UseLogger)
 	AddSubLogger(root, chanfitness.Subsystem, chanfitness.UseLogger)
 	AddSubLogger(root, verrpc.Subsystem, verrpc.UseLogger)
+	AddSubLogger(root, healthcheck.Subsystem, healthcheck.UseLogger)
 }

 // AddSubLogger is a helper method to conveniently create and register the
--- a/mobile/bindings.go
+++ b/mobile/bindings.go
@ -54,7 +54,10 @@ func Start(extraArgs string, unlockerReady, rpcReady Callback) {
 	}

 	// Hook interceptor for os signals.
-	signal.Intercept()
+	if err := signal.Intercept(); err != nil {
+		_, _ = fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}

 	// Set up channels that will be notified when the RPC servers are ready
 	// to accept calls.
--- a/sample-lnd.conf
+++ b/sample-lnd.conf
@ -400,3 +400,39 @@ litecoin.node=ltcd
 ; sweep funds if a breach occurs while being offline. The fee rate should be
 ; specified in sat/byte, the default is 10 sat/byte.
 ; wtclient.sweep-fee-rate=10
+
+[healthcheck]
+; The number of times we should attempt to query our chain backend before
+; gracefully shutting down. Set this value to 0 to disable this health check.
+; healthcheck.chainbackend.attempts=3
+
+; The amount of time we allow a call to our chain backend to take before we fail
+; the attempt. This value must be >= 1s.
+; healthcheck.chainbackend.timeout=10s
+
+; The amount of time we should backoff between failed attempts to query chain
+; backend. This value must be >= 1s.
+; healthcheck.chainbackend.backoff=30s
+
+; The amount of time we should wait between chain backend health checks. This
+; value must be >= 1m.
+; healthcheck.chainbackend.interval=1m
+
+; The minimum ratio of free disk space to total capacity that we require.
+; healthcheck.diskspace.diskrequired=0.1
+
+; The number of times we should attempt to query our available disk space before
+; gracefully shutting down. Set this value to 0 to disable this health check.
+; healthcheck.diskspace.attempts=2
+
+; The amount of time we allow a query for our available disk space to take
+; before we fail the attempt. This value must be >= 1s.
+; healthcheck.diskspace.timeout=5s
+
+; The amount of time we should backoff between failed attempts to query
+; available disk space. This value must be >= 1s.
+; healthcheck.diskspace.backoff=1m
+
+; The amount of time we should wait between disk space health checks. This
+; value must be >= 1m.
+; healthcheck.diskspace.interval=6h
--- a/server.go
+++ b/server.go
@ -37,6 +37,7 @@ import (
 	"github.com/lightningnetwork/lnd/contractcourt"
 	"github.com/lightningnetwork/lnd/discovery"
 	"github.com/lightningnetwork/lnd/feature"
+	"github.com/lightningnetwork/lnd/healthcheck"
 	"github.com/lightningnetwork/lnd/htlcswitch"
 	"github.com/lightningnetwork/lnd/htlcswitch/hop"
 	"github.com/lightningnetwork/lnd/input"
@ -276,6 +277,9 @@ type server struct {

 	hostAnn *netann.HostAnnouncer

+	// livelinessMonitor monitors that lnd has access to critical resources.
+	livelinessMonitor *healthcheck.Monitor
+
 	quit chan struct{}

 	wg sync.WaitGroup
@ -1254,6 +1258,56 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
 		})
 	}

+	// Create a set of health checks using our configured values. If a
+	// health check has been disabled by setting attempts to 0, our monitor
+	// will not run it.
+	chainHealthCheck := healthcheck.NewObservation(
+		"chain backend",
+		func() error {
+			_, _, err := cc.chainIO.GetBestBlock()
+			return err
+		},
+		cfg.HealthChecks.ChainCheck.Interval,
+		cfg.HealthChecks.ChainCheck.Timeout,
+		cfg.HealthChecks.ChainCheck.Backoff,
+		cfg.HealthChecks.ChainCheck.Attempts,
+	)
+
+	diskCheck := healthcheck.NewObservation(
+		"disk space",
+		func() error {
+			free, err := healthcheck.AvailableDiskSpace(cfg.LndDir)
+			if err != nil {
+				return err
+			}
+
+			// If we have more free space than we require,
+			// we return a nil error.
+			if free > cfg.HealthChecks.DiskCheck.RequiredRemaining {
+				return nil
+			}
+
+			return fmt.Errorf("require: %v free space, got: %v",
+				cfg.HealthChecks.DiskCheck.RequiredRemaining,
+				free)
+		},
+		cfg.HealthChecks.DiskCheck.Interval,
+		cfg.HealthChecks.DiskCheck.Timeout,
+		cfg.HealthChecks.DiskCheck.Backoff,
+		cfg.HealthChecks.DiskCheck.Attempts,
+	)
+
+	// If we have not disabled all of our health checks, we create a
+	// liveliness monitor with our configured checks.
+	s.livelinessMonitor = healthcheck.NewMonitor(
+		&healthcheck.Config{
+			Checks: []*healthcheck.Observation{
+				chainHealthCheck, diskCheck,
+			},
+			Shutdown: srvrLog.Criticalf,
+		},
+	)
+
 	// Create the connection manager which will be responsible for
 	// maintaining persistent outbound connections and also accepting new
 	// incoming connections
@ -1304,6 +1358,13 @@ func (s *server) Start() error {
 			}
 		}

+		if s.livelinessMonitor != nil {
+			if err := s.livelinessMonitor.Start(); err != nil {
+				startErr = err
+				return
+			}
+		}
+
 		// Start the notification server. This is used so channel
 		// management goroutines can be notified when a funding
 		// transaction reaches a sufficient number of confirmations, or
@ -1535,6 +1596,13 @@ func (s *server) Stop() error {
 			}
 		}

+		if s.livelinessMonitor != nil {
+			if err := s.livelinessMonitor.Stop(); err != nil {
+				srvrLog.Warnf("unable to shutdown liveliness "+
+					"monitor: %v", err)
+			}
+		}
+
 		// Wait for all lingering goroutines to quit.
 		s.wg.Wait()

--- a/signal/signal.go
+++ b/signal/signal.go
@ -6,8 +6,10 @@
 package signal

 import (
+	"errors"
 	"os"
 	"os/signal"
+	"sync/atomic"
 	"syscall"
 )

@ -19,6 +21,10 @@ var (
 	// gracefully, similar to when receiving SIGINT.
 	shutdownRequestChannel = make(chan struct{})

+	// started indicates whether we have started our main interrupt handler.
+	// This field should be used atomically.
+	started int32
+
 	// quit is closed when instructing the main interrupt handler to exit.
 	quit = make(chan struct{})

@ -26,8 +32,13 @@ var (
 	shutdownChannel = make(chan struct{})
 )

-// Intercept starts the interception of interrupt signals.
-func Intercept() {
+// Intercept starts the interception of interrupt signals. Note that this
+// function can only be called once.
+func Intercept() error {
+	if !atomic.CompareAndSwapInt32(&started, 0, 1) {
+		return errors.New("intercept already started")
+	}
+
 	signalsToCatch := []os.Signal{
 		os.Interrupt,
 		os.Kill,
@ -37,6 +48,8 @@ func Intercept() {
 	}
 	signal.Notify(interruptChannel, signalsToCatch...)
 	go mainInterruptHandler()
+
+	return nil
 }

 // mainInterruptHandler listens for SIGINT (Ctrl+C) signals on the
@ -85,6 +98,20 @@ func mainInterruptHandler() {
 	}
 }

+// Listening returns true if the main interrupt handler has been started, and
+// has not been killed.
+func Listening() bool {
+	// If our started field is not set, we are not yet listening for
+	// interrupts.
+	if atomic.LoadInt32(&started) != 1 {
+		return false
+	}
+
+	// If we have started our main goroutine, we check whether we have
+	// stopped it yet.
+	return Alive()
+}
+
 // Alive returns true if the main interrupt handler has not been killed.
 func Alive() bool {
 	select {