From 5a73029442a05470019660142217141c263889a0 Mon Sep 17 00:00:00 2001 From: carla Date: Mon, 24 Aug 2020 08:54:39 +0200 Subject: [PATCH] multi: add minimum disk space check --- config.go | 19 +++++++++++++++++++ go.mod | 1 + go.sum | 1 + healthcheck/diskcheck.go | 18 ++++++++++++++++++ healthcheck/diskcheck_solaris.go | 17 +++++++++++++++++ healthcheck/diskcheck_windows.go | 17 +++++++++++++++++ lncfg/healthcheck.go | 27 ++++++++++++++++++++++++++- sample-lnd.conf | 18 ++++++++++++++++++ server.go | 26 +++++++++++++++++++++++++- 9 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 healthcheck/diskcheck.go create mode 100644 healthcheck/diskcheck_solaris.go create mode 100644 healthcheck/diskcheck_windows.go diff --git a/config.go b/config.go index 5f066493..8713aa8f 100644 --- a/config.go +++ b/config.go @@ -91,6 +91,16 @@ const ( defaultChainTimeout = time.Second * 10 defaultChainBackoff = time.Second * 30 defaultChainAttempts = 3 + + // By default, we will shutdown if less than 10% of disk space is + // available. We allow a longer interval for disk space checks, because + // this check is less likely to deteriorate quickly. However, we allow + // fewer retries because this should not be a flakey check. + defaultRequiredDisk = 0.1 + defaultDiskInterval = time.Hour * 12 + defaultDiskTimeout = time.Second * 5 + defaultDiskBackoff = time.Minute + defaultDiskAttempts = 2 ) var ( @@ -397,6 +407,15 @@ func DefaultConfig() Config { Attempts: defaultChainAttempts, Backoff: defaultChainBackoff, }, + DiskCheck: &lncfg.DiskCheckConfig{ + RequiredRemaining: defaultRequiredDisk, + CheckConfig: &lncfg.CheckConfig{ + Interval: defaultDiskInterval, + Attempts: defaultDiskAttempts, + Timeout: defaultDiskTimeout, + Backoff: defaultDiskBackoff, + }, + }, }, MaxOutgoingCltvExpiry: htlcswitch.DefaultMaxOutgoingCltvExpiry, MaxChannelFeeAllocation: htlcswitch.DefaultMaxLinkFeeAllocation, diff --git a/go.mod b/go.mod index 160da68a..3b83e961 100644 --- a/go.mod +++ b/go.mod @@ -66,6 +66,7 @@ require ( go.uber.org/zap v1.14.1 // indirect golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899 golang.org/x/net v0.0.0-20191002035440-2ec189313ef0 + golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 google.golang.org/grpc v1.24.0 gopkg.in/errgo.v1 v1.0.1 // indirect diff --git a/go.sum b/go.sum index e1c65788..3f2b8b91 100644 --- a/go.sum +++ b/go.sum @@ -332,6 +332,7 @@ golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFW golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200724161237-0e2f3a69832c h1:UIcGWL6/wpCfyGuJnRFJRurA+yj8RrW7Q6x2YMCXt6c= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2 h1:z99zHgr7hKfrUcX/KsoJk5FJfjTceCKIp96+biqP4To= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/healthcheck/diskcheck.go b/healthcheck/diskcheck.go new file mode 100644 index 00000000..c3e2cbb2 --- /dev/null +++ b/healthcheck/diskcheck.go @@ -0,0 +1,18 @@ +// +build !windows,!solaris + +package healthcheck + +import "syscall" + +// AvailableDiskSpace returns ratio of available disk space to total capacity. +func AvailableDiskSpace(path string) (float64, error) { + s := syscall.Statfs_t{} + err := syscall.Statfs(path, &s) + if err != nil { + return 0, err + } + + // Calculate our free blocks/total blocks to get our total ratio of + // free blocks. + return float64(s.Bfree) / float64(s.Blocks), nil +} diff --git a/healthcheck/diskcheck_solaris.go b/healthcheck/diskcheck_solaris.go new file mode 100644 index 00000000..d44330b7 --- /dev/null +++ b/healthcheck/diskcheck_solaris.go @@ -0,0 +1,17 @@ +package healthcheck + +import "golang.org/x/sys/unix" + +// AvailableDiskSpace returns ratio of available disk space to total capacity +// for solaris. +func AvailableDiskSpace(path string) (float64, error) { + s := unix.Statvfs_t{} + err := unix.Statvfs(path, &s) + if err != nil { + return 0, err + } + + // Calculate our free blocks/total blocks to get our total ratio of + // free blocks. + return float64(s.Bfree) / float64(s.Blocks), nil +} diff --git a/healthcheck/diskcheck_windows.go b/healthcheck/diskcheck_windows.go new file mode 100644 index 00000000..7fed088b --- /dev/null +++ b/healthcheck/diskcheck_windows.go @@ -0,0 +1,17 @@ +package healthcheck + +import "golang.org/x/sys/windows" + +// AvailableDiskSpace returns ratio of available disk space to total capacity +// for windows. +func AvailableDiskSpace(path string) (float64, error) { + var free, total, avail uint64 + + pathPtr, err := windows.UTF16PtrFromString(path) + if err != nil { + panic(err) + } + err = windows.GetDiskFreeSpaceEx(pathPtr, &free, &total, &avail) + + return float64(avail) / float64(total), nil +} diff --git a/lncfg/healthcheck.go b/lncfg/healthcheck.go index 740d87d3..a43505ca 100644 --- a/lncfg/healthcheck.go +++ b/lncfg/healthcheck.go @@ -1,6 +1,7 @@ package lncfg import ( + "errors" "fmt" "time" ) @@ -23,11 +24,27 @@ var ( // the lnd runs. type HealthCheckConfig struct { ChainCheck *CheckConfig `group:"chainbackend" namespace:"chainbackend"` + + DiskCheck *DiskCheckConfig `group:"diskspace" namespace:"diskspace"` } // Validate checks the values configured for our health checks. func (h *HealthCheckConfig) Validate() error { - return h.ChainCheck.validate("chain backend") + if err := h.ChainCheck.validate("chain backend"); err != nil { + return err + } + + if err := h.DiskCheck.validate("disk space"); err != nil { + return err + } + + if h.DiskCheck.RequiredRemaining < 0 || + h.DiskCheck.RequiredRemaining >= 1 { + + return errors.New("disk required ratio must be in [0:1)") + } + + return nil } type CheckConfig struct { @@ -63,3 +80,11 @@ func (c *CheckConfig) validate(name string) error { return nil } + +// DiskCheckConfig contains configuration for ensuring that our node has +// sufficient disk space. +type DiskCheckConfig struct { + RequiredRemaining float64 `long:"diskrequired" description:"The minimum ratio of free disk space to total capacity that we allow before shutting lnd down safely."` + + *CheckConfig +} diff --git a/sample-lnd.conf b/sample-lnd.conf index 027ebeb1..2eb0b6bd 100644 --- a/sample-lnd.conf +++ b/sample-lnd.conf @@ -418,3 +418,21 @@ litecoin.node=ltcd ; value must be >= 1m. ; healthcheck.chainbackend.interval=1m +; The minimum ratio of free disk space to total capacity that we require. +; healthcheck.diskspace.diskrequired=0.1 + +; The number of times we should attempt to query our available disk space before +; gracefully shutting down. Set this value to 0 to disable this health check. +; healthcheck.diskspace.attempts=2 + +; The amount of time we allow a query for our available disk space to take +; before we fail the attempt. This value must be >= 1s. +; healthcheck.diskspace.timeout=5s + +; The amount of time we should backoff between failed attempts to query +; available disk space. This value must be >= 1s. +; healthcheck.diskspace.backoff=1m + +; The amount of time we should wait between disk space health checks. This +; value must be >= 1m. +; healthcheck.diskspace.interval=6h diff --git a/server.go b/server.go index d29225f7..626c9099 100644 --- a/server.go +++ b/server.go @@ -1273,12 +1273,36 @@ func newServer(cfg *Config, listenAddrs []net.Addr, cfg.HealthChecks.ChainCheck.Attempts, ) + diskCheck := healthcheck.NewObservation( + "disk space", + func() error { + free, err := healthcheck.AvailableDiskSpace(cfg.LndDir) + if err != nil { + return err + } + + // If we have more free space than we require, + // we return a nil error. + if free > cfg.HealthChecks.DiskCheck.RequiredRemaining { + return nil + } + + return fmt.Errorf("require: %v free space, got: %v", + cfg.HealthChecks.DiskCheck.RequiredRemaining, + free) + }, + cfg.HealthChecks.DiskCheck.Interval, + cfg.HealthChecks.DiskCheck.Timeout, + cfg.HealthChecks.DiskCheck.Backoff, + cfg.HealthChecks.DiskCheck.Attempts, + ) + // If we have not disabled all of our health checks, we create a // liveliness monitor with our configured checks. s.livelinessMonitor = healthcheck.NewMonitor( &healthcheck.Config{ Checks: []*healthcheck.Observation{ - chainHealthCheck, + chainHealthCheck, diskCheck, }, Shutdown: srvrLog.Criticalf, },