Merge pull request #4789 from cfromknecht/wt-clean-shutdown

wtclient: schedule ForceQuit first during Stop
2020-11-30 17:03:18 -08:00 · 2020-11-30 17:03:18 -08:00 · 39d2ea752e
commit 39d2ea752e
parent 7e298f1434 1f86526250
3 changed files with 87 additions and 22 deletions
--- a/watchtower/wtclient/client.go
+++ b/watchtower/wtclient/client.go
@ -462,13 +462,7 @@ func (c *TowerClient) Stop() error {
 	c.stopped.Do(func() {
 		log.Debugf("Stopping watchtower client")

-		// 1. Shutdown the backup queue, which will prevent any further
-		// updates from being accepted. In practice, the links should be
-		// shutdown before the client has been stopped, so all updates
-		// would have been added prior.
-		c.pipeline.Stop()
-
-		// 2. To ensure we don't hang forever on shutdown due to
+		// 1. To ensure we don't hang forever on shutdown due to
 		// unintended failures, we'll delay a call to force quit the
 		// pipeline if a ForceQuitDelay is specified. This will have no
 		// effect if the pipeline shuts down cleanly before the delay
@ -483,6 +477,12 @@ func (c *TowerClient) Stop() error {
 			time.AfterFunc(c.cfg.ForceQuitDelay, c.ForceQuit)
 		}

+		// 2. Shutdown the backup queue, which will prevent any further
+		// updates from being accepted. In practice, the links should be
+		// shutdown before the client has been stopped, so all updates
+		// would have been added prior.
+		c.pipeline.Stop()
+
 		// 3. Once the backup queue has shutdown, wait for the main
 		// dispatcher to exit. The backup queue will signal it's
 		// completion to the dispatcher, which releases the wait group
--- a/watchtower/wtclient/client_test.go
+++ b/watchtower/wtclient/client_test.go
@ -446,10 +446,11 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
 		NewAddress: func() ([]byte, error) {
 			return addrScript, nil
 		},
-		ReadTimeout:  timeout,
-		WriteTimeout: timeout,
-		MinBackoff:   time.Millisecond,
-		MaxBackoff:   10 * time.Millisecond,
+		ReadTimeout:    timeout,
+		WriteTimeout:   timeout,
+		MinBackoff:     time.Millisecond,
+		MaxBackoff:     time.Second,
+		ForceQuitDelay: 10 * time.Second,
 	}
 	client, err := wtclient.New(clientCfg)
 	if err != nil {
@ -1475,6 +1476,61 @@ var clientTests = []clientTest{
 			h.waitServerUpdates(hints[numUpdates/2:], 5*time.Second)
 		},
 	},
+	{
+		// Asserts that the client's force quite delay will properly
+		// shutdown the client if it is unable to completely drain the
+		// task pipeline.
+		name: "force unclean shutdown",
+		cfg: harnessCfg{
+			localBalance:  localBalance,
+			remoteBalance: remoteBalance,
+			policy: wtpolicy.Policy{
+				TxPolicy: wtpolicy.TxPolicy{
+					BlobType:     blob.TypeAltruistCommit,
+					SweepFeeRate: wtpolicy.DefaultSweepFeeRate,
+				},
+				MaxUpdates: 5,
+			},
+		},
+		fn: func(h *testHarness) {
+			const (
+				chanID     = 0
+				numUpdates = 6
+				maxUpdates = 5
+			)
+
+			// Advance the channel to create all states.
+			hints := h.advanceChannelN(chanID, numUpdates)
+
+			// Back up 4 of the 5 states for the negotiated session.
+			h.backupStates(chanID, 0, maxUpdates-1, nil)
+			h.waitServerUpdates(hints[:maxUpdates-1], 5*time.Second)
+
+			// Now, restart the tower and prevent it from acking any
+			// new sessions. We do this here as once the last slot
+			// is exhausted the client will attempt to renegotiate.
+			err := h.server.Stop()
+			require.Nil(h.t, err)
+			h.serverCfg.NoAckCreateSession = true
+			h.startServer()
+
+			// Back up the remaining two states. Once the first is
+			// processed, the session will be exhausted but the
+			// client won't be able to regnegotiate a session for
+			// the final state. We'll only wait for the first five
+			// states to arrive at the tower.
+			h.backupStates(chanID, maxUpdates-1, numUpdates, nil)
+			h.waitServerUpdates(hints[:maxUpdates], 5*time.Second)
+
+			// Finally, stop the client which will continue to
+			// attempt session negotiation since it has one more
+			// state to process. After the force quite delay
+			// expires, the client should force quite itself and
+			// allow the test to complete.
+			err = h.client.Stop()
+			require.Nil(h.t, err)
+		},
+	},
 }

 // TestClient executes the client test suite, asserting the ability to backup
--- a/watchtower/wtclient/session_negotiator.go
+++ b/watchtower/wtclient/session_negotiator.go
@ -231,6 +231,19 @@ func (n *sessionNegotiator) negotiate() {
 	// backoff.
 	var backoff time.Duration

+	// Create a closure to update the backoff upon failure such that it
+	// stays within our min and max backoff parameters.
+	updateBackoff := func() {
+		if backoff == 0 {
+			backoff = n.cfg.MinBackoff
+		} else {
+			backoff *= 2
+			if backoff > n.cfg.MaxBackoff {
+				backoff = n.cfg.MaxBackoff
+			}
+		}
+	}
+
 retryWithBackoff:
 	// If we are retrying, wait out the delay before continuing.
 	if backoff > 0 {
@ -251,16 +264,8 @@ retryWithBackoff:
 		// Pull the next candidate from our list of addresses.
 		tower, err := n.cfg.Candidates.Next()
 		if err != nil {
-			if backoff == 0 {
-				backoff = n.cfg.MinBackoff
-			} else {
-				// We've run out of addresses, double and clamp
-				// backoff.
-				backoff *= 2
-				if backoff > n.cfg.MaxBackoff {
-					backoff = n.cfg.MaxBackoff
-				}
-			}
+			// We've run out of addresses, update our backoff.
+			updateBackoff()

 			log.Debugf("Unable to get new tower candidate, "+
 				"retrying after %v -- reason: %v", backoff, err)
@ -293,10 +298,14 @@ retryWithBackoff:
 		// get a new session, trying all addresses if necessary.
 		err = n.createSession(tower, keyIndex)
 		if err != nil {
+			// An unexpected error occurred, updpate our backoff.
+			updateBackoff()
+
 			log.Debugf("Session negotiation with tower=%x "+
 				"failed, trying again -- reason: %v",
 				tower.IdentityKey.SerializeCompressed(), err)
-			continue
+
+			goto retryWithBackoff
 		}

 		// Success.