From f7c5236bf601c1cf0edbc87dafd4d6220b4e16cf Mon Sep 17 00:00:00 2001
From: Conner Fromknecht <conner@lightning.engineering>
Date: Wed, 17 Feb 2021 18:13:29 -0800
Subject: [PATCH 1/2] routing: dial back max concurrent block fetches

This commit reduces the number of concurrent validation operations the
router will perform when fully validating the channel graph. Reports
from several users indicate that GetInfo would hang for several minutes,
which is believed to be caused by attempting to validate massive amounts
of channels in parallel. This commit returns the limit back to its
original state before adding the batched gossip improvements.

We keep the 1000 concurrent validation request limit for
AssumeChannelValid, since we don't fetch blocks in that case. This
allows us to still keep the performance benefits on mobile/low-resource
devices.
---
 routing/router.go | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/routing/router.go b/routing/router.go
index b308338a..0cb0ced8 100644
--- a/routing/router.go
+++ b/routing/router.go
@@ -3,6 +3,7 @@ package routing
 import (
 	"bytes"
 	"fmt"
+	"runtime"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -914,7 +915,28 @@ func (r *ChannelRouter) networkHandler() {
 
 	// We'll use this validation barrier to ensure that we process all jobs
 	// in the proper order during parallel validation.
-	validationBarrier := NewValidationBarrier(1000, r.quit)
+	//
+	// NOTE: For AssumeChannelValid, we bump up the maximum number of
+	// concurrent validation requests since there are no blocks being
+	// fetched. This significantly increases the performance of IGD for
+	// neutrino nodes.
+	//
+	// However, we dial back to use multiple of the number of cores when
+	// fully validating, to avoid fetching up to 1000 blocks from the
+	// backend. On bitcoind, this will empirically cause massive latency
+	// spikes when executing this many concurrent RPC calls. Critical
+	// subsystems or basic rpc calls that rely on calls such as GetBestBlock
+	// will hang due to excessive load.
+	//
+	// See https://github.com/lightningnetwork/lnd/issues/4892.
+	var validationBarrier *ValidationBarrier
+	if r.cfg.AssumeChannelValid {
+		validationBarrier = NewValidationBarrier(1000, r.quit)
+	} else {
+		validationBarrier = NewValidationBarrier(
+			4*runtime.NumCPU(), r.quit,
+		)
+	}
 
 	for {
 

From 250bc8560edd0ac91112e7944aab5ed81c90ba3d Mon Sep 17 00:00:00 2001
From: Conner Fromknecht <conner@lightning.engineering>
Date: Wed, 17 Feb 2021 18:55:56 -0800
Subject: [PATCH 2/2] routing: avoid modifying AssumeChannelValid in unit tests

This produces a race condition when reading AssumeChannelValid from a
different goroutine. Instead we isolate the test cases and initial
AssumeChannelValid properly.
---
 routing/router_test.go | 57 +++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/routing/router_test.go b/routing/router_test.go
index a04aa7b3..19ad1ef3 100644
--- a/routing/router_test.go
+++ b/routing/router_test.go
@@ -72,6 +72,14 @@ func (c *testCtx) RestartRouter() error {
 func createTestCtxFromGraphInstance(startingHeight uint32, graphInstance *testGraphInstance) (
 	*testCtx, func(), error) {
 
+	return createTestCtxFromGraphInstanceAssumeValid(
+		startingHeight, graphInstance, false,
+	)
+}
+
+func createTestCtxFromGraphInstanceAssumeValid(startingHeight uint32,
+	graphInstance *testGraphInstance, assumeValid bool) (*testCtx, func(), error) {
+
 	// We'll initialize an instance of the channel router with mock
 	// versions of the chain and channel notifier. As we don't need to test
 	// any p2p functionality, the peer send and switch send messages won't
@@ -126,8 +134,9 @@ func createTestCtxFromGraphInstance(startingHeight uint32, graphInstance *testGr
 			next := atomic.AddUint64(&uniquePaymentID, 1)
 			return next, nil
 		},
-		PathFindingConfig: pathFindingConfig,
-		Clock:             clock.NewTestClock(time.Unix(1, 0)),
+		PathFindingConfig:  pathFindingConfig,
+		Clock:              clock.NewTestClock(time.Unix(1, 0)),
+		AssumeChannelValid: assumeValid,
 	})
 	if err != nil {
 		return nil, nil, fmt.Errorf("unable to create router %v", err)
@@ -2034,6 +2043,15 @@ func TestPruneChannelGraphStaleEdges(t *testing.T) {
 func TestPruneChannelGraphDoubleDisabled(t *testing.T) {
 	t.Parallel()
 
+	t.Run("no_assumechannelvalid", func(t *testing.T) {
+		testPruneChannelGraphDoubleDisabled(t, false)
+	})
+	t.Run("assumechannelvalid", func(t *testing.T) {
+		testPruneChannelGraphDoubleDisabled(t, true)
+	})
+}
+
+func testPruneChannelGraphDoubleDisabled(t *testing.T, assumeValid bool) {
 	// We'll create the following test graph so that only the last channel
 	// is pruned. We'll use a fresh timestamp to ensure they're not pruned
 	// according to that heuristic.
@@ -2125,34 +2143,37 @@ func TestPruneChannelGraphDoubleDisabled(t *testing.T) {
 	defer testGraph.cleanUp()
 
 	const startingHeight = 100
-	ctx, cleanUp, err := createTestCtxFromGraphInstance(
-		startingHeight, testGraph,
+	ctx, cleanUp, err := createTestCtxFromGraphInstanceAssumeValid(
+		startingHeight, testGraph, assumeValid,
 	)
 	if err != nil {
 		t.Fatalf("unable to create test context: %v", err)
 	}
 	defer cleanUp()
 
-	// All the channels should exist within the graph before pruning them.
-	assertChannelsPruned(t, ctx.graph, testChannels)
+	// All the channels should exist within the graph before pruning them
+	// when not using AssumeChannelValid, otherwise we should have pruned
+	// the last channel on startup.
+	if !assumeValid {
+		assertChannelsPruned(t, ctx.graph, testChannels)
+	} else {
+		prunedChannel := testChannels[len(testChannels)-1].ChannelID
+		assertChannelsPruned(t, ctx.graph, testChannels, prunedChannel)
+	}
 
-	// If we attempt to prune them without AssumeChannelValid being set,
-	// none should be pruned.
 	if err := ctx.router.pruneZombieChans(); err != nil {
 		t.Fatalf("unable to prune zombie channels: %v", err)
 	}
 
-	assertChannelsPruned(t, ctx.graph, testChannels)
-
-	// Now that AssumeChannelValid is set, we'll prune the graph again and
-	// the last channel should be the only one pruned.
-	ctx.router.cfg.AssumeChannelValid = true
-	if err := ctx.router.pruneZombieChans(); err != nil {
-		t.Fatalf("unable to prune zombie channels: %v", err)
+	// If we attempted to prune them without AssumeChannelValid being set,
+	// none should be pruned. Otherwise the last channel should still be
+	// pruned.
+	if !assumeValid {
+		assertChannelsPruned(t, ctx.graph, testChannels)
+	} else {
+		prunedChannel := testChannels[len(testChannels)-1].ChannelID
+		assertChannelsPruned(t, ctx.graph, testChannels, prunedChannel)
 	}
-
-	prunedChannel := testChannels[len(testChannels)-1].ChannelID
-	assertChannelsPruned(t, ctx.graph, testChannels, prunedChannel)
 }
 
 // TestFindPathFeeWeighting tests that the findPath method will properly prefer