bcache: reduce gc latency by processing less nodes and sleep less time

When bcache device is busy for high I/O loads, there are two methods to reduce the garbage collection latency, - Process less nodes in eac loop of incremental garbage collection in btree_gc_recurse(). - Sleep less time between two full garbage collection in bch_btree_gc(). This patch introduces to hleper routines to provide different garbage collection nodes number and sleep intervel time. - btree_gc_min_nodes() If there is no front end I/O, return 128 nodes to process in each incremental loop, otherwise only 10 nodes are returned. Then front I/O is able to access the btree earlier. - btree_gc_sleep_ms() If there is no synchronized wait for bucket allocation, sleep 100 ms between two incremental GC loop. Othersize only sleep 10 ms before incremental GC loop. Then a faster GC may provide available buckets earlier, to avoid most of bcache working threads from being starved by buckets allocation. The idea is inspired by works from Mingzhe Zou and Robert Pang, but much simpler and the expected behavior is more predictable. Signed-off-by: Coly Li <colyli@fnnas.com> Signed-off-by: Robert Pang <robertpang@google.com> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-12-07 20:06:24 +00:00 · 2025-11-13 13:36:26 +08:00
parent 7bf90cd740
commit 70bc173ce0
3 changed files with 29 additions and 24 deletions
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -399,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
 				TASK_UNINTERRUPTIBLE);

 		mutex_unlock(&ca->set->bucket_lock);
+
+		atomic_inc(&ca->set->bucket_wait_cnt);
 		schedule();
+		atomic_dec(&ca->set->bucket_wait_cnt);
+
 		mutex_lock(&ca->set->bucket_lock);
 	} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
 		 !fifo_pop(&ca->free[reserve], r));
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -604,6 +604,7 @@ struct cache_set {
 	 */
 	atomic_t		prio_blocked;
 	wait_queue_head_t	bucket_wait;
+	atomic_t		bucket_wait_cnt;

 	/*
 	 * For any bio we don't skip we subtract the number of sectors from
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -89,8 +89,9 @@
 * Test module load/unload
 */

-#define MAX_GC_TIMES		100
-#define MIN_GC_NODES		100
+#define MAX_GC_TIMES_SHIFT	7  /* 128 loops */
+#define GC_NODES_MIN		10
+#define GC_SLEEP_MS_MIN		10
 #define GC_SLEEP_MS		100

 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
@@ -1578,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b)

 static size_t btree_gc_min_nodes(struct cache_set *c)
 {
-	size_t min_nodes;
+	size_t min_nodes = GC_NODES_MIN;

-	/*
-	 * Since incremental GC would stop 100ms when front
-	 * side I/O comes, so when there are many btree nodes,
-	 * if GC only processes constant (100) nodes each time,
-	 * GC would last a long time, and the front side I/Os
-	 * would run out of the buckets (since no new bucket
-	 * can be allocated during GC), and be blocked again.
-	 * So GC should not process constant nodes, but varied
-	 * nodes according to the number of btree nodes, which
-	 * realized by dividing GC into constant(100) times,
-	 * so when there are many btree nodes, GC can process
-	 * more nodes each time, otherwise, GC will process less
-	 * nodes each time (but no less than MIN_GC_NODES)
-	 */
-	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
-	if (min_nodes < MIN_GC_NODES)
-		min_nodes = MIN_GC_NODES;
+	if (atomic_read(&c->search_inflight) == 0) {
+		size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT;
+
+		if (min_nodes < n)
+			min_nodes = n;
+	}

 	return min_nodes;
 }

+static uint64_t btree_gc_sleep_ms(struct cache_set *c)
+{
+	uint64_t sleep_ms;
+
+	if (atomic_read(&c->bucket_wait_cnt) > 0)
+		sleep_ms = GC_SLEEP_MS_MIN;
+	else
+		sleep_ms = GC_SLEEP_MS;
+
+	return sleep_ms;
+}

 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct closure *writes, struct gc_stat *gc)
@@ -1668,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
 		r->b = NULL;

-		if (atomic_read(&b->c->search_inflight) &&
-		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+		if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) {
 			gc->nodes_pre =  gc->nodes;
 			ret = -EAGAIN;
 			break;
@@ -1846,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c)
 		cond_resched();

 		if (ret == -EAGAIN)
-			schedule_timeout_interruptible(msecs_to_jiffies
-						       (GC_SLEEP_MS));
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(btree_gc_sleep_ms(c)));
 		else if (ret)
 			pr_warn("gc failed!\n");
 	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));