Files
linux/block/blk-throttle.h
Han Guangjiang bd9fd5be6b blk-throttle: fix access race during throttle policy activation
On repeated cold boots we occasionally hit a NULL pointer crash in
blk_should_throtl() when throttling is consulted before the throttle
policy is fully enabled for the queue. Checking only q->td != NULL is
insufficient during early initialization, so blkg_to_pd() for the
throttle policy can still return NULL and blkg_to_tg() becomes NULL,
which later gets dereferenced.

 Unable to handle kernel NULL pointer dereference
 at virtual address 0000000000000156
 ...
 pc : submit_bio_noacct+0x14c/0x4c8
 lr : submit_bio_noacct+0x48/0x4c8
 sp : ffff800087f0b690
 x29: ffff800087f0b690 x28: 0000000000005f90 x27: ffff00068af393c0
 x26: 0000000000080000 x25: 000000000002fbc0 x24: ffff000684ddcc70
 x23: 0000000000000000 x22: 0000000000000000 x21: 0000000000000000
 x20: 0000000000080000 x19: ffff000684ddcd08 x18: ffffffffffffffff
 x17: 0000000000000000 x16: ffff80008132a550 x15: 0000ffff98020fff
 x14: 0000000000000000 x13: 1fffe000d11d7021 x12: ffff000688eb810c
 x11: ffff00077ec4bb80 x10: ffff000688dcb720 x9 : ffff80008068ef60
 x8 : 00000a6fb8a86e85 x7 : 000000000000111e x6 : 0000000000000002
 x5 : 0000000000000246 x4 : 0000000000015cff x3 : 0000000000394500
 x2 : ffff000682e35e40 x1 : 0000000000364940 x0 : 000000000000001a
 Call trace:
  submit_bio_noacct+0x14c/0x4c8
  verity_map+0x178/0x2c8
  __map_bio+0x228/0x250
  dm_submit_bio+0x1c4/0x678
  __submit_bio+0x170/0x230
  submit_bio_noacct_nocheck+0x16c/0x388
  submit_bio_noacct+0x16c/0x4c8
  submit_bio+0xb4/0x210
  f2fs_submit_read_bio+0x4c/0xf0
  f2fs_mpage_readpages+0x3b0/0x5f0
  f2fs_readahead+0x90/0xe8

Tighten blk_throtl_activated() to also require that the throttle policy
bit is set on the queue:

  return q->td != NULL &&
         test_bit(blkcg_policy_throtl.plid, q->blkcg_pols);

This prevents blk_should_throtl() from accessing throttle group state
until policy data has been attached to blkgs.

Fixes: a3166c5170 ("blk-throttle: delay initialization until configuration")
Co-developed-by: Liang Jie <liangjie@lixiang.com>
Signed-off-by: Liang Jie <liangjie@lixiang.com>
Signed-off-by: Han Guangjiang <hanguangjiang@lixiang.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-09-08 08:24:44 -06:00

210 lines
6.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H
#include "blk-cgroup-rwstat.h"
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios_bps; /* queued bios for bps limit */
struct bio_list bios_iops; /* queued bios for iops limit */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued_bps[2]; /* number of queued bps bios */
unsigned int nr_queued_iops[2]; /* number of queued iops bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
/*
* The sq's iops queue is empty, and a bio is about to be enqueued
* to the first qnode's bios_iops list.
*/
THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules_bps[2];
bool has_rules_iops[2];
/* bytes per second rate limits */
uint64_t bps[2];
/* IOPS limits */
unsigned int iops[2];
/*
* Number of bytes/bio's dispatched in current slice.
* When new configuration is submitted while some bios are still throttled,
* first calculate the carryover: the amount of bytes/IOs already waited
* under the previous configuration. Then, [bytes/io]_disp are represented
* as the negative of the carryover, and they will be used to calculate the
* wait time under the new configuration.
*/
int64_t bytes_disp[2];
int io_disp[2];
unsigned long last_check_time;
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
extern struct blkcg_policy blkcg_policy_throtl;
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
/*
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
void blk_throtl_exit(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_throtl_activated(struct request_queue *q)
{
/*
* q->td guarantees that the blk-throttle module is already loaded,
* and the plid of blk-throttle is assigned.
* blkcg_policy_enabled() guarantees that the policy is activated
* in the request_queue.
*/
return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl);
}
static inline bool blk_should_throtl(struct bio *bio)
{
struct throtl_grp *tg;
int rw = bio_data_dir(bio);
if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
return false;
tg = blkg_to_tg(bio->bi_blkg);
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
bio_set_flag(bio, BIO_CGROUP_ACCT);
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
}
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
}
/* iops limit is always counted */
if (tg->has_rules_iops[rw])
return true;
if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
return true;
return false;
}
static inline bool blk_throtl_bio(struct bio *bio)
{
/*
* block throttling takes effect if the policy is activated
* in the bio's request_queue.
*/
if (!blk_should_throtl(bio))
return false;
return __blk_throtl_bio(bio);
}
#endif /* CONFIG_BLK_DEV_THROTTLING */
#endif