mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
cgroup_task_dead() is called from finish_task_switch() which runs with
preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
function needs to acquire css_set_lock which is a regular spinlock that can
sleep on RT kernels, leading to "sleeping function called from invalid
context" warnings.
css_set_lock is too large in scope to convert to a raw_spinlock. However,
the unlinking operations don't need to run synchronously - they just need
to complete after the task is done running.
On PREEMPT_RT, defer the work through irq_work. While the work doesn't need
to happen immediately, it can't be delayed indefinitely either as the dead
task pins the cgroup and task_struct can be pinned indefinitely. Use the
lazy version of irq_work to allow batching and lower impact while ensuring
timely completion.
v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation
for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior).
Fixes: d245698d72 ("cgroup: Defer task cgroup unlink until after the task is done switching out")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
@@ -1324,7 +1324,10 @@ struct task_struct {
|
|||||||
struct css_set __rcu *cgroups;
|
struct css_set __rcu *cgroups;
|
||||||
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
|
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
|
||||||
struct list_head cg_list;
|
struct list_head cg_list;
|
||||||
#endif
|
#ifdef CONFIG_PREEMPT_RT
|
||||||
|
struct llist_node cg_dead_lnode;
|
||||||
|
#endif /* CONFIG_PREEMPT_RT */
|
||||||
|
#endif /* CONFIG_CGROUPS */
|
||||||
#ifdef CONFIG_X86_CPU_RESCTRL
|
#ifdef CONFIG_X86_CPU_RESCTRL
|
||||||
u32 closid;
|
u32 closid;
|
||||||
u32 rmid;
|
u32 rmid;
|
||||||
|
|||||||
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css);
|
|||||||
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
|
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
|
||||||
struct cgroup *cgrp, struct cftype cfts[],
|
struct cgroup *cgrp, struct cftype cfts[],
|
||||||
bool is_add);
|
bool is_add);
|
||||||
|
static void cgroup_rt_init(void);
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_CGROUP_REF
|
#ifdef CONFIG_DEBUG_CGROUP_REF
|
||||||
#define CGROUP_REF_FN_ATTRS noinline
|
#define CGROUP_REF_FN_ATTRS noinline
|
||||||
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
|
|||||||
BUG_ON(ss_rstat_init(NULL));
|
BUG_ON(ss_rstat_init(NULL));
|
||||||
|
|
||||||
get_user_ns(init_cgroup_ns.user_ns);
|
get_user_ns(init_cgroup_ns.user_ns);
|
||||||
|
cgroup_rt_init();
|
||||||
|
|
||||||
cgroup_lock();
|
cgroup_lock();
|
||||||
|
|
||||||
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk)
|
|||||||
} while_each_subsys_mask();
|
} while_each_subsys_mask();
|
||||||
}
|
}
|
||||||
|
|
||||||
void cgroup_task_dead(struct task_struct *tsk)
|
static void do_cgroup_task_dead(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
struct css_set *cset;
|
struct css_set *cset;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
@@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk)
|
|||||||
spin_unlock_irqrestore(&css_set_lock, flags);
|
spin_unlock_irqrestore(&css_set_lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_PREEMPT_RT
|
||||||
|
/*
|
||||||
|
* cgroup_task_dead() is called from finish_task_switch() which doesn't allow
|
||||||
|
* scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
|
||||||
|
* this lead to sleeping in the invalid context warning bug. css_set_lock is too
|
||||||
|
* big to become a raw_spinlock. The task_dead path doesn't need to run
|
||||||
|
* synchronously but can't be delayed indefinitely either as the dead task pins
|
||||||
|
* the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
|
||||||
|
* irq_work to allow batching while ensuring timely completion.
|
||||||
|
*/
|
||||||
|
static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
|
||||||
|
static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
|
||||||
|
|
||||||
|
static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
|
||||||
|
{
|
||||||
|
struct llist_node *lnode;
|
||||||
|
struct task_struct *task, *next;
|
||||||
|
|
||||||
|
lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
|
||||||
|
llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
|
||||||
|
do_cgroup_task_dead(task);
|
||||||
|
put_task_struct(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __init cgroup_rt_init(void)
|
||||||
|
{
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
|
||||||
|
per_cpu(cgrp_dead_tasks_iwork, cpu) =
|
||||||
|
IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cgroup_task_dead(struct task_struct *task)
|
||||||
|
{
|
||||||
|
get_task_struct(task);
|
||||||
|
llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
|
||||||
|
irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
|
||||||
|
}
|
||||||
|
#else /* CONFIG_PREEMPT_RT */
|
||||||
|
static void __init cgroup_rt_init(void) {}
|
||||||
|
|
||||||
|
void cgroup_task_dead(struct task_struct *task)
|
||||||
|
{
|
||||||
|
do_cgroup_task_dead(task);
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_PREEMPT_RT */
|
||||||
|
|
||||||
void cgroup_task_release(struct task_struct *task)
|
void cgroup_task_release(struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
|
|||||||
Reference in New Issue
Block a user