cgroup: Fix sleeping from invalid context warning on PREEMPT_RT

cgroup_task_dead() is called from finish_task_switch() which runs with preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The function needs to acquire css_set_lock which is a regular spinlock that can sleep on RT kernels, leading to "sleeping function called from invalid context" warnings. css_set_lock is too large in scope to convert to a raw_spinlock. However, the unlinking operations don't need to run synchronously - they just need to complete after the task is done running. On PREEMPT_RT, defer the work through irq_work. While the work doesn't need to happen immediately, it can't be delayed indefinitely either as the dead task pins the cgroup and task_struct can be pinned indefinitely. Use the lazy version of irq_work to allow batching and lower impact while ensuring timely completion. v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior). Fixes: d245698d72 ("cgroup: Defer task cgroup unlink until after the task is done switching out") Reported-by: Calvin Owens <calvin@wbinvd.org> Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-07 20:06:24 +00:00 · 2025-11-06 08:12:36 -10:00
parent be04e96ba9
commit 9311e6c29b
2 changed files with 58 additions and 2 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1324,7 +1324,10 @@ struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
 	struct llist_node		cg_dead_lnode;
 #endif	/* CONFIG_PREEMPT_RT */
 #endif	/* CONFIG_CGROUPS */
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_rt_init(void);
 #ifdef CONFIG_DEBUG_CGROUP_REF
 #define CGROUP_REF_FN_ATTRS	noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
 	BUG_ON(ss_rstat_init(NULL));
 	get_user_ns(init_cgroup_ns.user_ns);
 	cgroup_rt_init();
 	cgroup_lock();
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
-void cgroup_task_dead(struct task_struct *tsk)
+static void do_cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 #ifdef CONFIG_PREEMPT_RT
 /*
 * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
 * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
 * this lead to sleeping in the invalid context warning bug. css_set_lock is too
 * big to become a raw_spinlock. The task_dead path doesn't need to run
 * synchronously but can't be delayed indefinitely either as the dead task pins
 * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
 * irq_work to allow batching while ensuring timely completion.
 */
 static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
 static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
 static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
 {
 	struct llist_node *lnode;
 	struct task_struct *task, *next;
 	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
 	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
 		do_cgroup_task_dead(task);
 		put_task_struct(task);
 	}
 }
 static void __init cgroup_rt_init(void)
 {
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
 		per_cpu(cgrp_dead_tasks_iwork, cpu) =
 			IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
 	}
 }
 void cgroup_task_dead(struct task_struct *task)
 {
 	get_task_struct(task);
 	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
 	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
 }
 #else	/* CONFIG_PREEMPT_RT */
 static void __init cgroup_rt_init(void) {}
 void cgroup_task_dead(struct task_struct *task)
 {
 	do_cgroup_task_dead(task);
 }
 #endif	/* CONFIG_PREEMPT_RT */
 void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;