mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
Bypass mode routes tasks through fallback dispatch queues. Originally a single
global DSQ, b7b3b2dbae ("sched_ext: Split the global DSQ per NUMA node")
changed this to per-node DSQs to resolve NUMA-related livelocks.
Dan Schatzberg found per-node DSQs can still livelock when many threads are
pinned to different small CPU subsets: each CPU must scan many incompatible
tasks to find runnable ones, causing severe contention with high CPU counts.
Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default
idle CPU selection and direct dispatch handle most cases well.
This introduces a failure mode when tasks concentrate on one CPU in
over-saturated systems. If the BPF scheduler severely skews placement before
triggering bypass, that CPU's queue may be too long to drain, causing RCU
stalls. A load balancer in a future patch will address this. The bypass DSQ is
separate from local DSQ to enable load balancing: local DSQs use rq locks,
preventing efficient scanning and transfer across CPUs, especially problematic
when systems are already contended.
v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi).
Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
@@ -57,6 +57,7 @@ enum scx_dsq_id_flags {
|
||||
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
|
||||
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
|
||||
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
|
||||
SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
|
||||
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
|
||||
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
|
||||
};
|
||||
|
||||
@@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
|
||||
|
||||
if (scx_rq_bypassing(rq)) {
|
||||
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
|
||||
goto global;
|
||||
goto bypass;
|
||||
}
|
||||
|
||||
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
|
||||
@@ -1356,6 +1356,9 @@ local:
|
||||
global:
|
||||
dsq = find_global_dsq(sch, p);
|
||||
goto enqueue;
|
||||
bypass:
|
||||
dsq = &task_rq(p)->scx.bypass_dsq;
|
||||
goto enqueue;
|
||||
|
||||
enqueue:
|
||||
/*
|
||||
@@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
if (consume_global_dsq(sch, rq))
|
||||
goto has_tasks;
|
||||
|
||||
if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
|
||||
scx_rq_bypassing(rq) || !scx_rq_online(rq))
|
||||
if (scx_rq_bypassing(rq)) {
|
||||
if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
|
||||
goto has_tasks;
|
||||
else
|
||||
goto no_tasks;
|
||||
}
|
||||
|
||||
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
|
||||
goto no_tasks;
|
||||
|
||||
dspc->rq = rq;
|
||||
@@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void)
|
||||
int n = cpu_to_node(cpu);
|
||||
|
||||
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
|
||||
init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
|
||||
INIT_LIST_HEAD(&rq->scx.runnable_list);
|
||||
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
|
||||
|
||||
|
||||
@@ -808,6 +808,7 @@ struct scx_rq {
|
||||
struct balance_callback deferred_bal_cb;
|
||||
struct irq_work deferred_irq_work;
|
||||
struct irq_work kick_cpus_irq_work;
|
||||
struct scx_dispatch_q bypass_dsq;
|
||||
};
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
|
||||
Reference in New Issue
Block a user