sched_ext: Add scx_cpu0 example scheduler

Add scx_cpu0, a simple scheduler that queues all tasks to a single DSQ and
only dispatches them from CPU0 in FIFO order. This is useful for testing bypass
behavior when many tasks are concentrated on a single CPU. If the load balancer
doesn't work, bypass mode can trigger task hangs or RCU stalls as the queue is
long and there's only one CPU working on it.

v2: Check whether task is on CPU0 at enqueue using scx_bpf_task_cpu() instead
    of nr_cpus_allowed (Andrea Righi).

Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Emil Tsalapatis <etsal@meta.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Tejun Heo
2025-11-11 09:18:13 -10:00
parent 582f700e1b
commit c948d9f80c
3 changed files with 195 additions and 1 deletions

View File

@@ -187,7 +187,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \

View File

@@ -0,0 +1,88 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* A CPU0 scheduler.
*
* This scheduler queues all tasks to a shared DSQ and only dispatches them on
* CPU0 in FIFO order. This is useful for testing bypass behavior when many
* tasks are concentrated on a single CPU. If the load balancer doesn't work,
* bypass mode can trigger task hangs or RCU stalls as the queue is long and
* there's only one CPU working on it.
*
* - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
* - Termination notification for userspace.
*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
*/
#include <scx/common.bpf.h>
char _license[] SEC("license") = "GPL";
const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
UEI_DEFINE(uei);
/*
* We create a custom DSQ with ID 0 that we dispatch to and consume from on
* CPU0.
*/
#define DSQ_CPU0 0
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u64));
__uint(max_entries, 2); /* [local, cpu0] */
} stats SEC(".maps");
static void stat_inc(u32 idx)
{
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
if (cnt_p)
(*cnt_p)++;
}
s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
return 0;
}
void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
{
/*
* select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
* CPU 0. Queue on whichever CPU it's currently only.
*/
if (scx_bpf_task_cpu(p) != 0) {
stat_inc(0); /* count local queueing */
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
return;
}
stat_inc(1); /* count cpu0 queueing */
scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
}
void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
{
if (cpu == 0)
scx_bpf_dsq_move_to_local(DSQ_CPU0);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
{
return scx_bpf_create_dsq(DSQ_CPU0, -1);
}
void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(cpu0_ops,
.select_cpu = (void *)cpu0_select_cpu,
.enqueue = (void *)cpu0_enqueue,
.dispatch = (void *)cpu0_dispatch,
.init = (void *)cpu0_init,
.exit = (void *)cpu0_exit,
.name = "cpu0");

106
tools/sched_ext/scx_cpu0.c Normal file
View File

@@ -0,0 +1,106 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
*/
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <assert.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_cpu0.bpf.skel.h"
const char help_fmt[] =
"A cpu0 sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-v]\n"
"\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int sig)
{
exit_req = 1;
}
static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
{
int nr_cpus = libbpf_num_possible_cpus();
assert(nr_cpus > 0);
__u64 cnts[2][nr_cpus];
__u32 idx;
memset(stats, 0, sizeof(stats[0]) * 2);
for (idx = 0; idx < 2; idx++) {
int ret, cpu;
ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
&idx, cnts[idx]);
if (ret < 0)
continue;
for (cpu = 0; cpu < nr_cpus; cpu++)
stats[idx] += cnts[idx][cpu];
}
}
int main(int argc, char **argv)
{
struct scx_cpu0 *skel;
struct bpf_link *link;
__u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "vh")) != -1) {
switch (opt) {
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
while (!exit_req && !UEI_EXITED(skel, uei)) {
__u64 stats[2];
read_stats(skel, stats);
printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
ecode = UEI_REPORT(skel, uei);
scx_cpu0__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}