mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Pull rseq updates from Thomas Gleixner:
"A large overhaul of the restartable sequences and CID management:
The recent enablement of RSEQ in glibc resulted in regressions which
are caused by the related overhead. It turned out that the decision to
invoke the exit to user work was not really a decision. More or less
each context switch caused that. There is a long list of small issues
which sums up nicely and results in a 3-4% regression in I/O
benchmarks.
The other detail which caused issues due to extra work in context
switch and task migration is the CID (memory context ID) management.
It also requires to use a task work to consolidate the CID space,
which is executed in the context of an arbitrary task and results in
sporadic uncontrolled exit latencies.
The rewrite addresses this by:
- Removing deprecated and long unsupported functionality
- Moving the related data into dedicated data structures which are
optimized for fast path processing.
- Caching values so actual decisions can be made
- Replacing the current implementation with a optimized inlined
variant.
- Separating fast and slow path for architectures which use the
generic entry code, so that only fault and error handling goes into
the TIF_NOTIFY_RESUME handler.
- Rewriting the CID management so that it becomes mostly invisible in
the context switch path. That moves the work of switching modes
into the fork/exit path, which is a reasonable tradeoff. That work
is only required when a process creates more threads than the
cpuset it is allowed to run on or when enough threads exit after
that. An artificial thread pool benchmarks which triggers this did
not degrade, it actually improved significantly.
The main effect in migration heavy scenarios is that runqueue lock
held time and therefore contention goes down significantly"
* tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
sched/mmcid: Switch over to the new mechanism
sched/mmcid: Implement deferred mode change
irqwork: Move data struct to a types header
sched/mmcid: Provide CID ownership mode fixup functions
sched/mmcid: Provide new scheduler CID mechanism
sched/mmcid: Introduce per task/CPU ownership infrastructure
sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
sched/mmcid: Provide precomputed maximal value
sched/mmcid: Move initialization out of line
signal: Move MMCID exit out of sighand lock
sched/mmcid: Convert mm CID mask to a bitmap
cpumask: Cache num_possible_cpus()
sched/mmcid: Use cpumask_weighted_or()
cpumask: Introduce cpumask_weighted_or()
sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
sched/mmcid: Move scheduler code out of global header
sched: Fixup whitespace damage
sched/mmcid: Cacheline align MM CID storage
sched/mmcid: Use proper data structures
sched/mmcid: Revert the complex CID management
...
266 lines
7.5 KiB
C
266 lines
7.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/init_task.h>
|
|
#include <linux/export.h>
|
|
#include <linux/mqueue.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/sysctl.h>
|
|
#include <linux/sched/rt.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/sched/ext.h>
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/audit.h>
|
|
#include <linux/numa.h>
|
|
#include <linux/scs.h>
|
|
#include <linux/plist.h>
|
|
|
|
#include <linux/uaccess.h>
|
|
|
|
static struct signal_struct init_signals = {
|
|
.nr_threads = 1,
|
|
.thread_head = LIST_HEAD_INIT(init_task.thread_node),
|
|
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
|
|
.shared_pending = {
|
|
.list = LIST_HEAD_INIT(init_signals.shared_pending.list),
|
|
.signal = {{0}}
|
|
},
|
|
.multiprocess = HLIST_HEAD_INIT,
|
|
.rlim = INIT_RLIMITS,
|
|
#ifdef CONFIG_CGROUPS
|
|
.cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
|
|
#endif
|
|
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
|
|
.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
|
|
#ifdef CONFIG_POSIX_TIMERS
|
|
.posix_timers = HLIST_HEAD_INIT,
|
|
.ignored_posix_timers = HLIST_HEAD_INIT,
|
|
.cputimer = {
|
|
.cputime_atomic = INIT_CPUTIME_ATOMIC,
|
|
},
|
|
#endif
|
|
INIT_CPU_TIMERS(init_signals)
|
|
.pids = {
|
|
[PIDTYPE_PID] = &init_struct_pid,
|
|
[PIDTYPE_TGID] = &init_struct_pid,
|
|
[PIDTYPE_PGID] = &init_struct_pid,
|
|
[PIDTYPE_SID] = &init_struct_pid,
|
|
},
|
|
INIT_PREV_CPUTIME(init_signals)
|
|
};
|
|
|
|
static struct sighand_struct init_sighand = {
|
|
.count = REFCOUNT_INIT(1),
|
|
.action = { { { .sa_handler = SIG_DFL, } }, },
|
|
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
|
|
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
|
|
};
|
|
|
|
#ifdef CONFIG_SHADOW_CALL_STACK
|
|
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
|
|
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
|
|
};
|
|
#endif
|
|
|
|
/* init to 2 - one for init_task, one to ensure it is never freed */
|
|
static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
|
|
|
|
/*
|
|
* The initial credentials for the initial task
|
|
*/
|
|
static struct cred init_cred = {
|
|
.usage = ATOMIC_INIT(4),
|
|
.uid = GLOBAL_ROOT_UID,
|
|
.gid = GLOBAL_ROOT_GID,
|
|
.suid = GLOBAL_ROOT_UID,
|
|
.sgid = GLOBAL_ROOT_GID,
|
|
.euid = GLOBAL_ROOT_UID,
|
|
.egid = GLOBAL_ROOT_GID,
|
|
.fsuid = GLOBAL_ROOT_UID,
|
|
.fsgid = GLOBAL_ROOT_GID,
|
|
.securebits = SECUREBITS_DEFAULT,
|
|
.cap_inheritable = CAP_EMPTY_SET,
|
|
.cap_permitted = CAP_FULL_SET,
|
|
.cap_effective = CAP_FULL_SET,
|
|
.cap_bset = CAP_FULL_SET,
|
|
.user = INIT_USER,
|
|
.user_ns = &init_user_ns,
|
|
.group_info = &init_groups,
|
|
.ucounts = &init_ucounts,
|
|
};
|
|
|
|
/*
|
|
* Set up the first task table, touch at your own risk!. Base=0,
|
|
* limit=0x1fffff (=2MB)
|
|
*/
|
|
struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
|
|
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
|
.thread_info = INIT_THREAD_INFO(init_task),
|
|
.stack_refcount = REFCOUNT_INIT(1),
|
|
#endif
|
|
.__state = 0,
|
|
.stack = init_stack,
|
|
.usage = REFCOUNT_INIT(2),
|
|
.flags = PF_KTHREAD,
|
|
.prio = MAX_PRIO - 20,
|
|
.static_prio = MAX_PRIO - 20,
|
|
.normal_prio = MAX_PRIO - 20,
|
|
.policy = SCHED_NORMAL,
|
|
.cpus_ptr = &init_task.cpus_mask,
|
|
.user_cpus_ptr = NULL,
|
|
.cpus_mask = CPU_MASK_ALL,
|
|
.max_allowed_capacity = SCHED_CAPACITY_SCALE,
|
|
.nr_cpus_allowed= NR_CPUS,
|
|
.mm = NULL,
|
|
.active_mm = &init_mm,
|
|
.faults_disabled_mapping = NULL,
|
|
.restart_block = {
|
|
.fn = do_no_restart_syscall,
|
|
},
|
|
.se = {
|
|
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
|
|
},
|
|
.rt = {
|
|
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
|
|
.time_slice = RR_TIMESLICE,
|
|
},
|
|
.tasks = LIST_HEAD_INIT(init_task.tasks),
|
|
#ifdef CONFIG_SMP
|
|
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
|
|
#endif
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
.sched_task_group = &root_task_group,
|
|
#endif
|
|
#ifdef CONFIG_SCHED_CLASS_EXT
|
|
.scx = {
|
|
.dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node),
|
|
.sticky_cpu = -1,
|
|
.holding_cpu = -1,
|
|
.runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
|
|
.runnable_at = INITIAL_JIFFIES,
|
|
.ddsp_dsq_id = SCX_DSQ_INVALID,
|
|
.slice = SCX_SLICE_DFL,
|
|
},
|
|
#endif
|
|
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
|
|
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
|
|
.real_parent = &init_task,
|
|
.parent = &init_task,
|
|
.children = LIST_HEAD_INIT(init_task.children),
|
|
.sibling = LIST_HEAD_INIT(init_task.sibling),
|
|
.group_leader = &init_task,
|
|
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
|
|
RCU_POINTER_INITIALIZER(cred, &init_cred),
|
|
.comm = INIT_TASK_COMM,
|
|
.thread = INIT_THREAD,
|
|
.fs = &init_fs,
|
|
.files = &init_files,
|
|
#ifdef CONFIG_IO_URING
|
|
.io_uring = NULL,
|
|
#endif
|
|
.signal = &init_signals,
|
|
.sighand = &init_sighand,
|
|
.nsproxy = &init_nsproxy,
|
|
.pending = {
|
|
.list = LIST_HEAD_INIT(init_task.pending.list),
|
|
.signal = {{0}}
|
|
},
|
|
.blocked = {{0}},
|
|
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
|
|
.journal_info = NULL,
|
|
INIT_CPU_TIMERS(init_task)
|
|
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
|
|
.timer_slack_ns = 50000, /* 50 usec default slack */
|
|
.thread_pid = &init_struct_pid,
|
|
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
|
|
#ifdef CONFIG_AUDIT
|
|
.loginuid = INVALID_UID,
|
|
.sessionid = AUDIT_SID_UNSET,
|
|
#endif
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
|
|
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
|
|
#endif
|
|
#ifdef CONFIG_PREEMPT_RCU
|
|
.rcu_read_lock_nesting = 0,
|
|
.rcu_read_unlock_special.s = 0,
|
|
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
|
|
.rcu_blocked_node = NULL,
|
|
#endif
|
|
#ifdef CONFIG_TASKS_RCU
|
|
.rcu_tasks_holdout = false,
|
|
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
|
|
.rcu_tasks_idle_cpu = -1,
|
|
.rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
|
|
#endif
|
|
#ifdef CONFIG_TASKS_TRACE_RCU
|
|
.trc_reader_nesting = 0,
|
|
.trc_reader_special.s = 0,
|
|
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
|
|
.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
|
|
#endif
|
|
#ifdef CONFIG_CPUSETS
|
|
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
|
|
&init_task.alloc_lock),
|
|
#endif
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
.pi_waiters = RB_ROOT_CACHED,
|
|
.pi_top_task = NULL,
|
|
#endif
|
|
INIT_PREV_CPUTIME(init_task)
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
|
|
.vtime.starttime = 0,
|
|
.vtime.state = VTIME_SYS,
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
.numa_preferred_nid = NUMA_NO_NODE,
|
|
.numa_group = NULL,
|
|
.numa_faults = NULL,
|
|
#endif
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
.kasan_depth = 1,
|
|
#endif
|
|
#ifdef CONFIG_KCSAN
|
|
.kcsan_ctx = {
|
|
.scoped_accesses = {LIST_POISON1, NULL},
|
|
},
|
|
#endif
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
.softirqs_enabled = 1,
|
|
#endif
|
|
#ifdef CONFIG_LOCKDEP
|
|
.lockdep_depth = 0, /* no locks held yet */
|
|
.curr_chain_key = INITIAL_CHAIN_KEY,
|
|
.lockdep_recursion = 0,
|
|
#endif
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
|
.ret_stack = NULL,
|
|
.tracing_graph_pause = ATOMIC_INIT(0),
|
|
#endif
|
|
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
|
|
.trace_recursion = 0,
|
|
#endif
|
|
#ifdef CONFIG_LIVEPATCH
|
|
.patch_state = KLP_TRANSITION_IDLE,
|
|
#endif
|
|
#ifdef CONFIG_SECURITY
|
|
.security = NULL,
|
|
#endif
|
|
#ifdef CONFIG_SECCOMP_FILTER
|
|
.seccomp = { .filter_count = ATOMIC_INIT(0) },
|
|
#endif
|
|
#ifdef CONFIG_SCHED_MM_CID
|
|
.mm_cid = { .cid = MM_CID_UNSET, },
|
|
#endif
|
|
};
|
|
EXPORT_SYMBOL(init_task);
|
|
|
|
/*
|
|
* Initial thread structure. Alignment of this is handled by a special
|
|
* linker map entry.
|
|
*/
|
|
#ifndef CONFIG_THREAD_INFO_IN_TASK
|
|
struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
|
|
#endif
|