mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 11:56:58 +00:00
Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Scalability and load-balancing improvements:
- Enable scheduler feature NEXT_BUDDY (Mel Gorman)
- Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman)
- Skip sched_balance_running cmpxchg when balance is not due (Tim
Chen)
- Implement generic code for architecture specific sched domain NUMA
distances (Tim Chen)
- Optimize the NUMA distances of the sched-domains builds of Intel
Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim
Chen)
- Implement proportional newidle balance: a randomized algorithm that
runs newidle balancing proportional to its success rate. (Peter
Zijlstra)
Scheduler infrastructure changes:
- Implement the 'sched_change' scoped_guard() pattern for the entire
scheduler (Peter Zijlstra)
- More broadly utilize the sched_change guard (Peter Zijlstra)
- Add support to pick functions to take runqueue-flags (Joel
Fernandes)
- Provide and use set_need_resched_current() (Peter Zijlstra)
Fair scheduling enhancements:
- Forfeit vruntime on yield (Fernand Sieber)
- Only update stats for allowed CPUs when looking for dst group (Adam
Li)
CPU-core scheduling enhancements:
- Optimize core cookie matching check (Fernand Sieber)
Deadline scheduler fixes:
- Only set free_cpus for online runqueues (Doug Berger)
- Fix dl_server time accounting (Peter Zijlstra)
- Fix dl_server stop condition (Peter Zijlstra)
Proxy scheduling fixes:
- Yield the donor task (Fernand Sieber)
Fixes and cleanups:
- Fix do_set_cpus_allowed() locking (Peter Zijlstra)
- Fix migrate_disable_switch() locking (Peter Zijlstra)
- Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
(Hao Jia)
- Increase sched_tick_remote timeout (Phil Auld)
- sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth
Hegde)
- sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)"
* tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits)
sched: Provide and use set_need_resched_current()
sched/fair: Proportional newidle balance
sched/fair: Small cleanup to update_newidle_cost()
sched/fair: Small cleanup to sched_balance_newidle()
sched/fair: Revert max_newidle_lb_cost bump
sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals
sched/fair: Enable scheduler feature NEXT_BUDDY
sched: Increase sched_tick_remote timeout
sched/fair: Have SD_SERIALIZE affect newidle balancing
sched/fair: Skip sched_balance_running cmpxchg when balance is not due
sched/deadline: Minor cleanup in select_task_rq_dl()
sched/deadline: Use cpumask_weight_and() in dl_bw_cpus
sched/deadline: Document dl_server
sched/deadline: Fix dl_server stop condition
sched/deadline: Fix dl_server time accounting
sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
sched/eevdf: Fix min_vruntime vs avg_vruntime
sched/core: Add comment explaining force-idle vruntime snapshots
sched/core: Optimize core cookie matching check
sched/proxy: Yield the donor task
...
This commit is contained in:
@@ -199,8 +199,7 @@ block:
|
||||
* return to userspace schedule() to block.
|
||||
*/
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
set_tsk_need_resched(tsk);
|
||||
set_preempt_need_resched();
|
||||
set_need_resched_current();
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
||||
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
|
||||
extern void arch_scale_freq_tick(void);
|
||||
#define arch_scale_freq_tick arch_scale_freq_tick
|
||||
|
||||
extern int arch_sched_node_distance(int from, int to);
|
||||
|
||||
#endif /* _ASM_X86_TOPOLOGY_H */
|
||||
|
||||
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
|
||||
set_sched_topology(topology);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static int sched_avg_remote_distance;
|
||||
static int avg_remote_numa_distance(void)
|
||||
{
|
||||
int i, j;
|
||||
int distance, nr_remote, total_distance;
|
||||
|
||||
if (sched_avg_remote_distance > 0)
|
||||
return sched_avg_remote_distance;
|
||||
|
||||
nr_remote = 0;
|
||||
total_distance = 0;
|
||||
for_each_node_state(i, N_CPU) {
|
||||
for_each_node_state(j, N_CPU) {
|
||||
distance = node_distance(i, j);
|
||||
|
||||
if (distance >= REMOTE_DISTANCE) {
|
||||
nr_remote++;
|
||||
total_distance += distance;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nr_remote)
|
||||
sched_avg_remote_distance = total_distance / nr_remote;
|
||||
else
|
||||
sched_avg_remote_distance = REMOTE_DISTANCE;
|
||||
|
||||
return sched_avg_remote_distance;
|
||||
}
|
||||
|
||||
int arch_sched_node_distance(int from, int to)
|
||||
{
|
||||
int d = node_distance(from, to);
|
||||
|
||||
switch (boot_cpu_data.x86_vfm) {
|
||||
case INTEL_GRANITERAPIDS_X:
|
||||
case INTEL_ATOM_DARKMONT_X:
|
||||
|
||||
if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
|
||||
d < REMOTE_DISTANCE)
|
||||
return d;
|
||||
|
||||
/*
|
||||
* With SNC enabled, there could be too many levels of remote
|
||||
* NUMA node distances, creating NUMA domain levels
|
||||
* including local nodes and partial remote nodes.
|
||||
*
|
||||
* Trim finer distance tuning for NUMA nodes in remote package
|
||||
* for the purpose of building sched domains. Group NUMA nodes
|
||||
* in the remote package in the same sched group.
|
||||
* Simplify NUMA domains and avoid extra NUMA levels including
|
||||
* different remote NUMA nodes and local nodes.
|
||||
*
|
||||
* GNR and CWF don't expect systems with more than 2 packages
|
||||
* and more than 2 hops between packages. Single average remote
|
||||
* distance won't be appropriate if there are more than 2
|
||||
* packages as average distance to different remote packages
|
||||
* could be different.
|
||||
*/
|
||||
WARN_ONCE(topology_max_packages() > 2,
|
||||
"sched: Expect only up to 2 packages for GNR or CWF, "
|
||||
"but saw %d packages when building sched domains.",
|
||||
topology_max_packages());
|
||||
|
||||
d = avg_remote_numa_distance();
|
||||
}
|
||||
return d;
|
||||
}
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
void set_cpu_sibling_map(int cpu)
|
||||
{
|
||||
bool has_smt = __max_threads_per_core > 1;
|
||||
|
||||
@@ -348,6 +348,11 @@ _label: \
|
||||
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
|
||||
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
|
||||
|
||||
#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
|
||||
__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
|
||||
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
|
||||
{ return (void *)1; }
|
||||
|
||||
#define __GUARD_IS_ERR(_ptr) \
|
||||
({ \
|
||||
unsigned long _rc = (__force unsigned long)(_ptr); \
|
||||
|
||||
@@ -637,8 +637,8 @@ struct sched_rt_entity {
|
||||
#endif
|
||||
} __randomize_layout;
|
||||
|
||||
typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
|
||||
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
|
||||
struct rq_flags;
|
||||
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
|
||||
|
||||
struct sched_dl_entity {
|
||||
struct rb_node rb_node;
|
||||
@@ -685,20 +685,22 @@ struct sched_dl_entity {
|
||||
*
|
||||
* @dl_server tells if this is a server entity.
|
||||
*
|
||||
* @dl_defer tells if this is a deferred or regular server. For
|
||||
* now only defer server exists.
|
||||
*
|
||||
* @dl_defer_armed tells if the deferrable server is waiting
|
||||
* for the replenishment timer to activate it.
|
||||
*
|
||||
* @dl_server_active tells if the dlserver is active(started).
|
||||
* dlserver is started on first cfs enqueue on an idle runqueue
|
||||
* and is stopped when a dequeue results in 0 cfs tasks on the
|
||||
* runqueue. In other words, dlserver is active only when cpu's
|
||||
* runqueue has atleast one cfs task.
|
||||
*
|
||||
* @dl_defer tells if this is a deferred or regular server. For
|
||||
* now only defer server exists.
|
||||
*
|
||||
* @dl_defer_armed tells if the deferrable server is waiting
|
||||
* for the replenishment timer to activate it.
|
||||
*
|
||||
* @dl_defer_running tells if the deferrable server is actually
|
||||
* running, skipping the defer phase.
|
||||
*
|
||||
* @dl_defer_idle tracks idle state
|
||||
*/
|
||||
unsigned int dl_throttled : 1;
|
||||
unsigned int dl_yielded : 1;
|
||||
@@ -709,6 +711,7 @@ struct sched_dl_entity {
|
||||
unsigned int dl_defer : 1;
|
||||
unsigned int dl_defer_armed : 1;
|
||||
unsigned int dl_defer_running : 1;
|
||||
unsigned int dl_defer_idle : 1;
|
||||
|
||||
/*
|
||||
* Bandwidth enforcement timer. Each -deadline task has its
|
||||
@@ -730,9 +733,6 @@ struct sched_dl_entity {
|
||||
* dl_server_update().
|
||||
*
|
||||
* @rq the runqueue this server is for
|
||||
*
|
||||
* @server_has_tasks() returns true if @server_pick return a
|
||||
* runnable task.
|
||||
*/
|
||||
struct rq *rq;
|
||||
dl_server_pick_f server_pick_task;
|
||||
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p);
|
||||
extern int dl_bw_alloc(int cpu, u64 dl_bw);
|
||||
extern void dl_bw_free(int cpu, u64 dl_bw);
|
||||
|
||||
/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
|
||||
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
|
||||
/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
|
||||
extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
|
||||
|
||||
/**
|
||||
* set_cpus_allowed_ptr - set CPU affinity mask of a task
|
||||
@@ -2058,6 +2058,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
|
||||
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
|
||||
}
|
||||
|
||||
static inline void set_need_resched_current(void)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
}
|
||||
|
||||
/*
|
||||
* cond_resched() and cond_resched_lock(): latency reduction via
|
||||
* explicit rescheduling in places that are safe. The return
|
||||
|
||||
@@ -92,6 +92,9 @@ struct sched_domain {
|
||||
unsigned int nr_balance_failed; /* initialise to 0 */
|
||||
|
||||
/* idle_balance() stats */
|
||||
unsigned int newidle_call;
|
||||
unsigned int newidle_success;
|
||||
unsigned int newidle_ratio;
|
||||
u64 max_newidle_lb_cost;
|
||||
unsigned long last_decay_max_lb_cost;
|
||||
|
||||
|
||||
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
|
||||
rcu_read_lock();
|
||||
cs_mask = task_cs(tsk)->cpus_allowed;
|
||||
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
|
||||
do_set_cpus_allowed(tsk, cs_mask);
|
||||
set_cpus_allowed_force(tsk, cs_mask);
|
||||
changed = true;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
|
||||
|
||||
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (!wait_task_inactive(p, state)) {
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
|
||||
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
|
||||
set_cpus_allowed_force(p, mask);
|
||||
|
||||
/* It's safe because the task is inactive. */
|
||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||
do_set_cpus_allowed(p, mask);
|
||||
p->flags |= PF_NO_SETAFFINITY;
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
}
|
||||
|
||||
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
|
||||
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(p);
|
||||
cpumask_var_t affinity;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
|
||||
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
|
||||
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
|
||||
kthread_fetch_affinity(kthread, affinity);
|
||||
|
||||
/* It's safe because the task is inactive. */
|
||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||
do_set_cpus_allowed(p, affinity);
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
|
||||
set_cpus_allowed_force(p, affinity);
|
||||
|
||||
mutex_unlock(&kthreads_hotplug_lock);
|
||||
out:
|
||||
|
||||
@@ -70,12 +70,10 @@ void rcu_qs(void)
|
||||
*/
|
||||
void rcu_sched_clock_irq(int user)
|
||||
{
|
||||
if (user) {
|
||||
if (user)
|
||||
rcu_qs();
|
||||
} else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
}
|
||||
else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
|
||||
set_need_resched_current();
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user)
|
||||
/* The load-acquire pairs with the store-release setting to true. */
|
||||
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
|
||||
/* Idle and userspace execution already are quiescent states. */
|
||||
if (!rcu_is_cpu_rrupt_from_idle() && !user) {
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
}
|
||||
if (!rcu_is_cpu_rrupt_from_idle() && !user)
|
||||
set_need_resched_current();
|
||||
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
|
||||
}
|
||||
rcu_flavor_sched_clock_irq(user);
|
||||
@@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work)
|
||||
/* Perform RCU core processing work for the current CPU. */
|
||||
static __latent_entropy void rcu_core(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
|
||||
struct rcu_node *rnp = rdp->mynode;
|
||||
|
||||
@@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void)
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
|
||||
rcu_preempt_deferred_qs(current);
|
||||
} else if (rcu_preempt_need_deferred_qs(current)) {
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
guard(irqsave)();
|
||||
set_need_resched_current();
|
||||
}
|
||||
|
||||
/* Update RCU state based on any recent quiescent states. */
|
||||
@@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void)
|
||||
/* No grace period and unregistered callbacks? */
|
||||
if (!rcu_gp_in_progress() &&
|
||||
rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
|
||||
local_irq_save(flags);
|
||||
guard(irqsave)();
|
||||
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
|
||||
rcu_accelerate_cbs_unlocked(rnp, rdp);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
|
||||
|
||||
@@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void)
|
||||
__this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
|
||||
/* Store .exp before .rcu_urgent_qs. */
|
||||
smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
set_need_resched_current();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RCU
|
||||
|
||||
@@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
// Also if no expediting and no possible deboosting,
|
||||
// slow is OK. Plus nohz_full CPUs eventually get
|
||||
// tick enabled.
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
set_need_resched_current();
|
||||
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
|
||||
needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
|
||||
cpu_online(rdp->cpu)) {
|
||||
@@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user)
|
||||
if (rcu_preempt_depth() > 0 ||
|
||||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
|
||||
/* No QS, force context switch if deferred. */
|
||||
if (rcu_preempt_need_deferred_qs(t)) {
|
||||
set_tsk_need_resched(t);
|
||||
set_preempt_need_resched();
|
||||
}
|
||||
if (rcu_preempt_need_deferred_qs(t))
|
||||
set_need_resched_current();
|
||||
} else if (rcu_preempt_need_deferred_qs(t)) {
|
||||
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
|
||||
return;
|
||||
|
||||
@@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
|
||||
* progress and it could be we're stuck in kernel space without context
|
||||
* switches for an entirely unreasonable amount of time.
|
||||
*/
|
||||
set_tsk_need_resched(current);
|
||||
set_preempt_need_resched();
|
||||
set_need_resched_current();
|
||||
}
|
||||
|
||||
static bool csd_lock_suppress_rcu_stall;
|
||||
|
||||
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
|
||||
|
||||
#ifdef CONFIG_SCHED_PROXY_EXEC
|
||||
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
|
||||
@@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
|
||||
*
|
||||
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
|
||||
*
|
||||
* is set by activate_task() and cleared by deactivate_task(), under
|
||||
* rq->lock. Non-zero indicates the task is runnable, the special
|
||||
* is set by activate_task() and cleared by deactivate_task()/block_task(),
|
||||
* under rq->lock. Non-zero indicates the task is runnable, the special
|
||||
* ON_RQ_MIGRATING state is used for migration without holding both
|
||||
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
|
||||
*
|
||||
@@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
uclamp_rq_inc(rq, p, flags);
|
||||
|
||||
rq->queue_mask |= p->sched_class->queue_mask;
|
||||
p->sched_class->enqueue_task(rq, p, flags);
|
||||
|
||||
psi_enqueue(p, flags);
|
||||
@@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
* and mark the task ->sched_delayed.
|
||||
*/
|
||||
uclamp_rq_dec(rq, p);
|
||||
rq->queue_mask |= p->sched_class->queue_mask;
|
||||
return p->sched_class->dequeue_task(rq, p, flags);
|
||||
}
|
||||
|
||||
@@ -2169,37 +2172,6 @@ inline int task_curr(const struct task_struct *p)
|
||||
return cpu_curr(task_cpu(p)) == p;
|
||||
}
|
||||
|
||||
/*
|
||||
* ->switching_to() is called with the pi_lock and rq_lock held and must not
|
||||
* mess with locking.
|
||||
*/
|
||||
void check_class_changing(struct rq *rq, struct task_struct *p,
|
||||
const struct sched_class *prev_class)
|
||||
{
|
||||
if (prev_class != p->sched_class && p->sched_class->switching_to)
|
||||
p->sched_class->switching_to(rq, p);
|
||||
}
|
||||
|
||||
/*
|
||||
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
|
||||
* use the balance_callback list if you want balancing.
|
||||
*
|
||||
* this means any call to check_class_changed() must be followed by a call to
|
||||
* balance_callback().
|
||||
*/
|
||||
void check_class_changed(struct rq *rq, struct task_struct *p,
|
||||
const struct sched_class *prev_class,
|
||||
int oldprio)
|
||||
{
|
||||
if (prev_class != p->sched_class) {
|
||||
if (prev_class->switched_from)
|
||||
prev_class->switched_from(rq, p);
|
||||
|
||||
p->sched_class->switched_to(rq, p);
|
||||
} else if (oldprio != p->prio || dl_task(p))
|
||||
p->sched_class->prio_changed(rq, p, oldprio);
|
||||
}
|
||||
|
||||
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
struct task_struct *donor = rq->donor;
|
||||
@@ -2362,7 +2334,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
|
||||
}
|
||||
|
||||
static void
|
||||
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
|
||||
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
|
||||
|
||||
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
@@ -2377,10 +2349,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
|
||||
if (p->cpus_ptr != &p->cpus_mask)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Violates locking rules! See comment in __do_set_cpus_allowed().
|
||||
*/
|
||||
__do_set_cpus_allowed(p, &ac);
|
||||
scoped_guard (task_rq_lock, p)
|
||||
do_set_cpus_allowed(p, &ac);
|
||||
}
|
||||
|
||||
void ___migrate_enable(void)
|
||||
@@ -2613,7 +2583,8 @@ static int migration_cpu_stop(void *data)
|
||||
*/
|
||||
WARN_ON_ONCE(!pending->stop_pending);
|
||||
preempt_disable();
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
rq_unlock(rq, &rf);
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
|
||||
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
|
||||
&pending->arg, &pending->stop_work);
|
||||
preempt_enable();
|
||||
@@ -2622,7 +2593,8 @@ static int migration_cpu_stop(void *data)
|
||||
out:
|
||||
if (pending)
|
||||
pending->stop_pending = false;
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
rq_unlock(rq, &rf);
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
|
||||
|
||||
if (complete)
|
||||
complete_all(&pending->done);
|
||||
@@ -2693,56 +2665,19 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
|
||||
}
|
||||
|
||||
static void
|
||||
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
|
||||
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
|
||||
{
|
||||
struct rq *rq = task_rq(p);
|
||||
bool queued, running;
|
||||
|
||||
/*
|
||||
* This here violates the locking rules for affinity, since we're only
|
||||
* supposed to change these variables while holding both rq->lock and
|
||||
* p->pi_lock.
|
||||
*
|
||||
* HOWEVER, it magically works, because ttwu() is the only code that
|
||||
* accesses these variables under p->pi_lock and only does so after
|
||||
* smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
|
||||
* before finish_task().
|
||||
*
|
||||
* XXX do further audits, this smells like something putrid.
|
||||
*/
|
||||
if (ctx->flags & SCA_MIGRATE_DISABLE)
|
||||
WARN_ON_ONCE(!p->on_cpu);
|
||||
else
|
||||
lockdep_assert_held(&p->pi_lock);
|
||||
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current_donor(rq, p);
|
||||
|
||||
if (queued) {
|
||||
/*
|
||||
* Because __kthread_bind() calls this on blocked tasks without
|
||||
* holding rq->lock.
|
||||
*/
|
||||
lockdep_assert_rq_held(rq);
|
||||
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
|
||||
scoped_guard (sched_change, p, DEQUEUE_SAVE) {
|
||||
p->sched_class->set_cpus_allowed(p, ctx);
|
||||
mm_set_cpus_allowed(p->mm, ctx->new_mask);
|
||||
}
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
p->sched_class->set_cpus_allowed(p, ctx);
|
||||
mm_set_cpus_allowed(p->mm, ctx->new_mask);
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
|
||||
* affinity (if any) should be destroyed too.
|
||||
*/
|
||||
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
||||
void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
|
||||
{
|
||||
struct affinity_context ac = {
|
||||
.new_mask = new_mask,
|
||||
@@ -2754,7 +2689,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
__do_set_cpus_allowed(p, &ac);
|
||||
scoped_guard (__task_rq_lock, p)
|
||||
do_set_cpus_allowed(p, &ac);
|
||||
|
||||
/*
|
||||
* Because this is called with p->pi_lock held, it is not possible
|
||||
@@ -2792,7 +2728,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
|
||||
* Use pi_lock to protect content of user_cpus_ptr
|
||||
*
|
||||
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
|
||||
* do_set_cpus_allowed().
|
||||
* set_cpus_allowed_force().
|
||||
*/
|
||||
raw_spin_lock_irqsave(&src->pi_lock, flags);
|
||||
if (src->user_cpus_ptr) {
|
||||
@@ -3064,8 +3000,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
|
||||
unsigned int dest_cpu;
|
||||
int ret = 0;
|
||||
|
||||
update_rq_clock(rq);
|
||||
|
||||
if (kthread || is_migration_disabled(p)) {
|
||||
/*
|
||||
* Kernel threads are allowed on online && !active CPUs,
|
||||
@@ -3120,7 +3054,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
|
||||
goto out;
|
||||
}
|
||||
|
||||
__do_set_cpus_allowed(p, ctx);
|
||||
do_set_cpus_allowed(p, ctx);
|
||||
|
||||
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
|
||||
|
||||
@@ -3529,13 +3463,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
|
||||
}
|
||||
fallthrough;
|
||||
case possible:
|
||||
/*
|
||||
* XXX When called from select_task_rq() we only
|
||||
* hold p->pi_lock and again violate locking order.
|
||||
*
|
||||
* More yuck to audit.
|
||||
*/
|
||||
do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
|
||||
set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
|
||||
state = fail;
|
||||
break;
|
||||
case fail:
|
||||
@@ -3777,7 +3705,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
|
||||
ttwu_do_wakeup(p);
|
||||
ret = 1;
|
||||
}
|
||||
__task_rq_unlock(rq, &rf);
|
||||
__task_rq_unlock(rq, p, &rf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -4231,7 +4159,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
* __schedule(). See the comment for smp_mb__after_spinlock().
|
||||
*
|
||||
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
|
||||
* schedule()'s deactivate_task() has 'happened' and p will no longer
|
||||
* schedule()'s block_task() has 'happened' and p will no longer
|
||||
* care about it's own p->state. See the comment in __schedule().
|
||||
*/
|
||||
smp_acquire__after_ctrl_dep();
|
||||
@@ -4370,7 +4298,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
|
||||
ret = func(p, arg);
|
||||
|
||||
if (rq)
|
||||
rq_unlock(rq, &rf);
|
||||
__task_rq_unlock(rq, p, &rf);
|
||||
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
|
||||
return ret;
|
||||
@@ -5692,7 +5620,7 @@ static void sched_tick_remote(struct work_struct *work)
|
||||
* reasonable amount of time.
|
||||
*/
|
||||
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
|
||||
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
|
||||
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30);
|
||||
}
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
|
||||
@@ -5916,19 +5844,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
|
||||
const struct sched_class *start_class = prev->sched_class;
|
||||
const struct sched_class *class;
|
||||
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
/*
|
||||
* SCX requires a balance() call before every pick_task() including when
|
||||
* waking up from SCHED_IDLE. If @start_class is below SCX, start from
|
||||
* SCX instead. Also, set a flag to detect missing balance() call.
|
||||
*/
|
||||
if (scx_enabled()) {
|
||||
rq->scx.flags |= SCX_RQ_BAL_PENDING;
|
||||
if (sched_class_above(&ext_sched_class, start_class))
|
||||
start_class = &ext_sched_class;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We must do the balancing pass before put_prev_task(), such
|
||||
* that when we release the rq->lock the task is in the same
|
||||
@@ -5972,7 +5887,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
|
||||
/* Assume the next prioritized class is idle_sched_class */
|
||||
if (!p) {
|
||||
p = pick_task_idle(rq);
|
||||
p = pick_task_idle(rq, rf);
|
||||
put_prev_set_next_task(rq, prev, p);
|
||||
}
|
||||
|
||||
@@ -5984,11 +5899,15 @@ restart:
|
||||
|
||||
for_each_active_class(class) {
|
||||
if (class->pick_next_task) {
|
||||
p = class->pick_next_task(rq, prev);
|
||||
p = class->pick_next_task(rq, prev, rf);
|
||||
if (unlikely(p == RETRY_TASK))
|
||||
goto restart;
|
||||
if (p)
|
||||
return p;
|
||||
} else {
|
||||
p = class->pick_task(rq);
|
||||
p = class->pick_task(rq, rf);
|
||||
if (unlikely(p == RETRY_TASK))
|
||||
goto restart;
|
||||
if (p) {
|
||||
put_prev_set_next_task(rq, prev, p);
|
||||
return p;
|
||||
@@ -6018,7 +5937,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
|
||||
return a->core_cookie == b->core_cookie;
|
||||
}
|
||||
|
||||
static inline struct task_struct *pick_task(struct rq *rq)
|
||||
/*
|
||||
* Careful; this can return RETRY_TASK, it does not include the retry-loop
|
||||
* itself due to the whole SMT pick retry thing below.
|
||||
*/
|
||||
static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
const struct sched_class *class;
|
||||
struct task_struct *p;
|
||||
@@ -6026,7 +5949,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
|
||||
rq->dl_server = NULL;
|
||||
|
||||
for_each_active_class(class) {
|
||||
p = class->pick_task(rq);
|
||||
p = class->pick_task(rq, rf);
|
||||
if (p)
|
||||
return p;
|
||||
}
|
||||
@@ -6041,7 +5964,7 @@ static void queue_core_balance(struct rq *rq);
|
||||
static struct task_struct *
|
||||
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
{
|
||||
struct task_struct *next, *p, *max = NULL;
|
||||
struct task_struct *next, *p, *max;
|
||||
const struct cpumask *smt_mask;
|
||||
bool fi_before = false;
|
||||
bool core_clock_updated = (rq == rq->core);
|
||||
@@ -6126,7 +6049,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* and there are no cookied tasks running on siblings.
|
||||
*/
|
||||
if (!need_sync) {
|
||||
next = pick_task(rq);
|
||||
restart_single:
|
||||
next = pick_task(rq, rf);
|
||||
if (unlikely(next == RETRY_TASK))
|
||||
goto restart_single;
|
||||
if (!next->core_cookie) {
|
||||
rq->core_pick = NULL;
|
||||
rq->core_dl_server = NULL;
|
||||
@@ -6146,6 +6072,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
*
|
||||
* Tie-break prio towards the current CPU
|
||||
*/
|
||||
restart_multi:
|
||||
max = NULL;
|
||||
for_each_cpu_wrap(i, smt_mask, cpu) {
|
||||
rq_i = cpu_rq(i);
|
||||
|
||||
@@ -6157,7 +6085,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
|
||||
update_rq_clock(rq_i);
|
||||
|
||||
rq_i->core_pick = p = pick_task(rq_i);
|
||||
p = pick_task(rq_i, rf);
|
||||
if (unlikely(p == RETRY_TASK))
|
||||
goto restart_multi;
|
||||
|
||||
rq_i->core_pick = p;
|
||||
rq_i->core_dl_server = rq_i->dl_server;
|
||||
|
||||
if (!max || prio_less(max, p, fi_before))
|
||||
@@ -6179,7 +6111,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
if (cookie)
|
||||
p = sched_core_find(rq_i, cookie);
|
||||
if (!p)
|
||||
p = idle_sched_class.pick_task(rq_i);
|
||||
p = idle_sched_class.pick_task(rq_i, rf);
|
||||
}
|
||||
|
||||
rq_i->core_pick = p;
|
||||
@@ -6812,6 +6744,7 @@ static void __sched notrace __schedule(int sched_mode)
|
||||
|
||||
local_irq_disable();
|
||||
rcu_note_context_switch(preempt);
|
||||
migrate_disable_switch(rq, prev);
|
||||
|
||||
/*
|
||||
* Make sure that signal_pending_state()->signal_pending() below
|
||||
@@ -6918,7 +6851,6 @@ keep_resched:
|
||||
*/
|
||||
++*switch_count;
|
||||
|
||||
migrate_disable_switch(rq, prev);
|
||||
psi_account_irqtime(rq, prev, next);
|
||||
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
|
||||
prev->se.sched_delayed);
|
||||
@@ -7326,7 +7258,7 @@ void rt_mutex_post_schedule(void)
|
||||
*/
|
||||
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
|
||||
{
|
||||
int prio, oldprio, queued, running, queue_flag =
|
||||
int prio, oldprio, queue_flag =
|
||||
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
|
||||
const struct sched_class *prev_class, *next_class;
|
||||
struct rq_flags rf;
|
||||
@@ -7388,64 +7320,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
|
||||
prev_class = p->sched_class;
|
||||
next_class = __setscheduler_class(p->policy, prio);
|
||||
|
||||
if (prev_class != next_class && p->se.sched_delayed)
|
||||
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
|
||||
if (prev_class != next_class)
|
||||
queue_flag |= DEQUEUE_CLASS;
|
||||
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current_donor(rq, p);
|
||||
if (queued)
|
||||
dequeue_task(rq, p, queue_flag);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
/*
|
||||
* Boosting condition are:
|
||||
* 1. -rt task is running and holds mutex A
|
||||
* --> -dl task blocks on mutex A
|
||||
*
|
||||
* 2. -dl task is running and holds mutex A
|
||||
* --> -dl task blocks on mutex A and could preempt the
|
||||
* running task
|
||||
*/
|
||||
if (dl_prio(prio)) {
|
||||
if (!dl_prio(p->normal_prio) ||
|
||||
(pi_task && dl_prio(pi_task->prio) &&
|
||||
dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||
p->dl.pi_se = pi_task->dl.pi_se;
|
||||
queue_flag |= ENQUEUE_REPLENISH;
|
||||
scoped_guard (sched_change, p, queue_flag) {
|
||||
/*
|
||||
* Boosting condition are:
|
||||
* 1. -rt task is running and holds mutex A
|
||||
* --> -dl task blocks on mutex A
|
||||
*
|
||||
* 2. -dl task is running and holds mutex A
|
||||
* --> -dl task blocks on mutex A and could preempt the
|
||||
* running task
|
||||
*/
|
||||
if (dl_prio(prio)) {
|
||||
if (!dl_prio(p->normal_prio) ||
|
||||
(pi_task && dl_prio(pi_task->prio) &&
|
||||
dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||
p->dl.pi_se = pi_task->dl.pi_se;
|
||||
scope->flags |= ENQUEUE_REPLENISH;
|
||||
} else {
|
||||
p->dl.pi_se = &p->dl;
|
||||
}
|
||||
} else if (rt_prio(prio)) {
|
||||
if (dl_prio(oldprio))
|
||||
p->dl.pi_se = &p->dl;
|
||||
if (oldprio < prio)
|
||||
scope->flags |= ENQUEUE_HEAD;
|
||||
} else {
|
||||
p->dl.pi_se = &p->dl;
|
||||
if (dl_prio(oldprio))
|
||||
p->dl.pi_se = &p->dl;
|
||||
if (rt_prio(oldprio))
|
||||
p->rt.timeout = 0;
|
||||
}
|
||||
} else if (rt_prio(prio)) {
|
||||
if (dl_prio(oldprio))
|
||||
p->dl.pi_se = &p->dl;
|
||||
if (oldprio < prio)
|
||||
queue_flag |= ENQUEUE_HEAD;
|
||||
} else {
|
||||
if (dl_prio(oldprio))
|
||||
p->dl.pi_se = &p->dl;
|
||||
if (rt_prio(oldprio))
|
||||
p->rt.timeout = 0;
|
||||
|
||||
p->sched_class = next_class;
|
||||
p->prio = prio;
|
||||
}
|
||||
|
||||
p->sched_class = next_class;
|
||||
p->prio = prio;
|
||||
|
||||
check_class_changing(rq, p, prev_class);
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, p, queue_flag);
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
|
||||
check_class_changed(rq, p, prev_class, oldprio);
|
||||
out_unlock:
|
||||
/* Avoid rq from going away on us: */
|
||||
preempt_disable();
|
||||
|
||||
rq_unpin_lock(rq, &rf);
|
||||
__balance_callbacks(rq);
|
||||
raw_spin_rq_unlock(rq);
|
||||
rq_repin_lock(rq, &rf);
|
||||
__task_rq_unlock(rq, p, &rf);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
@@ -8084,26 +8003,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
|
||||
*/
|
||||
void sched_setnuma(struct task_struct *p, int nid)
|
||||
{
|
||||
bool queued, running;
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
rq = task_rq_lock(p, &rf);
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current_donor(rq, p);
|
||||
|
||||
if (queued)
|
||||
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
p->numa_preferred_nid = nid;
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
guard(task_rq_lock)(p);
|
||||
scoped_guard (sched_change, p, DEQUEUE_SAVE)
|
||||
p->numa_preferred_nid = nid;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
@@ -8141,18 +8043,15 @@ static int __balance_push_cpu_stop(void *arg)
|
||||
struct rq_flags rf;
|
||||
int cpu;
|
||||
|
||||
raw_spin_lock_irq(&p->pi_lock);
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
update_rq_clock(rq);
|
||||
|
||||
if (task_rq(p) == rq && task_on_rq_queued(p)) {
|
||||
scoped_guard (raw_spinlock_irq, &p->pi_lock) {
|
||||
cpu = select_fallback_rq(rq->cpu, p);
|
||||
rq = __migrate_task(rq, &rf, p, cpu);
|
||||
}
|
||||
|
||||
rq_unlock(rq, &rf);
|
||||
raw_spin_unlock_irq(&p->pi_lock);
|
||||
rq_lock(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
if (task_rq(p) == rq && task_on_rq_queued(p))
|
||||
rq = __migrate_task(rq, &rf, p, cpu);
|
||||
rq_unlock(rq, &rf);
|
||||
}
|
||||
|
||||
put_task_struct(p);
|
||||
|
||||
@@ -8591,6 +8490,8 @@ void __init sched_init_smp(void)
|
||||
{
|
||||
sched_init_numa(NUMA_NO_NODE);
|
||||
|
||||
prandom_init_once(&sched_rnd_state);
|
||||
|
||||
/*
|
||||
* There's no userspace yet to cause hotplug operations; hence all the
|
||||
* CPU masks are stable and all blatant races in the below code cannot
|
||||
@@ -9207,38 +9108,23 @@ static void sched_change_group(struct task_struct *tsk)
|
||||
*/
|
||||
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
|
||||
{
|
||||
int queued, running, queue_flags =
|
||||
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
|
||||
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
|
||||
bool resched = false;
|
||||
struct rq *rq;
|
||||
|
||||
CLASS(task_rq_lock, rq_guard)(tsk);
|
||||
rq = rq_guard.rq;
|
||||
|
||||
update_rq_clock(rq);
|
||||
|
||||
running = task_current_donor(rq, tsk);
|
||||
queued = task_on_rq_queued(tsk);
|
||||
|
||||
if (queued)
|
||||
dequeue_task(rq, tsk, queue_flags);
|
||||
if (running)
|
||||
put_prev_task(rq, tsk);
|
||||
|
||||
sched_change_group(tsk);
|
||||
if (!for_autogroup)
|
||||
scx_cgroup_move_task(tsk);
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, tsk, queue_flags);
|
||||
if (running) {
|
||||
set_next_task(rq, tsk);
|
||||
/*
|
||||
* After changing group, the running task may have joined a
|
||||
* throttled one but it's still the running task. Trigger a
|
||||
* resched to make sure that task can still run.
|
||||
*/
|
||||
resched_curr(rq);
|
||||
scoped_guard (sched_change, tsk, queue_flags) {
|
||||
sched_change_group(tsk);
|
||||
if (!for_autogroup)
|
||||
scx_cgroup_move_task(tsk);
|
||||
if (scope->running)
|
||||
resched = true;
|
||||
}
|
||||
|
||||
if (resched)
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
@@ -10894,37 +10780,75 @@ void sched_mm_cid_fork(struct task_struct *t)
|
||||
}
|
||||
#endif /* CONFIG_SCHED_MM_CID */
|
||||
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
|
||||
struct sched_enq_and_set_ctx *ctx)
|
||||
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
|
||||
|
||||
struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
|
||||
{
|
||||
struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
|
||||
struct rq *rq = task_rq(p);
|
||||
|
||||
/*
|
||||
* Must exclusively use matched flags since this is both dequeue and
|
||||
* enqueue.
|
||||
*/
|
||||
WARN_ON_ONCE(flags & 0xFFFF0000);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
if (!(flags & DEQUEUE_NOCLOCK)) {
|
||||
update_rq_clock(rq);
|
||||
flags |= DEQUEUE_NOCLOCK;
|
||||
}
|
||||
|
||||
if (flags & DEQUEUE_CLASS) {
|
||||
if (p->sched_class->switching_from)
|
||||
p->sched_class->switching_from(rq, p);
|
||||
}
|
||||
|
||||
*ctx = (struct sched_change_ctx){
|
||||
.p = p,
|
||||
.flags = flags,
|
||||
.queued = task_on_rq_queued(p),
|
||||
.running = task_current_donor(rq, p),
|
||||
};
|
||||
|
||||
if (!(flags & DEQUEUE_CLASS)) {
|
||||
if (p->sched_class->get_prio)
|
||||
ctx->prio = p->sched_class->get_prio(rq, p);
|
||||
else
|
||||
ctx->prio = p->prio;
|
||||
}
|
||||
|
||||
if (ctx->queued)
|
||||
dequeue_task(rq, p, flags);
|
||||
if (ctx->running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
|
||||
p->sched_class->switched_from(rq, p);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void sched_change_end(struct sched_change_ctx *ctx)
|
||||
{
|
||||
struct task_struct *p = ctx->p;
|
||||
struct rq *rq = task_rq(p);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
*ctx = (struct sched_enq_and_set_ctx){
|
||||
.p = p,
|
||||
.queue_flags = queue_flags,
|
||||
.queued = task_on_rq_queued(p),
|
||||
.running = task_current(rq, p),
|
||||
};
|
||||
|
||||
update_rq_clock(rq);
|
||||
if (ctx->queued)
|
||||
dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
|
||||
if (ctx->running)
|
||||
put_prev_task(rq, p);
|
||||
}
|
||||
|
||||
void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
|
||||
{
|
||||
struct rq *rq = task_rq(ctx->p);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
|
||||
p->sched_class->switching_to(rq, p);
|
||||
|
||||
if (ctx->queued)
|
||||
enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
|
||||
enqueue_task(rq, p, ctx->flags);
|
||||
if (ctx->running)
|
||||
set_next_task(rq, ctx->p);
|
||||
set_next_task(rq, p);
|
||||
|
||||
if (ctx->flags & ENQUEUE_CLASS) {
|
||||
if (p->sched_class->switched_to)
|
||||
p->sched_class->switched_to(rq, p);
|
||||
} else {
|
||||
p->sched_class->prio_changed(rq, p, ctx->prio);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
@@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
* cpudl_clear - remove a CPU from the cpudl max-heap
|
||||
* @cp: the cpudl max-heap context
|
||||
* @cpu: the target CPU
|
||||
* @online: the online state of the deadline runqueue
|
||||
*
|
||||
* Notes: assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
* Returns: (void)
|
||||
*/
|
||||
void cpudl_clear(struct cpudl *cp, int cpu)
|
||||
void cpudl_clear(struct cpudl *cp, int cpu, bool online)
|
||||
{
|
||||
int old_idx, new_cpu;
|
||||
unsigned long flags;
|
||||
@@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu)
|
||||
if (old_idx == IDX_INVALID) {
|
||||
/*
|
||||
* Nothing to remove if old_idx was invalid.
|
||||
* This could happen if a rq_offline_dl is
|
||||
* This could happen if rq_online_dl or rq_offline_dl is
|
||||
* called for a CPU without -dl tasks running.
|
||||
*/
|
||||
} else {
|
||||
@@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu)
|
||||
cp->elements[new_cpu].idx = old_idx;
|
||||
cp->elements[cpu].idx = IDX_INVALID;
|
||||
cpudl_heapify(cp, old_idx);
|
||||
|
||||
cpumask_set_cpu(cpu, cp->free_cpus);
|
||||
}
|
||||
if (likely(online))
|
||||
__cpumask_set_cpu(cpu, cp->free_cpus);
|
||||
else
|
||||
__cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
|
||||
raw_spin_unlock_irqrestore(&cp->lock, flags);
|
||||
}
|
||||
|
||||
@@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
|
||||
cp->elements[new_idx].cpu = cpu;
|
||||
cp->elements[cpu].idx = new_idx;
|
||||
cpudl_heapify_up(cp, new_idx);
|
||||
cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
__cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
} else {
|
||||
cp->elements[old_idx].dl = dl;
|
||||
cpudl_heapify(cp, old_idx);
|
||||
@@ -237,26 +241,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
|
||||
raw_spin_unlock_irqrestore(&cp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_set_freecpu - Set the cpudl.free_cpus
|
||||
* @cp: the cpudl max-heap context
|
||||
* @cpu: rd attached CPU
|
||||
*/
|
||||
void cpudl_set_freecpu(struct cpudl *cp, int cpu)
|
||||
{
|
||||
cpumask_set_cpu(cpu, cp->free_cpus);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_clear_freecpu - Clear the cpudl.free_cpus
|
||||
* @cp: the cpudl max-heap context
|
||||
* @cpu: rd attached CPU
|
||||
*/
|
||||
void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
|
||||
{
|
||||
cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_init - initialize the cpudl structure
|
||||
* @cp: the cpudl max-heap context
|
||||
|
||||
@@ -19,8 +19,6 @@ struct cpudl {
|
||||
|
||||
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
|
||||
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
|
||||
void cpudl_clear(struct cpudl *cp, int cpu);
|
||||
void cpudl_clear(struct cpudl *cp, int cpu, bool online);
|
||||
int cpudl_init(struct cpudl *cp);
|
||||
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
|
||||
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
|
||||
void cpudl_cleanup(struct cpudl *cp);
|
||||
|
||||
@@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i)
|
||||
static inline int dl_bw_cpus(int i)
|
||||
{
|
||||
struct root_domain *rd = cpu_rq(i)->rd;
|
||||
int cpus;
|
||||
|
||||
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
|
||||
"sched RCU must be held");
|
||||
|
||||
if (cpumask_subset(rd->span, cpu_active_mask))
|
||||
return cpumask_weight(rd->span);
|
||||
|
||||
cpus = 0;
|
||||
|
||||
for_each_cpu_and(i, rd->span, cpu_active_mask)
|
||||
cpus++;
|
||||
|
||||
return cpus;
|
||||
return cpumask_weight_and(rd->span, cpu_active_mask);
|
||||
}
|
||||
|
||||
static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
|
||||
@@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
|
||||
* up, and checks if the task is still in the "ACTIVE non contending"
|
||||
* state or not (in the second case, it updates running_bw).
|
||||
*/
|
||||
static void task_non_contending(struct sched_dl_entity *dl_se)
|
||||
static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
|
||||
{
|
||||
struct hrtimer *timer = &dl_se->inactive_timer;
|
||||
struct rq *rq = rq_of_dl_se(dl_se);
|
||||
@@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
|
||||
} else {
|
||||
struct task_struct *p = dl_task_of(dl_se);
|
||||
|
||||
if (dl_task(p))
|
||||
if (dl_task)
|
||||
sub_running_bw(dl_se, dl_rq);
|
||||
|
||||
if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
|
||||
if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
|
||||
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
|
||||
|
||||
if (READ_ONCE(p->__state) == TASK_DEAD)
|
||||
@@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
|
||||
sched_clock_tick();
|
||||
update_rq_clock(rq);
|
||||
|
||||
if (!dl_se->dl_runtime)
|
||||
/*
|
||||
* Make sure current has propagated its pending runtime into
|
||||
* any relevant server through calling dl_server_update() and
|
||||
* friends.
|
||||
*/
|
||||
rq->donor->sched_class->update_curr(rq);
|
||||
|
||||
if (dl_se->dl_defer_idle) {
|
||||
dl_server_stop(dl_se);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
if (dl_se->dl_defer_armed) {
|
||||
/*
|
||||
@@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
|
||||
int flags);
|
||||
update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags);
|
||||
|
||||
static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
{
|
||||
bool idle = rq->curr == rq->idle;
|
||||
s64 scaled_delta_exec;
|
||||
|
||||
if (unlikely(delta_exec <= 0)) {
|
||||
@@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
|
||||
|
||||
dl_se->runtime -= scaled_delta_exec;
|
||||
|
||||
if (dl_se->dl_defer_idle && !idle)
|
||||
dl_se->dl_defer_idle = 0;
|
||||
|
||||
/*
|
||||
* The fair server can consume its runtime while throttled (not queued/
|
||||
* running as regular CFS).
|
||||
@@ -1449,6 +1453,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
|
||||
* starting a new period, pushing the activation.
|
||||
*/
|
||||
if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
|
||||
/*
|
||||
* Non-servers would never get time accounted while throttled.
|
||||
*/
|
||||
WARN_ON_ONCE(!dl_server(dl_se));
|
||||
|
||||
/*
|
||||
* While the server is marked idle, do not push out the
|
||||
* activation further, instead wait for the period timer
|
||||
* to lapse and stop the server.
|
||||
*/
|
||||
if (dl_se->dl_defer_idle && idle) {
|
||||
/*
|
||||
* The timer is at the zero-laxity point, this means
|
||||
* dl_server_stop() / dl_server_start() can happen
|
||||
* while now < deadline. This means update_dl_entity()
|
||||
* will not replenish. Additionally start_dl_timer()
|
||||
* will be set for 'deadline - runtime'. Negative
|
||||
* runtime will not do.
|
||||
*/
|
||||
dl_se->runtime = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the server was previously activated - the starving condition
|
||||
* took place, it this point it went away because the fair scheduler
|
||||
@@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
|
||||
|
||||
replenish_dl_new_period(dl_se, dl_se->rq);
|
||||
|
||||
if (idle)
|
||||
dl_se->dl_defer_idle = 1;
|
||||
|
||||
/*
|
||||
* Not being able to start the timer seems problematic. If it could not
|
||||
* be started for whatever reason, we need to "unthrottle" the DL server
|
||||
@@ -1543,38 +1573,213 @@ throttle:
|
||||
* as time available for the fair server, avoiding a penalty for the
|
||||
* rt scheduler that did not consumed that time.
|
||||
*/
|
||||
void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
|
||||
void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
{
|
||||
s64 delta_exec;
|
||||
|
||||
if (!rq->fair_server.dl_defer)
|
||||
return;
|
||||
|
||||
/* no need to discount more */
|
||||
if (rq->fair_server.runtime < 0)
|
||||
return;
|
||||
|
||||
delta_exec = rq_clock_task(rq) - p->se.exec_start;
|
||||
if (delta_exec < 0)
|
||||
return;
|
||||
|
||||
rq->fair_server.runtime -= delta_exec;
|
||||
|
||||
if (rq->fair_server.runtime < 0) {
|
||||
rq->fair_server.dl_defer_running = 0;
|
||||
rq->fair_server.runtime = 0;
|
||||
}
|
||||
|
||||
p->se.exec_start = rq_clock_task(rq);
|
||||
if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer)
|
||||
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
|
||||
}
|
||||
|
||||
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
{
|
||||
/* 0 runtime = fair server disabled */
|
||||
if (dl_se->dl_runtime)
|
||||
if (dl_se->dl_server_active && dl_se->dl_runtime)
|
||||
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
|
||||
}
|
||||
|
||||
/*
|
||||
* dl_server && dl_defer:
|
||||
*
|
||||
* 6
|
||||
* +--------------------+
|
||||
* v |
|
||||
* +-------------+ 4 +-----------+ 5 +------------------+
|
||||
* +-> | A:init | <--- | D:running | -----> | E:replenish-wait |
|
||||
* | +-------------+ +-----------+ +------------------+
|
||||
* | | | 1 ^ ^ |
|
||||
* | | 1 +----------+ | 3 |
|
||||
* | v | |
|
||||
* | +--------------------------------+ 2 |
|
||||
* | | | ----+ |
|
||||
* | 8 | B:zero_laxity-wait | | |
|
||||
* | | | <---+ |
|
||||
* | +--------------------------------+ |
|
||||
* | | ^ ^ 2 |
|
||||
* | | 7 | 2 +--------------------+
|
||||
* | v |
|
||||
* | +-------------+ |
|
||||
* +-- | C:idle-wait | -+
|
||||
* +-------------+
|
||||
* ^ 7 |
|
||||
* +---------+
|
||||
*
|
||||
*
|
||||
* [A] - init
|
||||
* dl_server_active = 0
|
||||
* dl_throttled = 0
|
||||
* dl_defer_armed = 0
|
||||
* dl_defer_running = 0/1
|
||||
* dl_defer_idle = 0
|
||||
*
|
||||
* [B] - zero_laxity-wait
|
||||
* dl_server_active = 1
|
||||
* dl_throttled = 1
|
||||
* dl_defer_armed = 1
|
||||
* dl_defer_running = 0
|
||||
* dl_defer_idle = 0
|
||||
*
|
||||
* [C] - idle-wait
|
||||
* dl_server_active = 1
|
||||
* dl_throttled = 1
|
||||
* dl_defer_armed = 1
|
||||
* dl_defer_running = 0
|
||||
* dl_defer_idle = 1
|
||||
*
|
||||
* [D] - running
|
||||
* dl_server_active = 1
|
||||
* dl_throttled = 0
|
||||
* dl_defer_armed = 0
|
||||
* dl_defer_running = 1
|
||||
* dl_defer_idle = 0
|
||||
*
|
||||
* [E] - replenish-wait
|
||||
* dl_server_active = 1
|
||||
* dl_throttled = 1
|
||||
* dl_defer_armed = 0
|
||||
* dl_defer_running = 1
|
||||
* dl_defer_idle = 0
|
||||
*
|
||||
*
|
||||
* [1] A->B, A->D
|
||||
* dl_server_start()
|
||||
* dl_server_active = 1;
|
||||
* enqueue_dl_entity()
|
||||
* update_dl_entity(WAKEUP)
|
||||
* if (!dl_defer_running)
|
||||
* dl_defer_armed = 1;
|
||||
* dl_throttled = 1;
|
||||
* if (dl_throttled && start_dl_timer())
|
||||
* return; // [B]
|
||||
* __enqueue_dl_entity();
|
||||
* // [D]
|
||||
*
|
||||
* // deplete server runtime from client-class
|
||||
* [2] B->B, C->B, E->B
|
||||
* dl_server_update()
|
||||
* update_curr_dl_se() // idle = false
|
||||
* if (dl_defer_idle)
|
||||
* dl_defer_idle = 0;
|
||||
* if (dl_defer && dl_throttled && dl_runtime_exceeded())
|
||||
* dl_defer_running = 0;
|
||||
* hrtimer_try_to_cancel(); // stop timer
|
||||
* replenish_dl_new_period()
|
||||
* // fwd period
|
||||
* dl_throttled = 1;
|
||||
* dl_defer_armed = 1;
|
||||
* start_dl_timer(); // restart timer
|
||||
* // [B]
|
||||
*
|
||||
* // timer actually fires means we have runtime
|
||||
* [3] B->D
|
||||
* dl_server_timer()
|
||||
* if (dl_defer_armed)
|
||||
* dl_defer_running = 1;
|
||||
* enqueue_dl_entity(REPLENISH)
|
||||
* replenish_dl_entity()
|
||||
* // fwd period
|
||||
* if (dl_throttled)
|
||||
* dl_throttled = 0;
|
||||
* if (dl_defer_armed)
|
||||
* dl_defer_armed = 0;
|
||||
* __enqueue_dl_entity();
|
||||
* // [D]
|
||||
*
|
||||
* // schedule server
|
||||
* [4] D->A
|
||||
* pick_task_dl()
|
||||
* p = server_pick_task();
|
||||
* if (!p)
|
||||
* dl_server_stop()
|
||||
* dequeue_dl_entity();
|
||||
* hrtimer_try_to_cancel();
|
||||
* dl_defer_armed = 0;
|
||||
* dl_throttled = 0;
|
||||
* dl_server_active = 0;
|
||||
* // [A]
|
||||
* return p;
|
||||
*
|
||||
* // server running
|
||||
* [5] D->E
|
||||
* update_curr_dl_se()
|
||||
* if (dl_runtime_exceeded())
|
||||
* dl_throttled = 1;
|
||||
* dequeue_dl_entity();
|
||||
* start_dl_timer();
|
||||
* // [E]
|
||||
*
|
||||
* // server replenished
|
||||
* [6] E->D
|
||||
* dl_server_timer()
|
||||
* enqueue_dl_entity(REPLENISH)
|
||||
* replenish_dl_entity()
|
||||
* fwd-period
|
||||
* if (dl_throttled)
|
||||
* dl_throttled = 0;
|
||||
* __enqueue_dl_entity();
|
||||
* // [D]
|
||||
*
|
||||
* // deplete server runtime from idle
|
||||
* [7] B->C, C->C
|
||||
* dl_server_update_idle()
|
||||
* update_curr_dl_se() // idle = true
|
||||
* if (dl_defer && dl_throttled && dl_runtime_exceeded())
|
||||
* if (dl_defer_idle)
|
||||
* return;
|
||||
* dl_defer_running = 0;
|
||||
* hrtimer_try_to_cancel();
|
||||
* replenish_dl_new_period()
|
||||
* // fwd period
|
||||
* dl_throttled = 1;
|
||||
* dl_defer_armed = 1;
|
||||
* dl_defer_idle = 1;
|
||||
* start_dl_timer(); // restart timer
|
||||
* // [C]
|
||||
*
|
||||
* // stop idle server
|
||||
* [8] C->A
|
||||
* dl_server_timer()
|
||||
* if (dl_defer_idle)
|
||||
* dl_server_stop();
|
||||
* // [A]
|
||||
*
|
||||
*
|
||||
* digraph dl_server {
|
||||
* "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
|
||||
* "A:init" -> "D:running" [label="1:dl_server_start"]
|
||||
* "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
|
||||
* "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
|
||||
* "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
|
||||
* "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
|
||||
* "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
|
||||
* "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
|
||||
* "D:running" -> "A:init" [label="4:pick_task_dl"]
|
||||
* "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"]
|
||||
* "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
|
||||
* "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"]
|
||||
* }
|
||||
*
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
* - When there are fair tasks running the most likely loop is [2]->[2].
|
||||
* the dl_server never actually runs, the timer never fires.
|
||||
*
|
||||
* - When there is actual fair starvation; the timer fires and starts the
|
||||
* dl_server. This will then throttle and replenish like a normal DL
|
||||
* task. Notably it will not 'defer' again.
|
||||
*
|
||||
* - When idle it will push the actication forward once, and then wait
|
||||
* for the timer to hit or a non-idle update to restart things.
|
||||
*/
|
||||
void dl_server_start(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
struct rq *rq = dl_se->rq;
|
||||
@@ -1582,6 +1787,11 @@ void dl_server_start(struct sched_dl_entity *dl_se)
|
||||
if (!dl_server(dl_se) || dl_se->dl_server_active)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Update the current task to 'now'.
|
||||
*/
|
||||
rq->donor->sched_class->update_curr(rq);
|
||||
|
||||
if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
|
||||
return;
|
||||
|
||||
@@ -1600,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
|
||||
hrtimer_try_to_cancel(&dl_se->dl_timer);
|
||||
dl_se->dl_defer_armed = 0;
|
||||
dl_se->dl_throttled = 0;
|
||||
dl_se->dl_defer_idle = 0;
|
||||
dl_se->dl_server_active = 0;
|
||||
}
|
||||
|
||||
@@ -1811,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
||||
if (!dl_rq->dl_nr_running) {
|
||||
dl_rq->earliest_dl.curr = 0;
|
||||
dl_rq->earliest_dl.next = 0;
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu);
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
|
||||
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
|
||||
} else {
|
||||
struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
|
||||
@@ -2048,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
|
||||
* or "inactive")
|
||||
*/
|
||||
if (flags & DEQUEUE_SLEEP)
|
||||
task_non_contending(dl_se);
|
||||
task_non_contending(dl_se, true);
|
||||
}
|
||||
|
||||
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
|
||||
@@ -2143,7 +2354,7 @@ static void yield_task_dl(struct rq *rq)
|
||||
* it and the bandwidth timer will wake it up and will give it
|
||||
* new scheduling parameters (thanks to dl_yielded=1).
|
||||
*/
|
||||
rq->curr->dl.dl_yielded = 1;
|
||||
rq->donor->dl.dl_yielded = 1;
|
||||
|
||||
update_rq_clock(rq);
|
||||
update_curr_dl(rq);
|
||||
@@ -2173,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
|
||||
struct rq *rq;
|
||||
|
||||
if (!(flags & WF_TTWU))
|
||||
goto out;
|
||||
return cpu;
|
||||
|
||||
rq = cpu_rq(cpu);
|
||||
|
||||
@@ -2211,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
out:
|
||||
return cpu;
|
||||
}
|
||||
|
||||
@@ -2355,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
|
||||
* __pick_next_task_dl - Helper to pick the next -deadline task to run.
|
||||
* @rq: The runqueue to pick the next task from.
|
||||
*/
|
||||
static struct task_struct *__pick_task_dl(struct rq *rq)
|
||||
static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
struct sched_dl_entity *dl_se;
|
||||
struct dl_rq *dl_rq = &rq->dl;
|
||||
@@ -2369,7 +2579,7 @@ again:
|
||||
WARN_ON_ONCE(!dl_se);
|
||||
|
||||
if (dl_server(dl_se)) {
|
||||
p = dl_se->server_pick_task(dl_se);
|
||||
p = dl_se->server_pick_task(dl_se, rf);
|
||||
if (!p) {
|
||||
dl_server_stop(dl_se);
|
||||
goto again;
|
||||
@@ -2382,9 +2592,9 @@ again:
|
||||
return p;
|
||||
}
|
||||
|
||||
static struct task_struct *pick_task_dl(struct rq *rq)
|
||||
static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
return __pick_task_dl(rq);
|
||||
return __pick_task_dl(rq, rf);
|
||||
}
|
||||
|
||||
static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
|
||||
@@ -2883,9 +3093,10 @@ static void rq_online_dl(struct rq *rq)
|
||||
if (rq->dl.overloaded)
|
||||
dl_set_overload(rq);
|
||||
|
||||
cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
|
||||
if (rq->dl.dl_nr_running > 0)
|
||||
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
|
||||
else
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
|
||||
}
|
||||
|
||||
/* Assumes rq->lock is held */
|
||||
@@ -2894,8 +3105,7 @@ static void rq_offline_dl(struct rq *rq)
|
||||
if (rq->dl.overloaded)
|
||||
dl_clear_overload(rq);
|
||||
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu);
|
||||
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
|
||||
}
|
||||
|
||||
void __init init_sched_dl_class(void)
|
||||
@@ -2973,7 +3183,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
|
||||
* will reset the task parameters.
|
||||
*/
|
||||
if (task_on_rq_queued(p) && p->dl.dl_runtime)
|
||||
task_non_contending(&p->dl);
|
||||
task_non_contending(&p->dl, false);
|
||||
|
||||
/*
|
||||
* In case a task is setscheduled out from SCHED_DEADLINE we need to
|
||||
@@ -3045,23 +3255,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
return p->dl.deadline;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the scheduling parameters of a -deadline task changed,
|
||||
* a push or pull operation might be needed.
|
||||
*/
|
||||
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
|
||||
int oldprio)
|
||||
static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
|
||||
{
|
||||
if (!task_on_rq_queued(p))
|
||||
return;
|
||||
|
||||
/*
|
||||
* This might be too much, but unfortunately
|
||||
* we don't have the old deadline value, and
|
||||
* we can't argue if the task is increasing
|
||||
* or lowering its prio, so...
|
||||
*/
|
||||
if (!rq->dl.overloaded)
|
||||
if (p->dl.deadline == old_deadline)
|
||||
return;
|
||||
|
||||
if (dl_time_before(old_deadline, p->dl.deadline))
|
||||
deadline_queue_pull_task(rq);
|
||||
|
||||
if (task_current_donor(rq, p)) {
|
||||
@@ -3094,6 +3305,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
|
||||
|
||||
DEFINE_SCHED_CLASS(dl) = {
|
||||
|
||||
.queue_mask = 8,
|
||||
|
||||
.enqueue_task = enqueue_task_dl,
|
||||
.dequeue_task = dequeue_task_dl,
|
||||
.yield_task = yield_task_dl,
|
||||
@@ -3116,6 +3329,7 @@ DEFINE_SCHED_CLASS(dl) = {
|
||||
.task_tick = task_tick_dl,
|
||||
.task_fork = task_fork_dl,
|
||||
|
||||
.get_prio = get_prio_dl,
|
||||
.prio_changed = prio_changed_dl,
|
||||
.switched_from = switched_from_dl,
|
||||
.switched_to = switched_to_dl,
|
||||
|
||||
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||
|
||||
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
|
||||
s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
|
||||
struct sched_entity *last, *first, *root;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long flags;
|
||||
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
last = __pick_last_entity(cfs_rq);
|
||||
if (last)
|
||||
right_vruntime = last->vruntime;
|
||||
min_vruntime = cfs_rq->min_vruntime;
|
||||
zero_vruntime = cfs_rq->zero_vruntime;
|
||||
raw_spin_rq_unlock_irqrestore(rq, flags);
|
||||
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
|
||||
SPLIT_NS(left_deadline));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
|
||||
SPLIT_NS(left_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
||||
SPLIT_NS(min_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
|
||||
SPLIT_NS(zero_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
|
||||
SPLIT_NS(avg_vruntime(cfs_rq)));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
|
||||
|
||||
@@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
|
||||
static void yield_task_scx(struct rq *rq)
|
||||
{
|
||||
struct scx_sched *sch = scx_root;
|
||||
struct task_struct *p = rq->curr;
|
||||
struct task_struct *p = rq->donor;
|
||||
|
||||
if (SCX_HAS_OP(sch, yield))
|
||||
SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
|
||||
@@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq)
|
||||
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
|
||||
{
|
||||
struct scx_sched *sch = scx_root;
|
||||
struct task_struct *from = rq->curr;
|
||||
struct task_struct *from = rq->donor;
|
||||
|
||||
if (SCX_HAS_OP(sch, yield))
|
||||
return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
|
||||
@@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
rq->scx.flags |= SCX_RQ_IN_BALANCE;
|
||||
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
|
||||
rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
|
||||
|
||||
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
|
||||
unlikely(rq->scx.cpu_released)) {
|
||||
@@ -2153,42 +2153,6 @@ has_tasks:
|
||||
return true;
|
||||
}
|
||||
|
||||
static int balance_scx(struct rq *rq, struct task_struct *prev,
|
||||
struct rq_flags *rf)
|
||||
{
|
||||
int ret;
|
||||
|
||||
rq_unpin_lock(rq, rf);
|
||||
|
||||
ret = balance_one(rq, prev);
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
/*
|
||||
* When core-sched is enabled, this ops.balance() call will be followed
|
||||
* by pick_task_scx() on this CPU and the SMT siblings. Balance the
|
||||
* siblings too.
|
||||
*/
|
||||
if (sched_core_enabled(rq)) {
|
||||
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
|
||||
int scpu;
|
||||
|
||||
for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
|
||||
struct rq *srq = cpu_rq(scpu);
|
||||
struct task_struct *sprev = srq->curr;
|
||||
|
||||
WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
|
||||
update_rq_clock(srq);
|
||||
balance_one(srq, sprev);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rq_repin_lock(rq, rf);
|
||||
|
||||
maybe_queue_balance_callback(rq);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void process_ddsp_deferred_locals(struct rq *rq)
|
||||
{
|
||||
struct task_struct *p;
|
||||
@@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq)
|
||||
struct task_struct, scx.dsq_list.node);
|
||||
}
|
||||
|
||||
static struct task_struct *pick_task_scx(struct rq *rq)
|
||||
static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
struct task_struct *prev = rq->curr;
|
||||
bool keep_prev, kick_idle = false;
|
||||
struct task_struct *p;
|
||||
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
|
||||
bool kick_idle = false;
|
||||
|
||||
/*
|
||||
* WORKAROUND:
|
||||
*
|
||||
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
|
||||
* have gone through balance_scx(). Unfortunately, there currently is a
|
||||
* bug where fair could say yes on balance() but no on pick_task(),
|
||||
* which then ends up calling pick_task_scx() without preceding
|
||||
* balance_scx().
|
||||
*
|
||||
* Keep running @prev if possible and avoid stalling from entering idle
|
||||
* without balancing.
|
||||
*
|
||||
* Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
|
||||
* if pick_task_scx() is called without preceding balance_scx().
|
||||
*/
|
||||
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
|
||||
if (prev->scx.flags & SCX_TASK_QUEUED) {
|
||||
keep_prev = true;
|
||||
} else {
|
||||
keep_prev = false;
|
||||
kick_idle = true;
|
||||
}
|
||||
} else if (unlikely(keep_prev &&
|
||||
prev->sched_class != &ext_sched_class)) {
|
||||
/*
|
||||
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
|
||||
* conditional on scx_enabled() and may have been skipped.
|
||||
*/
|
||||
rq_modified_clear(rq);
|
||||
rq_unpin_lock(rq, rf);
|
||||
balance_one(rq, prev);
|
||||
rq_repin_lock(rq, rf);
|
||||
maybe_queue_balance_callback(rq);
|
||||
if (rq_modified_above(rq, &ext_sched_class))
|
||||
return RETRY_TASK;
|
||||
|
||||
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
|
||||
if (unlikely(keep_prev &&
|
||||
prev->sched_class != &ext_sched_class)) {
|
||||
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
|
||||
keep_prev = false;
|
||||
}
|
||||
@@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
|
||||
p, p->scx.weight);
|
||||
}
|
||||
|
||||
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {}
|
||||
* their current sched_class. Call them directly from sched core instead.
|
||||
*/
|
||||
DEFINE_SCHED_CLASS(ext) = {
|
||||
.queue_mask = 1,
|
||||
|
||||
.enqueue_task = enqueue_task_scx,
|
||||
.dequeue_task = dequeue_task_scx,
|
||||
.yield_task = yield_task_scx,
|
||||
@@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = {
|
||||
|
||||
.wakeup_preempt = wakeup_preempt_scx,
|
||||
|
||||
.balance = balance_scx,
|
||||
.pick_task = pick_task_scx,
|
||||
|
||||
.put_prev_task = put_prev_task_scx,
|
||||
@@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass)
|
||||
*/
|
||||
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
|
||||
scx.runnable_node) {
|
||||
struct sched_enq_and_set_ctx ctx;
|
||||
|
||||
/* cycling deq/enq is enough, see the function comment */
|
||||
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
|
||||
sched_enq_and_set_task(&ctx);
|
||||
scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
|
||||
/* nothing */ ;
|
||||
}
|
||||
}
|
||||
|
||||
/* resched to restore ticks and idle state */
|
||||
@@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work)
|
||||
|
||||
scx_task_iter_start(&sti);
|
||||
while ((p = scx_task_iter_next_locked(&sti))) {
|
||||
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
|
||||
const struct sched_class *old_class = p->sched_class;
|
||||
const struct sched_class *new_class =
|
||||
__setscheduler_class(p->policy, p->prio);
|
||||
struct sched_enq_and_set_ctx ctx;
|
||||
|
||||
if (old_class != new_class && p->se.sched_delayed)
|
||||
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
update_rq_clock(task_rq(p));
|
||||
|
||||
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
|
||||
if (old_class != new_class)
|
||||
queue_flags |= DEQUEUE_CLASS;
|
||||
|
||||
p->sched_class = new_class;
|
||||
check_class_changing(task_rq(p), p, old_class);
|
||||
scoped_guard (sched_change, p, queue_flags) {
|
||||
p->sched_class = new_class;
|
||||
}
|
||||
|
||||
sched_enq_and_set_task(&ctx);
|
||||
|
||||
check_class_changed(task_rq(p), p, old_class, p->prio);
|
||||
scx_exit_task(p);
|
||||
}
|
||||
scx_task_iter_stop(&sti);
|
||||
@@ -4751,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
||||
percpu_down_write(&scx_fork_rwsem);
|
||||
scx_task_iter_start(&sti);
|
||||
while ((p = scx_task_iter_next_locked(&sti))) {
|
||||
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
|
||||
const struct sched_class *old_class = p->sched_class;
|
||||
const struct sched_class *new_class =
|
||||
__setscheduler_class(p->policy, p->prio);
|
||||
struct sched_enq_and_set_ctx ctx;
|
||||
|
||||
if (!tryget_task_struct(p))
|
||||
continue;
|
||||
|
||||
if (old_class != new_class && p->se.sched_delayed)
|
||||
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
if (old_class != new_class)
|
||||
queue_flags |= DEQUEUE_CLASS;
|
||||
|
||||
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
|
||||
scoped_guard (sched_change, p, queue_flags) {
|
||||
p->scx.slice = SCX_SLICE_DFL;
|
||||
p->sched_class = new_class;
|
||||
}
|
||||
|
||||
p->scx.slice = SCX_SLICE_DFL;
|
||||
p->sched_class = new_class;
|
||||
check_class_changing(task_rq(p), p, old_class);
|
||||
|
||||
sched_enq_and_set_task(&ctx);
|
||||
|
||||
check_class_changed(task_rq(p), p, old_class, p->prio);
|
||||
put_task_struct(p);
|
||||
}
|
||||
scx_task_iter_stop(&sti);
|
||||
|
||||
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
|
||||
|
||||
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
return (s64)(se->vruntime - cfs_rq->min_vruntime);
|
||||
return (s64)(se->vruntime - cfs_rq->zero_vruntime);
|
||||
}
|
||||
|
||||
#define __node_2_se(node) \
|
||||
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
*
|
||||
* Which we track using:
|
||||
*
|
||||
* v0 := cfs_rq->min_vruntime
|
||||
* v0 := cfs_rq->zero_vruntime
|
||||
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
|
||||
* \Sum w_i := cfs_rq->avg_load
|
||||
*
|
||||
* Since min_vruntime is a monotonic increasing variable that closely tracks
|
||||
* the per-task service, these deltas: (v_i - v), will be in the order of the
|
||||
* maximal (virtual) lag induced in the system due to quantisation.
|
||||
* Since zero_vruntime closely tracks the per-task service, these
|
||||
* deltas: (v_i - v), will be in the order of the maximal (virtual) lag
|
||||
* induced in the system due to quantisation.
|
||||
*
|
||||
* Also, we use scale_load_down() to reduce the size.
|
||||
*
|
||||
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
avg = div_s64(avg, load);
|
||||
}
|
||||
|
||||
return cfs_rq->min_vruntime + avg;
|
||||
return cfs_rq->zero_vruntime + avg;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
|
||||
load += weight;
|
||||
}
|
||||
|
||||
return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
|
||||
return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
|
||||
}
|
||||
|
||||
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
return vruntime_eligible(cfs_rq, se->vruntime);
|
||||
}
|
||||
|
||||
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
|
||||
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
u64 min_vruntime = cfs_rq->min_vruntime;
|
||||
/*
|
||||
* open coded max_vruntime() to allow updating avg_vruntime
|
||||
*/
|
||||
s64 delta = (s64)(vruntime - min_vruntime);
|
||||
if (delta > 0) {
|
||||
avg_vruntime_update(cfs_rq, delta);
|
||||
min_vruntime = vruntime;
|
||||
}
|
||||
return min_vruntime;
|
||||
}
|
||||
u64 vruntime = avg_vruntime(cfs_rq);
|
||||
s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
|
||||
|
||||
static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct sched_entity *se = __pick_root_entity(cfs_rq);
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
u64 vruntime = cfs_rq->min_vruntime;
|
||||
avg_vruntime_update(cfs_rq, delta);
|
||||
|
||||
if (curr) {
|
||||
if (curr->on_rq)
|
||||
vruntime = curr->vruntime;
|
||||
else
|
||||
curr = NULL;
|
||||
}
|
||||
|
||||
if (se) {
|
||||
if (!curr)
|
||||
vruntime = se->min_vruntime;
|
||||
else
|
||||
vruntime = min_vruntime(vruntime, se->min_vruntime);
|
||||
}
|
||||
|
||||
/* ensure we never gain time by being placed backwards. */
|
||||
cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
|
||||
cfs_rq->zero_vruntime = vruntime;
|
||||
}
|
||||
|
||||
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
|
||||
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
|
||||
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
avg_vruntime_add(cfs_rq, se);
|
||||
update_zero_vruntime(cfs_rq);
|
||||
se->min_vruntime = se->vruntime;
|
||||
se->min_slice = se->slice;
|
||||
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
||||
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
|
||||
&min_vruntime_cb);
|
||||
avg_vruntime_sub(cfs_rq, se);
|
||||
update_zero_vruntime(cfs_rq);
|
||||
}
|
||||
|
||||
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
|
||||
@@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
|
||||
if (cfs_rq->nr_queued == 1)
|
||||
return curr && curr->on_rq ? curr : se;
|
||||
|
||||
/*
|
||||
* Picking the ->next buddy will affect latency but not fairness.
|
||||
*/
|
||||
if (sched_feat(PICK_BUDDY) &&
|
||||
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
|
||||
/* ->next will never be delayed */
|
||||
WARN_ON_ONCE(cfs_rq->next->sched_delayed);
|
||||
return cfs_rq->next;
|
||||
}
|
||||
|
||||
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
||||
curr = NULL;
|
||||
|
||||
@@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
|
||||
return delta_exec;
|
||||
}
|
||||
|
||||
static void set_next_buddy(struct sched_entity *se);
|
||||
|
||||
/*
|
||||
* Used by other classes to account runtime.
|
||||
*/
|
||||
@@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
||||
|
||||
curr->vruntime += calc_delta_fair(delta_exec, curr);
|
||||
resched = update_deadline(cfs_rq, curr);
|
||||
update_min_vruntime(cfs_rq);
|
||||
|
||||
if (entity_is_task(curr)) {
|
||||
/*
|
||||
@@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
||||
* against fair_server such that it can account for this time
|
||||
* and possibly avoid running this period.
|
||||
*/
|
||||
if (dl_server_active(&rq->fair_server))
|
||||
dl_server_update(&rq->fair_server, delta_exec);
|
||||
dl_server_update(&rq->fair_server, delta_exec);
|
||||
}
|
||||
|
||||
account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
@@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
if (!curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
cfs_rq->nr_queued++;
|
||||
|
||||
/*
|
||||
* The entity's vruntime has been adjusted, so let's check
|
||||
* whether the rq-wide min_vruntime needs updated too. Since
|
||||
* the calculations above require stable min_vruntime rather
|
||||
* than up-to-date one, we do the update at the end of the
|
||||
* reweight process.
|
||||
*/
|
||||
update_min_vruntime(cfs_rq);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
|
||||
update_cfs_group(se);
|
||||
|
||||
/*
|
||||
* Now advance min_vruntime if @se was the entity holding it back,
|
||||
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
|
||||
* put back on, and if we advance min_vruntime, we'll be placed back
|
||||
* further than we started -- i.e. we'll be penalized.
|
||||
*/
|
||||
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
|
||||
update_min_vruntime(cfs_rq);
|
||||
|
||||
if (flags & DEQUEUE_DELAYED)
|
||||
finish_delayed_dequeue_entity(se);
|
||||
|
||||
@@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct sched_entity *se;
|
||||
|
||||
/*
|
||||
* Picking the ->next buddy will affect latency but not fairness.
|
||||
*/
|
||||
if (sched_feat(PICK_BUDDY) &&
|
||||
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
|
||||
/* ->next will never be delayed */
|
||||
WARN_ON_ONCE(cfs_rq->next->sched_delayed);
|
||||
return cfs_rq->next;
|
||||
}
|
||||
|
||||
se = pick_eevdf(cfs_rq);
|
||||
if (se->sched_delayed) {
|
||||
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
@@ -7003,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
h_nr_idle = 1;
|
||||
}
|
||||
|
||||
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
|
||||
/* Account for idle runtime */
|
||||
if (!rq->nr_running)
|
||||
dl_server_update_idle_time(rq, rq->curr);
|
||||
if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
|
||||
dl_server_start(&rq->fair_server);
|
||||
}
|
||||
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
add_nr_running(rq, 1);
|
||||
@@ -7035,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
static void set_next_buddy(struct sched_entity *se);
|
||||
|
||||
/*
|
||||
* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
|
||||
* failing half-way through and resume the dequeue later.
|
||||
@@ -8712,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
|
||||
set_task_max_allowed_capacity(p);
|
||||
}
|
||||
|
||||
static int
|
||||
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
{
|
||||
if (sched_fair_runnable(rq))
|
||||
return 1;
|
||||
|
||||
return sched_balance_newidle(rq, rf) != 0;
|
||||
}
|
||||
|
||||
static void set_next_buddy(struct sched_entity *se)
|
||||
{
|
||||
for_each_sched_entity(se) {
|
||||
@@ -8732,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se)
|
||||
}
|
||||
}
|
||||
|
||||
enum preempt_wakeup_action {
|
||||
PREEMPT_WAKEUP_NONE, /* No preemption. */
|
||||
PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
|
||||
PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
|
||||
PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
|
||||
};
|
||||
|
||||
static inline bool
|
||||
set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
|
||||
struct sched_entity *pse, struct sched_entity *se)
|
||||
{
|
||||
/*
|
||||
* Keep existing buddy if the deadline is sooner than pse.
|
||||
* The older buddy may be cache cold and completely unrelated
|
||||
* to the current wakeup but that is unpredictable where as
|
||||
* obeying the deadline is more in line with EEVDF objectives.
|
||||
*/
|
||||
if (cfs_rq->next && entity_before(cfs_rq->next, pse))
|
||||
return false;
|
||||
|
||||
set_next_buddy(pse);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
|
||||
* strictly enforced because the hint is either misunderstood or
|
||||
* multiple tasks must be woken up.
|
||||
*/
|
||||
static inline enum preempt_wakeup_action
|
||||
preempt_sync(struct rq *rq, int wake_flags,
|
||||
struct sched_entity *pse, struct sched_entity *se)
|
||||
{
|
||||
u64 threshold, delta;
|
||||
|
||||
/*
|
||||
* WF_SYNC without WF_TTWU is not expected so warn if it happens even
|
||||
* though it is likely harmless.
|
||||
*/
|
||||
WARN_ON_ONCE(!(wake_flags & WF_TTWU));
|
||||
|
||||
threshold = sysctl_sched_migration_cost;
|
||||
delta = rq_clock_task(rq) - se->exec_start;
|
||||
if ((s64)delta < 0)
|
||||
delta = 0;
|
||||
|
||||
/*
|
||||
* WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
|
||||
* could run on other CPUs. Reduce the threshold before preemption is
|
||||
* allowed to an arbitrary lower value as it is more likely (but not
|
||||
* guaranteed) the waker requires the wakee to finish.
|
||||
*/
|
||||
if (wake_flags & WF_RQ_SELECTED)
|
||||
threshold >>= 2;
|
||||
|
||||
/*
|
||||
* As WF_SYNC is not strictly obeyed, allow some runtime for batch
|
||||
* wakeups to be issued.
|
||||
*/
|
||||
if (entity_before(pse, se) && delta >= threshold)
|
||||
return PREEMPT_WAKEUP_RESCHED;
|
||||
|
||||
return PREEMPT_WAKEUP_NONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preempt the current task with a newly woken task if needed:
|
||||
*/
|
||||
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
|
||||
{
|
||||
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
|
||||
struct task_struct *donor = rq->donor;
|
||||
struct sched_entity *se = &donor->se, *pse = &p->se;
|
||||
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
|
||||
int cse_is_idle, pse_is_idle;
|
||||
bool do_preempt_short = false;
|
||||
|
||||
if (unlikely(se == pse))
|
||||
return;
|
||||
@@ -8755,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
|
||||
if (task_is_throttled(p))
|
||||
return;
|
||||
|
||||
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
|
||||
set_next_buddy(pse);
|
||||
}
|
||||
|
||||
/*
|
||||
* We can come here with TIF_NEED_RESCHED already set from new task
|
||||
* wake up path.
|
||||
@@ -8790,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
|
||||
* When non-idle entity preempt an idle entity,
|
||||
* don't give idle entity slice protection.
|
||||
*/
|
||||
do_preempt_short = true;
|
||||
preempt_action = PREEMPT_WAKEUP_SHORT;
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
@@ -8809,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
|
||||
* If @p has a shorter slice than current and @p is eligible, override
|
||||
* current's slice protection in order to allow preemption.
|
||||
*/
|
||||
do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
|
||||
if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
|
||||
preempt_action = PREEMPT_WAKEUP_SHORT;
|
||||
goto pick;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ignore wakee preemption on WF_FORK as it is less likely that
|
||||
* there is shared data as exec often follow fork. Do not
|
||||
* preempt for tasks that are sched_delayed as it would violate
|
||||
* EEVDF to forcibly queue an ineligible task.
|
||||
*/
|
||||
if ((wake_flags & WF_FORK) || pse->sched_delayed)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If @p potentially is completing work required by current then
|
||||
* consider preemption.
|
||||
*
|
||||
* Reschedule if waker is no longer eligible. */
|
||||
if (in_task() && !entity_eligible(cfs_rq, se)) {
|
||||
preempt_action = PREEMPT_WAKEUP_RESCHED;
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
/* Prefer picking wakee soon if appropriate. */
|
||||
if (sched_feat(NEXT_BUDDY) &&
|
||||
set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
|
||||
|
||||
/*
|
||||
* Decide whether to obey WF_SYNC hint for a new buddy. Old
|
||||
* buddies are ignored as they may not be relevant to the
|
||||
* waker and less likely to be cache hot.
|
||||
*/
|
||||
if (wake_flags & WF_SYNC)
|
||||
preempt_action = preempt_sync(rq, wake_flags, pse, se);
|
||||
}
|
||||
|
||||
switch (preempt_action) {
|
||||
case PREEMPT_WAKEUP_NONE:
|
||||
return;
|
||||
case PREEMPT_WAKEUP_RESCHED:
|
||||
goto preempt;
|
||||
case PREEMPT_WAKEUP_SHORT:
|
||||
fallthrough;
|
||||
case PREEMPT_WAKEUP_PICK:
|
||||
break;
|
||||
}
|
||||
|
||||
pick:
|
||||
/*
|
||||
* If @p has become the most eligible task, force preemption.
|
||||
*/
|
||||
if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
|
||||
if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
|
||||
goto preempt;
|
||||
|
||||
if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
|
||||
if (sched_feat(RUN_TO_PARITY))
|
||||
update_protect_slice(cfs_rq, se);
|
||||
|
||||
return;
|
||||
|
||||
preempt:
|
||||
if (do_preempt_short)
|
||||
if (preempt_action == PREEMPT_WAKEUP_SHORT)
|
||||
cancel_protect_slice(se);
|
||||
|
||||
resched_curr_lazy(rq);
|
||||
}
|
||||
|
||||
static struct task_struct *pick_task_fair(struct rq *rq)
|
||||
static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
struct sched_entity *se;
|
||||
struct cfs_rq *cfs_rq;
|
||||
@@ -8873,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
|
||||
int new_tasks;
|
||||
|
||||
again:
|
||||
p = pick_task_fair(rq);
|
||||
p = pick_task_fair(rq, rf);
|
||||
if (!p)
|
||||
goto idle;
|
||||
se = &p->se;
|
||||
@@ -8952,14 +9001,10 @@ idle:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
|
||||
static struct task_struct *
|
||||
fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
|
||||
{
|
||||
return pick_next_task_fair(rq, prev, NULL);
|
||||
}
|
||||
|
||||
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
return pick_task_fair(dl_se->rq);
|
||||
return pick_task_fair(dl_se->rq, rf);
|
||||
}
|
||||
|
||||
void fair_server_init(struct rq *rq)
|
||||
@@ -8990,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
|
||||
*/
|
||||
static void yield_task_fair(struct rq *rq)
|
||||
{
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct task_struct *curr = rq->donor;
|
||||
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
||||
struct sched_entity *se = &curr->se;
|
||||
|
||||
@@ -9014,7 +9059,18 @@ static void yield_task_fair(struct rq *rq)
|
||||
*/
|
||||
rq_clock_skip_update(rq);
|
||||
|
||||
se->deadline += calc_delta_fair(se->slice, se);
|
||||
/*
|
||||
* Forfeit the remaining vruntime, only if the entity is eligible. This
|
||||
* condition is necessary because in core scheduling we prefer to run
|
||||
* ineligible tasks rather than force idling. If this happens we may
|
||||
* end up in a loop where the core scheduler picks the yielding task,
|
||||
* which yields immediately again; without the condition the vruntime
|
||||
* ends up quickly running away.
|
||||
*/
|
||||
if (entity_eligible(cfs_rq, se)) {
|
||||
se->vruntime = se->deadline;
|
||||
se->deadline += calc_delta_fair(se->slice, se);
|
||||
}
|
||||
}
|
||||
|
||||
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
|
||||
@@ -10678,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
|
||||
if (sd->flags & SD_ASYM_CPUCAPACITY)
|
||||
sgs->group_misfit_task_load = 1;
|
||||
|
||||
for_each_cpu(i, sched_group_span(group)) {
|
||||
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
unsigned int local;
|
||||
|
||||
@@ -11729,6 +11785,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This flag serializes load-balancing passes over large domains
|
||||
* (above the NODE topology level) - only one load-balancing instance
|
||||
* may run at a time, to reduce overhead on very large systems with
|
||||
* lots of CPUs and large NUMA distances.
|
||||
*
|
||||
* - Note that load-balancing passes triggered while another one
|
||||
* is executing are skipped and not re-tried.
|
||||
*
|
||||
* - Also note that this does not serialize rebalance_domains()
|
||||
* execution, as non-SD_SERIALIZE domains will still be
|
||||
* load-balanced in parallel.
|
||||
*/
|
||||
static atomic_t sched_balance_running = ATOMIC_INIT(0);
|
||||
|
||||
/*
|
||||
* Check this_cpu to ensure it is balanced within domain. Attempt to move
|
||||
* tasks if there is an imbalance.
|
||||
@@ -11754,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
|
||||
.fbq_type = all,
|
||||
.tasks = LIST_HEAD_INIT(env.tasks),
|
||||
};
|
||||
bool need_unlock = false;
|
||||
|
||||
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
|
||||
|
||||
@@ -11765,6 +11837,14 @@ redo:
|
||||
goto out_balanced;
|
||||
}
|
||||
|
||||
if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
|
||||
int zero = 0;
|
||||
if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
|
||||
goto out_balanced;
|
||||
|
||||
need_unlock = true;
|
||||
}
|
||||
|
||||
group = sched_balance_find_src_group(&env);
|
||||
if (!group) {
|
||||
schedstat_inc(sd->lb_nobusyg[idle]);
|
||||
@@ -12005,6 +12085,9 @@ out_one_pinned:
|
||||
sd->balance_interval < sd->max_interval)
|
||||
sd->balance_interval *= 2;
|
||||
out:
|
||||
if (need_unlock)
|
||||
atomic_set_release(&sched_balance_running, 0);
|
||||
|
||||
return ld_moved;
|
||||
}
|
||||
|
||||
@@ -12129,21 +12212,6 @@ out_unlock:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This flag serializes load-balancing passes over large domains
|
||||
* (above the NODE topology level) - only one load-balancing instance
|
||||
* may run at a time, to reduce overhead on very large systems with
|
||||
* lots of CPUs and large NUMA distances.
|
||||
*
|
||||
* - Note that load-balancing passes triggered while another one
|
||||
* is executing are skipped and not re-tried.
|
||||
*
|
||||
* - Also note that this does not serialize rebalance_domains()
|
||||
* execution, as non-SD_SERIALIZE domains will still be
|
||||
* load-balanced in parallel.
|
||||
*/
|
||||
static atomic_t sched_balance_running = ATOMIC_INIT(0);
|
||||
|
||||
/*
|
||||
* Scale the max sched_balance_rq interval with the number of CPUs in the system.
|
||||
* This trades load-balance latency on larger machines for less cross talk.
|
||||
@@ -12153,30 +12221,43 @@ void update_max_interval(void)
|
||||
max_load_balance_interval = HZ*num_online_cpus()/10;
|
||||
}
|
||||
|
||||
static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
|
||||
static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
|
||||
{
|
||||
sd->newidle_call++;
|
||||
sd->newidle_success += success;
|
||||
|
||||
if (sd->newidle_call >= 1024) {
|
||||
sd->newidle_ratio = sd->newidle_success;
|
||||
sd->newidle_call /= 2;
|
||||
sd->newidle_success /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
|
||||
{
|
||||
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
|
||||
unsigned long now = jiffies;
|
||||
|
||||
if (cost)
|
||||
update_newidle_stats(sd, success);
|
||||
|
||||
if (cost > sd->max_newidle_lb_cost) {
|
||||
/*
|
||||
* Track max cost of a domain to make sure to not delay the
|
||||
* next wakeup on the CPU.
|
||||
*
|
||||
* sched_balance_newidle() bumps the cost whenever newidle
|
||||
* balance fails, and we don't want things to grow out of
|
||||
* control. Use the sysctl_sched_migration_cost as the upper
|
||||
* limit, plus a litle extra to avoid off by ones.
|
||||
*/
|
||||
sd->max_newidle_lb_cost =
|
||||
min(cost, sysctl_sched_migration_cost + 200);
|
||||
sd->last_decay_max_lb_cost = jiffies;
|
||||
} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
|
||||
sd->max_newidle_lb_cost = cost;
|
||||
sd->last_decay_max_lb_cost = now;
|
||||
|
||||
} else if (time_after(now, next_decay)) {
|
||||
/*
|
||||
* Decay the newidle max times by ~1% per second to ensure that
|
||||
* it is not outdated and the current max cost is actually
|
||||
* shorter.
|
||||
*/
|
||||
sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
|
||||
sd->last_decay_max_lb_cost = jiffies;
|
||||
|
||||
sd->last_decay_max_lb_cost = now;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -12199,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
/* Earliest time when we have to do rebalance again */
|
||||
unsigned long next_balance = jiffies + 60*HZ;
|
||||
int update_next_balance = 0;
|
||||
int need_serialize, need_decay = 0;
|
||||
int need_decay = 0;
|
||||
u64 max_cost = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -12208,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
* Decay the newidle max times here because this is a regular
|
||||
* visit to all the domains.
|
||||
*/
|
||||
need_decay = update_newidle_cost(sd, 0);
|
||||
need_decay = update_newidle_cost(sd, 0, 0);
|
||||
max_cost += sd->max_newidle_lb_cost;
|
||||
|
||||
/*
|
||||
@@ -12223,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
}
|
||||
|
||||
interval = get_sd_balance_interval(sd, busy);
|
||||
|
||||
need_serialize = sd->flags & SD_SERIALIZE;
|
||||
if (need_serialize) {
|
||||
if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
||||
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
|
||||
/*
|
||||
@@ -12243,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
sd->last_balance = jiffies;
|
||||
interval = get_sd_balance_interval(sd, busy);
|
||||
}
|
||||
if (need_serialize)
|
||||
atomic_set_release(&sched_balance_running, 0);
|
||||
out:
|
||||
if (time_after(next_balance, sd->last_balance + interval)) {
|
||||
next_balance = sd->last_balance + interval;
|
||||
update_next_balance = 1;
|
||||
@@ -12824,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
|
||||
rcu_read_lock();
|
||||
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
||||
if (!sd) {
|
||||
rcu_read_unlock();
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!get_rd_overloaded(this_rq->rd) ||
|
||||
(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
|
||||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
|
||||
|
||||
if (sd)
|
||||
update_next_balance(sd, &next_balance);
|
||||
update_next_balance(sd, &next_balance);
|
||||
rcu_read_unlock();
|
||||
|
||||
goto out;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
rq_modified_clear(this_rq);
|
||||
raw_spin_rq_unlock(this_rq);
|
||||
|
||||
t0 = sched_clock_cpu(this_cpu);
|
||||
@@ -12851,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
break;
|
||||
|
||||
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
||||
unsigned int weight = 1;
|
||||
|
||||
if (sched_feat(NI_RANDOM)) {
|
||||
/*
|
||||
* Throw a 1k sided dice; and only run
|
||||
* newidle_balance according to the success
|
||||
* rate.
|
||||
*/
|
||||
u32 d1k = sched_rng() % 1024;
|
||||
weight = 1 + sd->newidle_ratio;
|
||||
if (d1k > weight) {
|
||||
update_newidle_stats(sd, 0);
|
||||
continue;
|
||||
}
|
||||
weight = (1024 + weight/2) / weight;
|
||||
}
|
||||
|
||||
pulled_task = sched_balance_rq(this_cpu, this_rq,
|
||||
sd, CPU_NEWLY_IDLE,
|
||||
@@ -12862,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
t0 = t1;
|
||||
|
||||
/*
|
||||
* Failing newidle means it is not effective;
|
||||
* bump the cost so we end up doing less of it.
|
||||
* Track max cost of a domain to make sure to not delay the
|
||||
* next wakeup on the CPU.
|
||||
*/
|
||||
if (!pulled_task)
|
||||
domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
|
||||
|
||||
update_newidle_cost(sd, domain_cost);
|
||||
update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -12893,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
if (this_rq->cfs.h_nr_queued && !pulled_task)
|
||||
pulled_task = 1;
|
||||
|
||||
/* Is there a task of a high priority class? */
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
|
||||
/* If a higher prio class was modified, restart the pick */
|
||||
if (rq_modified_above(this_rq, &fair_sched_class))
|
||||
pulled_task = -1;
|
||||
|
||||
out:
|
||||
@@ -13012,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
|
||||
}
|
||||
|
||||
/*
|
||||
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
|
||||
* Consider any infeasible weight scenario. Take for instance two tasks,
|
||||
* each bound to their respective sibling, one with weight 1 and one with
|
||||
* weight 2. Then the lower weight task will run ahead of the higher weight
|
||||
* task without bound.
|
||||
*
|
||||
* This utterly destroys the concept of a shared time base.
|
||||
*
|
||||
* Remember; all this is about a proportionally fair scheduling, where each
|
||||
* tasks receives:
|
||||
*
|
||||
* w_i
|
||||
* dt_i = ---------- dt (1)
|
||||
* \Sum_j w_j
|
||||
*
|
||||
* which we do by tracking a virtual time, s_i:
|
||||
*
|
||||
* 1
|
||||
* s_i = --- d[t]_i (2)
|
||||
* w_i
|
||||
*
|
||||
* Where d[t] is a delta of discrete time, while dt is an infinitesimal.
|
||||
* The immediate corollary is that the ideal schedule S, where (2) to use
|
||||
* an infinitesimal delta, is:
|
||||
*
|
||||
* 1
|
||||
* S = ---------- dt (3)
|
||||
* \Sum_i w_i
|
||||
*
|
||||
* From which we can define the lag, or deviation from the ideal, as:
|
||||
*
|
||||
* lag(i) = S - s_i (4)
|
||||
*
|
||||
* And since the one and only purpose is to approximate S, we get that:
|
||||
*
|
||||
* \Sum_i w_i lag(i) := 0 (5)
|
||||
*
|
||||
* If this were not so, we no longer converge to S, and we can no longer
|
||||
* claim our scheduler has any of the properties we derive from S. This is
|
||||
* exactly what you did above, you broke it!
|
||||
*
|
||||
*
|
||||
* Let's continue for a while though; to see if there is anything useful to
|
||||
* be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
|
||||
*
|
||||
* \Sum_i w_i s_i
|
||||
* S = -------------- (6)
|
||||
* \Sum_i w_i
|
||||
*
|
||||
* Which gives us a way to compute S, given our s_i. Now, if you've read
|
||||
* our code, you know that we do not in fact do this, the reason for this
|
||||
* is two-fold. Firstly, computing S in that way requires a 64bit division
|
||||
* for every time we'd use it (see 12), and secondly, this only describes
|
||||
* the steady-state, it doesn't handle dynamics.
|
||||
*
|
||||
* Anyway, in (6): s_i -> x + (s_i - x), to get:
|
||||
*
|
||||
* \Sum_i w_i (s_i - x)
|
||||
* S - x = -------------------- (7)
|
||||
* \Sum_i w_i
|
||||
*
|
||||
* Which shows that S and s_i transform alike (which makes perfect sense
|
||||
* given that S is basically the (weighted) average of s_i).
|
||||
*
|
||||
* So the thing to remember is that the above is strictly UP. It is
|
||||
* possible to generalize to multiple runqueues -- however it gets really
|
||||
* yuck when you have to add affinity support, as illustrated by our very
|
||||
* first counter-example.
|
||||
*
|
||||
* Luckily I think we can avoid needing a full multi-queue variant for
|
||||
* core-scheduling (or load-balancing). The crucial observation is that we
|
||||
* only actually need this comparison in the presence of forced-idle; only
|
||||
* then do we need to tell if the stalled rq has higher priority over the
|
||||
* other.
|
||||
*
|
||||
* [XXX assumes SMT2; better consider the more general case, I suspect
|
||||
* it'll work out because our comparison is always between 2 rqs and the
|
||||
* answer is only interesting if one of them is forced-idle]
|
||||
*
|
||||
* And (under assumption of SMT2) when there is forced-idle, there is only
|
||||
* a single queue, so everything works like normal.
|
||||
*
|
||||
* Let, for our runqueue 'k':
|
||||
*
|
||||
* T_k = \Sum_i w_i s_i
|
||||
* W_k = \Sum_i w_i ; for all i of k (8)
|
||||
*
|
||||
* Then we can write (6) like:
|
||||
*
|
||||
* T_k
|
||||
* S_k = --- (9)
|
||||
* W_k
|
||||
*
|
||||
* From which immediately follows that:
|
||||
*
|
||||
* T_k + T_l
|
||||
* S_k+l = --------- (10)
|
||||
* W_k + W_l
|
||||
*
|
||||
* On which we can define a combined lag:
|
||||
*
|
||||
* lag_k+l(i) := S_k+l - s_i (11)
|
||||
*
|
||||
* And that gives us the tools to compare tasks across a combined runqueue.
|
||||
*
|
||||
*
|
||||
* Combined this gives the following:
|
||||
*
|
||||
* a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
|
||||
* using (7); this only requires storing single 'time'-stamps.
|
||||
*
|
||||
* b) when comparing tasks between 2 runqueues of which one is forced-idle,
|
||||
* compare the combined lag, per (11).
|
||||
*
|
||||
* Now, of course cgroups (I so hate them) make this more interesting in
|
||||
* that a) seems to suggest we need to iterate all cgroup on a CPU at such
|
||||
* boundaries, but I think we can avoid that. The force-idle is for the
|
||||
* whole CPU, all it's rqs. So we can mark it in the root and lazily
|
||||
* propagate downward on demand.
|
||||
*/
|
||||
|
||||
/*
|
||||
* So this sync is basically a relative reset of S to 0.
|
||||
*
|
||||
* So with 2 queues, when one goes idle, we drop them both to 0 and one
|
||||
* then increases due to not being idle, and the idle one builds up lag to
|
||||
* get re-elected. So far so simple, right?
|
||||
*
|
||||
* When there's 3, we can have the situation where 2 run and one is idle,
|
||||
* we sync to 0 and let the idle one build up lag to get re-election. Now
|
||||
* suppose another one also drops idle. At this point dropping all to 0
|
||||
* again would destroy the built-up lag from the queue that was already
|
||||
* idle, not good.
|
||||
*
|
||||
* So instead of syncing everything, we can:
|
||||
*
|
||||
* less := !((s64)(s_a - s_b) <= 0)
|
||||
*
|
||||
* (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
|
||||
* == v_a - (v_b - S_a + S_b)
|
||||
*
|
||||
* IOW, we can recast the (lag) comparison to a one-sided difference.
|
||||
* So if then, instead of syncing the whole queue, sync the idle queue
|
||||
* against the active queue with S_a + S_b at the point where we sync.
|
||||
*
|
||||
* (XXX consider the implication of living in a cyclic group: N / 2^n N)
|
||||
*
|
||||
* This gives us means of syncing single queues against the active queue,
|
||||
* and for already idle queues to preserve their build-up lag.
|
||||
*
|
||||
* Of course, then we get the situation where there's 2 active and one
|
||||
* going idle, who do we pick to sync against? Theory would have us sync
|
||||
* against the combined S, but as we've already demonstrated, there is no
|
||||
* such thing in infeasible weight scenarios.
|
||||
*
|
||||
* One thing I've considered; and this is where that core_active rudiment
|
||||
* came from, is having active queues sync up between themselves after
|
||||
* every tick. This limits the observed divergence due to the work
|
||||
* conservancy.
|
||||
*
|
||||
* On top of that, we can improve upon things by employing (10) here.
|
||||
*/
|
||||
|
||||
/*
|
||||
* se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
|
||||
*/
|
||||
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
|
||||
bool forceidle)
|
||||
@@ -13026,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
|
||||
cfs_rq->forceidle_seq = fi_seq;
|
||||
}
|
||||
|
||||
cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
|
||||
cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13079,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
|
||||
|
||||
/*
|
||||
* Find delta after normalizing se's vruntime with its cfs_rq's
|
||||
* min_vruntime_fi, which would have been updated in prior calls
|
||||
* zero_vruntime_fi, which would have been updated in prior calls
|
||||
* to se_fi_update().
|
||||
*/
|
||||
delta = (s64)(sea->vruntime - seb->vruntime) +
|
||||
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
|
||||
(s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
|
||||
|
||||
return delta > 0;
|
||||
}
|
||||
@@ -13145,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p)
|
||||
* the current task.
|
||||
*/
|
||||
static void
|
||||
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
|
||||
{
|
||||
if (!task_on_rq_queued(p))
|
||||
return;
|
||||
|
||||
if (p->prio == oldprio)
|
||||
return;
|
||||
|
||||
if (rq->cfs.nr_queued == 1)
|
||||
return;
|
||||
|
||||
@@ -13161,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
if (task_current_donor(rq, p)) {
|
||||
if (p->prio > oldprio)
|
||||
resched_curr(rq);
|
||||
} else
|
||||
} else {
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@@ -13246,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
|
||||
attach_entity_cfs_rq(se);
|
||||
}
|
||||
|
||||
static void switching_from_fair(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (p->se.sched_delayed)
|
||||
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
|
||||
}
|
||||
|
||||
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
detach_task_cfs_rq(p);
|
||||
@@ -13319,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
|
||||
void init_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
|
||||
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
|
||||
cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
|
||||
raw_spin_lock_init(&cfs_rq->removed.lock);
|
||||
}
|
||||
|
||||
@@ -13620,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
|
||||
*/
|
||||
DEFINE_SCHED_CLASS(fair) = {
|
||||
|
||||
.queue_mask = 2,
|
||||
|
||||
.enqueue_task = enqueue_task_fair,
|
||||
.dequeue_task = dequeue_task_fair,
|
||||
.yield_task = yield_task_fair,
|
||||
@@ -13628,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = {
|
||||
.wakeup_preempt = check_preempt_wakeup_fair,
|
||||
|
||||
.pick_task = pick_task_fair,
|
||||
.pick_next_task = __pick_next_task_fair,
|
||||
.pick_next_task = pick_next_task_fair,
|
||||
.put_prev_task = put_prev_task_fair,
|
||||
.set_next_task = set_next_task_fair,
|
||||
|
||||
.balance = balance_fair,
|
||||
.select_task_rq = select_task_rq_fair,
|
||||
.migrate_task_rq = migrate_task_rq_fair,
|
||||
|
||||
@@ -13647,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = {
|
||||
|
||||
.reweight_task = reweight_task_fair,
|
||||
.prio_changed = prio_changed_fair,
|
||||
.switching_from = switching_from_fair,
|
||||
.switched_from = switched_from_fair,
|
||||
.switched_to = switched_to_fair,
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
|
||||
* wakeup-preemption), since its likely going to consume data we
|
||||
* touched, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(NEXT_BUDDY, false)
|
||||
SCHED_FEAT(NEXT_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Allow completely ignoring cfs_rq->next; which can be set from various
|
||||
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
|
||||
SCHED_FEAT(UTIL_EST, true)
|
||||
|
||||
SCHED_FEAT(LATENCY_WARN, false)
|
||||
|
||||
/*
|
||||
* Do newidle balancing proportional to its success rate using randomization.
|
||||
*/
|
||||
SCHED_FEAT(NI_RANDOM, true)
|
||||
|
||||
@@ -452,9 +452,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
static void update_curr_idle(struct rq *rq);
|
||||
|
||||
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
dl_server_update_idle_time(rq, prev);
|
||||
update_curr_idle(rq);
|
||||
scx_update_idle(rq, false, true);
|
||||
}
|
||||
|
||||
@@ -466,7 +468,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
|
||||
next->se.exec_start = rq_clock_task(rq);
|
||||
}
|
||||
|
||||
struct task_struct *pick_task_idle(struct rq *rq)
|
||||
struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
scx_update_idle(rq, true, false);
|
||||
return rq->idle;
|
||||
@@ -496,21 +498,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
|
||||
{
|
||||
update_curr_idle(rq);
|
||||
}
|
||||
|
||||
static void switched_to_idle(struct rq *rq, struct task_struct *p)
|
||||
static void switching_to_idle(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void
|
||||
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
|
||||
{
|
||||
if (p->prio == oldprio)
|
||||
return;
|
||||
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void update_curr_idle(struct rq *rq)
|
||||
{
|
||||
struct sched_entity *se = &rq->idle->se;
|
||||
u64 now = rq_clock_task(rq);
|
||||
s64 delta_exec;
|
||||
|
||||
delta_exec = now - se->exec_start;
|
||||
if (unlikely(delta_exec <= 0))
|
||||
return;
|
||||
|
||||
se->exec_start = now;
|
||||
|
||||
dl_server_update_idle(&rq->fair_server, delta_exec);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -518,6 +535,8 @@ static void update_curr_idle(struct rq *rq)
|
||||
*/
|
||||
DEFINE_SCHED_CLASS(idle) = {
|
||||
|
||||
.queue_mask = 0,
|
||||
|
||||
/* no enqueue/yield_task for idle tasks */
|
||||
|
||||
/* dequeue is not valid, we print a debug message there: */
|
||||
@@ -536,6 +555,6 @@ DEFINE_SCHED_CLASS(idle) = {
|
||||
.task_tick = task_tick_idle,
|
||||
|
||||
.prio_changed = prio_changed_idle,
|
||||
.switched_to = switched_to_idle,
|
||||
.switching_to = switching_to_idle,
|
||||
.update_curr = update_curr_idle,
|
||||
};
|
||||
|
||||
@@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
|
||||
|
||||
static void yield_task_rt(struct rq *rq)
|
||||
{
|
||||
requeue_task_rt(rq, rq->curr, 0);
|
||||
requeue_task_rt(rq, rq->donor, 0);
|
||||
}
|
||||
|
||||
static int find_lowest_rq(struct task_struct *task);
|
||||
@@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
|
||||
return rt_task_of(rt_se);
|
||||
}
|
||||
|
||||
static struct task_struct *pick_task_rt(struct rq *rq)
|
||||
static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
|
||||
* us to initiate a push or pull.
|
||||
*/
|
||||
static void
|
||||
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
|
||||
{
|
||||
if (!task_on_rq_queued(p))
|
||||
return;
|
||||
|
||||
if (p->prio == oldprio)
|
||||
return;
|
||||
|
||||
if (task_current_donor(rq, p)) {
|
||||
/*
|
||||
* If our priority decreases while running, we
|
||||
@@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
|
||||
|
||||
DEFINE_SCHED_CLASS(rt) = {
|
||||
|
||||
.queue_mask = 4,
|
||||
|
||||
.enqueue_task = enqueue_task_rt,
|
||||
.dequeue_task = dequeue_task_rt,
|
||||
.yield_task = yield_task_rt,
|
||||
@@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = {
|
||||
|
||||
.get_rr_interval = get_rr_interval_rt,
|
||||
|
||||
.prio_changed = prio_changed_rt,
|
||||
.switched_to = switched_to_rt,
|
||||
.prio_changed = prio_changed_rt,
|
||||
|
||||
.update_curr = update_curr_rt,
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#ifndef _KERNEL_SCHED_SCHED_H
|
||||
#define _KERNEL_SCHED_SCHED_H
|
||||
|
||||
#include <linux/prandom.h>
|
||||
#include <linux/sched/affinity.h>
|
||||
#include <linux/sched/autogroup.h>
|
||||
#include <linux/sched/cpufreq.h>
|
||||
@@ -20,7 +21,6 @@
|
||||
#include <linux/sched/task_flags.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sched/topology.h>
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/bug.h>
|
||||
@@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
|
||||
* naturally thottled to once per period, avoiding high context switch
|
||||
* workloads from spamming the hrtimer program/cancel paths.
|
||||
*/
|
||||
extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec);
|
||||
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
|
||||
extern void dl_server_start(struct sched_dl_entity *dl_se);
|
||||
extern void dl_server_stop(struct sched_dl_entity *dl_se);
|
||||
@@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
|
||||
dl_server_pick_f pick_task);
|
||||
extern void sched_init_dl_servers(void);
|
||||
|
||||
extern void dl_server_update_idle_time(struct rq *rq,
|
||||
struct task_struct *p);
|
||||
extern void fair_server_init(struct rq *rq);
|
||||
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
|
||||
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
|
||||
@@ -682,10 +681,10 @@ struct cfs_rq {
|
||||
s64 avg_vruntime;
|
||||
u64 avg_load;
|
||||
|
||||
u64 min_vruntime;
|
||||
u64 zero_vruntime;
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
unsigned int forceidle_seq;
|
||||
u64 min_vruntime_fi;
|
||||
u64 zero_vruntime_fi;
|
||||
#endif
|
||||
|
||||
struct rb_root_cached tasks_timeline;
|
||||
@@ -780,7 +779,6 @@ enum scx_rq_flags {
|
||||
*/
|
||||
SCX_RQ_ONLINE = 1 << 0,
|
||||
SCX_RQ_CAN_STOP_TICK = 1 << 1,
|
||||
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
|
||||
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
|
||||
SCX_RQ_BYPASSING = 1 << 4,
|
||||
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
|
||||
@@ -1120,6 +1118,8 @@ struct rq {
|
||||
/* runqueue lock: */
|
||||
raw_spinlock_t __lock;
|
||||
|
||||
/* Per class runqueue modification mask; bits in class order. */
|
||||
unsigned int queue_mask;
|
||||
unsigned int nr_running;
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
unsigned int nr_numa_running;
|
||||
@@ -1349,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
|
||||
}
|
||||
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
|
||||
|
||||
static inline u32 sched_rng(void)
|
||||
{
|
||||
return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
|
||||
}
|
||||
|
||||
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
|
||||
#define this_rq() this_cpu_ptr(&runqueues)
|
||||
@@ -1432,6 +1438,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
|
||||
if (!sched_core_enabled(rq))
|
||||
return true;
|
||||
|
||||
if (rq->core->core_cookie == p->core_cookie)
|
||||
return true;
|
||||
|
||||
for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
|
||||
if (!available_idle_cpu(cpu)) {
|
||||
idle_core = false;
|
||||
@@ -1443,7 +1452,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
|
||||
* A CPU in an idle core is always the best choice for tasks with
|
||||
* cookies.
|
||||
*/
|
||||
return idle_core || rq->core->core_cookie == p->core_cookie;
|
||||
return idle_core;
|
||||
}
|
||||
|
||||
static inline bool sched_group_cookie_match(struct rq *rq,
|
||||
@@ -1827,7 +1836,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
|
||||
__acquires(p->pi_lock)
|
||||
__acquires(rq->lock);
|
||||
|
||||
static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
|
||||
static inline void
|
||||
__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
||||
__releases(rq->lock)
|
||||
{
|
||||
rq_unpin_lock(rq, rf);
|
||||
@@ -1839,8 +1849,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
||||
__releases(rq->lock)
|
||||
__releases(p->pi_lock)
|
||||
{
|
||||
rq_unpin_lock(rq, rf);
|
||||
raw_spin_rq_unlock(rq);
|
||||
__task_rq_unlock(rq, p, rf);
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
|
||||
}
|
||||
|
||||
@@ -1849,6 +1858,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
|
||||
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
|
||||
struct rq *rq; struct rq_flags rf)
|
||||
|
||||
DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
|
||||
_T->rq = __task_rq_lock(_T->lock, &_T->rf),
|
||||
__task_rq_unlock(_T->rq, _T->lock, &_T->rf),
|
||||
struct rq *rq; struct rq_flags rf)
|
||||
|
||||
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
|
||||
__acquires(rq->lock)
|
||||
{
|
||||
@@ -2342,8 +2356,7 @@ extern const u32 sched_prio_to_wmult[40];
|
||||
/*
|
||||
* {de,en}queue flags:
|
||||
*
|
||||
* DEQUEUE_SLEEP - task is no longer runnable
|
||||
* ENQUEUE_WAKEUP - task just became runnable
|
||||
* SLEEP/WAKEUP - task is no-longer/just-became runnable
|
||||
*
|
||||
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
|
||||
* are in a known state which allows modification. Such pairs
|
||||
@@ -2356,34 +2369,46 @@ extern const u32 sched_prio_to_wmult[40];
|
||||
*
|
||||
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
|
||||
*
|
||||
* DELAYED - de/re-queue a sched_delayed task
|
||||
*
|
||||
* CLASS - going to update p->sched_class; makes sched_change call the
|
||||
* various switch methods.
|
||||
*
|
||||
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
|
||||
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
|
||||
* ENQUEUE_MIGRATED - the task was migrated during wakeup
|
||||
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
|
||||
*
|
||||
* XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
|
||||
* SCHED_DEADLINE seems to rely on this for now.
|
||||
*/
|
||||
|
||||
#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
|
||||
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
|
||||
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
|
||||
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
|
||||
#define DEQUEUE_SPECIAL 0x10
|
||||
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
|
||||
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
|
||||
#define DEQUEUE_THROTTLE 0x800
|
||||
#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
|
||||
#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
|
||||
#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
|
||||
#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
|
||||
|
||||
#define ENQUEUE_WAKEUP 0x01
|
||||
#define ENQUEUE_RESTORE 0x02
|
||||
#define ENQUEUE_MOVE 0x04
|
||||
#define ENQUEUE_NOCLOCK 0x08
|
||||
#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
|
||||
#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
|
||||
#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
|
||||
|
||||
#define ENQUEUE_HEAD 0x10
|
||||
#define ENQUEUE_REPLENISH 0x20
|
||||
#define ENQUEUE_MIGRATED 0x40
|
||||
#define ENQUEUE_INITIAL 0x80
|
||||
#define ENQUEUE_MIGRATING 0x100
|
||||
#define ENQUEUE_DELAYED 0x200
|
||||
#define ENQUEUE_RQ_SELECTED 0x400
|
||||
#define DEQUEUE_SPECIAL 0x00010000
|
||||
#define DEQUEUE_THROTTLE 0x00020000
|
||||
|
||||
#define ENQUEUE_WAKEUP 0x0001
|
||||
#define ENQUEUE_RESTORE 0x0002
|
||||
#define ENQUEUE_MOVE 0x0004
|
||||
#define ENQUEUE_NOCLOCK 0x0008
|
||||
|
||||
#define ENQUEUE_MIGRATING 0x0010
|
||||
#define ENQUEUE_DELAYED 0x0020
|
||||
#define ENQUEUE_CLASS 0x0040
|
||||
|
||||
#define ENQUEUE_HEAD 0x00010000
|
||||
#define ENQUEUE_REPLENISH 0x00020000
|
||||
#define ENQUEUE_MIGRATED 0x00040000
|
||||
#define ENQUEUE_INITIAL 0x00080000
|
||||
#define ENQUEUE_RQ_SELECTED 0x00100000
|
||||
|
||||
#define RETRY_TASK ((void *)-1UL)
|
||||
|
||||
@@ -2400,16 +2425,61 @@ struct sched_class {
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
int uclamp_enabled;
|
||||
#endif
|
||||
/*
|
||||
* idle: 0
|
||||
* ext: 1
|
||||
* fair: 2
|
||||
* rt: 4
|
||||
* dl: 8
|
||||
* stop: 16
|
||||
*/
|
||||
unsigned int queue_mask;
|
||||
|
||||
/*
|
||||
* move_queued_task/activate_task/enqueue_task: rq->lock
|
||||
* ttwu_do_activate/activate_task/enqueue_task: rq->lock
|
||||
* wake_up_new_task/activate_task/enqueue_task: task_rq_lock
|
||||
* ttwu_runnable/enqueue_task: task_rq_lock
|
||||
* proxy_task_current: rq->lock
|
||||
* sched_change_end
|
||||
*/
|
||||
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
|
||||
/*
|
||||
* move_queued_task/deactivate_task/dequeue_task: rq->lock
|
||||
* __schedule/block_task/dequeue_task: rq->lock
|
||||
* proxy_task_current: rq->lock
|
||||
* wait_task_inactive: task_rq_lock
|
||||
* sched_change_begin
|
||||
*/
|
||||
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
/*
|
||||
* do_sched_yield: rq->lock
|
||||
*/
|
||||
void (*yield_task) (struct rq *rq);
|
||||
/*
|
||||
* yield_to: rq->lock (double)
|
||||
*/
|
||||
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
|
||||
|
||||
/*
|
||||
* move_queued_task: rq->lock
|
||||
* __migrate_swap_task: rq->lock
|
||||
* ttwu_do_activate: rq->lock
|
||||
* ttwu_runnable: task_rq_lock
|
||||
* wake_up_new_task: task_rq_lock
|
||||
*/
|
||||
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
/*
|
||||
* schedule/pick_next_task/prev_balance: rq->lock
|
||||
*/
|
||||
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
struct task_struct *(*pick_task)(struct rq *rq);
|
||||
|
||||
/*
|
||||
* schedule/pick_next_task: rq->lock
|
||||
*/
|
||||
struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
|
||||
/*
|
||||
* Optional! When implemented pick_next_task() should be equivalent to:
|
||||
*
|
||||
@@ -2419,55 +2489,123 @@ struct sched_class {
|
||||
* set_next_task_first(next);
|
||||
* }
|
||||
*/
|
||||
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
|
||||
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
|
||||
struct rq_flags *rf);
|
||||
|
||||
/*
|
||||
* sched_change:
|
||||
* __schedule: rq->lock
|
||||
*/
|
||||
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
|
||||
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
|
||||
|
||||
/*
|
||||
* select_task_rq: p->pi_lock
|
||||
* sched_exec: p->pi_lock
|
||||
*/
|
||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
|
||||
|
||||
/*
|
||||
* set_task_cpu: p->pi_lock || rq->lock (ttwu like)
|
||||
*/
|
||||
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
|
||||
|
||||
/*
|
||||
* ttwu_do_activate: rq->lock
|
||||
* wake_up_new_task: task_rq_lock
|
||||
*/
|
||||
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
|
||||
|
||||
/*
|
||||
* do_set_cpus_allowed: task_rq_lock + sched_change
|
||||
*/
|
||||
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
|
||||
|
||||
/*
|
||||
* sched_set_rq_{on,off}line: rq->lock
|
||||
*/
|
||||
void (*rq_online)(struct rq *rq);
|
||||
void (*rq_offline)(struct rq *rq);
|
||||
|
||||
/*
|
||||
* push_cpu_stop: p->pi_lock && rq->lock
|
||||
*/
|
||||
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
|
||||
|
||||
/*
|
||||
* hrtick: rq->lock
|
||||
* sched_tick: rq->lock
|
||||
* sched_tick_remote: rq->lock
|
||||
*/
|
||||
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
|
||||
/*
|
||||
* sched_cgroup_fork: p->pi_lock
|
||||
*/
|
||||
void (*task_fork)(struct task_struct *p);
|
||||
/*
|
||||
* finish_task_switch: no locks
|
||||
*/
|
||||
void (*task_dead)(struct task_struct *p);
|
||||
|
||||
/*
|
||||
* The switched_from() call is allowed to drop rq->lock, therefore we
|
||||
* cannot assume the switched_from/switched_to pair is serialized by
|
||||
* rq->lock. They are however serialized by p->pi_lock.
|
||||
* sched_change
|
||||
*/
|
||||
void (*switching_from)(struct rq *this_rq, struct task_struct *task);
|
||||
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
|
||||
u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
|
||||
u64 oldprio);
|
||||
|
||||
/*
|
||||
* set_load_weight: task_rq_lock + sched_change
|
||||
* __setscheduler_parms: task_rq_lock + sched_change
|
||||
*/
|
||||
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
|
||||
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
|
||||
const struct load_weight *lw);
|
||||
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
|
||||
int oldprio);
|
||||
|
||||
/*
|
||||
* sched_rr_get_interval: task_rq_lock
|
||||
*/
|
||||
unsigned int (*get_rr_interval)(struct rq *rq,
|
||||
struct task_struct *task);
|
||||
|
||||
/*
|
||||
* task_sched_runtime: task_rq_lock
|
||||
*/
|
||||
void (*update_curr)(struct rq *rq);
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/*
|
||||
* sched_change_group: task_rq_lock + sched_change
|
||||
*/
|
||||
void (*task_change_group)(struct task_struct *p);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
/*
|
||||
* pick_next_task: rq->lock
|
||||
* try_steal_cookie: rq->lock (double)
|
||||
*/
|
||||
int (*task_is_throttled)(struct task_struct *p, int cpu);
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
|
||||
*/
|
||||
static inline void rq_modified_clear(struct rq *rq)
|
||||
{
|
||||
rq->queue_mask = 0;
|
||||
}
|
||||
|
||||
static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
|
||||
{
|
||||
unsigned int mask = class->queue_mask;
|
||||
return rq->queue_mask & ~((mask << 1) - 1);
|
||||
}
|
||||
|
||||
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
WARN_ON_ONCE(rq->donor != prev);
|
||||
@@ -2579,8 +2717,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
|
||||
return rq->cfs.nr_queued > 0;
|
||||
}
|
||||
|
||||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
extern struct task_struct *pick_task_idle(struct rq *rq);
|
||||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
|
||||
struct rq_flags *rf);
|
||||
extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
|
||||
|
||||
#define SCA_CHECK 0x01
|
||||
#define SCA_MIGRATE_DISABLE 0x02
|
||||
@@ -2610,7 +2749,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
|
||||
static inline cpumask_t *alloc_user_cpus_ptr(int node)
|
||||
{
|
||||
/*
|
||||
* See do_set_cpus_allowed() above for the rcu_head usage.
|
||||
* See set_cpus_allowed_force() above for the rcu_head usage.
|
||||
*/
|
||||
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
|
||||
|
||||
@@ -3875,32 +4014,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
|
||||
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern void check_class_changing(struct rq *rq, struct task_struct *p,
|
||||
const struct sched_class *prev_class);
|
||||
extern void check_class_changed(struct rq *rq, struct task_struct *p,
|
||||
const struct sched_class *prev_class,
|
||||
int oldprio);
|
||||
|
||||
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
|
||||
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
|
||||
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
/*
|
||||
* Used by SCX in the enable/disable paths to move tasks between sched_classes
|
||||
* and establish invariants.
|
||||
* The 'sched_change' pattern is the safe, easy and slow way of changing a
|
||||
* task's scheduling properties. It dequeues a task, such that the scheduler
|
||||
* is fully unaware of it; at which point its properties can be modified;
|
||||
* after which it is enqueued again.
|
||||
*
|
||||
* Typically this must be called while holding task_rq_lock, since most/all
|
||||
* properties are serialized under those locks. There is currently one
|
||||
* exception to this rule in sched/ext which only holds rq->lock.
|
||||
*/
|
||||
struct sched_enq_and_set_ctx {
|
||||
|
||||
/*
|
||||
* This structure is a temporary, used to preserve/convey the queueing state
|
||||
* of the task between sched_change_begin() and sched_change_end(). Ensuring
|
||||
* the task's queueing state is idempotent across the operation.
|
||||
*/
|
||||
struct sched_change_ctx {
|
||||
u64 prio;
|
||||
struct task_struct *p;
|
||||
int queue_flags;
|
||||
int flags;
|
||||
bool queued;
|
||||
bool running;
|
||||
};
|
||||
|
||||
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
|
||||
struct sched_enq_and_set_ctx *ctx);
|
||||
void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
|
||||
struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
|
||||
void sched_change_end(struct sched_change_ctx *ctx);
|
||||
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
DEFINE_CLASS(sched_change, struct sched_change_ctx *,
|
||||
sched_change_end(_T),
|
||||
sched_change_begin(p, flags),
|
||||
struct task_struct *p, unsigned int flags)
|
||||
|
||||
DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
|
||||
|
||||
#include "ext.h"
|
||||
|
||||
|
||||
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
|
||||
|
||||
rq = __task_rq_lock(p, &rf);
|
||||
psi_task_change(p, p->psi_flags, 0);
|
||||
__task_rq_unlock(rq, &rf);
|
||||
__task_rq_unlock(rq, p, &rf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
|
||||
stop->se.exec_start = rq_clock_task(rq);
|
||||
}
|
||||
|
||||
static struct task_struct *pick_task_stop(struct rq *rq)
|
||||
static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
if (!sched_stop_runnable(rq))
|
||||
return NULL;
|
||||
@@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
|
||||
{
|
||||
}
|
||||
|
||||
static void switched_to_stop(struct rq *rq, struct task_struct *p)
|
||||
static void switching_to_stop(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
BUG(); /* its impossible to change to this class */
|
||||
}
|
||||
|
||||
static void
|
||||
prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
|
||||
{
|
||||
if (p->prio == oldprio)
|
||||
return;
|
||||
|
||||
BUG(); /* how!?, what priority? */
|
||||
}
|
||||
|
||||
@@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq)
|
||||
*/
|
||||
DEFINE_SCHED_CLASS(stop) = {
|
||||
|
||||
.queue_mask = 16,
|
||||
|
||||
.enqueue_task = enqueue_task_stop,
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
.yield_task = yield_task_stop,
|
||||
@@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = {
|
||||
.task_tick = task_tick_stop,
|
||||
|
||||
.prio_changed = prio_changed_stop,
|
||||
.switched_to = switched_to_stop,
|
||||
.switching_to = switching_to_stop,
|
||||
.update_curr = update_curr_stop,
|
||||
};
|
||||
|
||||
@@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p)
|
||||
|
||||
void set_user_nice(struct task_struct *p, long nice)
|
||||
{
|
||||
bool queued, running;
|
||||
struct rq *rq;
|
||||
int old_prio;
|
||||
|
||||
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
|
||||
@@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||
* We have to be careful, if called from sys_setpriority(),
|
||||
* the task might be in the middle of scheduling on another CPU.
|
||||
*/
|
||||
CLASS(task_rq_lock, rq_guard)(p);
|
||||
rq = rq_guard.rq;
|
||||
|
||||
update_rq_clock(rq);
|
||||
guard(task_rq_lock)(p);
|
||||
|
||||
/*
|
||||
* The RT priorities are set via sched_setscheduler(), but we still
|
||||
@@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||
return;
|
||||
}
|
||||
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current_donor(rq, p);
|
||||
if (queued)
|
||||
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
p->static_prio = NICE_TO_PRIO(nice);
|
||||
set_load_weight(p, true);
|
||||
old_prio = p->prio;
|
||||
p->prio = effective_prio(p);
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
|
||||
/*
|
||||
* If the task increased its priority or is running and
|
||||
* lowered its priority, then reschedule its CPU:
|
||||
*/
|
||||
p->sched_class->prio_changed(rq, p, old_prio);
|
||||
scoped_guard (sched_change, p, DEQUEUE_SAVE) {
|
||||
p->static_prio = NICE_TO_PRIO(nice);
|
||||
set_load_weight(p, true);
|
||||
old_prio = p->prio;
|
||||
p->prio = effective_prio(p);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(set_user_nice);
|
||||
|
||||
@@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p,
|
||||
bool user, bool pi)
|
||||
{
|
||||
int oldpolicy = -1, policy = attr->sched_policy;
|
||||
int retval, oldprio, newprio, queued, running;
|
||||
int retval, oldprio, newprio;
|
||||
const struct sched_class *prev_class, *next_class;
|
||||
struct balance_callback *head;
|
||||
struct rq_flags rf;
|
||||
@@ -695,38 +674,27 @@ change:
|
||||
prev_class = p->sched_class;
|
||||
next_class = __setscheduler_class(policy, newprio);
|
||||
|
||||
if (prev_class != next_class && p->se.sched_delayed)
|
||||
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
|
||||
if (prev_class != next_class)
|
||||
queue_flags |= DEQUEUE_CLASS;
|
||||
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current_donor(rq, p);
|
||||
if (queued)
|
||||
dequeue_task(rq, p, queue_flags);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
scoped_guard (sched_change, p, queue_flags) {
|
||||
|
||||
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
|
||||
__setscheduler_params(p, attr);
|
||||
p->sched_class = next_class;
|
||||
p->prio = newprio;
|
||||
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
|
||||
__setscheduler_params(p, attr);
|
||||
p->sched_class = next_class;
|
||||
p->prio = newprio;
|
||||
}
|
||||
__setscheduler_uclamp(p, attr);
|
||||
|
||||
if (scope->queued) {
|
||||
/*
|
||||
* We enqueue to tail when the priority of a task is
|
||||
* increased (user space view).
|
||||
*/
|
||||
if (oldprio < p->prio)
|
||||
scope->flags |= ENQUEUE_HEAD;
|
||||
}
|
||||
}
|
||||
__setscheduler_uclamp(p, attr);
|
||||
check_class_changing(rq, p, prev_class);
|
||||
|
||||
if (queued) {
|
||||
/*
|
||||
* We enqueue to tail when the priority of a task is
|
||||
* increased (user space view).
|
||||
*/
|
||||
if (oldprio < p->prio)
|
||||
queue_flags |= ENQUEUE_HEAD;
|
||||
|
||||
enqueue_task(rq, p, queue_flags);
|
||||
}
|
||||
if (running)
|
||||
set_next_task(rq, p);
|
||||
|
||||
check_class_changed(rq, p, prev_class, oldprio);
|
||||
|
||||
/* Avoid rq from going away on us: */
|
||||
preempt_disable();
|
||||
@@ -1351,7 +1319,7 @@ static void do_sched_yield(void)
|
||||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
schedstat_inc(rq->yld_count);
|
||||
current->sched_class->yield_task(rq);
|
||||
rq->donor->sched_class->yield_task(rq);
|
||||
|
||||
preempt_disable();
|
||||
rq_unlock_irq(rq, &rf);
|
||||
@@ -1420,12 +1388,13 @@ EXPORT_SYMBOL(yield);
|
||||
*/
|
||||
int __sched yield_to(struct task_struct *p, bool preempt)
|
||||
{
|
||||
struct task_struct *curr = current;
|
||||
struct task_struct *curr;
|
||||
struct rq *rq, *p_rq;
|
||||
int yielded = 0;
|
||||
|
||||
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
|
||||
rq = this_rq();
|
||||
curr = rq->donor;
|
||||
|
||||
again:
|
||||
p_rq = task_rq(p);
|
||||
|
||||
@@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
|
||||
#ifdef CONFIG_NUMA
|
||||
enum numa_topology_type sched_numa_topology_type;
|
||||
|
||||
/*
|
||||
* sched_domains_numa_distance is derived from sched_numa_node_distance
|
||||
* and provides a simplified view of NUMA distances used specifically
|
||||
* for building NUMA scheduling domains.
|
||||
*/
|
||||
static int sched_domains_numa_levels;
|
||||
static int sched_numa_node_levels;
|
||||
|
||||
int sched_max_numa_distance;
|
||||
static int *sched_domains_numa_distance;
|
||||
static int *sched_numa_node_distance;
|
||||
static struct cpumask ***sched_domains_numa_masks;
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
@@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
|
||||
.last_balance = jiffies,
|
||||
.balance_interval = sd_weight,
|
||||
|
||||
/* 50% success rate */
|
||||
.newidle_call = 512,
|
||||
.newidle_success = 256,
|
||||
.newidle_ratio = 512,
|
||||
|
||||
.max_newidle_lb_cost = 0,
|
||||
.last_decay_max_lb_cost = jiffies,
|
||||
.child = child,
|
||||
@@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance)
|
||||
return true;
|
||||
|
||||
rcu_read_lock();
|
||||
distances = rcu_dereference(sched_domains_numa_distance);
|
||||
distances = rcu_dereference(sched_numa_node_distance);
|
||||
if (!distances)
|
||||
goto unlock;
|
||||
for (i = 0; i < sched_domains_numa_levels; i++) {
|
||||
for (i = 0; i < sched_numa_node_levels; i++) {
|
||||
if (distances[i] == distance) {
|
||||
found = true;
|
||||
break;
|
||||
@@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node)
|
||||
|
||||
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
|
||||
|
||||
void sched_init_numa(int offline_node)
|
||||
/*
|
||||
* An architecture could modify its NUMA distance, to change
|
||||
* grouping of NUMA nodes and number of NUMA levels when creating
|
||||
* NUMA level sched domains.
|
||||
*
|
||||
* A NUMA level is created for each unique
|
||||
* arch_sched_node_distance.
|
||||
*/
|
||||
static int numa_node_dist(int i, int j)
|
||||
{
|
||||
struct sched_domain_topology_level *tl;
|
||||
unsigned long *distance_map;
|
||||
return node_distance(i, j);
|
||||
}
|
||||
|
||||
int arch_sched_node_distance(int from, int to)
|
||||
__weak __alias(numa_node_dist);
|
||||
|
||||
static bool modified_sched_node_distance(void)
|
||||
{
|
||||
return numa_node_dist != arch_sched_node_distance;
|
||||
}
|
||||
|
||||
static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
|
||||
int **dist, int *levels)
|
||||
{
|
||||
unsigned long *distance_map __free(bitmap) = NULL;
|
||||
int nr_levels = 0;
|
||||
int i, j;
|
||||
int *distances;
|
||||
struct cpumask ***masks;
|
||||
|
||||
/*
|
||||
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
|
||||
@@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node)
|
||||
*/
|
||||
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
|
||||
if (!distance_map)
|
||||
return;
|
||||
return -ENOMEM;
|
||||
|
||||
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
|
||||
for_each_cpu_node_but(i, offline_node) {
|
||||
for_each_cpu_node_but(j, offline_node) {
|
||||
int distance = node_distance(i, j);
|
||||
int distance = n_dist(i, j);
|
||||
|
||||
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
|
||||
sched_numa_warn("Invalid distance value range");
|
||||
bitmap_free(distance_map);
|
||||
return;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
bitmap_set(distance_map, distance, 1);
|
||||
@@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node)
|
||||
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
|
||||
|
||||
distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
|
||||
if (!distances) {
|
||||
bitmap_free(distance_map);
|
||||
return;
|
||||
}
|
||||
if (!distances)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0, j = 0; i < nr_levels; i++, j++) {
|
||||
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
|
||||
distances[i] = j;
|
||||
}
|
||||
rcu_assign_pointer(sched_domains_numa_distance, distances);
|
||||
*dist = distances;
|
||||
*levels = nr_levels;
|
||||
|
||||
bitmap_free(distance_map);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void sched_init_numa(int offline_node)
|
||||
{
|
||||
struct sched_domain_topology_level *tl;
|
||||
int nr_levels, nr_node_levels;
|
||||
int i, j;
|
||||
int *distances, *domain_distances;
|
||||
struct cpumask ***masks;
|
||||
|
||||
/* Record the NUMA distances from SLIT table */
|
||||
if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
|
||||
&nr_node_levels))
|
||||
return;
|
||||
|
||||
/* Record modified NUMA distances for building sched domains */
|
||||
if (modified_sched_node_distance()) {
|
||||
if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
|
||||
&domain_distances, &nr_levels)) {
|
||||
kfree(distances);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
domain_distances = distances;
|
||||
nr_levels = nr_node_levels;
|
||||
}
|
||||
rcu_assign_pointer(sched_numa_node_distance, distances);
|
||||
WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
|
||||
WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
|
||||
|
||||
/*
|
||||
* 'nr_levels' contains the number of unique distances
|
||||
@@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node)
|
||||
*
|
||||
* We reset it to 'nr_levels' at the end of this function.
|
||||
*/
|
||||
rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
|
||||
|
||||
sched_domains_numa_levels = 0;
|
||||
|
||||
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
|
||||
@@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node)
|
||||
masks[i][j] = mask;
|
||||
|
||||
for_each_cpu_node_but(k, offline_node) {
|
||||
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
|
||||
if (sched_debug() &&
|
||||
(arch_sched_node_distance(j, k) !=
|
||||
arch_sched_node_distance(k, j)))
|
||||
sched_numa_warn("Node-distance not symmetric");
|
||||
|
||||
if (node_distance(j, k) > sched_domains_numa_distance[i])
|
||||
if (arch_sched_node_distance(j, k) >
|
||||
sched_domains_numa_distance[i])
|
||||
continue;
|
||||
|
||||
cpumask_or(mask, mask, cpumask_of_node(k));
|
||||
@@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node)
|
||||
sched_domain_topology = tl;
|
||||
|
||||
sched_domains_numa_levels = nr_levels;
|
||||
WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
|
||||
|
||||
init_numa_topology_type(offline_node);
|
||||
}
|
||||
@@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node)
|
||||
|
||||
static void sched_reset_numa(void)
|
||||
{
|
||||
int nr_levels, *distances;
|
||||
int nr_levels, *distances, *dom_distances = NULL;
|
||||
struct cpumask ***masks;
|
||||
|
||||
nr_levels = sched_domains_numa_levels;
|
||||
sched_numa_node_levels = 0;
|
||||
sched_domains_numa_levels = 0;
|
||||
sched_max_numa_distance = 0;
|
||||
sched_numa_topology_type = NUMA_DIRECT;
|
||||
distances = sched_domains_numa_distance;
|
||||
distances = sched_numa_node_distance;
|
||||
if (sched_numa_node_distance != sched_domains_numa_distance)
|
||||
dom_distances = sched_domains_numa_distance;
|
||||
rcu_assign_pointer(sched_numa_node_distance, NULL);
|
||||
rcu_assign_pointer(sched_domains_numa_distance, NULL);
|
||||
masks = sched_domains_numa_masks;
|
||||
rcu_assign_pointer(sched_domains_numa_masks, NULL);
|
||||
@@ -2083,6 +2151,7 @@ static void sched_reset_numa(void)
|
||||
|
||||
synchronize_rcu();
|
||||
kfree(distances);
|
||||
kfree(dom_distances);
|
||||
for (i = 0; i < nr_levels && masks; i++) {
|
||||
if (!masks[i])
|
||||
continue;
|
||||
@@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
|
||||
continue;
|
||||
|
||||
/* Set ourselves in the remote node's masks */
|
||||
if (node_distance(j, node) <= sched_domains_numa_distance[i])
|
||||
if (arch_sched_node_distance(j, node) <=
|
||||
sched_domains_numa_distance[i])
|
||||
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user