Files
linux/kernel/entry/common.c
Thomas Gleixner 32034df66b rseq: Switch to TIF_RSEQ if supported
TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a separate TIF_RSEQ in the generic TIF space and enable the full
separation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a separate TIF_RSEQ.

The hypervisor TIF handling does not include the separate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de
2025-11-04 08:35:37 +01:00

267 lines
7.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/irq-entry-common.h>
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/tick.h>
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
#else
#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
#endif
static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
local_irq_enable_exit_to_user(ti_work);
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
arch_do_signal_or_restart(regs);
if (ti_work & _TIF_NOTIFY_RESUME)
resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
/*
* Disable interrupts and reevaluate the work flags as they
* might have changed while interrupts and preemption was
* enabled above.
*/
local_irq_disable_exit_to_user();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
return ti_work;
}
/**
* exit_to_user_mode_loop - do any pending work before leaving to user space
* @regs: Pointer to pt_regs on entry stack
* @ti_work: TIF work flags as read by the caller
*/
__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
for (;;) {
ti_work = __exit_to_user_mode_loop(regs, ti_work);
if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
return ti_work;
ti_work = read_thread_flags();
}
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
irqentry_state_t ret = {
.exit_rcu = false,
};
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
return ret;
}
/*
* If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
if (!IS_ENABLED(CONFIG_TINY_RCU) &&
(is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
ret.exit_rcu = true;
return ret;
}
/*
* If RCU is watching then RCU only wants to check whether it needs
* to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
return ret;
}
/**
* arch_irqentry_exit_need_resched - Architecture specific need resched function
*
* Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
* Defaults return true.
*
* The main purpose is to permit arch to avoid preemption of a task from an IRQ.
*/
static inline bool arch_irqentry_exit_need_resched(void);
#ifndef arch_irqentry_exit_need_resched
static inline bool arch_irqentry_exit_need_resched(void) { return true; }
#endif
void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
if (need_resched() && arch_irqentry_exit_need_resched())
preempt_schedule_irq();
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_irqentry_exit_cond_resched(void)
{
if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
return;
raw_irqentry_exit_cond_resched();
}
#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
lockdep_assert_irqs_disabled();
/* Check whether this returns to user mode */
if (user_mode(regs)) {
irqentry_exit_to_user_mode(regs);
} else if (!regs_irqs_disabled(regs)) {
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
* and RCU as the return to user mode path.
*/
if (state.exit_rcu) {
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
ct_irq_exit();
}
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
{
irqentry_state_t irq_state;
irq_state.lockdep = lockdep_hardirqs_enabled();
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
ct_nmi_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
return irq_state;
}
void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
{
instrumentation_begin();
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
}
instrumentation_end();
ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
__nmi_exit();
}