/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ #ifndef _LINUX_RSEQ_H #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ #include #include void __rseq_handle_slowpath(struct pt_regs *regs); static __always_inline bool rseq_v2(struct task_struct *t) { return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); } /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } void __rseq_signal_deliver(int sig, struct pt_regs *regs); /* * Invoked from signal delivery to fixup based on the register context before * switching to the signal delivery context. */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { if (rseq_v2(current)) { /* has_rseq is implied in rseq_v2() */ if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) __rseq_signal_deliver(ksig->sig, regs); } } static inline void rseq_raise_notify_resume(struct task_struct *t) { set_tsk_thread_flag(t, TIF_RSEQ); } /* Invoked from context switch to force evaluation on exit to user */ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; /* * Only apply the user_irq optimization for RSEQ ABI V2 registrations. * Legacy users like TCMalloc rely on the original ABI V1 behaviour * which updates IDs on every context swtich. */ if (rseq_v2(t)) { /* * Avoid a boat load of conditionals by using simple logic to * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be * raised. * * It's required when the CPU or MM CID has changed or the entry * was via interrupt from user space. ev->has_rseq does not have * to be evaluated here because rseq_v2() implies has_rseq. */ bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; rseq_raise_notify_resume(t); } } else { if (ev->has_rseq) { t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } } } /* * Invoked from __set_task_cpu() when a task migrates or from * mm_cid_schedin() when the CID changes to enforce an IDs update. * * This does not raise TIF_NOTIFY_RESUME as that happens in * rseq_sched_switch_event(). */ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) { t->rseq.event.ids_changed = true; } /* Enforce a full update after RSEQ registration and when execve() failed */ static inline void rseq_force_update(void) { if (current->rseq.event.has_rseq) { current->rseq.event.ids_changed = true; current->rseq.event.sched_switch = true; rseq_raise_notify_resume(current); } } /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, * which clears TIF_NOTIFY_RESUME on architectures that don't use the * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. * * To avoid updating user space RSEQ in that case just to do it eventually * again before returning to user space, because __rseq_handle_slowpath() * does nothing when invoked with NULL register state. * * After returning from guest mode, before exiting to userspace, hypervisors * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. */ static inline void rseq_virt_userspace_exit(void) { /* * The generic optimization for deferring RSEQ updates until the next * exit relies on having a dedicated TIF_RSEQ. */ if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq.event.sched_switch) rseq_raise_notify_resume(current); } static inline void rseq_reset(struct task_struct *t) { /* Protect against preemption and membarrier IPI */ guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } static inline void rseq_execve(struct task_struct *t) { rseq_reset(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. * * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault * on the COW page on exit to user space, when the child stays on the same * CPU as the parent. That's obviously not guaranteed, but in overcommit * scenarios it is more likely and optimizes for the fork/exec case without * taking the fault. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) rseq_reset(t); else t->rseq = current->rseq; } /* * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq * registration. This is the active rseq area size rounded up to next * power of 2, which guarantees that the rseq structure will always be * aligned on the nearest power of two large enough to contain it, even * as it grows. */ static inline unsigned int rseq_alloc_align(void) { return 1U << get_count_order(offsetof(struct rseq, end)); } #else /* CONFIG_RSEQ */ static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else /* CONFIG_DEBUG_RSEQ */ static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ #ifdef CONFIG_RSEQ_SLICE_EXTENSION void rseq_syscall_enter_work(long syscall); int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline void rseq_syscall_enter_work(long syscall) { } static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) { return -ENOTSUPP; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ #endif /* _LINUX_RSEQ_H */