diff options
Diffstat (limited to 'arch/x86/entry')
-rw-r--r-- | arch/x86/entry/Makefile | 14 | ||||
-rw-r--r-- | arch/x86/entry/calling.h | 40 | ||||
-rw-r--r-- | arch/x86/entry/common.c | 638 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 4 | ||||
-rw-r--r-- | arch/x86/entry/entry_64.S | 143 | ||||
-rw-r--r-- | arch/x86/entry/syscall_x32.c | 7 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_32.tbl | 5 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_64.tbl | 5 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 5 |
9 files changed, 237 insertions, 624 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b7a5790d8d63..08bf95dbc911 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -7,12 +7,20 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) + +CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_x32.o += -fno-stack-protector CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) + obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 4208c1e3f601..98e4d8886f11 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/inst.h> /* @@ -341,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + #else /* CONFIG_X86_64 */ # undef UNWIND_HINT_IRET_REGS # define UNWIND_HINT_IRET_REGS @@ -351,3 +358,36 @@ For 32-bit we have the following conventions - kernel is built with call stackleak_erase #endif .endm + +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e83b3f14897c..48512c7944e7 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -10,20 +10,13 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/entry-common.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> #include <linux/export.h> -#include <linux/context_tracking.h> -#include <linux/user-return-notifier.h> #include <linux/nospec.h> -#include <linux/uprobes.h> -#include <linux/livepatch.h> #include <linux/syscalls.h> #include <linux/uaccess.h> @@ -42,343 +35,12 @@ #include <asm/syscall.h> #include <asm/irq_stack.h> -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -/* Check that the stack and regs on entry from user mode are sane. */ -static void check_user_regs(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { - /* - * Make sure that the entry code gave us a sensible EFLAGS - * register. Native because we want to check the actual CPU - * state, not the interrupt state as imagined by Xen. - */ - unsigned long flags = native_save_fl(); - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | - X86_EFLAGS_NT)); - - /* We think we came from user mode. Make sure pt_regs agrees. */ - WARN_ON_ONCE(!user_mode(regs)); - - /* - * All entries from user mode (except #DF) should be on the - * normal thread stack and should have user pt_regs in the - * correct location. - */ - WARN_ON_ONCE(!on_thread_stack()); - WARN_ON_ONCE(regs != task_pt_regs(current)); - } -} - -#ifdef CONFIG_CONTEXT_TRACKING -/** - * enter_from_user_mode - Establish state when coming from user mode - * - * Syscall entry disables interrupts, but user mode is traced as interrupts - * enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ -static noinstr void enter_from_user_mode(void) -{ - enum ctx_state state = ct_state(); - - lockdep_hardirqs_off(CALLER_ADDR0); - user_exit_irqoff(); - - instrumentation_begin(); - CT_WARN_ON(state != CONTEXT_USER); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#else -static __always_inline void enter_from_user_mode(void) -{ - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#endif - -/** - * exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall exit enables interrupts, but the kernel state is interrupts - * disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. - * 4) Tell lockdep that interrupts are enabled - */ -static __always_inline void exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - user_enter_irqoff(); - mds_user_clear_cpu_buffers(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * Returns the syscall nr to run (which should match regs->orig_ax) or -1 - * to skip the syscall. - */ -static long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - - struct thread_info *ti = current_thread_info(); - unsigned long ret = 0; - u32 work; - - work = READ_ONCE(ti->flags); - - if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { - ret = tracehook_report_syscall_entry(regs); - if (ret || (work & _TIF_SYSCALL_EMU)) - return -1L; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp after ptrace, to catch any tracer changes. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; - - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else -#endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; - } -#endif - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); - - return ret ?: regs->orig_ax; -} - -#define EXIT_TO_USERMODE_LOOP_FLAGS \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) - -static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) -{ - /* - * In order to return to user mode, we need to have IRQs off with - * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags - * can be set at any time on preemptible kernels if we have IRQs on, - * so we need to loop. Disabling preemption wouldn't help: doing the - * work to clear some of the flags can sleep. - */ - while (true) { - /* We have work to do. */ - local_irq_enable(); - - if (cached_flags & _TIF_NEED_RESCHED) - schedule(); - - if (cached_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (cached_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - /* deal with pending signal delivery */ - if (cached_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (cached_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - if (cached_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - /* Disable IRQs and retry */ - local_irq_disable(); - - cached_flags = READ_ONCE(current_thread_info()->flags); - - if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - break; - } -} - -static void __prepare_exit_to_usermode(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags; - - addr_limit_user_check(); - - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); - - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - exit_to_usermode_loop(regs, cached_flags); - - /* Reload ti->flags; we may have rescheduled above. */ - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & _TIF_IO_BITMAP)) - tss_update_io_bitmap(); - - fpregs_assert_state_consistent(); - if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. We need to clear it *after* signal - * handling, because syscall restart has a fixup for compat - * syscalls. The fixup is exercised by the ptrace_syscall_32 - * selftest. - * - * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer - * special case only applies after poking regs and before the - * very next return to user mode. - */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); -#endif -} - -__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) -{ - instrumentation_begin(); - __prepare_exit_to_usermode(regs); - instrumentation_end(); - exit_to_user_mode(); -} - -#define SYSCALL_EXIT_WORK_FLAGS \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) - -static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) -{ - bool step; - - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); -} - -static void __syscall_return_slowpath(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags = READ_ONCE(ti->flags); - - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && - WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) - local_irq_enable(); - - rseq_syscall(regs); - - /* - * First do one-time work. If these work items are enabled, we - * want to run them exactly once per syscall exit with IRQs on. - */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) - syscall_slow_exit_work(regs, cached_flags); - - local_irq_disable(); - __prepare_exit_to_usermode(regs); -} - -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible noinstr void syscall_return_slowpath(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); -} - #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - struct thread_info *ti; + nr = syscall_enter_from_user_mode(regs, nr); - check_user_regs(regs); - - enter_from_user_mode(); instrumentation_begin(); - - local_irq_enable(); - ti = current_thread_info(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) - nr = syscall_trace_enter(regs); - if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); @@ -390,66 +52,55 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) regs->ax = x32_sys_call_table[nr](regs); #endif } - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); + syscall_exit_to_user_mode(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -/* - * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does - * all entry and exit work and returns with IRQs off. This function is - * extremely hot in workloads that use it, and it's usually called from - * do_fast_syscall_32, so forcibly inline it to improve performance. - */ -static void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; -#ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; -#endif - - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { - /* - * Subtlety here: if ptrace pokes something larger than - * 2^32-1 into orig_ax, this truncates it. This may or - * may not be necessary, but it matches the old asm - * behavior. - */ - nr = syscall_trace_enter(regs); - } + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + return (unsigned int)syscall_enter_from_user_mode(regs, nr); +} +/* + * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, + unsigned int nr) +{ if (likely(nr < IA32_NR_syscalls)) { + instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); + instrumentation_end(); } - - __syscall_return_slowpath(regs); } /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - check_user_regs(regs); + unsigned int nr = syscall_32_enter(regs); - enter_from_user_mode(); - instrumentation_begin(); - - local_irq_enable(); - do_syscall_32_irqs_on(regs); - - instrumentation_end(); - exit_to_user_mode(); + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); } -static bool __do_fast_syscall_32(struct pt_regs *regs) +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { + unsigned int nr = syscall_32_enter(regs); int res; + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* @@ -462,17 +113,18 @@ static bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } + instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; - local_irq_disable(); - __prepare_exit_to_usermode(regs); + syscall_exit_to_user_mode(regs); return false; } /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); return true; } @@ -485,9 +137,6 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; - bool success; - - check_user_regs(regs); /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -496,17 +145,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - enter_from_user_mode(); - instrumentation_begin(); - - local_irq_enable(); - success = __do_fast_syscall_32(regs); - - instrumentation_end(); - exit_to_user_mode(); - - /* If it failed, keep it simple: use IRET. */ - if (!success) + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) return 0; #ifdef CONFIG_X86_64 @@ -558,197 +198,38 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -/** - * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional - * RCU handling - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes: - * - lockdep irqflag state tracking as low level ASM entry disabled - * interrupts. - * - * - Context tracking if the exception hit user mode. - * - * - The hardirq tracer to keep the state consistent as low level ASM - * entry disabled interrupts. - * - * For kernel mode entries RCU handling is done conditional. If RCU is - * watching then the only RCU requirement is to check whether the tick has - * to be restarted. If RCU is not watching then rcu_irq_enter() has to be - * invoked on entry and rcu_irq_exit() on exit. - * - * Avoiding the rcu_irq_enter/exit() calls is an optimization but also - * solves the problem of kernel mode pagefaults which can schedule, which - * is not possible after invoking rcu_irq_enter() without undoing it. - * - * For user mode entries enter_from_user_mode() must be invoked to - * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit - * would not be possible. - * - * Returns: True if RCU has been adjusted on a kernel entry - * False otherwise - * - * The return value must be fed into the rcu_exit argument of - * idtentry_exit_cond_rcu(). - */ -bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) +noinstr bool idtentry_enter_nmi(struct pt_regs *regs) { - if (user_mode(regs)) { - check_user_regs(regs); - enter_from_user_mode(); - return false; - } + bool irq_state = lockdep_hardirqs_enabled(); - /* - * If this entry hit the idle task invoke rcu_irq_enter() whether - * RCU is watching or not. - * - * Interupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for __rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. - * - * Unconditionally invoke rcu_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - return true; - } + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ instrumentation_begin(); - rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); instrumentation_end(); - return false; -} - -static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) -{ - if (may_sched && !preempt_count()) { - /* Sanity check RCU and thread stack */ - rcu_irq_exit_check_preempt(); - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) - preempt_schedule_irq(); - } - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); + return irq_state; } -/** - * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU - * handling - * @regs: Pointer to pt_regs (exception entry regs) - * @rcu_exit: Invoke rcu_irq_exit() if true - * - * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and reguired and returns to - * the caller with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry - * function must be fed into the @rcu_exit argument. - */ -void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) +noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) { - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { - prepare_exit_to_usermode(regs); - } else if (regs->flags & X86_EFLAGS_IF) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (rcu_exit) { - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - rcu_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (rcu_exit) - rcu_irq_exit(); + instrumentation_begin(); + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); } -} - -/** - * idtentry_enter_user - Handle state tracking on idtentry from user mode - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes enter_from_user_mode() to establish the proper context for - * NOHZ_FULL. Otherwise scheduling on exit would not be possible. - */ -void noinstr idtentry_enter_user(struct pt_regs *regs) -{ - check_user_regs(regs); - enter_from_user_mode(); -} - -/** - * idtentry_exit_user - Handle return from exception to user mode - * @regs: Pointer to pt_regs (exception entry regs) - * - * Runs the necessary preemption and work checks and returns to the caller - * with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter_user(). - */ -void noinstr idtentry_exit_user(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); + instrumentation_end(); - prepare_exit_to_usermode(regs); + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); } #ifdef CONFIG_XEN_PV @@ -800,9 +281,10 @@ static void __xen_pv_evtchn_do_upcall(void) __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; - bool inhcall, rcu_exit; + bool inhcall; + irqentry_state_t state; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -812,13 +294,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) set_irq_regs(old_regs); inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { instrumentation_begin(); - idtentry_exit_cond_resched(regs, true); + irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2d0bd5d5f032..29b7d52143e9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -846,7 +846,7 @@ SYM_CODE_START(ret_from_fork) 2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax - call syscall_return_slowpath + call syscall_exit_to_user_mode jmp .Lsyscall_32_done /* kernel thread */ @@ -854,7 +854,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movl $0, PT_EAX(%esp) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..70dea9337816 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,7 @@ #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> +#include <asm/fsgsbase.h> #include <linux/err.h> #include "calling.h" @@ -283,7 +284,7 @@ SYM_CODE_START(ret_from_fork) 2: UNWIND_HINT_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ + call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode 1: @@ -293,7 +294,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movq $0, RAX(%rsp) @@ -426,10 +427,7 @@ SYM_CODE_START(\asmsym) testb $3, CS-ORIG_RAX(%rsp) jnz .Lfrom_usermode_switch_stack_\@ - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -458,10 +456,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ASM_CLAC - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -798,24 +793,21 @@ SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -825,16 +817,60 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence * to prevent GS speculation, regardless of whether PTI is enabled. */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -845,23 +881,45 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. + * + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1266,10 +1324,27 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include <asm/unistd.h> #include <asm/syscall.h> +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..e31a75262c9c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto @@ -440,6 +440,7 @@ 433 i386 fspick sys_fspick 434 i386 pidfd_open sys_pidfd_open 435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..9d82078c949a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -357,6 +357,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 @@ -396,8 +397,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index ea7c1f0b79df..9185cb1d13b9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - if (mmap_write_lock_killable(mm)) - return -EINTR; + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long size = vma->vm_end - vma->vm_start; @@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) zap_page_range(vma, vma->vm_start, size); } - mmap_write_unlock(mm); + mmap_read_unlock(mm); return 0; } #else |