diff options
Diffstat (limited to 'arch/x86')
50 files changed, 789 insertions, 719 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1720eabc31ec..57a14206ad8f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -283,7 +283,6 @@ config X86 select RTC_LIB select RTC_MC146818_LIB select SPARSE_IRQ - select SRCU select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -1938,7 +1937,6 @@ config X86_SGX depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC depends on CRYPTO=y depends on CRYPTO_SHA256=y - select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index b70559b821df..2106a2bd152b 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -3,9 +3,14 @@ core-y += arch/x86/crypto/ # # Disable SSE and other FP/SIMD instructions to match normal x86 +# This is required to work around issues in older LLVM versions, but breaks +# GCC versions < 11. See: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # +ifeq ($(CONFIG_CC_IS_CLANG),y) KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 +endif ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 8c45b198b62f..bccea57dee81 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -923,6 +923,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) /* Event overflow */ handled++; + status &= ~mask; perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) @@ -933,8 +934,6 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); - - status &= ~mask; } /* diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e2975a32d443..d7da28fada87 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -8,7 +8,7 @@ #define ALT_FLAGS_SHIFT 16 -#define ALT_FLAG_NOT BIT(0) +#define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cbaf174d8efd..b3af2d45bbbb 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -125,6 +125,8 @@ #define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_FAM6_ARROWLAKE 0xC6 + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 52788f79786f..255a78d9d906 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -49,7 +49,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * simple as possible. * Must be called with preemption disabled. */ -static void __resctrl_sched_in(void) +static inline void __resctrl_sched_in(struct task_struct *tsk) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); u32 closid = state->default_closid; @@ -61,13 +61,13 @@ static void __resctrl_sched_in(void) * Else use the closid/rmid assigned to this cpu. */ if (static_branch_likely(&rdt_alloc_enable_key)) { - tmp = READ_ONCE(current->closid); + tmp = READ_ONCE(tsk->closid); if (tmp) closid = tmp; } if (static_branch_likely(&rdt_mon_enable_key)) { - tmp = READ_ONCE(current->rmid); + tmp = READ_ONCE(tsk->rmid); if (tmp) rmid = tmp; } @@ -88,17 +88,17 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } -static inline void resctrl_sched_in(void) +static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) - __resctrl_sched_in(); + __resctrl_sched_in(tsk); } void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else -static inline void resctrl_sched_in(void) {} +static inline void resctrl_sched_in(struct task_struct *tsk) {} static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b8357d6ecd47..b63be696b776 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -128,8 +128,9 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; -/* Guest message request error code */ +/* Guest message request error codes */ #define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) +#define SNP_GUEST_REQ_ERR_BUSY BIT_ULL(33) #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index cb1ee53ad3b1..770dcf75eaa9 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -261,20 +261,22 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, }; -#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0) +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) /* - * For AVIC, the max index allowed for physical APIC ID - * table is 0xff (255). + * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as + * 0xff is a broadcast to all CPUs, i.e. can't be targeted individually. */ #define AVIC_MAX_PHYSICAL_ID 0XFEULL /* - * For x2AVIC, the max index allowed for physical APIC ID - * table is 0x1ff (511). + * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511). */ #define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL +static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); +static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); + #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d13d71af5cf6..0a49a8de9f3c 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -18,32 +18,26 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_unrolled(void *to, const void *from, unsigned len); +rep_movs_alternative(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len) +copy_user_generic(void *to, const void *from, unsigned long len) { - unsigned ret; - + stac(); /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. - * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. - * Otherwise, use copy_user_generic_unrolled. + * If CPU has FSRM feature, use 'rep movs'. + * Otherwise, use rep_movs_alternative. */ - alternative_call_2(copy_user_generic_unrolled, - copy_user_generic_string, - X86_FEATURE_REP_GOOD, - copy_user_enhanced_fast_string, - X86_FEATURE_ERMS, - ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), - "=d" (len)), - "1" (to), "2" (from), "3" (len) - : "memory", "rcx", "r8", "r9", "r10", "r11"); - return ret; + asm volatile( + "1:\n\t" + ALTERNATIVE("rep movsb", + "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT + : : "memory", "rax", "r8", "r9", "r10", "r11"); + clac(); + return len; } static __always_inline __must_check unsigned long @@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -extern long __copy_user_nocache(void *dst, const void __user *src, - unsigned size, int zerorest); - +extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); @@ -69,8 +61,12 @@ static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + long ret; kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 0); + stac(); + ret = __copy_user_nocache(dst, src, size); + clac(); + return ret; } static inline int @@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) */ __must_check unsigned long -clear_user_original(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_rep_good(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_erms(void __user *addr, unsigned long len); +rep_stos_alternative(void __user *addr, unsigned long len); static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) { @@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr */ asm volatile( "1:\n\t" - ALTERNATIVE_3("rep stosb", - "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), - "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS), - "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD)) + ALTERNATIVE("rep stosb", + "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)) "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT - : "a" (0) - /* rep_good clobbers %rdx */ - : "rdx"); + : "a" (0)); clac(); diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 6daa9b0c8d11..a3c29b1496c8 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -89,11 +89,21 @@ * Sub-leaf 2: EAX: host tsc frequency in kHz */ +#define XEN_CPUID_TSC_EMULATED (1u << 0) +#define XEN_CPUID_HOST_TSC_RELIABLE (1u << 1) +#define XEN_CPUID_RDTSCP_INSTR_AVAIL (1u << 2) + +#define XEN_CPUID_TSC_MODE_DEFAULT (0) +#define XEN_CPUID_TSC_MODE_ALWAYS_EMULATE (1u) +#define XEN_CPUID_TSC_MODE_NEVER_EMULATE (2u) +#define XEN_CPUID_TSC_MODE_PVRDTSCP (3u) + /* * Leaf 5 (0x40000x04) * HVM-specific features * Sub-leaf 0: EAX: Features * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag) */ #define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ #define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ @@ -102,12 +112,16 @@ #define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ #define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */ /* - * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be - * used to store high bits for the Destination ID. This expands the Destination - * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768. + * With interrupt format set to 0 (non-remappable) bits 55:49 from the + * IO-APIC RTE and bits 11:5 from the MSI address can be used to store + * high bits for the Destination ID. This expands the Destination ID + * field from 8 to 15 bits, allowing to target APIC IDs up 32768. */ #define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5) -/* Per-vCPU event channel upcalls */ +/* + * Per-vCPU event channel upcalls work correctly with physical IRQs + * bound to event channels. + */ #define XEN_HVM_CPUID_UPCALL_VECTOR (1u << 6) /* diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1c38174b5f01..0dac4ab5b55b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -146,7 +146,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } - if (madt->header.revision >= 5) + + /* ACPI 6.3 and newer support the online capable bit. */ + if (acpi_gbl_FADT.header.revision > 6 || + (acpi_gbl_FADT.header.revision == 6 && + acpi_gbl_FADT.minor_revision >= 3)) acpi_support_online_capable = true; default_acpi_madt_oem_check(madt->header.oem_id, @@ -193,7 +197,8 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags) if (lapic_flags & ACPI_MADT_ENABLED) return true; - if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + if (!acpi_support_online_capable || + (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) return true; return false; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 380753b14cab..1547781e505b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -880,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) } } #endif + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); } static void init_amd_zn(struct cpuinfo_x86 *c) @@ -920,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + /* AMD FSRM also implies FSRS */ + if (cpu_has(c, X86_FEATURE_FSRM)) + set_cpu_cap(c, X86_FEATURE_FSRS); + /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7832a69d170e..2eec60f50057 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2355,6 +2355,7 @@ static void mce_restart(void) { mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); + mce_schedule_work(); } /* Toggle features for corrected errors */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f36dc2f796c5..f1197366a97d 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -358,12 +358,16 @@ static void __init ms_hyperv_init_platform(void) * To mirror what Windows does we should extract CPU management * features and use the ReservedIdentityBit to detect if Linux is the * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). + * interface (a process to be finalized). For now, use the privilege + * flag as the indicator for running as root. * - * For now, use the privilege flag as the indicator for running as - * root. + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. */ - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { hv_root_partition = true; pr_info("Hyper-V: running as root partition\n"); } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index eb07d4435391..b44c487727d4 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -368,7 +368,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, { struct resctrl_schema *s; struct rdtgroup *rdtgrp; - struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; int ret = 0; @@ -397,10 +396,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - list_for_each_entry(s, &resctrl_schema_all, list) { - list_for_each_entry(dom, &s->res->domains, list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } + rdt_staged_configs_clear(); while ((tok = strsep(&buf, "\n")) != NULL) { resname = strim(strsep(&tok, ":")); @@ -445,6 +441,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, } out: + rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); cpus_read_unlock(); return ret ?: nbytes; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 8edecc5763d8..85ceaf9a31ac 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -555,5 +555,6 @@ void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); +void rdt_staged_configs_clear(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index e2c1599d1b37..6ad33f355861 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...) va_end(ap); } +void rdt_staged_configs_clear(void) +{ + struct rdt_resource *r; + struct rdt_domain *dom; + + lockdep_assert_held(&rdtgroup_mutex); + + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(dom, &r->domains, list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); + } +} + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -314,7 +327,7 @@ static void update_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(); + resctrl_sched_in(current); } /* @@ -530,7 +543,7 @@ static void _update_task_closid_rmid(void *task) * Otherwise, the MSR is updated when the task is scheduled in. */ if (task == current) - resctrl_sched_in(); + resctrl_sched_in(task); } static void update_task_closid_rmid(struct task_struct *t) @@ -3107,7 +3120,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { struct resctrl_schema *s; struct rdt_resource *r; - int ret; + int ret = 0; + + rdt_staged_configs_clear(); list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; @@ -3119,20 +3134,22 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) - return ret; + goto out; } ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); - return ret; + goto out; } } rdtgrp->mode = RDT_MODE_SHAREABLE; - return 0; +out: + rdt_staged_configs_clear(); + return ret; } static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 714166cc25f2..0bab497c9436 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1118,21 +1118,20 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, zerofrom = offsetof(struct xregs_state, extended_state_area); /* - * The ptrace buffer is in non-compacted XSAVE format. In - * non-compacted format disabled features still occupy state space, - * but there is no state to copy from in the compacted - * init_fpstate. The gap tracking will zero these states. - */ - mask = fpstate->user_xfeatures; - - /* - * Dynamic features are not present in init_fpstate. When they are - * in an all zeros init state, remove those from 'mask' to zero - * those features in the user buffer instead of retrieving them - * from init_fpstate. + * This 'mask' indicates which states to copy from fpstate. + * Those extended states that are not present in fpstate are + * either disabled or initialized: + * + * In non-compacted format, disabled features still occupy + * state space but there is no state to copy from in the + * compacted init_fpstate. The gap tracking will zero these + * states. + * + * The extended features have an all zeroes init state. Thus, + * remove them from 'mask' to zero those features in the user + * buffer instead of retrieving them from init_fpstate. */ - if (fpu_state_size_dynamic()) - mask &= (header.xfeatures | xinit->header.xcomp_bv); + mask = header.xfeatures; for_each_extended_xfeature(i, mask) { /* @@ -1151,9 +1150,8 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { - copy_feature(header.xfeatures & BIT_ULL(i), &to, + membuf_write(&to, __raw_xsave_addr(xsave, i), - __raw_xsave_addr(xinit, i), xstate_sizes[i]); } /* diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 1265ad519249..fb4f1e01b64a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -136,10 +136,12 @@ SYM_TYPED_FUNC_START(ftrace_stub) RET SYM_FUNC_END(ftrace_stub) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_TYPED_FUNC_START(ftrace_stub_graph) CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub_graph) +#endif #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 470c128759ea..708c87b88cc1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -212,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(); + resctrl_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4e34b3b68ebd..bb65a68b4b49 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -656,7 +656,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(); + resctrl_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 679026a640ef..3f664ab277c4 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2183,9 +2183,6 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned struct ghcb *ghcb; int ret; - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return -ENODEV; - if (!fw_err) return -EINVAL; @@ -2212,15 +2209,26 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned if (ret) goto e_put; - if (ghcb->save.sw_exit_info_2) { - /* Number of expected pages are returned in RBX */ - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && - ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) - input->data_npages = ghcb_get_rbx(ghcb); + *fw_err = ghcb->save.sw_exit_info_2; + switch (*fw_err) { + case 0: + break; - *fw_err = ghcb->save.sw_exit_info_2; + case SNP_GUEST_REQ_ERR_BUSY: + ret = -EAGAIN; + break; + case SNP_GUEST_REQ_INVALID_LEN: + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + input->data_npages = ghcb_get_rbx(ghcb); + ret = -ENOSPC; + break; + } + fallthrough; + default: ret = -EIO; + break; } e_put: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ef80d361b463..10622cf2b30f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -33,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } -static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } -static __init void get_rtc_noop(struct timespec64 *now) { } +static int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8e578311ca9d..89ca7f4c1464 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,7 +46,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO - select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 042dee556125..995eb5054360 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG - && ioapic->irr & (1 << index)) - ioapic_service(ioapic, index, false); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && + ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) { + /* + * Pending status in irr may be outdated: the IRQ line may have + * already been deasserted by a device while the IRQ was masked. + * This occurs, for instance, if the interrupt is handled in a + * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this + * case the guest acknowledges the interrupt to the device in + * its threaded irq handler, i.e. after the EOI but before + * unmasking, so at the time of unmasking the IRQ line is + * already down but our pending irr bit is still set. In such + * cases, injecting this pending interrupt to the guest is + * buggy: the guest will receive an extra unwanted interrupt. + * + * So we need to check here if the IRQ is actually still pending. + * As we are generally not able to probe the IRQ line status + * directly, we do it through irqfd resampler. Namely, we clear + * the pending status and notify the resampler that this interrupt + * is done, without actually injecting it into the guest. If the + * IRQ line is actually already deasserted, we are done. If it is + * still asserted, a new interrupt will be shortly triggered + * through irqfd and injected into the guest. + * + * If, however, it's not possible to resample (no irqfd resampler + * registered for this irq), then unconditionally inject this + * pending interrupt into the guest, so the guest will not miss + * an interrupt, although may get an extra unwanted interrupt. + */ + if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index)) + ioapic->irr &= ~(1 << index); + else + ioapic_service(ioapic, index, false); + } if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 287e98ef9df3..6272dabec02d 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -12,6 +12,11 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, int hv_remote_flush_tlb(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ +static inline int hv_remote_flush_tlb(struct kvm *kvm) +{ + return -EOPNOTSUPP; +} + static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ca684979e90d..cfc8ab773025 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,19 +27,38 @@ #include "irq.h" #include "svm.h" -/* AVIC GATAG is encoded using VM and VCPU IDs */ -#define AVIC_VCPU_ID_BITS 8 -#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) +/* + * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, + * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry + * if an interrupt can't be delivered, e.g. because the vCPU isn't running. + * + * For the vCPU ID, use however many bits are currently allowed for the max + * guest physical APIC ID (limited by the size of the physical ID table), and + * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the + * size of the GATag is defined by hardware (32 bits), but is an opaque value + * as far as hardware is concerned. + */ +#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK -#define AVIC_VM_ID_BITS 24 -#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) -#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) +#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) +#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) -#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ - (y & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ + ((vcpu_id) & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG(vm_id, vcpu_id) \ +({ \ + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ + \ + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ + WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ + ga_tag; \ +}) + +static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); + static bool force_avic; module_param_unsafe(force_avic, bool, 0444); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 252e7f37e4e2..f25bc3cbb250 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3729,7 +3729,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3753,6 +3753,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation--; } +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + hpa_t root_tdp = vcpu->arch.mmu->root.hpa; + + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly + * flush the NPT mappings via hypercall as flushing the ASID only + * affects virtual to physical mappings, it does not invalidate guest + * physical to host physical mappings. + */ + if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) + hyperv_flush_guest_mapping(root_tdp); + + svm_flush_tlb_asid(vcpu); +} + +static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB + * flushes should be routed to hv_remote_flush_tlb() without requesting + * a "regular" remote flush. Reaching this point means either there's + * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of + * which might be fatal to the guest. Yell, but try to recover. + */ + if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) + hv_remote_flush_tlb(vcpu->kvm); + + svm_flush_tlb_asid(vcpu); +} + static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4745,10 +4776,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_current, + .flush_tlb_guest = svm_flush_tlb_asid, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index cff838f15db5..786d46d73a8e 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -6,6 +6,8 @@ #ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ +#include <asm/mshyperv.h> + #if IS_ENABLED(CONFIG_HYPERV) #include "kvm_onhyperv.h" @@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops; int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; + + return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB && + !!hve->hv_enlightenments_control.enlightened_npt_tlb; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; @@ -80,6 +90,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) } #else +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7c4f5ca405c7..768487611db7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2903,7 +2903,7 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || @@ -2923,12 +2923,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, vmcs12->host_ia32_perf_global_ctrl))) return -EINVAL; -#ifdef CONFIG_X86_64 - ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); -#else - ia32e = false; -#endif - if (ia32e) { if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; @@ -3022,7 +3016,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3048,6 +3042,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; + if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) + return -EINVAL; + + if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -3059,7 +3060,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) && @@ -3868,7 +3868,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) exit_qual = 0; } - if (ex->has_error_code) { + /* + * Unlike AMD's Paged Real Mode, which reports an error code on #PF + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the + * "has error code" flags on VM-Exit if the CPU is in Real Mode. + */ + if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f550540ed54e..631fd7da2bc3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -262,7 +262,7 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) * eIBRS has its own protection against poisoned RSB, so it doesn't * need the RSB filling sequence. But it does need to be enabled, and a * single call to retire, before the first unbalanced RET. - */ + */ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ X86_FEATURE_RSB_VMEXIT_LITE @@ -311,7 +311,7 @@ SYM_FUNC_END(vmx_do_nmi_irqoff) * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed * @fault: %true if the VMREAD faulted, %false if it failed - + * * Save and restore volatile registers across a call to vmread_error(). Note, * all parameters are passed on the stack. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bcac3efcde41..d2d6e1b6c788 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -874,7 +874,7 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - else { + else { int mask = 0, match = 0; if (enable_ept && (eb & (1u << PF_VECTOR))) { @@ -1282,7 +1282,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) } } - if (vmx->nested.need_vmcs12_to_shadow_sync) + if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); if (vmx->guest_state_loaded) @@ -5049,10 +5049,10 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; - /* - * An IRQ must not be injected into L2 if it's supposed to VM-Exit, - * e.g. if the IRQ arrived asynchronously after checking nested events. - */ + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return -EBUSY; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7713420abab0..3d852ce84920 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4432,6 +4432,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -8903,6 +8904,8 @@ restart: } if (ctxt->have_exception) { + WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); + vcpu->mmio_needed = false; r = 1; inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { @@ -9906,13 +9909,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + /* + * Suppress the error code if the vCPU is in Real Mode, as Real Mode + * exceptions don't report error codes. The presence of an error code + * is carried with the exception and only stripped when the exception + * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do + * report an error code despite the CPU being in Real Mode. + */ + vcpu->arch.exception.has_error_code &= is_protmode(vcpu); + trace_kvm_inj_exception(vcpu->arch.exception.vector, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.injected); - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) - vcpu->arch.exception.error_code = false; static_call(kvm_x86_inject_exception)(vcpu); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4f1a40a86534..01932af64193 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y) endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o + lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ecbfb4dd3b01..f74a3e704a1c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms) * Input: * rdi destination * rcx count + * rax is zero * * Output: * rcx: uncleared bytes or 0 if successful. */ -SYM_FUNC_START(clear_user_original) - /* - * Copy only the lower 32 bits of size as that is enough to handle the rest bytes, - * i.e., no need for a 'q' suffix and thus a REX prefix. - */ - mov %ecx,%eax - shr $3,%rcx - jz .Lrest_bytes +SYM_FUNC_START(rep_stos_alternative) + cmpq $64,%rcx + jae .Lunrolled - # do the qwords first - .p2align 4 -.Lqwords: - movq $0,(%rdi) - lea 8(%rdi),%rdi - dec %rcx - jnz .Lqwords + cmp $8,%ecx + jae .Lword -.Lrest_bytes: - and $7, %eax - jz .Lexit + testl %ecx,%ecx + je .Lexit - # now do the rest bytes -.Lbytes: - movb $0,(%rdi) +.Lclear_user_tail: +0: movb %al,(%rdi) inc %rdi - dec %eax - jnz .Lbytes - + dec %rcx + jnz .Lclear_user_tail .Lexit: - /* - * %rax still needs to be cleared in the exception case because this function is called - * from inline asm and the compiler expects %rax to be zero when exiting the inline asm, - * in case it might reuse it somewhere. - */ - xor %eax,%eax - RET - -.Lqwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %eax - add %rax,%rcx - jmp .Lexit - -.Lbytes_exception: - mov %eax,%ecx - jmp .Lexit - - _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) - _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) -SYM_FUNC_END(clear_user_original) -EXPORT_SYMBOL(clear_user_original) - -/* - * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is - * present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - */ -SYM_FUNC_START(clear_user_rep_good) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lprep: - # copy lower 32-bits for rest bytes - mov %ecx, %edx - shr $3, %rcx - jz .Lrep_good_rest_bytes - -.Lrep_good_qwords: - rep stosq - -.Lrep_good_rest_bytes: - and $7, %edx - jz .Lrep_good_exit - -.Lrep_good_bytes: - mov %edx, %ecx - rep stosb - -.Lrep_good_exit: - # see .Lexit comment above - xor %eax, %eax RET -.Lrep_good_qwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %edx - add %rdx, %rcx - jmp .Lrep_good_exit + _ASM_EXTABLE_UA( 0b, .Lexit) - _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) - _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) -SYM_FUNC_END(clear_user_rep_good) -EXPORT_SYMBOL(clear_user_rep_good) +.Lword: +1: movq %rax,(%rdi) + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lclear_user_tail -/* - * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - * - */ -SYM_FUNC_START(clear_user_erms) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lerms_bytes: - rep stosb - -.Lerms_exit: - xorl %eax,%eax + .p2align 4 +.Lunrolled: +10: movq %rax,(%rdi) +11: movq %rax,8(%rdi) +12: movq %rax,16(%rdi) +13: movq %rax,24(%rdi) +14: movq %rax,32(%rdi) +15: movq %rax,40(%rdi) +16: movq %rax,48(%rdi) +17: movq %rax,56(%rdi) + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lclear_user_tail RET - _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) -SYM_FUNC_END(clear_user_erms) -EXPORT_SYMBOL(clear_user_erms) + /* + * If we take an exception on any of the + * word stores, we know that %rcx isn't zero, + * so we can just go to the tail clearing to + * get the exact count. + * + * The unrolled case might end up clearing + * some bytes twice. Don't care. + * + * We could use the value in %rdi to avoid + * a second fault on the exact count case, + * but do we really care? No. + * + * Finally, we could try to align %rdi at the + * top of the unrolling. But unaligned stores + * just aren't that common or expensive. + */ + _ASM_EXTABLE_UA( 1b, .Lclear_user_tail) + _ASM_EXTABLE_UA(10b, .Lclear_user_tail) + _ASM_EXTABLE_UA(11b, .Lclear_user_tail) + _ASM_EXTABLE_UA(12b, .Lclear_user_tail) + _ASM_EXTABLE_UA(13b, .Lclear_user_tail) + _ASM_EXTABLE_UA(14b, .Lclear_user_tail) + _ASM_EXTABLE_UA(15b, .Lclear_user_tail) + _ASM_EXTABLE_UA(16b, .Lclear_user_tail) + _ASM_EXTABLE_UA(17b, .Lclear_user_tail) +SYM_FUNC_END(rep_stos_alternative) +EXPORT_SYMBOL(rep_stos_alternative) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 9dec1b38a98f..4fc5c2de2de4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,404 +7,108 @@ */ #include <linux/linkage.h> -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeatures.h> -#include <asm/alternative.h> #include <asm/asm.h> -#include <asm/smap.h> #include <asm/export.h> -#include <asm/trapnr.h> - -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - - _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align) - _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) -.endm /* - * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro - * code for rep movsq - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_unrolled) - ASM_STAC - cmpl $8,%edx - jb .Lcopy_user_short_string_bytes - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz copy_user_short_string -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movq %r8,(%rdi) -6: movq %r9,1*8(%rdi) -7: movq %r10,2*8(%rdi) -8: movq %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movq %r8,4*8(%rdi) -14: movq %r9,5*8(%rdi) -15: movq %r10,6*8(%rdi) -16: movq %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b - jmp copy_user_short_string - -30: shll $6,%ecx - addl %ecx,%edx - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 30b) - _ASM_EXTABLE_CPY(2b, 30b) - _ASM_EXTABLE_CPY(3b, 30b) - _ASM_EXTABLE_CPY(4b, 30b) - _ASM_EXTABLE_CPY(5b, 30b) - _ASM_EXTABLE_CPY(6b, 30b) - _ASM_EXTABLE_CPY(7b, 30b) - _ASM_EXTABLE_CPY(8b, 30b) - _ASM_EXTABLE_CPY(9b, 30b) - _ASM_EXTABLE_CPY(10b, 30b) - _ASM_EXTABLE_CPY(11b, 30b) - _ASM_EXTABLE_CPY(12b, 30b) - _ASM_EXTABLE_CPY(13b, 30b) - _ASM_EXTABLE_CPY(14b, 30b) - _ASM_EXTABLE_CPY(15b, 30b) - _ASM_EXTABLE_CPY(16b, 30b) -SYM_FUNC_END(copy_user_generic_unrolled) -EXPORT_SYMBOL(copy_user_generic_unrolled) - -/* Some CPUs run faster using the string copy instructions. - * This is also a lot simpler. Use them when possible. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. + * rep_movs_alternative - memory copy with exception handling. + * This version is for CPUs that don't have FSRM (Fast Short Rep Movs) * * Input: * rdi destination * rsi source - * rdx count + * rcx count * * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_string) - ASM_STAC - cmpl $8,%edx - jb 2f /* less than 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx -1: rep movsq -2: movl %edx,%ecx -3: rep movsb - xorl %eax,%eax - ASM_CLAC - RET - -11: leal (%rdx,%rcx,8),%ecx -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 11b) - _ASM_EXTABLE_CPY(3b, 12b) -SYM_FUNC_END(copy_user_generic_string) -EXPORT_SYMBOL(copy_user_generic_string) - -/* - * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. - * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. - * - * Input: - * rdi destination - * rsi source - * rdx count + * rcx uncopied bytes or 0 if successful. * - * Output: - * eax uncopied bytes or 0 if successful. + * NOTE! The calling convention is very intentionally the same as + * for 'rep movs', so that we can rewrite the function call with + * just a plain 'rep movs' on machines that have FSRM. But to make + * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. */ -SYM_FUNC_START(copy_user_enhanced_fast_string) - ASM_STAC - /* CPUs without FSRM should avoid rep movsb for short copies */ - ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM - movl %edx,%ecx -1: rep movsb - xorl %eax,%eax - ASM_CLAC +SYM_FUNC_START(rep_movs_alternative) + cmpq $64,%rcx + jae .Lunrolled + + cmp $8,%ecx + jae .Lword + + testl %ecx,%ecx + je .Lexit + +.Lcopy_user_tail: +0: movb (%rsi),%al +1: movb %al,(%rdi) + inc %rdi + inc %rsi + dec %rcx + jne .Lcopy_user_tail +.Lexit: RET -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 12b) -SYM_FUNC_END(copy_user_enhanced_fast_string) -EXPORT_SYMBOL(copy_user_enhanced_fast_string) - -/* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - * Don't try to copy the tail if machine check happened - * - * Input: - * eax trap number written by ex_handler_copy() - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) - cmp $X86_TRAP_MC,%eax - je 3f - - movl %edx,%ecx -1: rep movsb -2: mov %ecx,%eax - ASM_CLAC + _ASM_EXTABLE_UA( 0b, .Lexit) + _ASM_EXTABLE_UA( 1b, .Lexit) + + .p2align 4 +.Lword: +2: movq (%rsi),%rax +3: movq %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lcopy_user_tail + + _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) + + .p2align 4 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +14: movq %r8,(%rdi) +15: movq %r9,8(%rdi) +16: movq %r10,16(%rdi) +17: movq %r11,24(%rdi) +20: movq 32(%rsi),%r8 +21: movq 40(%rsi),%r9 +22: movq 48(%rsi),%r10 +23: movq 56(%rsi),%r11 +24: movq %r8,32(%rdi) +25: movq %r9,40(%rdi) +26: movq %r10,48(%rdi) +27: movq %r11,56(%rdi) + addq $64,%rsi + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lcopy_user_tail RET -3: - movl %edx,%eax - ASM_CLAC - RET - - _ASM_EXTABLE_CPY(1b, 2b) - -.Lcopy_user_handle_align: - addl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - -SYM_CODE_END(.Lcopy_user_handle_tail) - -/* - * Finish memcpy of less than 64 bytes. #AC should already be set. - * - * Input: - * rdi destination - * rsi source - * rdx count (< 64) - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(copy_user_short_string) - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .Lcopy_user_short_string_bytes -18: movq (%rsi),%r8 -19: movq %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -.Lcopy_user_short_string_bytes: - andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xor %eax,%eax - ASM_CLAC - RET - -40: leal (%rdx,%rcx,8),%edx - jmp 60f -50: movl %ecx,%edx /* ecx is zerorest also */ -60: jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(18b, 40b) - _ASM_EXTABLE_CPY(19b, 40b) - _ASM_EXTABLE_CPY(21b, 50b) - _ASM_EXTABLE_CPY(22b, 50b) -SYM_CODE_END(copy_user_short_string) - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination out of cache for more performance. - * - * Note: Cached memory copy is used when destination or size is not - * naturally aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - */ -SYM_FUNC_START(__copy_user_nocache) - ASM_STAC - - /* If size is less than 8 bytes, go to 4-byte copy */ - cmpl $8,%edx - jb .L_4b_nocache_copy_entry - - /* If destination is not 8-byte aligned, "cache" copy to align it */ - ALIGN_DESTINATION - - /* Set 4x8-byte copy count and remainder */ - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz .L_8b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 4x8-byte nocache loop-copy */ -.L_4x8b_nocache_copy_loop: -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz .L_4x8b_nocache_copy_loop - - /* Set 8-byte copy count and remainder */ -.L_8b_nocache_copy_entry: - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .L_4b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 8-byte nocache loop-copy */ -.L_8b_nocache_copy_loop: -20: movq (%rsi),%r8 -21: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz .L_8b_nocache_copy_loop - - /* If no byte left, we're done */ -.L_4b_nocache_copy_entry: - andl %edx,%edx - jz .L_finish_copy - - /* If destination is not 4-byte aligned, go to byte copy: */ - movl %edi,%ecx - andl $3,%ecx - jnz .L_1b_cache_copy_entry - - /* Set 4-byte copy count (1 or 0) and remainder */ - movl %edx,%ecx - andl $3,%edx - shrl $2,%ecx - jz .L_1b_cache_copy_entry /* jump if count is 0 */ - - /* Perform 4-byte nocache copy: */ -30: movl (%rsi),%r8d -31: movnti %r8d,(%rdi) - leaq 4(%rsi),%rsi - leaq 4(%rdi),%rdi - - /* If no bytes left, we're done: */ - andl %edx,%edx - jz .L_finish_copy - - /* Perform byte "cache" loop-copy for the remainder */ -.L_1b_cache_copy_entry: - movl %edx,%ecx -.L_1b_cache_copy_loop: -40: movb (%rsi),%al -41: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_1b_cache_copy_loop - - /* Finished copying; fence the prior stores */ -.L_finish_copy: - xorl %eax,%eax - ASM_CLAC - sfence - RET - -.L_fixup_4x8b_copy: - shll $6,%ecx - addl %ecx,%edx - jmp .L_fixup_handle_tail -.L_fixup_8b_copy: - lea (%rdx,%rcx,8),%rdx - jmp .L_fixup_handle_tail -.L_fixup_4b_copy: - lea (%rdx,%rcx,4),%rdx - jmp .L_fixup_handle_tail -.L_fixup_1b_copy: - movl %ecx,%edx -.L_fixup_handle_tail: - sfence - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) -SYM_FUNC_END(__copy_user_nocache) -EXPORT_SYMBOL(__copy_user_nocache) + _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) +SYM_FUNC_END(rep_movs_alternative) +EXPORT_SYMBOL(rep_movs_alternative) diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S new file mode 100644 index 000000000000..5c5f38d32672 --- /dev/null +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/export.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * + * This copies from user space into kernel space, but the kernel + * space accesses can take a machine check exception, so they too + * need exception handling. + * + * Note: only 32-bit and 64-bit stores have non-temporal versions, + * and we only use aligned versions. Any unaligned parts at the + * start or end of the copy will be done using normal cached stores. + * + * Input: + * rdi destination + * rsi source + * edx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ +SYM_FUNC_START(__copy_user_nocache) + /* If destination is not 7-byte aligned, we'll have to align it */ + testb $7,%dil + jne .Lalign + +.Lis_aligned: + cmp $64,%edx + jb .Lquadwords + + .p2align 4,0x90 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +20: movnti %r8,(%rdi) +21: movnti %r9,8(%rdi) +22: movnti %r10,16(%rdi) +23: movnti %r11,24(%rdi) +30: movq 32(%rsi),%r8 +31: movq 40(%rsi),%r9 +32: movq 48(%rsi),%r10 +33: movq 56(%rsi),%r11 +40: movnti %r8,32(%rdi) +41: movnti %r9,40(%rdi) +42: movnti %r10,48(%rdi) +43: movnti %r11,56(%rdi) + + addq $64,%rsi + addq $64,%rdi + sub $64,%edx + cmp $64,%edx + jae .Lunrolled + +/* + * First set of user mode loads have been done + * without any stores, so if they fail, we can + * just try the non-unrolled loop. + */ +_ASM_EXTABLE_UA(10b, .Lquadwords) +_ASM_EXTABLE_UA(11b, .Lquadwords) +_ASM_EXTABLE_UA(12b, .Lquadwords) +_ASM_EXTABLE_UA(13b, .Lquadwords) + +/* + * The second set of user mode loads have been + * done with 32 bytes stored to the destination, + * so we need to take that into account before + * falling back to the unrolled loop. + */ +_ASM_EXTABLE_UA(30b, .Lfixup32) +_ASM_EXTABLE_UA(31b, .Lfixup32) +_ASM_EXTABLE_UA(32b, .Lfixup32) +_ASM_EXTABLE_UA(33b, .Lfixup32) + +/* + * An exception on a write means that we're + * done, but we need to update the count + * depending on where in the unrolled loop + * we were. + */ +_ASM_EXTABLE_UA(20b, .Ldone0) +_ASM_EXTABLE_UA(21b, .Ldone8) +_ASM_EXTABLE_UA(22b, .Ldone16) +_ASM_EXTABLE_UA(23b, .Ldone24) +_ASM_EXTABLE_UA(40b, .Ldone32) +_ASM_EXTABLE_UA(41b, .Ldone40) +_ASM_EXTABLE_UA(42b, .Ldone48) +_ASM_EXTABLE_UA(43b, .Ldone56) + +.Lquadwords: + cmp $8,%edx + jb .Llong +50: movq (%rsi),%rax +51: movnti %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%edx + jmp .Lquadwords + +/* + * If we fail on the last full quadword, we will + * not try to do any byte-wise cached accesses. + * We will try to do one more 4-byte uncached + * one, though. + */ +_ASM_EXTABLE_UA(50b, .Llast4) +_ASM_EXTABLE_UA(51b, .Ldone0) + +.Llong: + test $4,%dl + je .Lword +60: movl (%rsi),%eax +61: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx +.Lword: + sfence + test $2,%dl + je .Lbyte +70: movw (%rsi),%ax +71: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lbyte: + test $1,%dl + je .Ldone +80: movb (%rsi),%al +81: movb %al,(%rdi) + dec %edx +.Ldone: + mov %edx,%eax + RET + +/* + * If we fail on the last four bytes, we won't + * bother with any fixups. It's dead, Jim. Note + * that there's no need for 'sfence' for any + * of this, since the exception will have been + * serializing. + */ +_ASM_EXTABLE_UA(60b, .Ldone) +_ASM_EXTABLE_UA(61b, .Ldone) +_ASM_EXTABLE_UA(70b, .Ldone) +_ASM_EXTABLE_UA(71b, .Ldone) +_ASM_EXTABLE_UA(80b, .Ldone) +_ASM_EXTABLE_UA(81b, .Ldone) + +/* + * This is the "head needs aliging" case when + * the destination isn't 8-byte aligned. The + * 4-byte case can be done uncached, but any + * smaller alignment is done with regular stores. + */ +.Lalign: + test $1,%dil + je .Lalign_word + test %edx,%edx + je .Ldone +90: movb (%rsi),%al +91: movb %al,(%rdi) + inc %rsi + inc %rdi + dec %edx +.Lalign_word: + test $2,%dil + je .Lalign_long + cmp $2,%edx + jb .Lbyte +92: movw (%rsi),%ax +93: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lalign_long: + test $4,%dil + je .Lis_aligned + cmp $4,%edx + jb .Lword +94: movl (%rsi),%eax +95: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx + jmp .Lis_aligned + +/* + * If we fail on the initial alignment accesses, + * we're all done. Again, no point in trying to + * do byte-by-byte probing if the 4-byte load + * fails - we're not doing any uncached accesses + * any more. + */ +_ASM_EXTABLE_UA(90b, .Ldone) +_ASM_EXTABLE_UA(91b, .Ldone) +_ASM_EXTABLE_UA(92b, .Ldone) +_ASM_EXTABLE_UA(93b, .Ldone) +_ASM_EXTABLE_UA(94b, .Ldone) +_ASM_EXTABLE_UA(95b, .Ldone) + +/* + * Exception table fixups for faults in the middle + */ +.Ldone56: sub $8,%edx +.Ldone48: sub $8,%edx +.Ldone40: sub $8,%edx +.Ldone32: sub $8,%edx +.Ldone24: sub $8,%edx +.Ldone16: sub $8,%edx +.Ldone8: sub $8,%edx +.Ldone0: + mov %edx,%eax + RET + +.Lfixup32: + addq $32,%rsi + addq $32,%rdi + sub $32,%edx + jmp .Lquadwords + +.Llast4: +52: movl (%rsi),%eax +53: movnti %eax,(%rdi) + sfence + sub $4,%edx + mov %edx,%eax + RET +_ASM_EXTABLE_UA(52b, .Ldone0) +_ASM_EXTABLE_UA(53b, .Ldone0) + +SYM_FUNC_END(__copy_user_nocache) +EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index a64017602010..8f95fb267caa 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -11,13 +11,6 @@ .section .noinstr.text, "ax" /* - * We build a jump to memcpy_orig by default which gets NOPped out on - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. - */ - -/* * memcpy - Copy a memory block. * * Input: @@ -27,17 +20,21 @@ * * Output: * rax original destination + * + * The FSRM alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep movsb' itself is small enough to replace the call, but the + * two register moves blow up the code. And one of them is "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_TYPED_FUNC_START(__memcpy) - ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memcpy_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM movq %rdi, %rax movq %rdx, %rcx - shrq $3, %rcx - andl $7, %edx - rep movsq - movl %edx, %ecx rep movsb RET SYM_FUNC_END(__memcpy) @@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) -/* - * memcpy_erms() - enhanced fast string memcpy. This is faster and - * simpler than memcpy. Use memcpy_erms when possible. - */ -SYM_FUNC_START_LOCAL(memcpy_erms) - movq %rdi, %rax - movq %rdx, %rcx - rep movsb - RET -SYM_FUNC_END(memcpy_erms) - SYM_FUNC_START_LOCAL(memcpy_orig) movq %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6143b1a6fa2c..7c59a704c458 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -18,27 +18,22 @@ * rdx count (bytes) * * rax original destination + * + * The FSRS alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep stosb' itself is small enough to replace the call, but all + * the register moves blow up the code. And two of them are "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_FUNC_START(__memset) - /* - * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended - * to use it when possible. If not available, use fast string instructions. - * - * Otherwise, use original memset function. - */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memset_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS movq %rdi,%r9 + movb %sil,%al movq %rdx,%rcx - andl $7,%edx - shrq $3,%rcx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - imulq %rsi,%rax - rep stosq - movl %edx,%ecx rep stosb movq %r9,%rax RET @@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) -/* - * ISO C memset - set a memory block to a byte value. This function uses - * enhanced rep stosb to override the fast string function. - * The code is simpler and shorter than the fast string function as well. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ -SYM_FUNC_START_LOCAL(memset_erms) - movq %rdi,%r9 - movb %sil,%al - movq %rdx,%rcx - rep stosb - movq %r9,%rax - RET -SYM_FUNC_END(memset_erms) - SYM_FUNC_START_LOCAL(memset_orig) movq %rdi,%r10 diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 6c1f8ac5e721..c3a5bbc0b41e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) { unsigned long flushed, dest = (unsigned long) dst; - long rc = __copy_user_nocache(dst, src, size, 0); + long rc; + + stac(); + rc = __copy_user_nocache(dst, src, size); + clac(); /* * __copy_user_nocache() uses non-temporal stores for the bulk diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 7316a8224259..e91500a80963 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -10,6 +10,7 @@ #include <asm/fixmap.h> #include <asm/desc.h> #include <asm/kasan.h> +#include <asm/setup.h> static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); @@ -29,6 +30,12 @@ static __init void init_cea_offsets(void) unsigned int max_cea; unsigned int i, j; + if (!kaslr_enabled()) { + for_each_possible_cpu(i) + per_cpu(_cea_offset, i) = i; + return; + } + max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE; /* O(sodding terrible) */ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 88cccd65029d..c6efcf559d88 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -600,7 +600,8 @@ void __init sme_enable(struct boot_params *bp) cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | ((u64)bp->ext_cmd_line_ptr << 32)); - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) + return; if (!strncmp(buffer, cmdline_on, sizeof(buffer))) sme_me_mask = me_mask; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 615a76d70019..bf5161dcf89e 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -7,6 +7,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/amd_nb.h> #include <asm/hpet.h> #include <asm/pci_x86.h> @@ -824,3 +825,23 @@ static void rs690_fix_64bit_dma(struct pci_dev *pdev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); #endif + +#ifdef CONFIG_AMD_NB + +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008 +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L + +static void quirk_clear_strap_no_soft_reset_dev2_f0(struct pci_dev *dev) +{ + u32 data; + + if (!amd_smn_read(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, &data)) { + data &= ~AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK; + if (amd_smn_write(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, data)) + pci_err(dev, "Failed to write data 0x%x\n", data); + } else { + pci_err(dev, "Failed to read data\n"); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b8, quirk_clear_strap_no_soft_reset_dev2_f0); +#endif diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 17f09dc26381..82fec66d46d2 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3c5b52fbe4a7..a9ec8c9f5c5d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_PV_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index bb59cc6ddb2d..093b78c8bbec 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1390,7 +1390,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_platform.set_legacy_features = xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_init_vga(info, xen_start_info->console.dom0.info_size, + &boot_params.screen_info); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bcae606bbc5c..ada3868c02c2 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -43,6 +43,19 @@ void __init xen_pvh_init(struct boot_params *boot_params) x86_init.oem.banner = xen_banner; xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + int ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } } void __init mem_map_via_hcall(struct boot_params *boot_params_p) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1d597364b49d..b74ac2562cfb 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <asm/xen/cpuid.h> #include <xen/events.h> #include <xen/features.h> @@ -503,11 +504,7 @@ static int __init xen_tsc_safe_clocksource(void) /* Leaf 4, sub-leaf 0 (0x40000x03) */ cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx); - /* tsc_mode = no_emulate (2) */ - if (ebx != 2) - return 0; - - return 1; + return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE; } static void __init xen_time_init(void) diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 14ea32e734d5..d97adab8420f 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -9,10 +9,9 @@ #include "xen-ops.h" -void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size, + struct screen_info *screen_info) { - struct screen_info *screen_info = &boot_params.screen_info; - /* This is drawn from a dump from vgacon:startup in * standard Linux. */ screen_info->orig_video_mode = 3; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a8bb972193d..a10903785a33 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -108,11 +108,12 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_PV_DOM0 -void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size, + struct screen_info *); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, - size_t size) + size_t size, struct screen_info *si) { } #endif |