diff options
Diffstat (limited to 'arch/x86')
97 files changed, 1619 insertions, 807 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f9920f1341c8..088af7c84e5d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,6 +107,8 @@ config X86 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_CFI_CLANG if X86_64 + select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP @@ -257,6 +259,7 @@ config X86 select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ + select HAVE_RUST if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL select HAVE_UNSTABLE_SCHED_CLOCK @@ -284,6 +287,7 @@ config X86 select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE config INSTRUCTION_DECODER def_bool y @@ -448,6 +452,11 @@ config X86_X2APIC This allows 32-bit apic IDs (so it can support very large systems), and accesses the local apic via MSRs not via mmio. + Some Intel systems circa 2022 and later are locked into x2APIC mode + and can not fall back to the legacy APIC modes if SGX or TDX are + enabled in the BIOS. They will be unable to boot without enabling + this option. + If you don't know what to do here, say N. config X86_MPPARSE @@ -1919,7 +1928,7 @@ endchoice config X86_SGX bool "Software Guard eXtensions (SGX)" - depends on X86_64 && CPU_SUP_INTEL + depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC depends on CRYPTO=y depends on CRYPTO_SHA256=y select SRCU @@ -2569,7 +2578,7 @@ menuconfig APM 1) make sure that you have enough swap space and that it is enabled. - 2) pass the "no-hlt" option to the kernel + 2) pass the "idle=poll" option to the kernel 3) switch on floating point emulation in the kernel and pass the "no387" option to the kernel 4) pass the "floppy=nodma" option to the kernel diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7854685c5f25..2d7e640674c6 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -68,6 +68,7 @@ export BITS # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 ifeq ($(CONFIG_X86_KERNEL_IBT),y) # @@ -155,8 +156,17 @@ else cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) + rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 + rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona + rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 + rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom + rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic + KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel + KBUILD_RUSTFLAGS += -Cno-redzone=y + KBUILD_RUSTFLAGS += -Ccode-model=kernel endif # @@ -286,10 +296,6 @@ vdso_install: archprepare: checkbin checkbin: -ifndef CONFIG_CC_HAS_ASM_GOTO - @echo Compiler lacks asm-goto support. - @exit 1 -endif ifdef CONFIG_RETPOLINE ifeq ($(RETPOLINE_CFLAGS),) @echo "You are building kernel with non-retpoline compiler." >&2 diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 02e1dea11d94..8518ae214c9b 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -19,13 +19,13 @@ static inline bool constant_test_bit(int nr, const void *addr) { - const u32 *p = (const u32 *)addr; + const u32 *p = addr; return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0; } static inline bool variable_test_bit(int nr, const void *addr) { bool v; - const u32 *p = (const u32 *)addr; + const u32 *p = addr; asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr)); return v; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 4910bf230d7b..62208ec04ca4 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -132,7 +132,17 @@ void snp_set_page_private(unsigned long paddr); void snp_set_page_shared(unsigned long paddr); void sev_prep_identity_maps(unsigned long top_level_pgt); #else -static inline void sev_enable(struct boot_params *bp) { } +static inline void sev_enable(struct boot_params *bp) +{ + /* + * bp->cc_blob_address should only be set by boot/compressed kernel. + * Initialize it to 0 unconditionally (thus here in this stub too) to + * ensure that uninitialized values from buggy bootloaders aren't + * propagated. + */ + if (bp) + bp->cc_blob_address = 0; +} static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) { diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 52f989f6acc2..c93930d5ccbd 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -277,6 +277,14 @@ void sev_enable(struct boot_params *bp) bool snp; /* + * bp->cc_blob_address should only be set by boot/compressed kernel. + * Initialize it to 0 to ensure that uninitialized values from + * buggy bootloaders aren't propagated. + */ + if (bp) + bp->cc_blob_address = 0; + + /* * Setup/preliminary detection of SNP. This will be sanity-checked * against CPUID/MSR values later. */ diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index d9fc7139fd46..581296255b39 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -14,7 +14,6 @@ CONFIG_CPU_FREQ=y # x86 xen specific config options CONFIG_XEN_PVH=y -CONFIG_XEN_MAX_DOMAIN_MEMORY=500 CONFIG_XEN_SAVE_RESTORE=y # CONFIG_XEN_DEBUG_FS is not set CONFIG_XEN_MCE_LOG=y diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 802d71582689..4a43e072d2d1 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "blowfish-x86_64-asm.S" .text @@ -141,7 +142,7 @@ SYM_FUNC_START(__blowfish_enc_blk) RET; SYM_FUNC_END(__blowfish_enc_blk) -SYM_FUNC_START(blowfish_dec_blk) +SYM_TYPED_FUNC_START(blowfish_dec_blk) /* input: * %rdi: ctx * %rsi: dst @@ -332,7 +333,7 @@ SYM_FUNC_START(__blowfish_enc_blk_4way) RET; SYM_FUNC_END(__blowfish_enc_blk_4way) -SYM_FUNC_START(blowfish_dec_blk_4way) +SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 682338e7e2a3..4dd19819053a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -311,7 +311,7 @@ SYM_CODE_START(entry_INT80_compat) * Interrupts are off on entry. */ ASM_CLAC /* Do this early to minimize exposure */ - SWAPGS + ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV /* * User tracing code (ptrace or signal handlers) might assume that diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 12f6c4d714cd..381d3333b996 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -153,6 +153,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..c20d8cd47c48 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2102,6 +2102,15 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(mem-loads, mem_ld_grt, "event=0xd0,umask=0x5,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_grt, "event=0xd0,umask=0x6"); + +static struct attribute *grt_mem_attrs[] = { + EVENT_PTR(mem_ld_grt), + EVENT_PTR(mem_st_grt), + NULL +}; + static struct extra_reg intel_grt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), @@ -4052,8 +4061,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) /* Disable guest PEBS if host PEBS is enabled. */ arr[pebs_enable].guest = 0; } else { - /* Disable guest PEBS for cross-mapped PEBS counters. */ + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ arr[global_ctrl].guest |= arr[pebs_enable].guest; } @@ -5974,6 +5984,36 @@ __init int intel_pmu_init(void) name = "Tremont"; break; + case INTEL_FAM6_ALDERLAKE_N: + x86_pmu.mid_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; + x86_pmu.extra_regs = intel_grt_extra_regs; + + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + + intel_pmu_pebs_data_source_grt(); + x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.get_event_constraints = tnt_get_event_constraints; + x86_pmu.limit_period = spr_limit_period; + td_attr = tnt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = nhm_format_attr; + pr_cont("Gracemont events, "); + name = "gracemont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: @@ -6291,10 +6331,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; - x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; @@ -6318,7 +6356,6 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: - case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: /* @@ -6337,10 +6374,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; - x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; x86_pmu.lbr_pt_coexist = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..ac973c6f82ad 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -110,13 +110,18 @@ void __init intel_pmu_pebs_data_source_skl(bool pmem) __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source); } -static void __init intel_pmu_pebs_data_source_grt(u64 *data_source) +static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source) { data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD); } +void __init intel_pmu_pebs_data_source_grt(void) +{ + __intel_pmu_pebs_data_source_grt(pebs_data_source); +} + void __init intel_pmu_pebs_data_source_adl(void) { u64 *data_source; @@ -127,7 +132,7 @@ void __init intel_pmu_pebs_data_source_adl(void) data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); - intel_pmu_pebs_data_source_grt(data_source); + __intel_pmu_pebs_data_source_grt(data_source); } static u64 precise_store_data(u64 status) @@ -291,6 +296,7 @@ static u64 load_latency_data(struct perf_event *event, u64 status) static u64 store_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; + union perf_mem_data_src src; u64 val; dse.val = status; @@ -304,7 +310,14 @@ static u64 store_latency_data(struct perf_event *event, u64 status) val |= P(BLK, NA); - return val; + /* + * the pebs_data_source table is only for loads + * so override the mem_op to say STORE instead + */ + src.val = val; + src.mem_op = P(OP,STORE); + + return src.val; } struct pebs_record_core { @@ -822,7 +835,7 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { struct event_constraint intel_grt_pebs_event_constraints[] = { /* Allow all events as PEBS with no flags */ - INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf), + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3), INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf), EVENT_CONSTRAINT_END }; @@ -2262,6 +2275,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_BRANCH_STACK | PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.pebs_capable = ~0ULL; pebs_qual = "-baseline"; x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..47fca6a7a8bc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1097,6 +1097,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { reg->config = mask; + + /* + * The Arch LBR HW can retrieve the common branch types + * from the LBR_INFO. It doesn't require the high overhead + * SW disassemble. + * Enable the branch type by default for the Arch LBR. + */ + reg->reg |= X86_BR_TYPE_SAVE; return 0; } diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index ce440011cc4e..1ef4f7861e2e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -841,6 +841,22 @@ int snb_pci2phy_map_init(int devid) return 0; } +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * SNB IMC counters are 32-bit and are laid out back to back + * in MMIO space. Therefore we must use a 32-bit accessor function + * using readq() from uncore_mmio_read_counter() causes problems + * because it is reading 64-bit at a time. This is okay for the + * uncore_perf_event_update() function because it drops the upper + * 32-bits but not okay for plain uncore_read_counter() as invoked + * in uncore_pmu_event_start(). + */ + return (u64)readl(box->io_addr + hwc->event_base); +} + static struct pmu snb_uncore_imc_pmu = { .task_ctx_nr = perf_invalid_context, .event_init = snb_uncore_imc_event_init, @@ -860,7 +876,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = { .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = uncore_mmio_read_counter, + .read_counter = snb_uncore_imc_read_counter, }; static struct intel_uncore_type snb_uncore_imc = { diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba3d24a6a4ec..266143abcbd8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1516,6 +1516,8 @@ void intel_pmu_pebs_data_source_skl(bool pmem); void intel_pmu_pebs_data_source_adl(void); +void intel_pmu_pebs_data_source_grt(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h index e003a01b7c67..1dd14381bcb6 100644 --- a/arch/x86/include/asm/acrn.h +++ b/arch/x86/include/asm/acrn.h @@ -10,6 +10,15 @@ /* Bit 0 indicates whether guest VM is privileged */ #define ACRN_FEATURE_PRIVILEGED_VM BIT(0) +/* + * Timing Information. + * This leaf returns the current TSC frequency in kHz. + * + * EAX: (Virtual) TSC frequency in kHz. + * EBX, ECX, EDX: RESERVED (reserved fields are set to zero). + */ +#define ACRN_CPUID_TIMING_INFO 0x40000010 + void acrn_setup_intr_handler(void (*handler)(void)); void acrn_remove_intr_handler(void); @@ -21,6 +30,11 @@ static inline u32 acrn_cpuid_base(void) return 0; } +static inline unsigned long acrn_get_tsc_khz(void) +{ + return cpuid_eax(ACRN_CPUID_TIMING_INFO); +} + /* * Hypercalls for ACRN * diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 973c6bd17f98..2edf68475fec 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } +static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) +{ + bool oldbit; + + asm volatile("testb %2,%1" + CC_SET(nz) + : CC_OUT(nz) (oldbit) + : "m" (((unsigned char *)addr)[nr >> 3]), + "i" (1 << (nr & 7)) + :"memory"); + + return oldbit; +} + static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; @@ -226,17 +240,37 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr) variable_test_bit(nr, addr); } +static __always_inline bool +arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ + return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : + variable_test_bit(nr, addr); +} + +static __always_inline unsigned long variable__ffs(unsigned long word) +{ + asm("rep; bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +#define __ffs(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)__builtin_ctzl(word) : \ + variable__ffs(word)) + +static __always_inline unsigned long variable_ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) - : "rm" (word)); + : "r" (~word)); return word; } @@ -246,13 +280,10 @@ static __always_inline unsigned long __ffs(unsigned long word) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __always_inline unsigned long ffz(unsigned long word) -{ - asm("rep; bsf %1,%0" - : "=r" (word) - : "r" (~word)); - return word; -} +#define ffz(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)__builtin_ctzl(~word) : \ + variable_ffz(word)) /* * __fls: find last set bit in word @@ -271,18 +302,7 @@ static __always_inline unsigned long __fls(unsigned long word) #undef ADDR #ifdef __KERNEL__ -/** - * ffs - find first set bit in word - * @x: the word to search - * - * This is defined the same way as the libc and compiler builtin ffs - * routines, therefore differs in spirit from the other bitops. - * - * ffs(value) returns 0 if value is 0 or the position of the first - * set bit if value is nonzero. The first (least significant) bit - * is at position 1. - */ -static __always_inline int ffs(int x) +static __always_inline int variable_ffs(int x) { int r; @@ -313,6 +333,19 @@ static __always_inline int ffs(int x) } /** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) + +/** * fls - find last set bit in word * @x: the word to search * diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h new file mode 100644 index 000000000000..58dacd90daef --- /dev/null +++ b/arch/x86/include/asm/cfi.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CFI_H +#define _ASM_X86_CFI_H + +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ + +#include <linux/cfi.h> + +#ifdef CONFIG_CFI_CLANG +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#else +static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_NONE; +} +#endif /* CONFIG_CFI_CLANG */ + +#endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 8cbf623f0ecf..b472ef76826a 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -94,4 +94,6 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, return p1 & p2; } +extern u64 x86_read_arch_cap_msr(void); + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea34cc31b047..1a85e1fb0922 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -155,20 +155,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) - -/* - * Workaround for the sake of BPF compilation which utilizes kernel - * headers, but clang does not support ASM GOTO and fails the build. - */ -#ifndef __BPF_TRACING__ -#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" -#endif - -#define static_cpu_has(bit) boot_cpu_has(bit) - -#else - /* * Static testing of CPU features. Used the same as boot_cpu_has(). It * statically patches the target code for additional performance. Use @@ -208,7 +194,6 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) -#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 235dc85c91c3..ef4775c6db01 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -457,7 +457,8 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 503622627400..991e31cfde94 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,4 +64,6 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + #endif diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 689880eca9ba..9b08082a5d9f 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -31,6 +31,16 @@ #define __noendbr __attribute__((nocf_check)) +/* + * Create a dummy function pointer reference to prevent objtool from marking + * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by + * apply_ibt_endbr()). + */ +#define IBT_NOSEAL(fname) \ + ".pushsection .discard.ibt_endbr_noseal\n\t" \ + _ASM_PTR fname "\n\t" \ + ".popsection\n\t" + static inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -84,6 +94,7 @@ extern __noendbr void ibt_restore(u64 save); #ifndef __ASSEMBLY__ #define ASM_ENDBR +#define IBT_NOSEAL(name) #define __noendbr diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index def6ca121111..5d75fe229342 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _S - other client parts * * Historical OPTDIFFs: * @@ -112,6 +113,10 @@ #define INTEL_FAM6_RAPTORLAKE 0xB7 #define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_FAM6_RAPTORLAKE_S 0xBF + +#define INTEL_FAM6_METEORLAKE 0xAC +#define INTEL_FAM6_METEORLAKE_L 0xAA /* "Small Core" Processors (Atom) */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 63f818aedf77..147cb8fdda92 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -203,7 +203,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK /* * Macro to invoke __do_softirq on the irq stack. This is only called from * task context when bottom halves are about to be reenabled and soft diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ffa578cafe1..aa381ab69a19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -53,7 +53,7 @@ #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 3 +#define KVM_INTERNAL_MEM_SLOTS 3 #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -729,6 +729,7 @@ struct kvm_vcpu_arch { struct fpu_guest guest_fpu; u64 xcr0; + u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 73ca20049835..f484d656d34e 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -43,6 +43,18 @@ #endif /* __ASSEMBLY__ */ +#define __CFI_TYPE(name) \ + SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ + .fill 11, 1, 0x90 ASM_NL \ + .byte 0xb8 ASM_NL \ + .long __kcfi_typeid_##name ASM_NL \ + SYM_FUNC_END(__cfi_##name) + +/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ +#define SYM_TYPED_FUNC_START(name) \ + SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + ENDBR + /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 97198001e567..6115bb3d5795 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -95,7 +95,7 @@ static inline unsigned char current_lock_cmos_reg(void) unsigned char rtc_cmos_read(unsigned char addr); void rtc_cmos_write(unsigned char val, unsigned char addr); -extern int mach_set_rtc_mmss(const struct timespec64 *now); +extern int mach_set_cmos_time(const struct timespec64 *now); extern void mach_get_cmos_time(struct timespec64 *now); #define RTC_IRQ 8 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cc73061e7255..6e986088817d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,7 @@ #define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */ #define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38) #define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT) +#define MCI_STATUS_MSCOD(m) (((m) >> 16) & 0xffff) /* AMD-specific bits */ #define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 0c3d3440fe27..74ecc2bd6cd0 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -9,6 +9,7 @@ struct ucode_patch { struct list_head plist; void *data; /* Intel uses only this one */ + unsigned int size; u32 patch_id; u16 equiv_cpu; }; @@ -32,9 +33,6 @@ enum ucode_state { }; struct microcode_ops { - enum ucode_state (*request_microcode_user) (int cpu, - const void __user *buf, size_t size); - enum ucode_state (*request_microcode_fw) (int cpu, struct device *, bool refresh_fw); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6674bdb096f3..1e086b37a307 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -155,6 +155,11 @@ * Return Stack Buffer Predictions. */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ + #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the @@ -1054,4 +1059,12 @@ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 +/* x2APIC locked status */ +#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +#define LEGACY_XAPIC_DISABLED BIT(0) /* + * x2APIC mode is locked and + * disabling x2APIC will cause + * a #GP + */ + #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e64fd20778b6..c936ce9f0c47 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -35,33 +35,56 @@ #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* + * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN. + */ +#define __FILL_RETURN_SLOT \ + ANNOTATE_INTRA_FUNCTION_CALL; \ + call 772f; \ + int3; \ +772: + +/* + * Stuff the entire RSB. + * * Google experimented with loop-unrolling and this turned out to be * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ -#define __FILL_RETURN_BUFFER(reg, nr, sp) \ - mov $(nr/2), reg; \ -771: \ - ANNOTATE_INTRA_FUNCTION_CALL; \ - call 772f; \ -773: /* speculation trap */ \ - UNWIND_HINT_EMPTY; \ - pause; \ - lfence; \ - jmp 773b; \ -772: \ - ANNOTATE_INTRA_FUNCTION_CALL; \ - call 774f; \ -775: /* speculation trap */ \ - UNWIND_HINT_EMPTY; \ - pause; \ - lfence; \ - jmp 775b; \ -774: \ - add $(BITS_PER_LONG/8) * 2, sp; \ - dec reg; \ - jnz 771b; \ - /* barrier for jnz misprediction */ \ +#ifdef CONFIG_X86_64 +#define __FILL_RETURN_BUFFER(reg, nr) \ + mov $(nr/2), reg; \ +771: \ + __FILL_RETURN_SLOT \ + __FILL_RETURN_SLOT \ + add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \ + dec reg; \ + jnz 771b; \ + /* barrier for jnz misprediction */ \ + lfence; +#else +/* + * i386 doesn't unconditionally have LFENCE, as such it can't + * do a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr) \ + .rept nr; \ + __FILL_RETURN_SLOT; \ + .endr; \ + add $(BITS_PER_LONG/8) * nr, %_ASM_SP; +#endif + +/* + * Stuff a single RSB slot. + * + * To mitigate Post-Barrier RSB speculation, one CALL instruction must be + * forced to retire before letting a RET instruction execute. + * + * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed + * before this point. + */ +#define __FILL_ONE_RETURN \ + __FILL_RETURN_SLOT \ + add $(BITS_PER_LONG/8), %_ASM_SP; \ lfence; #ifdef __ASSEMBLY__ @@ -132,28 +155,15 @@ #endif .endm -.macro ISSUE_UNBALANCED_RET_GUARD - ANNOTATE_INTRA_FUNCTION_CALL - call .Lunbalanced_ret_guard_\@ - int3 -.Lunbalanced_ret_guard_\@: - add $(BITS_PER_LONG/8), %_ASM_SP - lfence -.endm - /* * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 -.ifb \ftr2 - ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr -.else - ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 -.endif - __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) -.Lunbalanced_\@: - ISSUE_UNBALANCED_RET_GUARD +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS) + ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \ + __stringify(__FILL_ONE_RETURN), \ftr2 + .Lskip_rsb_\@: .endm diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 964442b99245..2a0b8dd4ec33 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -743,6 +743,7 @@ extern void default_banner(void); word 771b; \ .byte ptype; \ .byte 772b-771b; \ + _ASM_ALIGN; \ .popsection diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 89df6c6617f5..f3d601574730 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -294,6 +294,7 @@ extern struct paravirt_patch_template pv_ops; " .byte " type "\n" \ " .byte 772b-771b\n" \ " .short " clobber "\n" \ + _ASM_ALIGN "\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ @@ -328,7 +329,7 @@ int paravirt_disable_iospace(void); * Unfortunately, this is a relatively slow operation for modern CPUs, * because it cannot necessarily determine what the destination * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or + * the very least we can patch the call to a simple direct call, or, * ideally, patch an inline implementation into the callsite. (Direct * calls are essentially free, because the call and return addresses * are completely predictable.) @@ -339,10 +340,10 @@ int paravirt_disable_iospace(void); * on the stack. All caller-save registers (eax,edx,ecx) are expected * to be modified (either clobbered or used for return values). * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameters going on %rdi, %rsi, + * conventions, returning at %rax, with parameters going in %rdi, %rsi, * %rdx, and %rcx. Note that for this reason, x86_64 does not need any * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which + * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * * The call instruction itself is marked by placing its start address @@ -360,22 +361,22 @@ int paravirt_disable_iospace(void); * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. * It could be extended to more arguments, but there would be little * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. + * two VCALL and CALL variants for void and non-void functions. * * When there is a return value, the invoker of the macro must specify * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return + * determine whether it's a 32 or 64 bit value and places the return * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * 64-bit). For x86_64 machines, it just returns in %rax regardless of * the return value size. * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments; * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments * in low,high order * * Small structures are passed and returned in registers. The macro * calling convention can't directly deal with this, so the wrapper - * functions must do this. + * functions must do it. * * These PVOP_* macros are only defined within this header. This * means that all uses must be wrapped in inline functions. This also @@ -414,8 +415,17 @@ int paravirt_disable_iospace(void); "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) -/* void functions are still allowed [re]ax for scratch */ +/* + * void functions are still allowed [re]ax for scratch. + * + * The ZERO_CALL_USED REGS feature may end up zeroing out callee-saved + * registers. Make sure we model this with the appropriate clobbers. + */ +#ifdef CONFIG_ZERO_CALL_USED_REGS +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), PVOP_VCALL_CLOBBERS +#else #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#endif #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index d60ed0668a59..d24b04ebf950 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -81,6 +81,15 @@ static void __resctrl_sched_in(void) } } +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + unsigned int scale = boot_cpu_data.x86_cache_occ_scale; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + val /= scale; + return val * scale; +} + static inline void resctrl_sched_in(void) { if (static_branch_likely(&rdt_enable_key)) diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8a9eba191516..7fa611216417 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -11,7 +11,7 @@ #define __CLOBBERS_MEM(clb...) "memory", ## clb -#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO) +#ifndef __GCC_ASM_FLAG_OUTPUTS__ /* Use asm goto */ @@ -27,7 +27,7 @@ cc_label: c = true; \ c; \ }) -#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ /* Use flags output or a set instruction */ @@ -40,7 +40,7 @@ cc_label: c = true; \ c; \ }) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ #define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4a23e52fe0ee..ebc271bb6d8e 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -195,7 +195,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void snp_abort(void); +void __init __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 81a0211a372d..a73bced40e24 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -21,16 +21,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); -static inline struct cpumask *cpu_llc_shared_mask(int cpu) -{ - return per_cpu(cpu_llc_shared_map, cpu); -} - -static inline struct cpumask *cpu_l2c_shared_mask(int cpu) -{ - return per_cpu(cpu_l2c_shared_map, cpu); -} - DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); @@ -172,6 +162,16 @@ extern int safe_smp_processor_id(void); # define safe_smp_processor_id() smp_processor_id() #endif +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + +static inline struct cpumask *cpu_l2c_shared_mask(int cpu) +{ + return per_cpu(cpu_l2c_shared_map, cpu); +} + #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) @@ -179,6 +179,11 @@ static inline int wbinvd_on_all_cpus(void) wbinvd(); return 0; } + +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return (struct cpumask *)cpumask_of(0); +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45..1ec6a9ea2328 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -448,7 +448,7 @@ do { \ #ifdef CONFIG_X86_32 /* - * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error. + * Unlike the normal CMPXCHG, use output GPR for both success/fail and error. * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses * both ESI and EDI for the memory operand, compilation will fail if the error @@ -461,11 +461,12 @@ do { \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ - "mov $0, %%ecx\n\t" \ - "setz %%cl\n" \ + "mov $0, %[result]\n\t" \ + "setz %b[result]\n" \ "2:\n" \ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \ - : [result]"=c" (__result), \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ + %[result]) \ + : [result] "=q" (__result), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ @@ -502,9 +503,6 @@ strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); - #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); @@ -526,6 +524,8 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); # include <asm/uaccess_32.h> #else # include <asm/uaccess_64.h> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 45697e04d771..d13d71af5cf6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -79,4 +79,49 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } + +/* + * Zero Userspace. + */ + +__must_check unsigned long +clear_user_original(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_rep_good(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_erms(void __user *addr, unsigned long len); + +static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) +{ + might_fault(); + stac(); + + /* + * No memory constraint because it doesn't change any memory gcc + * knows about. + */ + asm volatile( + "1:\n\t" + ALTERNATIVE_3("rep stosb", + "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), + "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS), + "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0) + /* rep_good clobbers %rdx */ + : "rdx"); + + clac(); + + return size; +} + +static __always_inline unsigned long clear_user(void __user *to, unsigned long n) +{ + if (access_ok(to, n)) + return __clear_user(to, n); + return n; +} #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 8338b0432b50..46b4f1f7f354 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -77,58 +77,18 @@ static inline unsigned long find_zero(unsigned long mask) * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ -#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long offset, data; unsigned long ret; - asm_volatile_goto( + asm volatile( "1: mov %[mem], %[ret]\n" - - _ASM_EXTABLE(1b, %l[do_exception]) - - : [ret] "=r" (ret) - : [mem] "m" (*(unsigned long *)addr) - : : do_exception); - - return ret; - -do_exception: - offset = (unsigned long)addr & (sizeof(long) - 1); - addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); - data = *(unsigned long *)addr; - ret = data >> offset * 8; - - return ret; -} - -#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - -static inline unsigned long load_unaligned_zeropad(const void *addr) -{ - unsigned long offset, data; - unsigned long ret, err = 0; - - asm( "1: mov %[mem], %[ret]\n" "2:\n" - - _ASM_EXTABLE_FAULT(1b, 2b) - - : [ret] "=&r" (ret), "+a" (err) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD) + : [ret] "=r" (ret) : [mem] "m" (*(unsigned long *)addr)); - if (unlikely(err)) { - offset = (unsigned long)addr & (sizeof(long) - 1); - addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); - data = *(unsigned long *)addr; - ret = data >> offset * 8; - } - return ret; } -#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - #endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd7..1286a73ebdbc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o +obj-$(CONFIG_CFI_CLANG) += cfi.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 62f6b8b7c4a5..5cadcea035e0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -453,6 +453,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) return ret; i += ret; + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or SLS isn't enabled, we still need an INT3 after indirect JMPs + * even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + for (; i < insn->length;) bytes[i++] = BYTES_NOP1; @@ -1319,22 +1328,23 @@ struct bp_patching_desc { atomic_t refs; }; -static struct bp_patching_desc *bp_desc; +static struct bp_patching_desc bp_desc; static __always_inline -struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +struct bp_patching_desc *try_get_desc(void) { - /* rcu_dereference */ - struct bp_patching_desc *desc = __READ_ONCE(*descp); + struct bp_patching_desc *desc = &bp_desc; - if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) + if (!arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } -static __always_inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(void) { + struct bp_patching_desc *desc = &bp_desc; + smp_mb__before_atomic(); arch_atomic_dec(&desc->refs); } @@ -1367,15 +1377,15 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc: + * bp_desc with non-zero refcount: * - * bp_desc = desc INT3 + * bp_desc.refs = 1 INT3 * WMB RMB - * write INT3 if (desc) + * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); - desc = try_get_desc(&bp_desc); + desc = try_get_desc(); if (!desc) return 0; @@ -1429,7 +1439,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(desc); + put_desc(); return ret; } @@ -1460,18 +1470,20 @@ static int tp_vec_nr; */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - struct bp_patching_desc desc = { - .vec = tp, - .nr_entries = nr_entries, - .refs = ATOMIC_INIT(1), - }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); /* * Corresponding read barrier in int3 notifier for making sure the @@ -1559,12 +1571,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries text_poke_sync(); /* - * Remove and synchronize_rcu(), except we have a very primitive - * refcount based completion. + * Remove and wait for refs to be zero. */ - WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ - if (!atomic_dec_and_test(&desc.refs)) - atomic_cond_read_acquire(&desc.refs, !VAL); + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 194d54eed537..19a0207e529f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -53,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * of only flushing when an mapping is reused. With it true the GART is * flushed for every mapping. Problem is that doing the lazy flush seems * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). + * has been also seen with Qlogic at least). */ static int iommu_fullflush = 1; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 7a5630d904b2..4feaa670d578 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -36,7 +36,7 @@ /* * Using 512M as goal, in case kexec will load kernel_big * that will do the on-position decompress, and could overlap with - * with the gart aperture that is used. + * the gart aperture that is used. * Sequence: * kernel_small * ==> kexec (with kdump trigger path or gart still enabled) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6d303d1d276c..c6876d3ea4b1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,6 +61,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/irq_regs.h> +#include <asm/cpu.h> unsigned int num_processors; @@ -1751,11 +1752,26 @@ EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, - X2APIC_ON, X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED }; static int x2apic_state; +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + static void __x2apic_disable(void) { u64 msr; @@ -1793,6 +1809,10 @@ static int __init setup_nox2apic(char *str) apicid); return 0; } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } @@ -1807,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic); void x2apic_setup(void) { /* - * If x2apic is not in ON state, disable it if already enabled + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled * from BIOS. */ - if (x2apic_state != X2APIC_ON) { + if (x2apic_state < X2APIC_ON) { __x2apic_disable(); return; } @@ -1831,6 +1859,11 @@ static __init void x2apic_disable(void) if (x2apic_id >= 255) panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + __x2apic_disable(); register_lapic_address(mp_lapic_addr); } @@ -1889,7 +1922,10 @@ void __init check_x2apic(void) if (x2apic_enabled()) { pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; - x2apic_state = X2APIC_ON; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c new file mode 100644 index 000000000000..8674a5c0c031 --- /dev/null +++ b/arch/x86/kernel/cfi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/cfi.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/string.h> + +/* + * Returns the target address and the expected type when regs->ip points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + char buffer[MAX_INSN_SIZE]; + struct insn insn; + int offset = 0; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + * Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * We can decode the expected type and the target address from the + * movl/addl instructions. + */ + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0xBA) + return false; + + *type = -(u32)insn.immediate.value; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0x3) + return false; + + /* Read the target address from the register. */ + offset = insn_get_modrm_rm_off(&insn, regs); + if (offset < 0) + return false; + + *target = *(unsigned long *)((void *)regs + offset); + + return true; +} + +/* + * Checks if a ud2 trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->ip)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->ip); + + return report_cfi_failure(regs, regs->ip, &target, type); +} + +/* + * Ensure that __kcfi_typeid_ symbols are emitted for functions that may + * not be indirectly called with all configurations. + */ +__ADDRESSABLE(__memcpy) diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 23f5f27b5a02..485441b7f030 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -28,6 +28,9 @@ static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + + x86_platform.calibrate_tsc = acrn_get_tsc_khz; + x86_platform.calibrate_cpu = acrn_get_tsc_khz; } static bool acrn_x2apic_available(void) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 510d85261132..da7c361f47e0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -433,7 +433,8 @@ static void __init mmio_select_mitigation(void) u64 ia32_cap; if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -538,6 +539,8 @@ out: pr_info("TAA: %s\n", taa_strings[taa_mitigation]); if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); } static void __init md_clear_select_mitigation(void) @@ -2275,6 +2278,9 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2421,6 +2427,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -2480,7 +2487,10 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 64a73f415f03..3e508f239098 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1135,7 +1135,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) -#define NO_EIBRS_PBRSB BIT(9) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1158,6 +1159,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1176,9 +1182,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1193,18 +1199,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), {} }; @@ -1358,10 +1364,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (cpu_matches(cpu_vuln_blacklist, MMIO) && - !arch_cap_mmio_immune(ia32_cap)) - setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 993697e71854..03851240c3e3 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/tboot.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> #include <asm/processor.h> #include <asm/vmx.h> -#include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "x86/cpu: " fmt diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 717192915f28..8ed341714686 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -29,15 +29,26 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; + int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8b2fcdfa6d31..e7410e98fc1f 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -788,6 +788,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, kfree(patch); return -EINVAL; } + patch->size = *patch_size; mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); proc_id = mc_hdr->processor_rev_id; @@ -869,7 +870,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); return ret; } @@ -924,12 +925,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return UCODE_ERROR; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -938,7 +933,6 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ad57e0e4d674..6a41cee242f6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -491,7 +491,7 @@ wait_for_siblings: */ static int microcode_reload_late(void) { - int ret; + int old = boot_cpu_data.microcode, ret; pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); pr_err("You should switch to early loading, if possible.\n"); @@ -503,7 +503,8 @@ static int microcode_reload_late(void) if (ret == 0) microcode_check(); - pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 025c8f0cd948..1fcbd671f1df 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -916,24 +916,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - struct iov_iter iter; - struct iovec iov; - - if (is_blacklisted(cpu)) - return UCODE_NFOUND; - - iov.iov_base = (void __user *)buf; - iov.iov_len = size; - iov_iter_init(&iter, WRITE, &iov, 1, size); - - return generic_load_microcode(cpu, &iter); -} - static struct microcode_ops microcode_intel_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index bb1c3f5f60c8..de62b0b87ced 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -147,7 +147,6 @@ static inline void cache_alloc_hsw_probe(void) r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; r->alloc_capable = true; - r->alloc_enabled = true; rdt_alloc_capable = true; } @@ -211,7 +210,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r) thread_throttle_mode_init(); r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -242,7 +240,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->data_width = 4; r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -261,7 +258,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; - r->alloc_enabled = true; } static void rdt_get_cdp_config(int level) @@ -300,7 +296,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; @@ -401,7 +397,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int i; @@ -410,12 +406,17 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) * Initialize the Control MSRs to having no control. * For Cache Allocation: Set all bits in cbm * For Memory Allocation: Set b/w requested to 100% - * and the bandwidth in MBps to U32_MAX */ - for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) { + for (i = 0; i < hw_res->num_closid; i++, dc++) *dc = r->default_ctrl; - *dm = MBA_MAX_MBPS; - } +} + +static void domain_free(struct rdt_hw_domain *hw_dom) +{ + kfree(hw_dom->arch_mbm_total); + kfree(hw_dom->arch_mbm_local); + kfree(hw_dom->ctrl_val); + kfree(hw_dom); } static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) @@ -423,23 +424,15 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; - u32 *dc, *dm; + u32 *dc; dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), GFP_KERNEL); if (!dc) return -ENOMEM; - dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val), - GFP_KERNEL); - if (!dm) { - kfree(dc); - return -ENOMEM; - } - hw_dom->ctrl_val = dc; - hw_dom->mbps_val = dm; - setup_default_ctrlval(r, dc, dm); + setup_default_ctrlval(r, dc); m.low = 0; m.high = hw_res->num_closid; @@ -447,39 +440,31 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +/** + * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * @num_rmid: The size of the MBM counter array + * @hw_dom: The domain that owns the allocated arrays + */ +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) { size_t tsize; - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - } if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); + tsize = sizeof(*hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_total) return -ENOMEM; - } } if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); + tsize = sizeof(*hw_dom->arch_mbm_local); + hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_local) { + kfree(hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = NULL; return -ENOMEM; } } - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); - } - return 0; } @@ -502,6 +487,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct list_head *add_pos = NULL; struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + int err; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { @@ -527,25 +513,22 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(hw_dom); + domain_free(hw_dom); return; } - if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - kfree(hw_dom); + if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + domain_free(hw_dom); return; } list_add_tail(&d->list, add_pos); - /* - * If resctrl is mounted, add - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - mkdir_mondata_subdir_allrdtgrp(r, d); + err = resctrl_online_domain(r, d); + if (err) { + list_del(&d->list); + domain_free(hw_dom); + } } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -563,27 +546,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + resctrl_offline_domain(r, d); list_del(&d->list); - if (r->mon_capable && is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } /* * rdt_domain "d" is going to be freed below, so clear @@ -591,13 +555,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (d->plr) d->plr->d = NULL; + domain_free(hw_dom); - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); - kfree(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 87666275eed9..1dafbdc5ac31 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -61,6 +61,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_domain *d) { struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; unsigned long bw_val; @@ -72,6 +73,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -261,14 +268,13 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) static bool apply_config(struct rdt_hw_domain *hw_dom, struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask, bool mba_sc) + cpumask_var_t cpu_mask) { struct rdt_domain *dom = &hw_dom->d_resctrl; - u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val; - if (cfg->new_ctrl != dc[idx]) { + if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - dc[idx] = cfg->new_ctrl; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; return true; } @@ -276,6 +282,27 @@ static bool apply_config(struct rdt_hw_domain *hw_dom, return false; } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, t); + struct msr_param msr_param; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + hw_dom->ctrl_val[idx] = cfg_val; + + msr_param.res = r; + msr_param.low = idx; + msr_param.high = idx + 1; + hw_res->msr_update(d, &msr_param, r); + + return 0; +} + int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; @@ -284,14 +311,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - bool mba_sc; int cpu; u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - mba_sc = is_mba_sc(r); msr_param.res = NULL; list_for_each_entry(d, &r->domains, list) { hw_dom = resctrl_to_arch_dom(d); @@ -301,7 +326,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) continue; idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc)) + if (!apply_config(hw_dom, cfg, idx, cpu_mask)) continue; if (!msr_param.res) { @@ -315,11 +340,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) } } - /* - * Avoid writing the control msr with control values when - * MBA software controller is enabled - */ - if (cpumask_empty(cpu_mask) || mba_sc) + if (cpumask_empty(cpu_mask)) goto done; cpu = get_cpu(); /* Update resource control msr on this CPU if it's in cpu_mask. */ @@ -406,6 +427,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret) goto out; @@ -433,9 +462,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); u32 idx = get_config_index(closid, type); - if (!is_mba_sc(r)) - return hw_dom->ctrl_val[idx]; - return hw_dom->mbps_val[idx]; + return hw_dom->ctrl_val[idx]; } static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) @@ -450,8 +477,12 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo if (sep) seq_puts(s, ";"); - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + seq_printf(s, r->format_str, dom->id, max_data_width, ctrl_val); sep = true; @@ -518,7 +549,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; - struct rdt_hw_resource *hw_res; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; @@ -538,8 +568,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) domid = md.u.domid; evtid = md.u.evtid; - hw_res = &rdt_resources_all[resid]; - r = &hw_res->r_resctrl; + r = &rdt_resources_all[resid].r_resctrl; d = rdt_find_domain(r, domid, NULL); if (IS_ERR_OR_NULL(d)) { ret = -ENOENT; @@ -548,12 +577,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) mon_event_read(&rr, r, d, rdtgrp, evtid, false); - if (rr.val & RMID_VAL_ERROR) + if (rr.err == -EIO) seq_puts(m, "Error\n"); - else if (rr.val & RMID_VAL_UNAVAIL) + else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); else - seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale); + seq_printf(m, "%llu\n", rr.val); out: rdtgroup_kn_unlock(of->kn); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1d647188a43b..5f7128686cfd 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -22,21 +22,12 @@ #define L2_QOS_CDP_ENABLE 0x01ULL -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 - #define CQM_LIMBOCHECK_INTERVAL 1000 #define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 @@ -74,7 +65,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * @list: entry in &rdt_resource->evt_list */ struct mon_evt { - u32 evtid; + enum resctrl_event_id evtid; char *name; struct list_head list; }; @@ -91,9 +82,9 @@ struct mon_evt { union mon_data_bits { void *priv; struct { - unsigned int rid : 10; - unsigned int evtid : 8; - unsigned int domid : 14; + unsigned int rid : 10; + enum resctrl_event_id evtid : 8; + unsigned int domid : 14; } u; }; @@ -101,12 +92,12 @@ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_domain *d; - int evtid; + enum resctrl_event_id evtid; bool first; + int err; u64 val; }; -extern unsigned int resctrl_cqm_threshold; extern bool rdt_alloc_capable; extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; @@ -288,35 +279,45 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain - * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it - * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps * @delta_bw: Difference between the current and previous bandwidth * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { - u64 chunks; - u64 prev_msr; - u64 prev_bw_msr; + u64 prev_bw_bytes; u32 prev_bw; u32 delta_bw; bool delta_comp; }; /** + * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s + * return value. + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to + * find this struct. + */ +struct arch_mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share * a resource * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps + * @arch_mbm_total: arch private state for MBM total bandwidth + * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_domain { struct rdt_domain d_resctrl; u32 *ctrl_val; - u32 *mbps_val; + struct arch_mbm_state *arch_mbm_total; + struct arch_mbm_state *arch_mbm_local; }; static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) @@ -459,14 +460,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); for_each_rdt_resource(r) \ if (r->mon_capable) -#define for_each_alloc_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_enabled) - -#define for_each_mon_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_enabled) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -530,10 +523,6 @@ void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id); -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); @@ -542,8 +531,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index eaf25a234ff5..efe0c30d3a12 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -16,8 +16,12 @@ */ #include <linux/module.h> +#include <linux/sizes.h> #include <linux/slab.h> + #include <asm/cpu_device_id.h> +#include <asm/resctrl.h> + #include "internal.h" struct rmid_entry { @@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru); * @rmid_limbo_count count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that - * may have a occupancy value > intel_cqm_threshold. User can change - * the threshold occupancy value. + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. */ static unsigned int rmid_limbo_count; @@ -59,10 +63,15 @@ bool rdt_mon_capable; unsigned int rdt_mon_features; /* - * This is the threshold cache occupancy at which we will consider an + * This is the threshold cache occupancy in bytes at which we will consider an * RMID available for re-allocation. */ -unsigned int resctrl_cqm_threshold; +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) @@ -137,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid) return entry; } -static u64 __rmid_read(u32 rmid, u32 eventid) +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, + u32 rmid, + enum resctrl_event_id eventid) { - u64 val; + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + return NULL; + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &hw_dom->arch_mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &hw_dom->arch_mbm_local[rmid]; + } + + /* Never expect to get here */ + WARN_ON_ONCE(1); + + return NULL; +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + memset(am, 0, sizeof(*am)); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +{ + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >> shift; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + u64 msr_val, chunks; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; /* * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured @@ -150,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - return val; -} + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } -static bool rmid_dirty(struct rmid_entry *entry) -{ - u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + *val = chunks * hw_res->mon_scale; - return val >= resctrl_cqm_threshold; + return 0; } /* @@ -170,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry) */ void __check_limbo(struct rdt_domain *d, bool force_free) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rmid_entry *entry; - struct rdt_resource *r; u32 crmid = 1, nrmid; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool rmid_dirty; + u64 val = 0; /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that @@ -188,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free) break; entry = __rmid_entry(nrmid); - if (force_free || !rmid_dirty(entry)) { + + if (resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + } + + if (force_free || !rmid_dirty) { clear_bit(entry->rmid, d->rmid_busy_llc); if (!--entry->busy) { rmid_limbo_count--; @@ -227,19 +299,19 @@ int alloc_rmid(void) static void add_rmid_to_limbo(struct rmid_entry *entry) { - struct rdt_resource *r; + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu; - u64 val; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + int cpu, err; + u64 val = 0; entry->busy = 0; cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); - if (val <= resctrl_cqm_threshold) + err = resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, + &val); + if (err || val <= resctrl_rmid_realloc_threshold) continue; } @@ -277,24 +349,18 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +static int __mon_event_count(u32 rmid, struct rmid_read *rr) { - u64 shift = 64 - width, chunks; + struct mbm_state *m; + u64 tval = 0; - chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >> shift; -} + if (rr->first) + resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); -static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); - struct mbm_state *m; - u64 chunks, tval; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + if (rr->err) + return rr->err; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - return tval; - } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: rr->val += tval; @@ -308,48 +374,47 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) default: /* * Code would never reach here because an invalid - * event id would fail the __rmid_read. + * event id would fail in resctrl_arch_rmid_read(). */ - return RMID_VAL_ERROR; + return -EINVAL; } if (rr->first) { memset(m, 0, sizeof(struct mbm_state)); - m->prev_bw_msr = m->prev_msr = tval; return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width); - m->chunks += chunks; - m->prev_msr = tval; - - rr->val += get_corrected_mbm_count(rmid, m->chunks); + rr->val += tval; return 0; } /* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. */ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) { - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); struct mbm_state *m = &rr->d->mbm_local[rmid]; - u64 tval, cur_bw, chunks; + u64 cur_bw, bytes, cur_bytes; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; - chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width); - cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20; + cur_bw = bytes / SZ_1M; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); m->delta_comp = false; m->prev_bw = cur_bw; - m->prev_bw_msr = tval; } /* @@ -361,11 +426,11 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; - u64 ret_val; + int ret; rdtgrp = rr->rgrp; - ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -377,13 +442,17 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { if (__mon_event_count(entry->mon.rmid, rr) == 0) - ret_val = 0; + ret = 0; } } - /* Report error if none of rmid_reads are successful */ - if (ret_val) - rr->val = ret_val; + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; } /* @@ -420,10 +489,8 @@ void mon_event_count(void *info) */ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { - u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_hw_resource *hw_r_mba; - struct rdt_hw_domain *hw_dom_mba; u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; @@ -433,8 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) if (!is_mbm_local_enabled()) return; - hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; - r_mba = &hw_r_mba->r_resctrl; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + closid = rgrp->closid; rmid = rgrp->mon.rmid; pmbm_data = &dom_mbm->mbm_local[rmid]; @@ -444,16 +511,13 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) pr_warn_once("Failure to get domain for MBA update\n"); return; } - hw_dom_mba = resctrl_to_arch_dom(dom_mba); cur_bw = pmbm_data->prev_bw; - user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + user_bw = dom_mba->mbps_val[closid]; delta_bw = pmbm_data->delta_bw; - /* - * resctrl_arch_get_config() chooses the mbps/ctrl value to return - * based on is_mba_sc(). For now, reach into the hw_dom. - */ - cur_msr_val = hw_dom_mba->ctrl_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); /* * For Ctrl groups read data from child monitor groups. @@ -488,9 +552,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) return; } - cur_msr = hw_r_mba->msr_base + closid; - wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); - hw_dom_mba->ctrl_val[closid] = new_msr_val; + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); /* * Delta values are updated dynamically package wise for each @@ -523,10 +585,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) */ if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); /* @@ -686,9 +750,10 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - unsigned int cl_size = boot_cpu_data.x86_cache_size; + unsigned int threshold; int ret; + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; @@ -705,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->num_rmid; - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ - resctrl_cqm_threshold /= hw_res->mon_scale; + /* + * Because num_rmid may not be a power of two, round the value + * to the nearest multiple of hw_res->mon_scale so it matches a + * value the hardware will measure. mon_scale may not be a power of 2. + */ + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); ret = dom_data_init(r); if (ret) @@ -717,7 +786,6 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) l3_mon_evt_init(r); r->mon_capable = true; - r->mon_enabled = true; return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index db813f819ad6..d961ae3ed96e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -420,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp) struct pseudo_lock_region *plr = rdtgrp->plr; u32 rmid_p, closid_p; unsigned long i; + u64 saved_msr; #ifdef CONFIG_KASAN /* * The registers used for local register variables are also used @@ -463,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); @@ -514,7 +516,7 @@ static int pseudo_lock_fn(void *_rdtgrp) __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -835,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * First determine which cpus have pseudo-locked regions * associated with them. */ - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { list_for_each_entry(d_i, &r->domains, list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, @@ -871,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) static int measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; unsigned long i; u64 start, end; void *mem_r; @@ -879,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr) /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); mem_r = READ_ONCE(plr->kmem); /* @@ -895,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr) end = rdtsc_ordered(); trace_pseudo_lock_mem_latency((u32)(end - start)); } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -940,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; struct perf_event *miss_event, *hit_event; int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; unsigned int line_size; unsigned int size; unsigned long i; @@ -973,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); /* Initialize rest of local variables */ @@ -1031,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, */ rmb(); /* Re-enable hardware prefetchers */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); out_hit: perf_event_release_kernel(hit_event); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f276aff521e8..e5a48f05e787 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale); + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); return 0; } @@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct rdt_hw_resource *hw_res; unsigned int bytes; int ret; @@ -1063,11 +1059,10 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, if (ret) return ret; - if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + if (bytes > resctrl_rmid_realloc_limit) return -EINVAL; - hw_res = resctrl_to_arch_res(of->kn->parent->priv); - resctrl_cqm_threshold = bytes / hw_res->mon_scale; + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); return nbytes; } @@ -1356,11 +1351,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct resctrl_schema *schema; + enum resctrl_conf_type type; struct rdtgroup *rdtgrp; struct rdt_resource *r; struct rdt_domain *d; unsigned int size; int ret = 0; + u32 closid; bool sep; u32 ctrl; @@ -1386,8 +1383,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, goto out; } + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { r = schema->res; + type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(d, &r->domains, list) { @@ -1396,9 +1396,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - ctrl = resctrl_arch_get_config(r, d, - rdtgrp->closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); if (r->rid == RDT_RESOURCE_MBA) size = ctrl; else @@ -1756,7 +1759,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (ret) goto out_destroy; - /* loop over enabled controls, these are all alloc_enabled */ + /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; fflags = r->fflags | RF_CTRL_INFO; @@ -1765,7 +1768,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) goto out_destroy; } - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { fflags = r->fflags | RF_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); @@ -1889,26 +1892,61 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + /* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. * MBA software controller is supported only if * MBM is supported and MBA is in linear scale. */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + return (is_mbm_local_enabled() && + r->alloc_capable && is_mba_linear()); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - struct rdt_hw_domain *hw_dom; + u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_domain *d; + int i; - if (!is_mbm_enabled() || !is_mba_linear() || - mba_sc == is_mba_sc(r)) + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; r->membw.mba_sc = mba_sc; + list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); - setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val); + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; } return 0; @@ -2106,7 +2144,7 @@ static int schemata_list_create(void) struct rdt_resource *r; int ret = 0; - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { if (resctrl_arch_get_cdp_enabled(r->rid)) { ret = schemata_list_add(r, CDP_CODE); if (ret) @@ -2261,7 +2299,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (!supports_mba_mbps()) return -EINVAL; ctx->enable_mba_mbps = true; return 0; @@ -2452,7 +2490,7 @@ static void rdt_kill_sb(struct super_block *sb) set_mba_sc(false); /*Put everything back to default values. */ - for_each_alloc_enabled_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); @@ -2499,14 +2537,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, * Remove all subdirectories of mon_data of ctrl_mon groups * and monitor groups with given domain id. */ -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id) { struct rdtgroup *prgrp, *crgrp; char name[32]; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { sprintf(name, "mon_%s_%02d", r->name, dom_id); kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); @@ -2565,16 +2601,13 @@ out_destroy: * Add all subdirectories of mon_data for "ctrl_mon" groups * and "monitor" groups with given domain id. */ -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; struct list_head *head; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; mkdir_mondata_subdir(parent_kn, d, r, prgrp); @@ -2642,7 +2675,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, * Create the subdirectories for each domain. Note that all events * in a domain like L3 are grouped into a resource whose domain is L3 */ - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); if (ret) goto out_destroy; @@ -2786,14 +2819,19 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) } /* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r) +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + cfg->new_ctrl = r->default_ctrl; cfg->have_new_ctrl = true; } } @@ -2808,7 +2846,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA) { - rdtgroup_init_mba(r); + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) @@ -3236,6 +3276,110 @@ out: return ret; } +static void domain_destroy_mon_state(struct rdt_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + if (!r->mon_capable) + return; + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + /* RDT_RESOURCE_MBA is never mon_capable */ + return mba_sc_domain_allocate(r, d); + + if (!r->mon_capable) + return 0; + + err = domain_setup_mon_state(r, d); + if (err) + return err; + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + if (is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* If resctrl is mounted, add per domain monitor data directories. */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); + + return 0; +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 24c1bb8eb196..9f13d724172e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,9 @@ #include "encls.h" #include "sgx.h" +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) /* * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to @@ -344,8 +347,11 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, } va_page = sgx_encl_grow(encl, false); - if (IS_ERR(va_page)) + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; goto err_out_epc; + } if (va_page) list_add(&va_page->list, &encl->va_pages); @@ -914,7 +920,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, } /** - * sgx_encl_get_backing() - Pin the backing storage + * __sgx_encl_get_backing() - Pin the backing storage * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -926,7 +932,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); @@ -1001,7 +1007,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) } /** - * sgx_encl_alloc_backing() - allocate a new backing storage page + * sgx_encl_alloc_backing() - create a new backing storage page * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -1009,7 +1015,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) * When called from ksgxd, sets the active memcg from one of the * mms in the enclave's mm_list prior to any backing page allocation, * in order to ensure that shmem page allocations are charged to the - * enclave. + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, @@ -1022,7 +1030,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct mem_cgroup *memcg = set_active_memcg(encl_memcg); int ret; - ret = sgx_encl_get_backing(encl, page_index, backing); + ret = __sgx_encl_get_backing(encl, page_index, backing); set_active_memcg(memcg); mem_cgroup_put(encl_memcg); @@ -1040,15 +1048,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, * It is the caller's responsibility to ensure that it is appropriate to use * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). * * Return: * 0 on success, * -errno otherwise. */ -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - return sgx_encl_get_backing(encl, page_index, backing); + return __sgx_encl_get_backing(encl, page_index, backing); } /** diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index a65a952116fd..f94ff14c9486 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,8 +107,6 @@ bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); -int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); void sgx_encl_put_backing(struct sgx_backing *backing); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 515e2a5f25bb..0aad028f04d4 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -49,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list); * Reset post-kexec EPC pages to the uninitialized state. The pages are removed * from the input list, and made available for the page allocator. SECS pages * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. */ -static void __sgx_sanitize_pages(struct list_head *dirty_page_list) +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) { + unsigned long left_dirty = 0; struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; @@ -59,7 +63,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) /* dirty_page_list is thread-local, no need for a lock: */ while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) - return; + return 0; page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); @@ -92,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) } else { /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); + left_dirty++; } cond_resched(); } list_splice(&dirty, dirty_page_list); + return left_dirty; } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -395,10 +401,7 @@ static int ksgxd(void *p) * required for SECS pages, whose child pages blocked EREMOVE. */ __sgx_sanitize_pages(&sgx_dirty_page_list); - __sgx_sanitize_pages(&sgx_dirty_page_list); - - /* sanity check: */ - WARN_ON(!list_empty(&sgx_dirty_page_list)); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index afae4dd77495..b3dba35f466e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -128,7 +128,7 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) /* No access to the user space stack of other tasks. Ignore. */ break; default: - printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + printk("%sCode: Unable to access opcode bytes at 0x%lx.\n", loglvl, prologue); break; } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 68b38925a74f..44f937015e1e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -264,11 +264,11 @@ static __init void early_pci_serial_init(char *s) bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); /* - * Verify it is a UART type device + * Verify it is a 16550-UART type device */ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || - (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) { if (!force) return; } @@ -276,22 +276,22 @@ static __init void early_pci_serial_init(char *s) /* * Determine if it is IO or memory mapped */ - if (bar0 & 0x01) { + if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ serial_in = io_serial_in; serial_out = io_serial_out; - early_serial_base = bar0&0xfffffffc; + early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_IO); + cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ serial_in = mem32_serial_in; serial_out = mem32_serial_out; /* WARNING! assuming the address is always in the first 4G */ early_serial_base = - (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_MEMORY); + cmdreg|PCI_COMMAND_MEMORY); } /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e5dd6da78713..01833ebf5e8e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,7 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK void do_softirq_own_stack(void) { struct irq_stack *irqstk; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 74167dc5f55e..4c3c27b6aea3 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -505,7 +505,7 @@ static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (p->ainsn.jcc.type >= 0xe) - match = match && (regs->flags & X86_EFLAGS_ZF); + match = match || (regs->flags & X86_EFLAGS_ZF); } __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 586f718b8e95..349046434513 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,11 +4,8 @@ */ #include <linux/platform_device.h> #include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/bcd.h> #include <linux/export.h> #include <linux/pnp.h> -#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -20,26 +17,23 @@ /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. */ volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); #endif /* CONFIG_X86_32 */ -/* For two digit years assume time is always after that */ -#define CMOS_YEARS_OFFS 2000 - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * In order to set the CMOS clock precisely, mach_set_cmos_time has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec64 *now) +int mach_set_cmos_time(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now) void mach_get_cmos_time(struct timespec64 *now) { - unsigned int status, year, mon, day, hour, min, sec, century = 0; - unsigned long flags; + struct rtc_time tm; /* * If pm_trace abused the RTC as storage, set the timespec to 0, @@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - spin_lock_irqsave(&rtc_lock, flags); - - /* - * If UIP is clear, then we have >= 244 microseconds before - * RTC registers will be updated. Spec sheet says that this - * is the reliable way to read RTC - registers. If UIP is set - * then the register access might be invalid. - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - - status = CMOS_READ(RTC_CONTROL); - WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); - - spin_unlock_irqrestore(&rtc_lock, flags); - - if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hour = bcd2bin(hour); - day = bcd2bin(day); - mon = bcd2bin(mon); - year = bcd2bin(year); + if (mc146818_get_time(&tm)) { + pr_err("Unable to read current time from RTC\n"); + now->tv_sec = now->tv_nsec = 0; + return; } - if (century) { - century = bcd2bin(century); - year += century * 100; - } else - year += CMOS_YEARS_OFFS; - - now->tv_sec = mktime64(year, mon, day, hour, min, sec); + now->tv_sec = rtc_tm_to_time64(&tm); now->tv_nsec = 0; } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 63dc626627a0..a428c62330d3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -701,7 +701,13 @@ e_term: void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) return; /* @@ -717,7 +723,13 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) return; /* Invalidate the memory pages before they are marked shared in the RMP table. */ @@ -2100,7 +2112,7 @@ bool __init snp_init(struct boot_params *bp) return true; } -void __init snp_abort(void) +void __init __noreturn snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d62b2cb85cea..178015a820f0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -63,6 +63,7 @@ #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> +#include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -313,7 +314,8 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { regs->ip += LEN_UD2; handled = true; } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 38185aedf7d1..0ea57da92940 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -93,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e84ee5cdbd8c..57353519bc11 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -138,7 +138,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, - .set_wallclock = mach_set_rtc_mmss, + .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 75dcf7a72605..2796dde06302 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -315,7 +315,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - u64 guest_supported_xcr0; best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -327,10 +326,16 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - guest_supported_xcr0 = + vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0; + /* + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. + */ + vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; kvm_update_pv_runtime(vcpu); @@ -897,8 +902,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; - case 9: - break; case 0xa: { /* Architectural Performance Monitoring */ union cpuid10_eax eax; union cpuid10_edx edx; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b4eeb7c75dfa..aacb28c83e43 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -326,7 +326,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); ".align " __stringify(FASTOP_SIZE) " \n\t" \ ".type " name ", @function \n\t" \ name ":\n\t" \ - ASM_ENDBR + ASM_ENDBR \ + IBT_NOSEAL(name) #define FOP_FUNC(name) \ __FOP_FUNC(#name) @@ -446,27 +447,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); FOP_END /* Special case for SETcc - 1 instruction per cc */ - -/* - * Depending on .config the SETcc functions look like: - * - * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT] - * SETcc %al [3 bytes] - * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] - * INT3 [1 byte; CONFIG_SLS] - */ -#define SETCC_ALIGN 16 - #define FOP_SETCC(op) \ - ".align " __stringify(SETCC_ALIGN) " \n\t" \ - ".type " #op ", @function \n\t" \ - #op ": \n\t" \ - ASM_ENDBR \ + FOP_FUNC(op) \ #op " %al \n\t" \ - __FOP_RET(#op) \ - ".skip " __stringify(SETCC_ALIGN) " - (.-" #op "), 0xcc \n\t" + FOP_RET(op) -__FOP_START(setcc, SETCC_ALIGN) +FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) FOP_SETCC(setc) @@ -493,7 +479,7 @@ FOP_END; /* * XXX: inoutclob user must know where the argument is being expanded. - * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault. + * Using asm goto would allow us to remove _fault. */ #define asm_safe(insn, inoutclob...) \ ({ \ @@ -1079,7 +1065,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; - void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf); + void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; asm("push %[flags]; popf; " CALL_NOSPEC @@ -4146,6 +4132,9 @@ static int em_xsetbv(struct x86_emulate_ctxt *ctxt) { u32 eax, ecx, edx; + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)) + return emulate_ud(ctxt); + eax = reg_read(ctxt, VCPU_REGS_RAX); edx = reg_read(ctxt, VCPU_REGS_RDX); ecx = reg_read(ctxt, VCPU_REGS_RCX); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index eccddb136954..3552e6af3684 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1596,6 +1596,8 @@ static void __rmap_add(struct kvm *kvm, rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); rmap_count = pte_list_add(cache, spte, rmap_head); + if (rmap_count > kvm->stat.max_mmu_rmap_size) + kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( @@ -2914,7 +2916,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -2928,7 +2930,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * * There are several ways to safely use this helper: * - * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -3056,7 +3058,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return; /* - * mmu_notifier_retry() was successful and mmu_lock is held, so + * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; @@ -4203,7 +4205,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4227,7 +4229,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); @@ -5361,19 +5363,6 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & SPTE_BASE_ADDR_MASK) - return true; - old ^= shadow_nx_mask; - new ^= shadow_nx_mask; - return (old & ~new & SPTE_PERM_MASK) != 0; -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { @@ -5519,7 +5508,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; - if (need_remote_flush(entry, *spte)) + if (is_shadow_present_pte(entry)) flush = true; ++spte; } @@ -6055,7 +6044,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6069,7 +6058,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end - gfn_start); - kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); write_unlock(&kvm->mmu_lock); } @@ -6085,47 +6074,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, - false); + slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } - - /* - * Flush TLBs if any SPTEs had to be write-protected to ensure that - * guest writes are reflected in the dirty bitmap before the memslot - * update completes, i.e. before enabling dirty logging is visible to - * userspace. - * - * Perform the TLB flush outside the mmu_lock to reduce the amount of - * time the lock is held. However, this does mean that another CPU can - * now grab mmu_lock and encounter a write-protected SPTE while CPUs - * still have a writable mapping for the associated GFN in their TLB. - * - * This is safe but requires KVM to be careful when making decisions - * based on the write-protection status of an SPTE. Specifically, KVM - * also write-protects SPTEs to monitor changes to guest page tables - * during shadow paging, and must guarantee no CPUs can write to those - * page before the lock is dropped. As mentioned in the previous - * paragraph, a write-protected SPTE is no guarantee that CPU cannot - * perform writes. So to determine if a TLB flush is truly required, KVM - * will clear a separate software-only bit (MMU-writable) and skip the - * flush if-and-only-if this bit was already clear. - * - * See is_writable_pte() for more details. - */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) @@ -6493,32 +6453,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* + * The caller will flush the TLBs after this function returns. + * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f5958071220c..39e0205e7300 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; if (sp->role.direct) @@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f3744eea45f5..7670c13ce251 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -343,7 +343,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, } /* - * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages * so that they can be split be split down into the dirty logging @@ -361,8 +361,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * - * To keep track of why a given SPTE is write-protected, KVM uses 2 - * software-only bits in the SPTE: + * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this + * case, the SPTE is access-protected, not just write-protected! + * + * For cases #1 and #4, KVM can safely make such SPTEs writable without taking + * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. + * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits + * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging @@ -391,7 +396,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the - * dirty bitmap. + * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for + * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ddd4367d4826..7eaf96064cb0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2570,7 +2570,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bits which we consider mandatory enabled. * The CR0_READ_SHADOW is what L2 should have expected to read given * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we + * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we * have more bits than L1 expected. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d7f8331d6f7e..c9b49a09e6b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -843,8 +843,7 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, - MSR_IA32_SPEC_CTRL); + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 205ebdc2b11b..b0c47b41c264 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1011,15 +1011,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); -static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.guest_fpu.fpstate->user_xfeatures; -} - #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) { - return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC; + return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; } #endif @@ -1042,7 +1037,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ - valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP; + valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; @@ -1070,6 +1065,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { + /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); @@ -1557,12 +1553,32 @@ static const u32 msr_based_features_all[] = { static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + static u64 kvm_get_arch_capabilities(void) { u64 data = 0; - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + data &= KVM_SUPPORTED_ARCH_CAP; + } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1610,9 +1626,6 @@ static u64 kvm_get_arch_capabilities(void) */ } - /* Guests don't need to know "Fill buffer clear control" exists */ - data &= ~ARCH_CAP_FB_CLEAR_CTRL; - return data; } @@ -10652,7 +10665,8 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) case KVM_MP_STATE_INIT_RECEIVED: break; default: - return -EINTR; + WARN_ON_ONCE(1); + break; } return 1; } @@ -11093,9 +11107,22 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + switch (mp_state->mp_state) { + case KVM_MP_STATE_UNINITIALIZED: + case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: + case KVM_MP_STATE_INIT_RECEIVED: + case KVM_MP_STATE_SIPI_RECEIVED: + if (!lapic_in_kernel(vcpu)) + goto out; + break; + + case KVM_MP_STATE_RUNNABLE: + break; + + default: goto out; + } /* * KVM_MP_STATE_INIT_RECEIVED means the processor is in @@ -11563,7 +11590,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) - goto fail_free_pio_data; + goto fail_free_mce_banks; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, @@ -11617,7 +11644,6 @@ free_wbinvd_dirty_mask: fail_free_mce_banks: kfree(vcpu->arch.mce_banks); kfree(vcpu->arch.mci_ctl2_banks); -fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); @@ -12473,6 +12499,50 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } + + /* + * Unconditionally flush the TLBs after enabling dirty logging. + * A flush is almost always going to be necessary (see below), + * and unconditionally flushing allows the helpers to omit + * the subtly complex checks when removing write access. + * + * Do the flush outside of mmu_lock to reduce the amount of + * time mmu_lock is held. Flushing after dropping mmu_lock is + * safe as KVM only needs to guarantee the slot is fully + * write-protected before returning to userspace, i.e. before + * userspace can consume the dirty status. + * + * Flushing outside of mmu_lock requires KVM to be careful when + * making decisions based on writable status of an SPTE, e.g. a + * !writable SPTE doesn't guarantee a CPU can't perform writes. + * + * Specifically, KVM also write-protects guest page tables to + * monitor changes when using shadow paging, and must guarantee + * no CPUs can write to those page before mmu_lock is dropped. + * Because CPUs may have stale TLB entries at this point, a + * !writable SPTE doesn't guarantee CPUs can't perform writes. + * + * KVM also allows making SPTES writable outside of mmu_lock, + * e.g. to allow dirty logging without taking mmu_lock. + * + * To handle these scenarios, KVM uses a separate software-only + * bit (MMU-writable) to track if a SPTE is !writable due to + * a guest page table being write-protected (KVM clears the + * MMU-writable flag when write-protecting for shadow paging). + * + * The use of MMU-writable is also the primary motivation for + * the unconditional flush. Because KVM must guarantee that a + * CPU doesn't contain stale, writable TLB entries for a + * !MMU-writable SPTE, KVM must flush if it encounters any + * MMU-writable SPTE regardless of whether the actual hardware + * writable bit was set. I.e. KVM is almost guaranteed to need + * to flush, while unconditionally flushing allows the "remove + * write access" helpers to ignore MMU-writable entirely. + * + * See is_writable_pte() for more details (the case involving + * access-tracked SPTEs is particularly relevant). + */ + kvm_arch_flush_remote_tlbs_memslot(kvm, new); } } diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index fe59b8ac4fcc..ecbfb4dd3b01 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include <linux/linkage.h> +#include <asm/asm.h> #include <asm/export.h> /* @@ -50,3 +51,140 @@ SYM_FUNC_START(clear_page_erms) RET SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) + +/* + * Default clear user-space. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + */ +SYM_FUNC_START(clear_user_original) + /* + * Copy only the lower 32 bits of size as that is enough to handle the rest bytes, + * i.e., no need for a 'q' suffix and thus a REX prefix. + */ + mov %ecx,%eax + shr $3,%rcx + jz .Lrest_bytes + + # do the qwords first + .p2align 4 +.Lqwords: + movq $0,(%rdi) + lea 8(%rdi),%rdi + dec %rcx + jnz .Lqwords + +.Lrest_bytes: + and $7, %eax + jz .Lexit + + # now do the rest bytes +.Lbytes: + movb $0,(%rdi) + inc %rdi + dec %eax + jnz .Lbytes + +.Lexit: + /* + * %rax still needs to be cleared in the exception case because this function is called + * from inline asm and the compiler expects %rax to be zero when exiting the inline asm, + * in case it might reuse it somewhere. + */ + xor %eax,%eax + RET + +.Lqwords_exception: + # convert remaining qwords back into bytes to return to caller + shl $3, %rcx + and $7, %eax + add %rax,%rcx + jmp .Lexit + +.Lbytes_exception: + mov %eax,%ecx + jmp .Lexit + + _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) + _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) +SYM_FUNC_END(clear_user_original) +EXPORT_SYMBOL(clear_user_original) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is + * present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + */ +SYM_FUNC_START(clear_user_rep_good) + # call the original thing for less than a cacheline + cmp $64, %rcx + jb clear_user_original + +.Lprep: + # copy lower 32-bits for rest bytes + mov %ecx, %edx + shr $3, %rcx + jz .Lrep_good_rest_bytes + +.Lrep_good_qwords: + rep stosq + +.Lrep_good_rest_bytes: + and $7, %edx + jz .Lrep_good_exit + +.Lrep_good_bytes: + mov %edx, %ecx + rep stosb + +.Lrep_good_exit: + # see .Lexit comment above + xor %eax, %eax + RET + +.Lrep_good_qwords_exception: + # convert remaining qwords back into bytes to return to caller + shl $3, %rcx + and $7, %edx + add %rdx, %rcx + jmp .Lrep_good_exit + + _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) + _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) +SYM_FUNC_END(clear_user_rep_good) +EXPORT_SYMBOL(clear_user_rep_good) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + * + */ +SYM_FUNC_START(clear_user_erms) + # call the original thing for less than a cacheline + cmp $64, %rcx + jb clear_user_original + +.Lerms_bytes: + rep stosb + +.Lerms_exit: + xorl %eax,%eax + RET + + _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) +SYM_FUNC_END(clear_user_erms) +EXPORT_SYMBOL(clear_user_erms) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index d0d7b9bc6cad..dd8cd8831251 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -2,6 +2,7 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/errno.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> @@ -27,7 +28,7 @@ * Output: * rax original destination */ -SYM_FUNC_START(__memcpy) +SYM_TYPED_FUNC_START(__memcpy) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ "jmp memcpy_erms", X86_FEATURE_ERMS diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index ad0139d25401..f1bb18617156 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) * called from other contexts. */ pagefault_disable(); - ret = __copy_from_user_inatomic(to, from, n); + ret = raw_copy_from_user(to, from, n); pagefault_enable(); return ret; diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0ae6cf804197..6c1f8ac5e721 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -14,46 +14,6 @@ * Zero Userspace */ -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_fault(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - stac(); - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - " .align 16\n" - "0: movq $0,(%[dst])\n" - " addq $8,%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb $0,(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - - _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) - _ASM_EXTABLE_UA(1b, 2b) - - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); - clac(); - return size; -} -EXPORT_SYMBOL(__clear_user); - -unsigned long clear_user(void __user *to, unsigned long n) -{ - if (access_ok(to, n)) - return __clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f8220fd2c169..829c1409ffbd 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n KCOV_INSTRUMENT_mem_encrypt_identity.o := n +KCOV_INSTRUMENT_pgprot.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n +KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. @@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg +CFLAGS_REMOVE_pgprot.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 331310c29349..60814e110a54 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -41,6 +41,59 @@ static bool ex_handler_default(const struct exception_table_entry *e, return true; } +/* + * This is the *very* rare case where we do a "load_unaligned_zeropad()" + * and it's a page crosser into a non-existent page. + * + * This happens when we optimistically load a pathname a word-at-a-time + * and the name is less than the full word and the next page is not + * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. + * + * NOTE! The faulting address is always a 'mov mem,reg' type instruction + * of size 'long', and the exception fixup must always point to right + * after the instruction. + */ +static bool ex_handler_zeropad(const struct exception_table_entry *e, + struct pt_regs *regs, + unsigned long fault_addr) +{ + struct insn insn; + const unsigned long mask = sizeof(long) - 1; + unsigned long offset, addr, next_ip, len; + unsigned long *reg; + + next_ip = ex_fixup_addr(e); + len = next_ip - regs->ip; + if (len > MAX_INSN_SIZE) + return false; + + if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) + return false; + if (insn.length != len) + return false; + + if (insn.opcode.bytes[0] != 0x8b) + return false; + if (insn.opnd_bytes != sizeof(long)) + return false; + + addr = (unsigned long) insn_get_addr_ref(&insn, regs); + if (addr == ~0ul) + return false; + + offset = addr & mask; + addr = addr & ~mask; + if (fault_addr != addr + sizeof(long)) + return false; + + reg = insn_get_modrm_reg_ptr(&insn, regs); + if (!reg) + return false; + + *reg = *(unsigned long *)addr >> (offset * 8); + return ex_handler_default(e, regs); +} + static bool ex_handler_fault(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -217,6 +270,8 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_sgx(e, regs, trapnr); case EX_TYPE_UCOPY_LEN: return ex_handler_ucopy_len(e, regs, trapnr, reg, imm); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(e, regs, fault_addr); } BUG(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa71a5d12e87..a498ae1fbe66 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -769,6 +769,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; + /* This is a racy snapshot, but it's better than nothing. */ + int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; @@ -782,6 +784,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, print_vma_addr(KERN_CONT " in ", regs->ip); + /* + * Dump the likely CPU where the fatal segfault happened. + * This can help identify faulty hardware. + */ + printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, + topology_core_id(cpu), topology_physical_package_id(cpu)); + + printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 39c5246964a9..0fe690ebc269 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -645,7 +645,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, pages++; spin_lock(&init_mm.page_table_lock); - prot = __pgprot(pgprot_val(prot) | __PAGE_KERNEL_LARGE); + prot = __pgprot(pgprot_val(prot) | _PAGE_PSE); set_pte_init((pte_t *)pud, pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index d5ef64ddd35e..66a209f7eb86 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -62,6 +62,7 @@ static bool __read_mostly pat_bp_initialized; static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT); static bool __read_mostly pat_bp_enabled; static bool __read_mostly pat_cm_initialized; @@ -86,6 +87,7 @@ void pat_disable(const char *msg_reason) static int __init nopat(char *str) { pat_disable("PAT support disabled via boot option."); + pat_force_disabled = true; return 0; } early_param("nopat", nopat); @@ -272,7 +274,7 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void init_cache_modes(void) +void __init init_cache_modes(void) { u64 pat = 0; @@ -313,6 +315,12 @@ void init_cache_modes(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { + /* + * Clearly PAT is enabled underneath. Allow pat_enabled() to + * reflect this. + */ + pat_bp_enabled = true; } __init_cache_modes(pat); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c1f6c1c51d99..99620428ad78 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -419,7 +419,9 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) OPTIMIZER_HIDE_VAR(reg); emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { - EMIT2(0xFF, 0xE0 + reg); + EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ + if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) + EMIT1(0xCC); /* int3 */ } *pprog = prog; @@ -662,7 +664,7 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, */ emit_mov_imm32(&prog, false, dst_reg, imm32_lo); } else { - /* movabsq %rax, imm64 */ + /* movabsq rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); EMIT(imm32_lo, 4); EMIT(imm32_hi, 4); @@ -1751,34 +1753,60 @@ emit_jmp: static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { - int i; + int i, j, arg_size, nr_regs; /* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi */ - for (i = 0; i < min(nr_args, 6); i++) - emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), - BPF_REG_FP, - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - -(stack_size - i * 8)); + for (i = 0, j = 0; i < min(nr_args, 6); i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { + nr_regs = (m->arg_size[i] + 7) / 8; + arg_size = 8; + } else { + nr_regs = 1; + arg_size = m->arg_size[i]; + } + + while (nr_regs) { + emit_stx(prog, bytes_to_bpf_size(arg_size), + BPF_REG_FP, + j == 5 ? X86_REG_R9 : BPF_REG_1 + j, + -(stack_size - j * 8)); + nr_regs--; + j++; + } + } } static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { - int i; + int i, j, arg_size, nr_regs; /* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] */ - for (i = 0; i < min(nr_args, 6); i++) - emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, - BPF_REG_FP, - -(stack_size - i * 8)); + for (i = 0, j = 0; i < min(nr_args, 6); i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { + nr_regs = (m->arg_size[i] + 7) / 8; + arg_size = 8; + } else { + nr_regs = 1; + arg_size = m->arg_size[i]; + } + + while (nr_regs) { + emit_ldx(prog, bytes_to_bpf_size(arg_size), + j == 5 ? X86_REG_R9 : BPF_REG_1 + j, + BPF_REG_FP, + -(stack_size - j * 8)); + nr_regs--; + j++; + } + } } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, @@ -1810,6 +1838,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (p->aux->sleepable) { enter = __bpf_prog_enter_sleepable; exit = __bpf_prog_exit_sleepable; + } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { + enter = __bpf_prog_enter_struct_ops; + exit = __bpf_prog_exit_struct_ops; } else if (p->expected_attach_type == BPF_LSM_CGROUP) { enter = __bpf_prog_enter_lsm_cgroup; exit = __bpf_prog_exit_lsm_cgroup; @@ -2013,13 +2044,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) { - int ret, i, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args, extra_nregs = 0; int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + void *orig_call = func_addr; u8 **branches = NULL; u8 *prog; bool save_ret; @@ -2028,6 +2060,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (nr_args > 6) return -ENOTSUPP; + for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) + extra_nregs += (m->arg_size[i] + 7) / 8 - 1; + } + if (nr_args + extra_nregs > 6) + return -ENOTSUPP; + stack_size += extra_nregs * 8; + /* Generated trampoline stack layout: * * RBP + 8 [ return address ] @@ -2040,7 +2080,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * - * RBP - args_off [ args count ] always + * RBP - args_off [ arg regs count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * @@ -2083,21 +2123,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ EMIT1(0x53); /* push rbx */ - /* Store number of arguments of the traced function: - * mov rax, nr_args + /* Store number of argument registers of the traced function: + * mov rax, nr_args + extra_nregs * mov QWORD PTR [rbp - args_off], rax */ - emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args); + emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: - * mov rax, QWORD PTR [rbp + 8] - * sub rax, X86_PATCH_SIZE + * movabsq rax, func_addr * mov QWORD PTR [rbp - ip_off], rax */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); - EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE); + emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } @@ -2209,7 +2247,7 @@ cleanup: return ret; } -static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) +static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; int pivot, err, jg_bytes = 1; @@ -2225,12 +2263,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a]); err = emit_cond_near_jump(&prog, /* je func */ - (void *)progs[a], prog, + (void *)progs[a], image + (prog - buf), X86_JE); if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, prog); + emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -2255,7 +2293,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) jg_reloc = prog; err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */ - progs); + progs, image, buf); if (err) return err; @@ -2269,7 +2307,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */ - b, progs); + b, progs, image, buf); if (err) return err; @@ -2289,12 +2327,12 @@ static int cmp_ips(const void *a, const void *b) return 0; } -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { - u8 *prog = image; + u8 *prog = buf; sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL); - return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs); + return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } struct x64_jit_data { diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 6e598bd78eef..ebc98a68c400 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -169,7 +169,7 @@ static void __init do_add_efi_memmap(void) } /* - * Given add_efi_memmap defaults to 0 and there there is no alternative + * Given add_efi_memmap defaults to 0 and there is no alternative * e820 mechanism for soft-reserved memory, import the full EFI memory * map if soft reservations are present and enabled. Otherwise, the * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 31c634a22818..58a200dc762d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -55,6 +55,10 @@ ifdef CONFIG_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif +ifdef CONFIG_CFI_CLANG +PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) +endif + CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_purgatory.o += $(PURGATORY_CFLAGS) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index e2c5b296120d..2925074b9a58 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,6 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "^(xen_irq_disable_direct_reloc$|" "xen_save_fl_direct_reloc$|" "VDSO|" + "__kcfi_typeid_|" "__crc_)", /* diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h index 68fd2cf526fd..f6e9f84397e7 100644 --- a/arch/x86/um/shared/sysdep/syscalls_32.h +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -6,10 +6,9 @@ #include <asm/unistd.h> #include <sysdep/ptrace.h> -typedef long syscall_handler_t(struct pt_regs); +typedef long syscall_handler_t(struct syscall_args); extern syscall_handler_t *sys_call_table[]; #define EXECUTE_SYSCALL(syscall, regs) \ - ((long (*)(struct syscall_args)) \ - (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) + ((*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index ac8eee093f9c..66162eafd8e8 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -65,9 +65,6 @@ static int get_free_idx(struct task_struct* task) struct thread_struct *t = &task->thread; int idx; - if (!t->arch.tls_array) - return GDT_ENTRY_TLS_MIN; - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) if (!t->arch.tls_array[idx].present) return idx + GDT_ENTRY_TLS_MIN; @@ -240,9 +237,6 @@ static int get_tls_entry(struct task_struct *task, struct user_desc *info, { struct thread_struct *t = &task->thread; - if (!t->arch.tls_array) - goto clear; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 8c0396fd0e6f..6fbe97c52c99 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -65,7 +65,7 @@ quiet_cmd_vdso = VDSO $@ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv +VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack GCOV_PROFILE := n # diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0ed2e487a693..9b1a58dda935 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -765,6 +765,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + static const struct trap_info zero = { }; unsigned out; trace_xen_cpu_load_idt(desc); @@ -774,7 +775,7 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); out = xen_convert_trap_info(desc, traps, false); - memset(&traps[out], 0, sizeof(traps[0])); + traps[out] = zero; xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) |