diff options
Diffstat (limited to 'arch')
126 files changed, 944 insertions, 525 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 2bb30673d8e6..ecfd3520b676 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -632,13 +632,12 @@ config HAS_LTO_CLANG def_bool y # Clang >= 11: https://github.com/ClangBuiltLinux/linux/issues/510 depends on CC_IS_CLANG && CLANG_VERSION >= 110000 && LD_IS_LLD - depends on $(success,test $(LLVM) -eq 1) depends on $(success,test $(LLVM_IAS) -eq 1) depends on $(success,$(NM) --help | head -n 1 | grep -qi llvm) depends on $(success,$(AR) --help | head -n 1 | grep -qi llvm) depends on ARCH_SUPPORTS_LTO_CLANG depends on !FTRACE_MCOUNT_USE_RECORDMCOUNT - depends on !KASAN + depends on !KASAN || KASAN_HW_TAGS depends on !GCOV_KERNEL help The compiler and Kconfig options support building with Clang's diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 853aab5ab327..5da96f5df48f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -348,6 +348,7 @@ config ARCH_EP93XX select ARM_AMBA imply ARM_PATCH_PHYS_VIRT select ARM_VIC + select GENERIC_IRQ_MULTI_HANDLER select AUTO_ZRELADDR select CLKDEV_LOOKUP select CLKSRC_MMIO diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index acb464547a54..84a1cea1f43b 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -11,6 +11,7 @@ #include <xen/xen.h> #include <xen/interface/memory.h> +#include <xen/grant_table.h> #include <xen/page.h> #include <xen/swiotlb-xen.h> @@ -109,7 +110,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap.host_addr = map_ops[i].host_addr, unmap.handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap.dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -130,7 +131,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, return 0; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -145,7 +145,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return 0; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, unsigned long nr_pages) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1f212b47a48a..e4e1b6550115 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -810,6 +810,16 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. +config NVIDIA_CARMEL_CNP_ERRATUM + bool "NVIDIA Carmel CNP: CNP on Carmel semantically different than ARM cores" + default y + help + If CNP is enabled on Carmel cores, non-sharable TLBIs on a core will not + invalidate shared TLB entries installed by a different core, as it would + on standard ARM cores. + + If unsure, say Y. + config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -1055,8 +1065,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_HUGE_PMD_SHARE - config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -1157,8 +1165,8 @@ config XEN config FORCE_MAX_ZONEORDER int - default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE) - default "12" if (ARM64_16K_PAGES && TRANSPARENT_HUGEPAGE) + default "14" if ARM64_64K_PAGES + default "12" if ARM64_16K_PAGES default "11" help The kernel memory allocator divides physically contiguous memory @@ -1855,12 +1863,6 @@ config CMDLINE_FROM_BOOTLOADER the boot loader doesn't provide any, the default kernel command string provided in CMDLINE will be used. -config CMDLINE_EXTEND - bool "Extend bootloader kernel arguments" - help - The command-line arguments provided by the boot loader will be - appended to the default kernel command string. - config CMDLINE_FORCE bool "Always use the default kernel command string" help diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 93a161b3bf3f..dc52b733675d 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -37,7 +37,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } while (--n > 0); sum += ((sum >> 32) | (sum << 32)); - return csum_fold((__force u32)(sum >> 32)); + return csum_fold((__force __wsum)(sum >> 32)); } #define ip_fast_csum ip_fast_csum diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b77d997b173b..c40f2490cd7b 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -66,7 +66,8 @@ #define ARM64_WORKAROUND_1508412 58 #define ARM64_HAS_LDAPR 59 #define ARM64_KVM_PROTECTED_MODE 60 +#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61 -#define ARM64_NCAPS 61 +#define ARM64_NCAPS 62 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 22d933e9b59e..a7ab84f781f7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -47,10 +47,10 @@ #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 @@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); -extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); -extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c0450828378b..32ae676236b6 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt); void __debug_switch_to_guest(struct kvm_vcpu *vcpu); void __debug_switch_to_host(struct kvm_vcpu *vcpu); +#ifdef __KVM_NVHE_HYPERVISOR__ +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); +#endif + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); @@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); +void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c759faf7a1ff..0aabc3be9a75 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -328,6 +328,11 @@ static inline void *phys_to_virt(phys_addr_t x) #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) +#define page_to_virt(x) ({ \ + __typeof__(x) __page = x; \ + void *__addr = __va(page_to_phys(__page)); \ + (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ +}) #define virt_to_page(x) pfn_to_page(virt_to_pfn(x)) #else #define page_to_virt(x) ({ \ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 70ce8c1d2b07..bd02e99b1a4c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -63,23 +63,6 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) extern u64 idmap_t0sz; extern u64 idmap_ptrs_per_pgd; -static inline bool __cpu_uses_extended_idmap(void) -{ - if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52)) - return false; - - return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)); -} - -/* - * True if the extended ID map requires an extra level of translation table - * to be configured. - */ -static inline bool __cpu_uses_extended_idmap_level(void) -{ - return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS; -} - /* * Ensure TCR.T0SZ is set to the provided value. */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 046be789fbb4..9a65fb528110 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -66,7 +66,6 @@ extern bool arm64_use_ng_mappings; #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define PAGE_KERNEL __pgprot(PROT_NORMAL) -#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) #define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e17b96d0e4b5..47027796c2f9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -486,6 +486,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define pgprot_device(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) +#define pgprot_tagged(prot) \ + __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED)) +#define pgprot_mhp pgprot_tagged /* * DMA allocations for non-coherent devices use what the Arm architecture calls * "Normal non-cacheable" memory, which permits speculation, unaligned accesses diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..efc10e9041a0 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -251,6 +251,8 @@ unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); +asmlinkage void arm64_preempt_schedule_irq(void); + #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index dfd4edbfe360..d4a5fca984c3 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -796,6 +796,11 @@ #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX 0x7 + #ifdef CONFIG_ARM64_PA_BITS_52 #define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_52 #else @@ -961,14 +966,17 @@ #define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #elif defined(CONFIG_ARM64_16K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0xF #elif defined(CONFIG_ARM64_64K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #endif #define MVFR2_FPMISC_SHIFT 4 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9f4e3b266f21..6623c99f0984 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -55,6 +55,8 @@ void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec void arch_release_task_struct(struct task_struct *tsk); +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src); #endif diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 506a1cd37973..e2c20c036442 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -526,6 +526,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { 1, 0), }, #endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 066030717a4c..2a5d9854d664 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1321,7 +1321,10 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 77605aec25fe..51fcf99d5351 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index e6e284265f19..58303a9ec32c 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -64,5 +64,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 66b0e0b66e31..840bda1869e9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -319,7 +319,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) */ adrp x5, __idmap_text_end clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? + cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension adr_l x6, idmap_t0sz @@ -655,8 +655,10 @@ SYM_FUNC_END(__secondary_too_slow) SYM_FUNC_START(__enable_mmu) mrs x2, ID_AA64MMFR0_EL1 ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support update_early_cpu_boot_status 0, x2, x3 adrp x2, idmap_pg_dir phys_to_ttbr x1, x1 diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index dffb16682330..83f1c4b92095 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -163,33 +163,36 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init void parse_cmdline(void) +static __init const u8 *get_bootargs_cmdline(void) { - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - const u8 *prop; - void *fdt; - int node; + const u8 *prop; + void *fdt; + int node; - fdt = get_early_fdt_ptr(); - if (!fdt) - goto out; + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return NULL; - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + return NULL; - __parse_cmdline(prop, true); + return strlen(prop) ? prop : NULL; +} - if (!IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - return; - } +static __init void parse_cmdline(void) +{ + const u8 *prop = get_bootargs_cmdline(); -out: - __parse_cmdline(CONFIG_CMDLINE, true); + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(CONFIG_CMDLINE, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); } /* Keep checkers quiet */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 23f1a557bd9f..5aa9ed1e9ec6 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table); /* Array containing bases of nVHE per-CPU memory regions. */ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); +/* PMU available static key */ +KVM_NVHE_ALIAS(kvm_arm_pmu_available); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 7d2318f80955..4658fcf88c2b 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -460,7 +460,7 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline u32 armv8pmu_read_evcntr(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 325c83b1a24d..6e60aa3b5ea9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -57,6 +57,8 @@ #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ad20981dfda4..d55bdfb7789c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -194,8 +194,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) #ifdef CONFIG_STACKTRACE -void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, - struct task_struct *task, struct pt_regs *regs) +noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stackframe frame; @@ -203,8 +204,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, start_backtrace(&frame, regs->regs[29], regs->pc); else if (task == current) start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)arch_stack_walk); + (unsigned long)__builtin_frame_address(1), + (unsigned long)__builtin_return_address(0)); else start_backtrace(&frame, thread_saved_fp(task), thread_saved_pc(task)); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fc4c95dd2d26..7f06ba76698d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); + kvm_call_hyp(__kvm_flush_cpu_context, mmu); *last_ran = vcpu->vcpu_id; } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index b0afad7a99c6..e831d3dfd50d 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // If the hyp context is loaded, go straight to hyp_panic get_loaded_vcpu x0, x1 - cbz x0, hyp_panic + cbnz x0, 1f + b hyp_panic +1: // The hyp context is saved so make sure it is restored to allow // hyp_panic to run at hyp and, subsequently, panic to run in the host. // This makes use of __guest_exit to avoid duplication but sets the @@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // current state is saved to the guest context but it will only be // accurate if the guest had been completely restored. adr_this_cpu x0, kvm_hyp_ctxt, x1 - adr x1, hyp_panic + adr_l x1, hyp_panic str x1, [x0, #CPU_XREG_OFFSET(30)] get_vcpu_ptr x1, x0 @@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Now restore the hyp regs restore_callee_saved_regs x2 - set_loaded_vcpu xzr, x1, x2 + set_loaded_vcpu xzr, x2, x3 alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 54f4860cd87c..6c1f51f25eb3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - write_sysreg(0, pmselr_el0); - write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + write_sysreg(0, pmselr_el0); + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + } write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } static inline void __deactivate_traps_common(void) { write_sysreg(0, hstr_el2); - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) + write_sysreg(0, pmuserenr_el0); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 91a711aa8382..f401724f12ef 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); } -void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ __debug_switch_to_guest_common(vcpu); } -void __debug_switch_to_host(struct kvm_vcpu *vcpu) +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_host(struct kvm_vcpu *vcpu) +{ __debug_switch_to_host_common(vcpu); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 6585a7cbbc56..5d94584840cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter) SYM_FUNC_END(__host_enter) /* - * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) /* Prepare and exit to the host's panic funciton. */ @@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic) hyp_kimg_va lr, x6 msr elr_el2, lr - /* Set the panic format string. Use the, now free, LR as scratch. */ - ldr lr, =__hyp_panic_string - hyp_kimg_va lr, x6 + mov x29, x0 + + /* Load the format string into x0 and arguments into x1-7 */ + ldr x0, =__hyp_panic_string + hyp_kimg_va x0, x6 /* Load the format arguments into x1-7. */ mov x6, x3 @@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic) mrs x5, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ - cmp x0, xzr - mov x0, lr - b.eq __host_enter_without_restoring + cbz x29, __host_enter_without_restoring b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f012f8665ecc..936328207bde 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } -static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) +static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + __kvm_flush_cpu_context(kern_hyp_va(mmu)); } static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) @@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) write_sysreg_el2(tmp, SYS_SCTLR); } -static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) { - cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) @@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), - HANDLE_FUNC(__kvm_tlb_flush_local_vmid), + HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), + HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), HANDLE_FUNC(__vgic_v3_init_lrs), diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f3d0e9eca56c..68ab6b4d5141 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); __sysreg_save_state_nvhe(host_ctxt); + /* + * We must flush and disable the SPE buffer for nVHE, as + * the translation regime(EL1&0) is going to be loaded with + * that of the guest. And we must do this before we change the + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and + * before we load guest Stage1. + */ + __debug_save_host_buffers_nvhe(vcpu); __adjust_pc(vcpu); @@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) __fpsimd_save_fpexc32(vcpu); + __debug_switch_to_host(vcpu); /* * This must come after restoring the host sysregs, since a non-VHE * system may enable SPE here and make use of the TTBRs. */ - __debug_switch_to_host(vcpu); + __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) __pmu_switch_to_host(host_ctxt); @@ -257,7 +266,6 @@ void __noreturn hyp_panic(void) u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); - bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; @@ -271,7 +279,7 @@ void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } - __hyp_do_panic(restore_host, spsr, elr, par); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index fbde89a2c6e8..229b06748c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536..926fc07074f5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; if (!table) { + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 80406f463c28..ee3682b9873c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -405,9 +405,45 @@ void __vgic_v3_init_lrs(void) __gic_v3_set_lr(0, i); } -u64 __vgic_v3_get_ich_vtr_el2(void) +/* + * Return the GIC CPU configuration: + * - [31:0] ICH_VTR_EL2 + * - [62:32] RES0 + * - [63] MMIO (GICv2) capable + */ +u64 __vgic_v3_get_gic_config(void) { - return read_gicreg(ICH_VTR_EL2); + u64 val, sre = read_gicreg(ICC_SRE_EL1); + unsigned long flags = 0; + + /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. To do that safely, we have to prevent any interrupt + * from firing (which would be deadly). + * + * Note that this only makes sense on VHE, as interrupts are + * already masked for nVHE as part of the exception entry to + * EL2. + */ + if (has_vhe()) + flags = local_daif_save(); + + write_gicreg(0, ICC_SRE_EL1); + isb(); + + val = read_gicreg(ICC_SRE_EL1); + + write_gicreg(sre, ICC_SRE_EL1); + isb(); + + if (has_vhe()) + local_daif_restore(flags); + + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); + val |= read_gicreg(ICH_VTR_EL2); + + return val; } u64 __vgic_v3_read_vmcr(void) diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index fd7895945bbc..66f17349f0c3 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..8711894db8c2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) + if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; mmap_read_lock(current->mm); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index d45b8b9a4415..739164324afe 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -11,6 +11,8 @@ #include <asm/kvm_emulate.h> +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + static int kvm_is_in_guest(void) { return kvm_get_running_vcpu() != NULL; @@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) + static_branch_enable(&kvm_arm_pmu_available); + return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e9ec08b0b070..e32c6e139a09 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return val & mask; } -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_has_pmu(vcpu)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 47f3f035f3ea..bd354cd45d28 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -311,23 +311,24 @@ int kvm_set_ipa_limit(void) } switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { - default: - case 1: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; - case 0: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT: kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); break; - case 2: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX: kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); break; + default: + kvm_err("Unsupported value for TGRAN_2, giving up\n"); + return -EINVAL; } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); - WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, - "KVM IPA Size Limit (%d bits) is smaller than default size\n", - kvm_ipa_limit); - kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } @@ -356,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 52915b342351..6f530925a231 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + bool has_v2; int ret; + has_v2 = ich_vtr_el2 >> 63; + ich_vtr_el2 = (u32)ich_vtr_el2; + /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... @@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info) gicv4_enable ? "en" : "dis"); } + kvm_vgic_global_state.vcpu_base = 0; + if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; + } else if (!has_v2) { + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0ace5e68efba..3685e12aba9b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -219,17 +219,40 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { - phys_addr_t addr = pfn << PAGE_SHIFT; + phys_addr_t addr = PFN_PHYS(pfn); - if ((addr >> PAGE_SHIFT) != pfn) + /* + * Ensure the upper PAGE_SHIFT bits are clear in the + * pfn. Else it might lead to false positives when + * some of the upper bits are set, but the lower bits + * match a valid pfn. + */ + if (PHYS_PFN(addr) != pfn) return 0; #ifdef CONFIG_SPARSEMEM +{ + struct mem_section *ms; + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - if (!valid_section(__pfn_to_section(pfn))) + ms = __pfn_to_section(pfn); + if (!valid_section(ms)) return 0; + + /* + * ZONE_DEVICE memory does not have the memblock entries. + * memblock_is_map_memory() check for ZONE_DEVICE based + * addresses will always fail. Even the normal hotplugged + * memory will never have MEMBLOCK_NOMAP flag set in their + * memblock entries. Skip memblock search for all non early + * memory sections covering all of hotplug memory including + * both normal and ZONE_DEVICE based. + */ + if (!early_section(ms)) + return pfn_section_valid(ms, pfn); +} #endif return memblock_is_map_memory(addr); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3802cfbdd20d..5d9550fdb9cf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -40,7 +40,7 @@ #define NO_BLOCK_MAPPINGS BIT(0) #define NO_CONT_MAPPINGS BIT(1) -u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN); u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; u64 __section(".mmuoff.data.write") vabits_actual; @@ -512,7 +512,8 @@ static void __init map_mem(pgd_t *pgdp) * if MTE is present. Otherwise, it has the same attributes as * PAGE_KERNEL. */ - __map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags); + __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), + flags); } /* @@ -1447,6 +1448,22 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; + u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + u64 end_linear_pa = __pa(PAGE_END - 1); + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + /* + * Check for a wrap, it is possible because of randomized linear + * mapping the start physical address is actually bigger than + * the end physical address. In this case set start to zero + * because [0, end_linear_pa] range must still be able to cover + * all addressable physical addresses. + */ + if (start_linear_pa > end_linear_pa) + start_linear_pa = 0; + } + + WARN_ON(start_linear_pa > end_linear_pa); /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] @@ -1454,8 +1471,9 @@ struct range arch_get_mappable_range(void) * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); - mhp_range.end = __pa(PAGE_END - 1); + mhp_range.start = start_linear_pa; + mhp_range.end = end_linear_pa; + return mhp_range; } diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index ae2b1c7b3b5c..ef2bb9bd9605 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -9,7 +9,7 @@ int arch_check_ftrace_location(struct kprobe *p) return 0; } -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index 6c6f16e409a8..0d23c0049301 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -32,7 +32,7 @@ static inline void syscall_rollback(struct task_struct *task, static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return regs->r10 == -1 ? regs->r8:0; + return regs->r10 == -1 ? -regs->r8:0; } static inline long syscall_get_return_value(struct task_struct *task, diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 8b5b8e6bc9d9..dd5bfed52031 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -59,7 +59,7 @@ show_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ u32 cpu=dev->id; \ - return sprintf(buf, "%lx\n", name[cpu]); \ + return sprintf(buf, "%llx\n", name[cpu]); \ } #define store(name) \ @@ -86,9 +86,9 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); - printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); - printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); - printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -117,8 +117,8 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); - printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]); - printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); + printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%llx\n", resources[cpu]); #endif return size; } @@ -131,7 +131,7 @@ show_virtual_to_phys(struct device *dev, struct device_attribute *attr, char *buf) { unsigned int cpu=dev->id; - return sprintf(buf, "%lx\n", phys_addr[cpu]); + return sprintf(buf, "%llx\n", phys_addr[cpu]); } static ssize_t @@ -145,7 +145,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG - printk("Virtual address %lx is not existing.\n",virt_addr); + printk("Virtual address %llx is not existing.\n", virt_addr); #endif return -EINVAL; } @@ -163,7 +163,7 @@ show_err_data_buffer(struct device *dev, { unsigned int cpu=dev->id; - return sprintf(buf, "%lx, %lx, %lx\n", + return sprintf(buf, "%llx, %llx, %llx\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -178,13 +178,13 @@ store_err_data_buffer(struct device *dev, int ret; #ifdef ERR_INJ_DEBUG - printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3, cpu); #endif - ret=sscanf(buf, "%lx, %lx, %lx", + ret = sscanf(buf, "%llx, %llx, %llx", &err_data_buffer[cpu].data1, &err_data_buffer[cpu].data2, &err_data_buffer[cpu].data3); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index d4cae2fc69ca..adf6521525f4 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1824,7 +1824,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = (void *)__get_free_pages(GFP_KERNEL, + data = (void *)__get_free_pages(GFP_ATOMIC, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index c3490ee2daa5..e14f5653393a 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2013,27 +2013,39 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data) { struct syscall_get_set_args *args = data; struct pt_regs *pt = args->regs; - unsigned long *krbs, cfm, ndirty; + unsigned long *krbs, cfm, ndirty, nlocals, nouts; int i, count; if (unw_unwind_to_user(info) < 0) return; + /* + * We get here via a few paths: + * - break instruction: cfm is shared with caller. + * syscall args are in out= regs, locals are non-empty. + * - epsinstruction: cfm is set by br.call + * locals don't exist. + * + * For both cases argguments are reachable in cfm.sof - cfm.sol. + * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ] + */ cfm = pt->cr_ifs; + nlocals = (cfm >> 7) & 0x7f; /* aka sol */ + nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */ krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8; ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); count = 0; if (in_syscall(pt)) - count = min_t(int, args->n, cfm & 0x7f); + count = min_t(int, args->n, nouts); + /* Iterate over outs. */ for (i = 0; i < count; i++) { + int j = ndirty + nlocals + i + args->i; if (args->rw) - *ia64_rse_skip_regs(krbs, ndirty + i + args->i) = - args->args[i]; + *ia64_rse_skip_regs(krbs, j) = args->args[i]; else - args->args[i] = *ia64_rse_skip_regs(krbs, - ndirty + i + args->i); + args->args[i] = *ia64_rse_skip_regs(krbs, j); } if (!args->rw) { diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 1234834cc4c4..1f98947fe715 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -176,7 +176,7 @@ SECTIONS .fill : { FILL(0); BYTE(0); - . = ALIGN(8); + STRUCT_ALIGN(); } __appended_dtb = .; /* leave space for appended DTB */ diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index eacc9102c251..f1d029bf906e 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -73,9 +73,10 @@ void __patch_exception(int exc, unsigned long addr); #endif #define OP_RT_RA_MASK 0xffff0000UL -#define LIS_R2 0x3c020000UL -#define ADDIS_R2_R12 0x3c4c0000UL -#define ADDI_R2_R2 0x38420000UL +#define LIS_R2 (PPC_INST_ADDIS | __PPC_RT(R2)) +#define ADDIS_R2_R12 (PPC_INST_ADDIS | __PPC_RT(R2) | __PPC_RA(R12)) +#define ADDI_R2_R2 (PPC_INST_ADDI | __PPC_RT(R2) | __PPC_RA(R2)) + static inline unsigned long ppc_function_entry(void *func) { diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index 7897d16e0990..727d4b321937 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -7,7 +7,7 @@ #include <linux/bug.h> #include <asm/cputable.h> -static inline bool early_cpu_has_feature(unsigned long feature) +static __always_inline bool early_cpu_has_feature(unsigned long feature) { return !!((CPU_FTRS_ALWAYS & feature) || (CPU_FTRS_POSSIBLE & cur_cpu_spec->cpu_features & feature)); @@ -46,7 +46,7 @@ static __always_inline bool cpu_has_feature(unsigned long feature) return static_branch_likely(&cpu_feature_keys[i]); } #else -static inline bool cpu_has_feature(unsigned long feature) +static __always_inline bool cpu_has_feature(unsigned long feature) { return early_cpu_has_feature(feature); } diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index aedfba29e43a..e8d09a841373 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -410,7 +410,6 @@ DECLARE_INTERRUPT_HANDLER(altivec_assist_exception); DECLARE_INTERRUPT_HANDLER(CacheLockingException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException); -DECLARE_INTERRUPT_HANDLER(unrecoverable_exception); DECLARE_INTERRUPT_HANDLER(WatchdogException); DECLARE_INTERRUPT_HANDLER(kernel_bad_stack); @@ -437,6 +436,8 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); +void unrecoverable_exception(struct pt_regs *regs); + void replay_system_reset(void); void replay_soft_interrupts(void); diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 975ba260006a..1499e928ea6a 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -195,7 +195,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x11 #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #endif #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) #define NV_REG_POISON 0xdeadbeefdeadbeefUL @@ -210,7 +210,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x1F #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index fdab93428372..9d1fbd8be1c7 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(void) { msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } +#else +static inline void enable_kernel_vsx(void) +{ + BUILD_BUG(); +} + +static inline void disable_kernel_vsx(void) +{ + BUILD_BUG(); +} #endif #ifdef CONFIG_SPE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 60d3051a8bc8..8082b690e874 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -466,7 +466,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) ld r10,PACAKMSR(r13) /* get MSR value for kernel */ /* MSR[RI] is clear iff using SRR regs */ - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION xori r10,r10,MSR_RI END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 2ef3c4051bb9..c475a229a42a 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -436,7 +436,6 @@ again: return ret; } -void unrecoverable_exception(struct pt_regs *regs); void preempt_schedule_irq(void); notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1583fd1c6010..a44a30b0688c 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2170,7 +2170,7 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) * in the MSR is 0. This indicates that SRR0/1 are live, and that * we therefore lost state by taking this exception. */ -DEFINE_INTERRUPT_HANDLER(unrecoverable_exception) +void unrecoverable_exception(struct pt_regs *regs) { pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", regs->trap, regs->nip, regs->msr); diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index a6e29f880e0e..d21d08140a5e 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -65,3 +65,14 @@ V_FUNCTION_END(__kernel_clock_getres) V_FUNCTION_BEGIN(__kernel_time) cvdso_call_time __c_kernel_time V_FUNCTION_END(__kernel_time) + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz r0,4(r11) + lwz r31,-4(r11) + mtlr r0 + mr r1,r11 + blr diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 85d626b8ce5e..87d7b52f278f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -93,7 +93,6 @@ config RISCV select PCI_MSI if PCI select RISCV_INTC select RISCV_TIMER if RISCV_SBI - select SPARSEMEM_STATIC if 32BIT select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK @@ -154,7 +153,8 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y depends on MMU - select SPARSEMEM_VMEMMAP_ENABLE + select SPARSEMEM_STATIC if 32BIT && SPARSMEM + select SPARSEMEM_VMEMMAP_ENABLE if 64BIT config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 7efcece8896c..e1b2690b6e45 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -31,6 +31,8 @@ config SOC_CANAAN select SIFIVE_PLIC select ARCH_HAS_RESET_CONTROLLER select PINCTRL + select COMMON_CLK + select COMMON_CLK_K210 help This enables support for Canaan Kendryte K210 SoC platform hardware. diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 27e005fca584..2a652b0c987d 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -9,4 +9,20 @@ long long __lshrti3(long long a, int b); long long __ashrti3(long long a, int b); long long __ashlti3(long long a, int b); + +#define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) + +DECLARE_DO_ERROR_INFO(do_trap_unknown); +DECLARE_DO_ERROR_INFO(do_trap_insn_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_insn_fault); +DECLARE_DO_ERROR_INFO(do_trap_insn_illegal); +DECLARE_DO_ERROR_INFO(do_trap_load_fault); +DECLARE_DO_ERROR_INFO(do_trap_load_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_store_misaligned); +DECLARE_DO_ERROR_INFO(do_trap_store_fault); +DECLARE_DO_ERROR_INFO(do_trap_ecall_u); +DECLARE_DO_ERROR_INFO(do_trap_ecall_s); +DECLARE_DO_ERROR_INFO(do_trap_ecall_m); +DECLARE_DO_ERROR_INFO(do_trap_break); + #endif /* _ASM_RISCV_PROTOTYPES_H */ diff --git a/arch/riscv/include/asm/irq.h b/arch/riscv/include/asm/irq.h index 9807ad164015..e4c435509983 100644 --- a/arch/riscv/include/asm/irq.h +++ b/arch/riscv/include/asm/irq.h @@ -12,4 +12,6 @@ #include <asm-generic/irq.h> +extern void __init init_IRQ(void); + #endif /* _ASM_RISCV_IRQ_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 3a240037bde2..021ed64ee608 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -71,6 +71,7 @@ int riscv_of_processor_hartid(struct device_node *node); int riscv_of_parent_hartid(struct device_node *node); extern void riscv_fill_hwcap(void); +extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index cb4abb639e8d..09ad4e923510 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -119,6 +119,11 @@ extern int regs_query_register_offset(const char *name); extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer); +int do_syscall_trace_enter(struct pt_regs *regs); +void do_syscall_trace_exit(struct pt_regs *regs); + /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 99895d9c3bdd..d7027411dde8 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -51,10 +51,10 @@ enum sbi_ext_rfence_fid { SBI_EXT_RFENCE_REMOTE_FENCE_I = 0, SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, - SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, + SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, + SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, }; enum sbi_ext_hsm_fid { diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h index 81de51e6aa32..507cae273bc6 100644 --- a/arch/riscv/include/asm/timex.h +++ b/arch/riscv/include/asm/timex.h @@ -88,4 +88,6 @@ static inline int read_current_timer(unsigned long *timer_val) return 0; } +extern void time_init(void); + #endif /* _ASM_RISCV_TIMEX_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3dc0abde988a..647a47f5484a 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -8,6 +8,7 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) endif +CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) extra-y += head.o extra-y += vmlinux.lds diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index e6372490aa0b..17ca5e923bb0 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -2,39 +2,41 @@ #include <linux/kprobes.h> -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe *p; + struct pt_regs *regs; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) return; + regs = ftrace_get_regs(fregs); kcb = get_kprobe_ctlblk(); if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = instruction_pointer(&(regs->regs)); + unsigned long orig_ip = instruction_pointer(regs); - instruction_pointer_set(&(regs->regs), ip); + instruction_pointer_set(regs, ip); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, &(regs->regs))) { + if (!p->pre_handler || !p->pre_handler(p, regs)) { /* * Emulate singlestep (and also recover regs->pc) * as if there is a nop */ - instruction_pointer_set(&(regs->regs), + instruction_pointer_set(regs, (unsigned long)p->addr + MCOUNT_INSN_SIZE); if (unlikely(p->post_handler)) { kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, &(regs->regs), 0); + p->post_handler(p, regs, 0); } - instruction_pointer_set(&(regs->regs), orig_ip); + instruction_pointer_set(regs, orig_ip); } /* diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index a2ec18662fee..7e2c78e2ca6b 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -256,8 +256,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) * normal page fault. */ regs->epc = (unsigned long) cur->addr; - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 6f728e731bed..f9cd57c9c67d 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/tick.h> #include <linux/ptrace.h> diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f4a7db3d309e..d3bf756321a5 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -116,7 +116,7 @@ void sbi_clear_ipi(void) EXPORT_SYMBOL(sbi_clear_ipi); /** - * sbi_set_timer_v01() - Program the timer for next timer event. + * __sbi_set_timer_v01() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. * * Return: None diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index e85bacff1b50..f8f15332caa2 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -147,7 +147,8 @@ static void __init init_resources(void) bss_res.end = __pa_symbol(__bss_stop) - 1; bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * sizeof(*mem_res); + /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */ + mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res); mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); if (!mem_res) panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 8a5cf99c0776..1b432264f7ef 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <asm/sbi.h> #include <asm/processor.h> +#include <asm/timex.h> unsigned long riscv_timebase; EXPORT_SYMBOL_GPL(riscv_timebase); diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 3ed2c23601a0..0879b5df11b9 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/irq.h> +#include <asm/asm-prototypes.h> #include <asm/bug.h> #include <asm/processor.h> #include <asm/ptrace.h> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 3fc18f469efb..4f85c6d0ddf8 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -155,7 +155,7 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } -void __init kasan_shallow_populate(void *start, void *end) +static void __init kasan_shallow_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); @@ -187,6 +187,8 @@ void __init kasan_shallow_populate(void *start, void *end) } vaddr += PAGE_SIZE; } + + local_flush_tlb_all(); } void __init kasan_init(void) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 02056b024091..dc0b69058ac4 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -275,9 +275,9 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_IP_NF_IPTABLES=m @@ -298,7 +298,6 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -481,7 +480,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set @@ -581,7 +579,6 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -635,6 +632,7 @@ CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m @@ -714,12 +712,8 @@ CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -731,7 +725,6 @@ CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SM4=m @@ -796,12 +789,9 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y CONFIG_SLUB_DEBUG_ON=y CONFIG_SLUB_STATS=y -CONFIG_DEBUG_KMEMLEAK=y -CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m @@ -838,6 +828,7 @@ CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_DEBUG_ENTRY=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y @@ -861,4 +852,3 @@ CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_DEBUG_ENTRY=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index bac721a501da..320379da96d9 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -266,9 +266,9 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_IP_NF_IPTABLES=m @@ -289,7 +289,6 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -473,7 +472,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set # CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set @@ -573,7 +571,6 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -623,6 +620,7 @@ CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m @@ -703,12 +701,8 @@ CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -720,7 +714,6 @@ CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SM4=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index acf982a2ae4c..76123a4b26ab 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -26,7 +26,6 @@ CONFIG_CRASH_DUMP=y # CONFIG_SECCOMP is not set # CONFIG_GCC_PLUGINS is not set CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set # CONFIG_MIGRATION is not set @@ -61,11 +60,9 @@ CONFIG_RAW_DRIVER=y # CONFIG_HID is not set # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set -# CONFIG_SURFACE_PLATFORMS is not set # CONFIG_IOMMU_SUPPORT is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set -CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_LSM="yama,loadpin,safesetid,integrity" diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index b04f6a794cdf..5cea629c548e 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -14,12 +14,12 @@ struct s390_idle_data { seqcount_t seqcount; - unsigned long long idle_count; - unsigned long long idle_time; - unsigned long long clock_idle_enter; - unsigned long long clock_idle_exit; - unsigned long long timer_idle_enter; - unsigned long long timer_idle_exit; + unsigned long idle_count; + unsigned long idle_time; + unsigned long clock_idle_enter; + unsigned long clock_idle_exit; + unsigned long timer_idle_enter; + unsigned long timer_idle_exit; unsigned long mt_cycles_enter[8]; }; diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 053fe8b8dec7..a75d94a9bcb2 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -202,7 +202,7 @@ extern unsigned int s390_pci_no_rid; ----------------------------------------------------------------------------- */ /* Base stuff */ int zpci_create_device(u32 fid, u32 fh, enum zpci_state state); -void zpci_remove_device(struct zpci_dev *zdev); +void zpci_remove_device(struct zpci_dev *zdev, bool set_error); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index c4e23e925665..f6326c6d2abe 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -98,10 +98,10 @@ extern unsigned char ptff_function_mask[16]; /* Query TOD offset result */ struct ptff_qto { - unsigned long long physical_clock; - unsigned long long tod_offset; - unsigned long long logical_tod_offset; - unsigned long long tod_epoch_difference; + unsigned long physical_clock; + unsigned long tod_offset; + unsigned long logical_tod_offset; + unsigned long tod_epoch_difference; } __packed; static inline int ptff_query(unsigned int nr) @@ -151,9 +151,9 @@ struct ptff_qui { rc; \ }) -static inline unsigned long long local_tick_disable(void) +static inline unsigned long local_tick_disable(void) { - unsigned long long old; + unsigned long old; old = S390_lowcore.clock_comparator; S390_lowcore.clock_comparator = clock_comparator_max; @@ -161,7 +161,7 @@ static inline unsigned long long local_tick_disable(void) return old; } -static inline void local_tick_enable(unsigned long long comp) +static inline void local_tick_enable(unsigned long comp) { S390_lowcore.clock_comparator = comp; set_clock_comparator(S390_lowcore.clock_comparator); @@ -169,9 +169,9 @@ static inline void local_tick_enable(unsigned long long comp) #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -typedef unsigned long long cycles_t; +typedef unsigned long cycles_t; -static inline unsigned long long get_tod_clock(void) +static inline unsigned long get_tod_clock(void) { union tod_clock clk; @@ -179,10 +179,10 @@ static inline unsigned long long get_tod_clock(void) return clk.tod; } -static inline unsigned long long get_tod_clock_fast(void) +static inline unsigned long get_tod_clock_fast(void) { #ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - unsigned long long clk; + unsigned long clk; asm volatile("stckf %0" : "=Q" (clk) : : "cc"); return clk; @@ -208,9 +208,9 @@ extern union tod_clock tod_clock_base; * Therefore preemption must be disabled, otherwise the returned * value is not guaranteed to be monotonic. */ -static inline unsigned long long get_tod_clock_monotonic(void) +static inline unsigned long get_tod_clock_monotonic(void) { - unsigned long long tod; + unsigned long tod; preempt_disable_notrace(); tod = get_tod_clock() - tod_clock_base.tod; @@ -237,7 +237,7 @@ static inline unsigned long long get_tod_clock_monotonic(void) * -> ns = (th * 125) + ((tl * 125) >> 9); * */ -static inline unsigned long long tod_to_ns(unsigned long long todval) +static inline unsigned long tod_to_ns(unsigned long todval) { return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } @@ -249,10 +249,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) * * Returns: true if a is later than b */ -static inline int tod_after(unsigned long long a, unsigned long long b) +static inline int tod_after(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a > (long long) b; + return (long) a > (long) b; return a > b; } @@ -263,10 +263,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b) * * Returns: true if a is later than b */ -static inline int tod_after_eq(unsigned long long a, unsigned long long b) +static inline int tod_after_eq(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a >= (long long) b; + return (long) a >= (long) b; return a >= b; } diff --git a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h b/arch/s390/include/uapi/asm/hwctrset.h index 3d8284b95f87..3d8284b95f87 100644 --- a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h +++ b/arch/s390/include/uapi/asm/hwctrset.h diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 812073ea073e..4bf1ee293f2b 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -47,7 +47,7 @@ void account_idle_time_irq(void) void arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long long idle_time; + unsigned long idle_time; unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ @@ -73,7 +73,7 @@ static ssize_t show_idle_count(struct device *dev, struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long long idle_count; + unsigned long idle_count; unsigned int seq; do { @@ -82,14 +82,14 @@ static ssize_t show_idle_count(struct device *dev, if (READ_ONCE(idle->clock_idle_enter)) idle_count++; } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%llu\n", idle_count); + return sprintf(buf, "%lu\n", idle_count); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long long now, idle_time, idle_enter, idle_exit, in_idle; + unsigned long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); unsigned int seq; @@ -109,14 +109,14 @@ static ssize_t show_idle_time(struct device *dev, } } idle_time += in_idle; - return sprintf(buf, "%llu\n", idle_time >> 12); + return sprintf(buf, "%lu\n", idle_time >> 12); } DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); u64 arch_cpu_idle_time(int cpu) { struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long long now, idle_enter, idle_exit, in_idle; + unsigned long now, idle_enter, idle_exit, in_idle; unsigned int seq; do { diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 0eb1d1cc53a8..b3beef64d3d4 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -269,7 +269,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ return -EINVAL; - }; + } /* Initialize for using the CPU-measurement counter facility */ if (!atomic_inc_not_zero(&num_events)) { diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index db4877bbb9aa..2e3e7edbe3a0 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -26,12 +26,10 @@ #include <asm/timex.h> #include <asm/debug.h> -#include <asm/perf_cpum_cf_diag.h> +#include <asm/hwctrset.h> #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ -#define CF_DIAG_MIN_INTERVAL 60 /* Minimum counter set read */ /* interval in seconds */ -static unsigned long cf_diag_interval = CF_DIAG_MIN_INTERVAL; static unsigned int cf_diag_cpu_speed; static debug_info_t *cf_diag_dbg; @@ -729,7 +727,6 @@ static DEFINE_MUTEX(cf_diag_ctrset_mutex); static struct cf_diag_ctrset { unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ - time64_t lastread; /* Epoch counter set last read */ } cf_diag_ctrset; static void cf_diag_ctrset_clear(void) @@ -866,27 +863,16 @@ static int cf_diag_all_read(unsigned long arg) { struct cf_diag_call_on_cpu_parm p; cpumask_var_t mask; - time64_t now; - int rc = 0; + int rc; debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - now = ktime_get_seconds(); - if (cf_diag_ctrset.lastread + cf_diag_interval > now) { - debug_sprintf_event(cf_diag_dbg, 5, "%s now %lld " - " lastread %lld\n", __func__, now, - cf_diag_ctrset.lastread); - rc = -EAGAIN; - goto out; - } else { - cf_diag_ctrset.lastread = now; - } + p.sets = cf_diag_ctrset.ctrset; cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); rc = cf_diag_all_copy(arg, mask); -out: free_cpumask_var(mask); debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); return rc; @@ -982,7 +968,7 @@ static int cf_diag_all_start(void) */ static size_t cf_diag_needspace(unsigned int sets) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); size_t bytes = 0; int i; @@ -998,6 +984,7 @@ static size_t cf_diag_needspace(unsigned int sets) sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, bytes); + put_cpu_ptr(&cpu_cf_events); return bytes; } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 06bcfa636638..165da961f901 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -68,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier); unsigned char ptff_function_mask[16]; -static unsigned long long lpar_offset; -static unsigned long long initial_leap_seconds; -static unsigned long long tod_steering_end; -static long long tod_steering_delta; +static unsigned long lpar_offset; +static unsigned long initial_leap_seconds; +static unsigned long tod_steering_end; +static long tod_steering_delta; /* * Get time offsets with PTFF @@ -96,7 +96,7 @@ void __init time_early_init(void) /* get initial leap seconds */ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) - initial_leap_seconds = (unsigned long long) + initial_leap_seconds = (unsigned long) ((long) qui.old_leap * 4096000000L); } @@ -222,7 +222,7 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, static u64 read_tod_clock(struct clocksource *cs) { - unsigned long long now, adj; + unsigned long now, adj; preempt_disable(); /* protect from changes to steering parameters */ now = get_tod_clock(); @@ -362,7 +362,7 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long long delta) +static void clock_sync_global(unsigned long delta) { unsigned long now, adj; struct ptff_qto qto; @@ -378,7 +378,7 @@ static void clock_sync_global(unsigned long long delta) -(adj >> 15) : (adj >> 15); tod_steering_delta += delta; if ((abs(tod_steering_delta) >> 48) != 0) - panic("TOD clock sync offset %lli is too large to drift\n", + panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); vdso_data->arch_data.tod_steering_end = tod_steering_end; @@ -394,7 +394,7 @@ static void clock_sync_global(unsigned long long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long long delta) +static void clock_sync_local(unsigned long delta) { /* Add the delta to the clock comparator. */ if (S390_lowcore.clock_comparator != clock_comparator_max) { @@ -418,7 +418,7 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long long clock_delta; + unsigned long clock_delta; }; /* @@ -538,7 +538,7 @@ static int stpinfo_valid(void) static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - unsigned long long clock_delta, flags; + u64 clock_delta, flags; static int first; int rc; @@ -720,8 +720,8 @@ static ssize_t ctn_id_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%016llx\n", - *(unsigned long long *) stp_info.ctnid); + ret = sprintf(buf, "%016lx\n", + *(unsigned long *) stp_info.ctnid); mutex_unlock(&stp_mutex); return ret; } @@ -794,7 +794,7 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev, if (!stzi.lsoib.p) return sprintf(buf, "0,0\n"); - return sprintf(buf, "%llu,%d\n", + return sprintf(buf, "%lu,%d\n", tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, stzi.lsoib.nlso - stzi.lsoib.also); } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index e7ce447651b9..bfcc327acc6b 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -76,8 +76,6 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c } info = info->next; } - if (cpumask_empty(&mask)) - cpumask_copy(&mask, cpumask_of(cpu)); break; case TOPOLOGY_MODE_PACKAGE: cpumask_copy(&mask, cpu_present_mask); diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 73c7afcc0527..f216a1b2f825 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -214,7 +214,7 @@ void vtime_flush(struct task_struct *tsk) avg_steal = S390_lowcore.avg_steal_timer / 2; if ((s64) steal > 0) { S390_lowcore.steal_timer = 0; - account_steal_time(steal); + account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } S390_lowcore.avg_steal_timer = avg_steal; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index e3183bd05910..d548d60caed2 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1287,7 +1287,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) /* already expired? */ if (cputm >> 63) return 0; - return min(sltime, tod_to_ns(cputm)); + return min_t(u64, sltime, tod_to_ns(cputm)); } } else if (cpu_timer_interrupts_enabled(vcpu)) { sltime = kvm_s390_get_cpu_timer(vcpu); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 600881d894dd..91064077526d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -682,16 +682,36 @@ int zpci_disable_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_disable_device); -void zpci_remove_device(struct zpci_dev *zdev) +/* zpci_remove_device - Removes the given zdev from the PCI core + * @zdev: the zdev to be removed from the PCI core + * @set_error: if true the device's error state is set to permanent failure + * + * Sets a zPCI device to a configured but offline state; the zPCI + * device is still accessible through its hotplug slot and the zPCI + * API but is removed from the common code PCI bus, making it + * no longer available to drivers. + */ +void zpci_remove_device(struct zpci_dev *zdev, bool set_error) { struct zpci_bus *zbus = zdev->zbus; struct pci_dev *pdev; + if (!zdev->zbus->bus) + return; + pdev = pci_get_slot(zbus->bus, zdev->devfn); if (pdev) { - if (pdev->is_virtfn) - return zpci_iov_remove_virtfn(pdev, zdev->vfn); + if (set_error) + pdev->error_state = pci_channel_io_perm_failure; + if (pdev->is_virtfn) { + zpci_iov_remove_virtfn(pdev, zdev->vfn); + /* balance pci_get_slot */ + pci_dev_put(pdev); + return; + } pci_stop_and_remove_bus_device_locked(pdev); + /* balance pci_get_slot */ + pci_dev_put(pdev); } } @@ -765,7 +785,7 @@ void zpci_release_device(struct kref *kref) struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); if (zdev->zbus->bus) - zpci_remove_device(zdev); + zpci_remove_device(zdev, false); switch (zdev->state) { case ZPCI_FN_STATE_ONLINE: diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index b4162da4e8a2..ac0c65cdd69d 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -76,13 +76,10 @@ void zpci_event_error(void *data) static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); - struct pci_dev *pdev = NULL; enum zpci_state state; + struct pci_dev *pdev; int ret; - if (zdev && zdev->zbus->bus) - pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); - zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -124,8 +121,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0303: /* Deconfiguration requested */ if (!zdev) break; - if (pdev) - zpci_remove_device(zdev); + zpci_remove_device(zdev, false); ret = zpci_disable_device(zdev); if (ret) @@ -140,12 +136,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0304: /* Configured -> Standby|Reserved */ if (!zdev) break; - if (pdev) { - /* Give the driver a hint that the function is - * already unusable. */ - pdev->error_state = pci_channel_io_perm_failure; - zpci_remove_device(zdev); - } + /* Give the driver a hint that the function is + * already unusable. + */ + zpci_remove_device(zdev, true); zdev->fh = ccdf->fh; zpci_disable_device(zdev); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a2433ae8a65e..4efd39aacb9f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) regs->ax = -EFAULT; instrumentation_end(); - syscall_exit_to_user_mode(regs); + local_irq_disable(); + irqentry_exit_to_user_mode(regs); return false; } diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6ddeed3cd2ac..18df17129695 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); -DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } -static inline struct perf_guest_switch_msr * -perf_guest_get_msrs_nop(int *nr) -{ - *nr = 0; - return NULL; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; x86_pmu_static_call_update(); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d5c18e..37ce38403cb8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3659,11 +3659,16 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3681,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ebae1826403..d32b302719fe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 877a4025d8da..3768819693e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -884,12 +884,29 @@ struct kvm_hv_syndbg { u64 options; }; +/* Current state of Hyper-V TSC page clocksource */ +enum hv_tsc_page_status { + /* TSC page was not set up or disabled */ + HV_TSC_PAGE_UNSET = 0, + /* TSC page MSR was written by the guest, update pending */ + HV_TSC_PAGE_GUEST_CHANGED, + /* TSC page MSR was written by KVM userspace, update pending */ + HV_TSC_PAGE_HOST_CHANGED, + /* TSC page was properly set up and is currently active */ + HV_TSC_PAGE_SET, + /* TSC page is currently being updated and therefore is inactive */ + HV_TSC_PAGE_UPDATING, + /* TSC page was set up with an inaccessible GPA */ + HV_TSC_PAGE_BROKEN, +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; + enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; @@ -931,6 +948,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_x86_msr_filter { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; +}; + #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 @@ -963,7 +986,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -1025,18 +1048,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool bus_lock_detection_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct kvm_x86_msr_filter __rcu *msr_filter; - struct { - u8 count; - bool default_allow:1; - struct msr_bitmap_range ranges[16]; - } msr_filter; - - bool bus_lock_detection_enabled; - - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index dc6d149bf851..f1b9ed5efaa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -551,15 +551,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, *size = fpu_kernel_xstate_size; } -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ - static inline void native_load_sp0(unsigned long sp0) { diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..b6a9d51d1d79 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include <asm/paravirt_types.h> #endif +#include <asm/proto.h> + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8b58d6975d5d..0bc9b0895f33 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0d751d5da702..06b740bae431 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -205,10 +205,23 @@ static inline int arch_within_stack_frames(const void * const stack, #endif +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + +#ifndef __ASSEMBLY__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ + +#define arch_set_restart_data(restart) \ + do { restart->arch_data = current_thread_info()->status; } while (0) + #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_32 #define in_ia32_syscall() true diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bda4f2a36868..4f26700f314d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2342,6 +2342,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c3b60c37c728..73ff4dd426a8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 373e5fa3ce1f..51c7f5271aee 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01ca3b4..78bb0fae3982 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf6..1fc0962c89c0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 84c1821819af..04a780abb512 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } @@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) + if (on_vc_stack(regs)) new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); else new_ist = old_ist - sizeof(old_ist); @@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; @@ -1248,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1263,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1325,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..f306e85a08a6 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7f5aec758f0e..ac1874a2a70e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2a1d47f47eee..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 58fa8c029867..f98370a39936 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) u64 tsc; /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. + * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, + * is broken, disabled or being updated. */ - if (!hv->tsc_ref.tsc_sequence) + if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); @@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, return true; } +/* + * Don't touch TSC page values if the guest has opted for TSC emulation after + * migration. KVM doesn't fully support reenlightenment notifications and TSC + * access emulation and Hyper-V is known to expect the values in TSC page to + * stay constant before TSC access emulation is disabled from guest side + * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC + * frequency and guest visible TSC value across migration (and prevent it when + * TSC scaling is unsupported). + */ +static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) +{ + return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && + hv->hv_tsc_emulation_control; +} + void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { @@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; mutex_lock(&hv->hv_lock); @@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) + goto out_err; + + if (tsc_seq && tsc_page_update_unsafe(hv)) { + if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; + } /* * While we're computing and writing the parameters, force the @@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - goto out_unlock; + goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - goto out_unlock; + goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - goto out_unlock; + goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + goto out_unlock; + +out_err: + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; +out_unlock: + mutex_unlock(&hv->hv_lock); +} + +void kvm_hv_invalidate_tsc_page(struct kvm *kvm) +{ + struct kvm_hv *hv = to_kvm_hv(kvm); + u64 gfn; + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || + tsc_page_update_unsafe(hv)) + return; + + mutex_lock(&hv->hv_lock); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + + /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */ + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET) + hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + out_unlock: mutex_unlock(&hv->hv_lock); } @@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { + if (!host) + hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; + else + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } else { + hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; + } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, @@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: + if (data && !host) + return 1; + hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e951af1fcb2c..60547d5cb6d7 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); +void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bfacb7c..cc369b9ad8f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ec4fc28b325a..1f6f98c76bdf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e5f148106e20..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) } /* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; - - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 4cc177d75c4a..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -36,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c926c6b899a1..462b1f71c77f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} - static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); @@ -301,11 +296,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, bool shared) { - struct kvm_mmu_page *sp = sptep_to_sp(pt); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; u64 old_child_spte; @@ -318,7 +318,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = pt + i; + sptep = rcu_dereference(pt) + i; gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not @@ -481,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -498,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, true); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return true; } @@ -527,7 +534,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*iter->sptep, 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); return true; } @@ -553,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - tdp_ptep_t root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -570,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, false); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -648,9 +651,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index baee91c1e936..58a45bb139f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50810d471462..32cf8287d4a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..fe806e894212 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1526,35 +1526,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; - u32 count = kvm->arch.msr_filter.count; - u32 i; - bool r = kvm->arch.msr_filter.default_allow; + bool allowed; int idx; + u32 i; - /* MSR filtering not set up or x2APIC enabled, allow everything */ - if (!count || (index >= 0x800 && index <= 0x8ff)) + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) return true; - /* Prevent collision with set_msr_filter */ idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < count; i++) { + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - r = !!test_bit(index - start, bitmap); + allowed = !!test_bit(index - start, bitmap); break; } } +out: srcu_read_unlock(&kvm->srcu, idx); - return r; + return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -2551,6 +2560,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + kvm_hv_invalidate_tsc_page(kvm); + spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ @@ -5352,25 +5363,34 @@ split_irqchip_unlock: return r; } -static void kvm_clear_msr_filter(struct kvm *kvm) +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; - u32 count = kvm->arch.msr_filter.count; - struct msr_bitmap_range ranges[16]; - mutex_lock(&kvm->lock); - kvm->arch.msr_filter.count = 0; - memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); - for (i = 0; i < count; i++) - kfree(ranges[i].bitmap); + kfree(msr_filter); } -static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; @@ -5404,11 +5424,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user goto err; } - /* Everything ok, add this range identifier to our global pool */ - ranges[kvm->arch.msr_filter.count] = range; - /* Make sure we filled the array before we tell anyone to walk it */ - smp_wmb(); - kvm->arch.msr_filter.count++; + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; return 0; err: @@ -5419,10 +5437,11 @@ err: static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) { struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_x86_msr_filter *new_filter, *old_filter; struct kvm_msr_filter filter; bool default_allow; - int r = 0; bool empty = true; + int r = 0; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) @@ -5435,25 +5454,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (empty && !default_allow) return -EINVAL; - kvm_clear_msr_filter(kvm); - - kvm->arch.msr_filter.default_allow = default_allow; + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; - /* - * Protect from concurrent calls to this function that could trigger - * a TOCTOU violation on kvm->arch.msr_filter.count. - */ - mutex_lock(&kvm->lock); for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(kvm, &filter.ranges[i]); - if (r) - break; + r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } } + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); - return r; + return 0; } long kvm_arch_vm_ioctl(struct file *filp, @@ -6603,7 +6629,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -10601,7 +10627,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; @@ -10634,8 +10660,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - u32 i; - if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10651,8 +10675,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } static_call_cond(kvm_x86_vm_destroy)(kvm); - for (i = 0; i < kvm->arch.msr_filter.count; i++) - kfree(kvm->arch.msr_filter.ranges[i].bitmap); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..bb0b3fe1e0a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -2033,9 +2044,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2225,7 +2244,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2336,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ac8578258af..b42bfdab01a9 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -27,7 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); -MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); static bool force; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a3cc33091f46..17d80f751fcb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -741,7 +741,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap[0].host_addr = map_ops[i].host_addr, unmap[0].handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -751,7 +751,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, kmap_ops[i].status = GNTST_general_error; unmap[1].host_addr = kmap_ops[i].host_addr, unmap[1].handle = kmap_ops[i].handle; - kmap_ops[i].handle = ~0; + kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; else @@ -776,7 +776,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, out: return ret; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -802,7 +801,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> |